forked from jdwor/gendercitation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep5.5_TrimPapers.R
237 lines (212 loc) · 9.85 KB
/
Step5.5_TrimPapers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
setwd("/Users/erteich/Bassett/ldi_fellows/gendercitation") # Change to your project folder path
source("HelperFunctions.R")
library(readxl); library(stringr)
library(scholar); library(rcrossref)
library(parallel)
library(pbmcapply); library(pbapply)
## load data
load("df5_articledata_matchednames.RData")
## load the list of authors whose papers you are interested in
df = data.frame(read_excel("../ldi_senior_fellows.xlsx"))
cores=detectCores()
get_middle_init = function(middle_initials) {
if(!is.na(middle_initials)){
if(str_length(middle_initials) > 1){
return(paste0(" ", substr(middle_initials,1,1),"."))
} else {
return(paste0(" ",middle_initials,"."))
}
}else{
return("")
}
}
## shamelessly taken from HelperFunctions get.preferred
last_name_noMI=function(x){
## added str_to_title because last names or first can be all upper case
if(grepl("\\.",x)==TRUE & str_to_title(x)!=toupper(x)){
name=gsub("\\."," ",x)
name=strsplit(name," ")[[1]]
## added str_to_title because last names or first can be all upper case
notup=which(str_to_title(name)!=toupper(name))
name=name[notup][1]
if(substr(name,1,1)!="-"){
return(name)
}else{
return(substr(name,2,nchar(name)))
}
}else{
return(x)
}
}
## as is this
is_normal_name=function(x){
## added str_to_title because last names or first can be all upper case
if(grepl("\\.",x)==TRUE & str_to_title(x)!=toupper(x)){
name=gsub("\\."," ",x)
name=strsplit(name," ")[[1]]
## added str_to_title because last names or first can be all upper case
notup=which(str_to_title(name)!=toupper(name))
## if the full name is after the beginning it is not "normal"
if(notup > 1){
return(FALSE)
}
}
return(TRUE)
}
compare_inits = function(target.inits, query.inits){
# if initials lengths are the same, both must match
if(str_length(query.inits)==str_length(target.inits)){
return(query.inits == target.inits)
# if there is a mismatch in length, just the first initial of each should match
}else{
return(unlist(str_split(query.inits,""))[1] == unlist(str_split(target.inits,""))[1])
}
}
compare_first = function(x, target.inits, target.preferred, target.first,
query.inits, query.preferred, query.isinits, query.first){
## if the queried name is just initials, only make sure initials match
if(query.isinits[x]){
return(compare_inits(target.inits, query.inits[x]))
}else{
## if the queried name is a full name, it must match the full name of the target
match = tolower(query.preferred[x]) == tolower(target.preferred)
## if the queried preferred name is full and the same as the queried first name, the initials must ALSO match
## otherwise, however, initials do not have to match, for cases of M.Kit matching Kit, for example.
if(is_normal_name(query.first[x]) & is_normal_name(target.first)) {
match = match & compare_inits(target.inits, query.inits[x])
}
return(match)
}
}
## operates on each individual entry of article.data$AF
match_name = function(x, ln, fn){
target.first = fn
target.inits = extract.initials(fn)
target.preferred = get.preferred(fn)
auths = unlist(str_split(x,"; "))
alllasts = unlist(lapply(1:length(auths), get.all.family, authlist=auths))
allfirsts = unlist(lapply(1:length(auths), get.all.given, authlist=auths))
## this converts M. Ulrich to Ulrich, for example, when middle initials are stuck in the last name
alllasts = unlist(lapply(alllasts,last_name_noMI))
samelast_mask = tolower(alllasts) == tolower(ln)
## if there's a matching last name, look at first names
if(sum(samelast_mask) > 0){
query.first = allfirsts[samelast_mask]
query.isinits = unlist(lapply(allfirsts[samelast_mask], is.initials))
query.inits = unlist(lapply(allfirsts[samelast_mask], extract.initials))
query.preferred = unlist(lapply(allfirsts[samelast_mask], get.preferred))
matching_mask = unlist(lapply(1:length(query.inits), compare_first, target.inits=target.inits,
target.preferred=target.preferred, target.first=target.first,
query.inits=query.inits, query.preferred=query.preferred,
query.isinits=query.isinits, query.first=query.first))
## if matching last names and matching first names exist, the name matches
if(sum(matching_mask) > 0){
return(TRUE)
}
}
## if the last names did not match or the first names did not match, then there is no matching name
return(FALSE)
}
## operates on article.data$AF collectively
## some code taken from Jason Kim's get.maps and get.ma.overrep2 functions
match_name_fast = function(all_firsts, all_lasts, AF2PA, ln, fn){
target.first = fn
target.inits = extract.initials(fn)
target.preferred = get.preferred(fn)
samelast_mask = tolower(all_lasts) == tolower(ln)
## if there's a matching last name, look at first names
if(sum(samelast_mask) > 0){
query.first = all_firsts[samelast_mask]
query.isinits = unlist(lapply(all_firsts[samelast_mask], is.initials))
query.inits = unlist(lapply(all_firsts[samelast_mask], extract.initials))
query.preferred = unlist(lapply(all_firsts[samelast_mask], get.preferred))
matching_mask = unlist(lapply(1:length(query.inits), compare_first, target.inits=target.inits,
target.preferred=target.preferred, target.first=target.first,
query.inits=query.inits, query.preferred=query.preferred,
query.isinits=query.isinits, query.first=query.first))
## if matching last names and matching first names exist, the name matches
if(sum(matching_mask) > 0){
## these are indices into all_firsts, all_lasts
inds = which(samelast_mask)[matching_mask]
## these are indices into papers
p_inds = AF2PA[inds]
return(p_inds)
}
}
## if the last names did not match or the first names did not match, then there is no matching name
return(list())
}
## Outstanding issues:
## 1. Papers written by "Mucio Delgado" are not given to fellow "M. Kit Delgado"
## 2. Papers written by last name "Fernandez Lynch" are not given to fellow "Holly Fernandez Lynch",
## since Fernandez is stored as her middle name.
## some code taken from Jason Kim's get.maps and get.ma.overrep2 functions
AF = lapply(article.data$AF,authsplit)
AFUnlist = unlist(lapply(article.data$AF,authsplit))
all_firsts = unlist(lapply(1:length(AFUnlist), get.all.given, authlist=AFUnlist))
all_lasts = unlist(lapply(1:length(AFUnlist), get.all.family, authlist=AFUnlist))
## this converts M. Ulrich to Ulrich, for example, when middle initials are stuck in the last name
all_lasts = unlist(lapply(all_lasts,last_name_noMI))
## size of author list for each paper
AFLens = lengths(AF)
## map of indices of AFUnlist, all_firsts, all_lasts to indices of papers
AF2PA = unlist(pblapply(1:length(AFLens), function(x) rep(x, AFLens[x])))
middle_inits = unlist(lapply(df$Middle.Initial, get_middle_init))
ldi_firsts = paste0(df$First.Name, middle_inits)
ldi_firsts = str_to_title(ldi_firsts[!is.na(df$Last.Name)])
ldi_lasts = df$Last.Name
ldi_lasts = str_to_title(ldi_lasts[!is.na(df$Last.Name)])
ldi_papers = vector()
for(i in 1:length(ldi_firsts)){
print(i)
## if somehow multiple valid variants of an author are on the same paper, it will repeat in the below list,
## and therefore the repeats must be removed by unique()
fast_p = unique(match_name_fast(all_firsts, all_lasts, AF2PA, ldi_lasts[i], ldi_firsts[i]))
## every so often, double check against the slower method as a sanity check
## this looked okay, so commenting out for now
# if(i%%10 == 0){
# mask_p = unlist(pbmclapply(article.data$AF, match_name, ln=ldi_lasts[i], fn=ldi_firsts[i], mc.cores=4))
# stopifnot(sum(which(mask_p) != fast_p)==0)
# }
ldi_papers = append(ldi_papers, list(fast_p))
}
# # This is too stringent... rejects articles (incorrectly) if the crossref title is a subset of the google title, for example (see DOI 10.1378/chest.14-0501)
# # Or rejects articles (incorrectly) if accidentally the wrong google scholar profile was queried.... (see Mark Pauly)
# # Running this on a dataset of 26,285 articles results in only 8,633 "kept" articles...
# compare_to_google = function(x, df, article.data, ldi_papers){
# tryCatch({
# id = get_scholar_id(last_name=df$Last.Name[x], first_name = df$First.Name[x], affiliation="University of Pennsylvania")
# pub = get_publications(id)
# ## get all DOIs in current dataset nominally belonging to the LDI fellow
# dois = article.data$DI[ldi_papers[[x]]]
# ## query crossref for article.data paper titles
# titles = cr_works(dois=dois)$data$title
# ## are the article.data papers in the google scholar profile?
# mask = str_to_title(titles)%in%str_to_title(pub$title)
# ## return only those article.data papers in the google scholar profile
# return(ldi_papers[[x]][mask])
# },error=function(cond){
# message(cond)
# ## if google scholar produced nothing, just return all the article.data papers.
# return(ldi_papers[[x]])
# })
# }
#
# rows_keep_ind = unlist(lapply(1:length(ldi_names), compare_to_google, df=df, article.data=article.data, ldi_papers=ldi_papers))
# rows_keep_ind = unique(rows_keep_ind)
rows_keep_ind = unique(unlist(ldi_papers))
rows_keep = rep(FALSE, length(article.data$AF))
rows_keep[rows_keep_ind] = TRUE
article.data.keep = article.data[rows_keep,]
article.data.reject = article.data[!rows_keep,]
## coauthorship network among LDI fellows
L = matrix(nrow=length(ldi_papers), ncol=length(ldi_papers))
for(i in 1:length(ldi_papers)){
pi = ldi_papers[[i]]
for(j in i:length(ldi_papers)){
pj = ldi_papers[[j]]
L[i,j] = length(intersect(pi,pj))
L[j,i] = length(intersect(pi,pj))
}
}
write.csv(L, "LDI_coauthor_network.csv", row.names=paste(ldi_firsts, ldi_lasts))