Step5.5_TrimPapers.R

setwd("/Users/erteich/Bassett/ldi_fellows/gendercitation") # Change to your project folder path
source("HelperFunctions.R")
library(readxl); library(stringr)
library(scholar); library(rcrossref)
library(parallel)
library(pbmcapply); library(pbapply)

## load data
load("df5_articledata_matchednames.RData")

## load the list of authors whose papers you are interested in
df = data.frame(read_excel("../ldi_senior_fellows.xlsx"))

cores=detectCores()

get_middle_init = function(middle_initials) {
  if(!is.na(middle_initials)){
    if(str_length(middle_initials) > 1){
        return(paste0(" ", substr(middle_initials,1,1),"."))
      } else {
        return(paste0(" ",middle_initials,"."))
      }
    }else{
      return("")
  }
}

## shamelessly taken from HelperFunctions get.preferred
last_name_noMI=function(x){
  ## added str_to_title because last names or first can be all upper case
  if(grepl("\\.",x)==TRUE  & str_to_title(x)!=toupper(x)){
    name=gsub("\\."," ",x)
    name=strsplit(name," ")[[1]]
    ## added str_to_title because last names or first can be all upper case
    notup=which(str_to_title(name)!=toupper(name))
    name=name[notup][1]
    if(substr(name,1,1)!="-"){
      return(name)
    }else{
      return(substr(name,2,nchar(name)))
    }
  }else{
    return(x)
  }
}
## as is this
is_normal_name=function(x){
  ## added str_to_title because last names or first can be all upper case
  if(grepl("\\.",x)==TRUE & str_to_title(x)!=toupper(x)){
    name=gsub("\\."," ",x)
    name=strsplit(name," ")[[1]]
    ## added str_to_title because last names or first can be all upper case
    notup=which(str_to_title(name)!=toupper(name))
    ## if the full name is after the beginning it is not "normal"
    if(notup > 1){
      return(FALSE)
    }
  }
  return(TRUE)
}

compare_inits = function(target.inits, query.inits){
  # if initials lengths are the same, both must match
  if(str_length(query.inits)==str_length(target.inits)){
    return(query.inits == target.inits)
    # if there is a mismatch in length, just the first initial of each should match
  }else{
    return(unlist(str_split(query.inits,""))[1] == unlist(str_split(target.inits,""))[1])
  }
}

compare_first = function(x, target.inits, target.preferred, target.first, 
                         query.inits, query.preferred, query.isinits, query.first){
  ## if the queried name is just initials, only make sure initials match 
  if(query.isinits[x]){
    return(compare_inits(target.inits, query.inits[x]))
  }else{
    ## if the queried name is a full name, it must match the full name of the target
    match = tolower(query.preferred[x]) == tolower(target.preferred)
    ## if the queried preferred name is full and the same as the queried first name, the initials must ALSO match
    ## otherwise, however, initials do not have to match, for cases of M.Kit matching Kit, for example.
    if(is_normal_name(query.first[x]) & is_normal_name(target.first)) {
      match = match & compare_inits(target.inits, query.inits[x])
    }
    return(match)
  }
}

## operates on each individual entry of article.data$AF
match_name = function(x, ln, fn){
  target.first = fn
  target.inits = extract.initials(fn)
  target.preferred = get.preferred(fn)

  auths = unlist(str_split(x,"; "))
  alllasts = unlist(lapply(1:length(auths), get.all.family, authlist=auths))
  allfirsts = unlist(lapply(1:length(auths), get.all.given, authlist=auths))
  ## this converts M. Ulrich to Ulrich, for example, when middle initials are stuck in the last name
  alllasts = unlist(lapply(alllasts,last_name_noMI))
  
  samelast_mask = tolower(alllasts) == tolower(ln)
  ## if there's a matching last name, look at first names
  if(sum(samelast_mask) > 0){
    query.first = allfirsts[samelast_mask]
    query.isinits = unlist(lapply(allfirsts[samelast_mask], is.initials))
    query.inits = unlist(lapply(allfirsts[samelast_mask], extract.initials))
    query.preferred = unlist(lapply(allfirsts[samelast_mask], get.preferred))
    matching_mask = unlist(lapply(1:length(query.inits), compare_first, target.inits=target.inits, 
                                  target.preferred=target.preferred, target.first=target.first,
                                  query.inits=query.inits, query.preferred=query.preferred, 
                                  query.isinits=query.isinits, query.first=query.first))
    ## if matching last names and matching first names exist, the name matches
    if(sum(matching_mask) > 0){
      return(TRUE)
    } 
  }
  ## if the last names did not match or the first names did not match, then there is no matching name
  return(FALSE)
}

## operates on article.data$AF collectively
## some code taken from Jason Kim's get.maps and get.ma.overrep2 functions
match_name_fast = function(all_firsts, all_lasts, AF2PA, ln, fn){
  target.first = fn
  target.inits = extract.initials(fn)
  target.preferred = get.preferred(fn)
  
  samelast_mask = tolower(all_lasts) == tolower(ln)
  
  ## if there's a matching last name, look at first names
  if(sum(samelast_mask) > 0){
    query.first = all_firsts[samelast_mask]
    query.isinits = unlist(lapply(all_firsts[samelast_mask], is.initials))
    query.inits = unlist(lapply(all_firsts[samelast_mask], extract.initials))
    query.preferred = unlist(lapply(all_firsts[samelast_mask], get.preferred))
    matching_mask = unlist(lapply(1:length(query.inits), compare_first, target.inits=target.inits, 
                                  target.preferred=target.preferred, target.first=target.first,
                                  query.inits=query.inits, query.preferred=query.preferred, 
                                  query.isinits=query.isinits, query.first=query.first))
    ## if matching last names and matching first names exist, the name matches
    if(sum(matching_mask) > 0){
      ## these are indices into all_firsts, all_lasts
      inds = which(samelast_mask)[matching_mask]
      ## these are indices into papers
      p_inds = AF2PA[inds]
      return(p_inds)
    } 
  }
  ## if the last names did not match or the first names did not match, then there is no matching name
  return(list())
}

## Outstanding issues: 
## 1. Papers written by "Mucio Delgado" are not given to fellow "M. Kit Delgado"
## 2. Papers written by last name "Fernandez Lynch" are not given to fellow "Holly Fernandez Lynch", 
##    since Fernandez is stored as her middle name.

## some code taken from Jason Kim's get.maps and get.ma.overrep2 functions
AF = lapply(article.data$AF,authsplit)
AFUnlist = unlist(lapply(article.data$AF,authsplit))
all_firsts = unlist(lapply(1:length(AFUnlist), get.all.given, authlist=AFUnlist))
all_lasts = unlist(lapply(1:length(AFUnlist), get.all.family, authlist=AFUnlist))
## this converts M. Ulrich to Ulrich, for example, when middle initials are stuck in the last name
all_lasts = unlist(lapply(all_lasts,last_name_noMI))

## size of author list for each paper
AFLens = lengths(AF)
## map of indices of AFUnlist, all_firsts, all_lasts to indices of papers
AF2PA = unlist(pblapply(1:length(AFLens), function(x) rep(x, AFLens[x])))

middle_inits = unlist(lapply(df$Middle.Initial, get_middle_init))
ldi_firsts = paste0(df$First.Name, middle_inits)
ldi_firsts = str_to_title(ldi_firsts[!is.na(df$Last.Name)])
ldi_lasts = df$Last.Name
ldi_lasts = str_to_title(ldi_lasts[!is.na(df$Last.Name)])

ldi_papers = vector()
for(i in 1:length(ldi_firsts)){ 
  print(i)
  ## if somehow multiple valid variants of an author are on the same paper, it will repeat in the below list, 
  ## and therefore the repeats must be removed by unique()
  fast_p = unique(match_name_fast(all_firsts, all_lasts, AF2PA, ldi_lasts[i], ldi_firsts[i]))
  ## every so often, double check against the slower method as a sanity check
  ## this looked okay, so commenting out for now
  # if(i%%10 == 0){
  #   mask_p = unlist(pbmclapply(article.data$AF, match_name, ln=ldi_lasts[i], fn=ldi_firsts[i], mc.cores=4))
  #   stopifnot(sum(which(mask_p) != fast_p)==0)
  # }
  
  ldi_papers = append(ldi_papers, list(fast_p))
}

# # This is too stringent... rejects articles (incorrectly)  if the crossref title is a subset of the google title, for example (see DOI 10.1378/chest.14-0501)
# # Or rejects articles (incorrectly) if accidentally the wrong google scholar profile was queried.... (see Mark Pauly)
# # Running this on a dataset of 26,285 articles results in only 8,633 "kept" articles...
# compare_to_google = function(x, df, article.data, ldi_papers){
#   tryCatch({
#     id = get_scholar_id(last_name=df$Last.Name[x], first_name = df$First.Name[x], affiliation="University of Pennsylvania")
#     pub = get_publications(id)
#     ## get all DOIs in current dataset nominally belonging to the LDI fellow
#     dois = article.data$DI[ldi_papers[[x]]]
#     ## query crossref for article.data paper titles
#     titles = cr_works(dois=dois)$data$title
#     ## are the article.data papers in the google scholar profile?
#     mask = str_to_title(titles)%in%str_to_title(pub$title)
#     ## return only those article.data papers in the google scholar profile
#     return(ldi_papers[[x]][mask])
#   },error=function(cond){
#     message(cond)
#     ## if google scholar produced nothing, just return all the article.data papers.
#     return(ldi_papers[[x]])
#   })
# }
# 
# rows_keep_ind = unlist(lapply(1:length(ldi_names), compare_to_google, df=df, article.data=article.data, ldi_papers=ldi_papers))
# rows_keep_ind = unique(rows_keep_ind)

rows_keep_ind = unique(unlist(ldi_papers))

rows_keep = rep(FALSE, length(article.data$AF))
rows_keep[rows_keep_ind] = TRUE

article.data.keep = article.data[rows_keep,]
article.data.reject = article.data[!rows_keep,]

## coauthorship network among LDI fellows
L = matrix(nrow=length(ldi_papers), ncol=length(ldi_papers))
for(i in 1:length(ldi_papers)){
  pi = ldi_papers[[i]]
  for(j in i:length(ldi_papers)){
    pj = ldi_papers[[j]]
    L[i,j] = length(intersect(pi,pj))
    L[j,i] = length(intersect(pi,pj))
  }
}

write.csv(L, "LDI_coauthor_network.csv", row.names=paste(ldi_firsts, ldi_lasts))