forked from jdwor/gendercitation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep1.5_GetAffiliationInfo.R
117 lines (98 loc) · 4.53 KB
/
Step1.5_GetAffiliationInfo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
setwd("/Users/erteich/Bassett/ldi_fellows/gendercitation") # Change to your project folder path
library(bibliometrix);library(pbmcapply)
library(countrycode);library(dplyr)
source("HelperFunctions.R")
# Change to the names of your journal folders within project folder
journal_folders=c("raw_data")
# Create and fill data.frame of affiliation information from WoS files
affil.data=NULL
for(i in journal_folders){
# For each journal, find all data files within folder
files=list.files(i)
data.frame=NULL
for(j in files){
# For each file, read data in, convert to data frame, and concatenate
this.data.frame=readFiles(paste0(i,"/",j))
this.data.frame=createdf(this.data.frame)
if(!is.null(data.frame)){
data.frame=merge(data.frame,this.data.frame,all=T,sort=F)
}else{
data.frame=this.data.frame
}
}
# Select relevant variables
# AF=authors, DI=doi, C1=affiliations, RP=corresponding info,
# SO=journal, TI=title, CR=references, DT=document type,
# PD=month, PY=year
data.frame=data.frame %>% select(AF, DI, C1, RP, SO,
TI, CR, DT, PD, PY)
data.frame$PD=unlist(lapply(1:nrow(data.frame),get.date,pd=data.frame$PD))
data.frame$PD=as.numeric(data.frame$PD)
# Optional: subset data by date if you did so in the full data
# e.g., data.frame=data.frame[data.frame$PY>=1995,]
# e.g., data.frame=data.frame[data.frame$DT%in%c("Article","Review"),]
# Append this new data to the full dataset
affil.data=rbind(affil.data,data.frame)
rm(data.frame, this.data.frame, files)
}
# Clean up environment
rm(i, j, journal_folders)
# Clean up capitalization and punctuation
affil.data$AF=tolower(affil.data$AF)
affil.data$C1=tolower(affil.data$C1)
affil.data$RP=tolower(affil.data$RP)
affil.data$C1=gsub("\\.","",affil.data$C1)
affil.data$RP=gsub("\\.","",affil.data$RP)
# Save number of desired cores for parallelization
cores=detectCores()
# Separate out author names and save first/last authors' family names
all_auth_names=lapply(as.list(affil.data$AF),strsplit,split="; ")
last_names=pbmclapply(1:length(all_auth_names),get.all.family,
authlist=all_auth_names,mc.cores=cores)
fas=unlist(lapply(last_names,head,1))
las=unlist(lapply(last_names,tail,1))
# Locate country names within affiliation data. Returns the following:
# 1) country named in corresponding author affiliation,
# 2-3) countries linked with first and last author affiliations,
# 4) all countries listed, and 5) first country listed in affiliations
countries=pbmclapply(1:nrow(affil.data),getCountries,
affil.data,fas,las,mc.cores=cores)
countries=do.call(rbind,countries)
countries=data.frame(countries, stringsAsFactors=F)
colnames(countries)=c("Corresponding","FirstAuthor","LastAuthor",
"AllListed","FirstListed")
# Check output manually
head(countries)
head(sort(table(countries$Corresponding),decreasing=T),10)
# Pull out all unique countries listed within affiliations
all.countries=paste(countries$AllListed,collapse=', ')
all.countries=strsplit(all.countries,", ")[[1]]
unique.countries=unique(all.countries)
# Create dataset that links countries to continents/regions
# and fill in regions for countries not matched in countrycode
country.to.cont=countrycode(sourcevar = unique.countries,
origin = "country.name",
destination = "continent")
country.to.cont=cbind(unique.countries,country.to.cont)
country.to.cont[country.to.cont[,1]=="england",2]="Europe"
country.to.cont[country.to.cont[,1]=="scotland",2]="Europe"
country.to.cont[country.to.cont[,1]=="wales",2]="Europe"
country.to.cont[country.to.cont[,1]=="africa",2]="Africa"
country.to.cont[country.to.cont[,1]=="yugoslavia",2]="Europe"
country.to.cont[country.to.cont[,1]=="antilles",2]="Americas"
# Check output manually
head(country.to.cont)
# Match countries to corresponding continents/regions. Returns the following:
# 1) region of corresponding author affiliation,
# 2-3) regions linked with first and last author affiliations,
# 4) all regions listed, and 5) first region in affiliations
continents=pbmclapply(1:nrow(countries),getContinents,
countries,country.to.cont,mc.cores=cores)
continents=do.call(rbind,continents)
continents=data.frame(continents, stringsAsFactors=F)
colnames(continents)=c("Corresponding","FirstAuthor","LastAuthor",
"AllListed","FirstListed")
# Check output manually
head(continents)
# Save affiliation datasets
save(affil.data,countries,continents,file="df1.5_affiliationdata.RData")