forked from jdwor/gendercitation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep7_AssignGenders.R
49 lines (38 loc) · 1.76 KB
/
Step7_AssignGenders.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
setwd("/Users/erteich/Desktop/gendercitation") # Change to your project folder path
source("HelperFunctions.R")
library(rjson);library(pbmcapply)
# Load in article dataset from step 5
#load("df5_articledata_matchednames.RData")
## Skip step 5
load("df4_articledata_cleannames.RData")
# Load in gender dataset from step 6
load("df6_namegends.RData")
# Save number of cores on machine
cores=detectCores()
# Isolate first names of first- and last-authors
all_auth_names=lapply(as.list(article.data$AF),strsplit,split="; ")
first_names=pbmclapply(1:length(all_auth_names),get.all.given,
authlist=all_auth_names,mc.cores=cores)
first_last_auths=pbmclapply(first_names,get.first.last,mc.cores=cores)
# Assign probabilistic genders to author names
# 'Threshold' gives the probability above which you will assign a given gender
# This returns combinations of "M"=man, "W"=woman, and "U"=unknown
# giving e.g., "MW", "WM", "WU", "UU", etc. for each article
article_auth_gends=pbmclapply(first_last_auths,gend.to.auths,
namegends,threshold=0.7)
# Create new variable in article.data that gives author gender category
article.data$AG=unlist(article_auth_gends)
# See proportion of articles for which gender could be assigned to both
# first and last author
tt = table(!grepl("U",article.data$AG))/nrow(article.data)
print(tt)
# Save new article data with gender categories
save(article.data, file="df7_articledata_withgenders.RData")
## Create gender category vectors
gend_group_4=unlist(lapply(article.data$AG,transform.cat.4))
gend_group_4=factor(gend_group_4,lev=c("MM","WM","MW","WW","NA"))
print(sum(gend_group_4=="MM"))
print(sum(gend_group_4=="WM"))
print(sum(gend_group_4=="MW"))
print(sum(gend_group_4=="WW"))
print(sum(gend_group_4=="NA"))