-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathwiki_people.py
254 lines (217 loc) · 9.14 KB
/
wiki_people.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import argparse
import json
import os
import re
from collections import Counter, defaultdict
import pycountry
import geopandas as gpd
import mwparserfromhell
import psycopg2
import psycopg2.extras
import yaml
from shapely import wkt
WORD_RE = re.compile(r'\w+')
CAT_PREFIX = 'Category:'
DIED_POSTFIX = ' deaths'
BIRTH_POSTFIX = ' births'
def setup_db(connection_string):
conn = psycopg2.connect(connection_string)
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute('DROP TABLE IF EXISTS wikitrends')
cursor.execute('CREATE TABLE wikitrends ('
' person_name TEXT PRIMARY KEY,'
' view_count INTEGER,'
' word_count INTEGER,'
' year_born INTEGER,'
' year_died INTEGER, '
' gender TEXT,'
' occupation TEXT,'
' field TEXT,'
' country_code TEXT,'
' continent TEXT'
')')
cursor.execute('CREATE INDEX wikitrends_view_count ON wikitrends(view_count)')
cursor.execute('CREATE INDEX wikitrends_word_count ON wikitrends(word_count)')
cursor.execute('CREATE INDEX wikitrends_year_born ON wikitrends(year_born)')
cursor.execute('CREATE INDEX wikitrends_gender ON wikitrends(gender)')
cursor.execute('CREATE INDEX wikitrends_country_code ON wikitrends(country_code)')
cursor.execute('CREATE INDEX wikitrends_continent ON wikitrends(continent)')
cursor.execute('CREATE INDEX wikitrends_occupation ON wikitrends(occupation)')
cursor.execute('CREATE INDEX wikitrends_field ON wikitrends(field)')
return conn, cursor
def tolerant_int(s):
try:
return int(re.sub("\D", "", s))
except ValueError:
return -1
def parse_person(rec):
wikitext = rec['wikitext']
parsed = mwparserfromhell.parse(wikitext)
words = [w.lower() for w in WORD_RE.findall(parsed.strip_code())]
word_count = len(words)
word_counts = Counter(words)
gender_words = {w: word_counts[w] for w in ('him', 'his', 'he', 'her', 'she')}
res = {}
for template in parsed.filter_templates():
if template.name.lower().startswith('infobox'):
for param in template.params:
res[param.name.strip().lower()] = param.value
wikilinks = [str(x.title) for x in parsed.filter_wikilinks()]
locations = []
for k in 'birth_place', 'death_place':
if k in res:
locations += [str(x.title) for x in res[k].filter_wikilinks()]
born = None
died = None
for wl in parsed.filter_wikilinks():
title = str(wl.title)
if title.startswith(CAT_PREFIX):
if title.endswith(BIRTH_POSTFIX):
born = tolerant_int(title[len(CAT_PREFIX): -len(BIRTH_POSTFIX)])
if title.endswith(DIED_POSTFIX):
died = tolerant_int(title[len(CAT_PREFIX): -len(DIED_POSTFIX)])
return {'person_name': rec['title'],
'wiki_id': rec['wiki_id'],
'infobox': rec['infobox'],
'locations': locations,
'word_count': word_count,
'gender_words': gender_words,
'view_count': rec['viewcount'],
'wikilinks': wikilinks,
'born': born,
'died': died}
def fetch_people(json_dir, cursor, max_year, min_year):
all_people = []
for year in range(min_year, max_year):
print(year)
json_path = os.path.join(json_dir, str(year) + '.json')
if os.path.isfile(json_path):
people = json.load(open(json_path))
else:
cat = '%d births' % year
q = ("SELECT wikipedia.*, wikistats.viewcount "
"FROM wikipedia LEFT JOIN wikistats ON wikipedia.title = wikistats.title "
"WHERE categories @> ARRAY['%s']") % cat
cursor.execute(q)
people = []
for r in cursor.fetchall():
if len(people) % 100 == 0:
print(' ', r['title'])
people.append(parse_person(r))
with open(json_path, 'w') as fout:
json.dump(people, fout, indent=2)
all_people += people
return all_people
def assign_genders(people):
weights = {'he': 2, 'his': 1, 'him': 1, 'she': -2, 'her': -1}
for p in people:
gw = p['gender_words']
g = sum(weights[w] * gw[w] for w in weights)
s = sum(abs(weights[w]) * gw[w] for w in weights)
if s > 0:
g /= s
p['gender'] = g
gender_by_first_name = defaultdict(list)
for p in people:
first_name = p['person_name'].split(' ', 1)[0]
p['first_name'] = first_name
gender_by_first_name[first_name].append(p['gender'])
gender_by_first_name = {n: sum(v) / len(v) for n, v in gender_by_first_name.items()}
for p in people:
g = p['gender'] + gender_by_first_name[p['first_name']] * 0.3
p['gender'] = 'female' if g < 0 else 'male'
def find_locations(people):
alpha_3_to_2 = {c.alpha_3: c.alpha_2 for c in pycountry.countries}
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
locations = set()
for p in people:
if p['locations']:
locations.add(p['locations'][0])
q = ["'%s'" % l.replace("'", "''") for l in locations if l]
q = '(%s)' % ','.join(q)
cursor.execute(
"select wikipedia.title, ST_AsText(wikipedia.lng_lat) from wikipedia where title in " + q)
loc_by_lat_lng = {
t: None if p is None else wkt.loads(p)
for t, p in cursor.fetchall()
}
locs_with_coos = [k for k in loc_by_lat_lng if loc_by_lat_lng[k]]
coos_for_loc = [loc_by_lat_lng[k] for k in locs_with_coos]
loc_df = gpd.GeoDataFrame([{'location': k} for k in locs_with_coos],
geometry=coos_for_loc, crs={'init': 'epsg:4326'})
locs_with_countries = gpd.sjoin(world, loc_df, how='right')
locs_with_countries.reset_index()
locs_with_countries = {rec['location']: (rec['iso_a3'], rec['name'], rec['continent'])
for _, rec in locs_with_countries.iterrows()}
missing = set()
for p in people:
lat_lng = None
iso_a3 = ''
country = ''
continent = ''
for loc in p['locations'] + p['wikilinks']:
lat_lng = loc_by_lat_lng.get(loc)
if lat_lng:
iso_a3, country, continent = locs_with_countries.get(loc)
if country == 'France':
iso_a3 = 'FRA'
elif country == 'Norway':
iso_a3 = 'NOR'
break
p['lat_lng'] = lat_lng
p['country_code'] = alpha_3_to_2.get(iso_a3, '')
if not p['country_code']:
missing.add(iso_a3)
p['country'] = country
p['continent'] = continent
print(missing)
def add_fields(people):
field_mapping = yaml.safe_load(open('field_mapping.yaml'))
infobox_mapping = yaml.safe_load(open('infobox_mapping.yaml'))
for p in people:
field_count = Counter()
occupation_count = Counter()
for l in p['wikilinks']:
if l.startswith(CAT_PREFIX):
last_word = l[len(CAT_PREFIX):].lower().rsplit(' ', 1)[-1]
f = field_mapping.get(last_word)
if f:
field_count[f] += 1
occupation_count[last_word] += 1
infobox_field = infobox_mapping.get(p['infobox'])
if infobox_field:
field_count[infobox_field] += 3
p['field'] = field_count.most_common(1)[0][0] if field_count else ''
p['occupation'] = occupation_count.most_common(1)[0][0] if occupation_count else ''
def main(json_dir, cursor, min_year=-2000, max_year=2000):
people = fetch_people(json_dir, cursor, max_year, min_year)
print('assigning genders')
assign_genders(people)
print('finding locations')
find_locations(people)
print('adding fields')
add_fields(people)
print('inserting data')
seen = set()
for p in people:
if p['person_name'] in seen or p['born'] < 200 or (p['died'] and p['died'] < 200):
continue
seen.add(p['person_name'])
cursor.execute("INSERT INTO wikitrends "
"(person_name, view_count, year_born, year_died, word_count, gender, "
"continent, country_code, occupation, field) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(p['person_name'], p['view_count'],
p['born'], p['died'],
p['word_count'], p['gender'],
p['continent'], p['country_code'], p['occupation'], p['field']))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Import wikidata into postgress')
parser.add_argument('--postgres', type=str, help='postgres connection string')
parser.add_argument('json_dir', type=str, help='directory to store intermediate jsons')
args = parser.parse_args()
conn, cursor = setup_db(args.postgres)
if not os.path.isdir(args.json_dir):
os.makedirs(args.json_dir)
main(args.json_dir, cursor)
conn.commit()