forked from Kandy16/people-networks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofile-reading.py
92 lines (74 loc) · 3.59 KB
/
profile-reading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pickle
import pandas as pd
from mwclient import Site
import datetime
import time
import os
from create_profile_reading_tracker import create_profile_reading_tracker
file_name = 'politician-data-100-200'
tracker_file = file_name+'-tracker.csv' # make sure the file is in parallel to this program
'''
to store the profile data. if the folder not present then it will create it
no path delimiters please. don't use // '''
base_path = 'profile-data'
user_agent = 'Uni Koblenz-Landau student, [email protected]'
wiki = Site(host='en.wikipedia.org', clients_useragent=user_agent)
if(not os.path.exists(base_path)):
os.makedirs(base_path)
#if the tracker file does not exists then create one
if not os.path.exists(tracker_file):
create_profile_reading_tracker(file_name, tracker_file)
def read_profile_tracker() :
global profile_tracker
profile_tracker = pd.read_csv(tracker_file)
profile_tracker = profile_tracker[['handle','finished_reading','time_taken_in_mins']]
def get_unread_profile(profile_tracker):
data_to_be_read = profile_tracker[profile_tracker['finished_reading'] == False]
if(data_to_be_read.shape[0] > 0) :
return data_to_be_read.iloc[0]['handle']
else:
return None
def write_read_profile(profile_tracker, profile,time_taken):
# this could be improved - rather than filtering two times - get the row handle and update it
profile_tracker.loc[profile_tracker['handle'] == profile, 'finished_reading'] = True
profile_tracker.loc[profile_tracker['handle'] == profile, 'time_taken_in_mins'] = time_taken
profile_tracker.to_csv(tracker_file)
start_time = time.time()
profile_count = 0
profile_tracker = None
read_profile_tracker()
while True:
# get the profile to be read
# read them completely
#jump on to the next profile until everything is over
start_profile_time = time.time()
unread_profile = get_unread_profile(profile_tracker)
if (unread_profile):
profile_page = wiki.pages[unread_profile]
profile_list = {}
# go through all the years and all months and read a single article (or the latest previous article)
for year in range(2001,2017):
for mon in range(1,13):
dateObj = datetime.date(year, mon, 1)
#startDate = dateObj.isoformat()+'T00:00:00Z'
endDate = dateObj.isoformat()+'T23:59:59Z'
#look for article from the last minute of 1 st day of month.
#Start from there and go back until you find a article which will give the state of article at that point
for article in profile_page.revisions(start=endDate, prop='ids|timestamp|content',dir='older', limit=1):
profile_list[str(year)+'-'+str(mon)] = article
break
profile_count += 1
end_profile_time = time.time()
print(profile_count,') '+unread_profile+' is read. '+'time taken(mins) - ',(end_profile_time - start_profile_time)/60)
#push the content to the file and update the tracker file (this is done because even if program crashes - we will be able to resume)
pickle.dump(profile_list,open(base_path + '/'+unread_profile,'wb'))
write_read_profile(profile_tracker, unread_profile, (end_profile_time - start_profile_time)/60 )
else :
print('Hooray!!!!!!! The job is over.')
break
end_time = time.time()
print('Total Time taken (in mins)-',(end_time - start_time) / 60)
print('No. of profiles read :',profile_count)
#profile_list
#unread_profile
#pickle.dump(profile_list,open('profile-data//'+unread_profile,'wb'))