-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathwiki2schedule.py
executable file
·637 lines (545 loc) · 22.6 KB
/
wiki2schedule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
#
# Imports sessions from a semantic mediawiki instance,
# and can also be used as library – compare schedule_36C3.py
#
import requests
import json
from collections import OrderedDict
from datetime import datetime
import pytz
import os
import sys
import traceback
import optparse
from lxml import etree as ET
import html
import urllib.parse
# some functions used in multiple files of this collection
import voc.tools
from voc.schedule import Schedule, Event
tz = pytz.timezone("Europe/Amsterdam")
time_stamp_offset = -3600 # Workaround until MediaWiki server will be fixed
parser = optparse.OptionParser()
parser.add_option("--online", action="store_true", dest="online", default=False)
parser.add_option(
"--show-assembly-warnings",
action="store_true",
dest="show_assembly_warnings",
default=False,
)
parser.add_option(
"--fail", action="store_true", dest="exit_when_exception_occours", default=False
)
parser.add_option("--git", action="store_true", dest="git", default=False)
parser.add_option("--debug", action="store_true", dest="debug", default=False)
options, args = parser.parse_args()
local = False
use_offline_frab_schedules = False
only_workshops = False
if __name__ == "__main__":
congress_nr = 35
year = str(1983 + congress_nr)
xc3 = "{x}C3".format(x=congress_nr)
wiki_url = "https://events.ccc.de/congress/{year}/wiki".format(year=year)
output_dir = "/srv/www/" + xc3
secondary_output_dir = "./" + xc3
if len(sys.argv) == 2:
output_dir = sys.argv[1]
if not os.path.exists(output_dir):
try:
if not os.path.exists(secondary_output_dir):
os.mkdir(output_dir)
else:
output_dir = secondary_output_dir
local = True
except:
print(
"Please create directory named {} if you want to run in local mode".format(
secondary_output_dir
)
)
exit(-1)
os.chdir(output_dir)
if not os.path.exists("events"):
os.mkdir("events")
# this list/map is required to sort the events in the schedule.xml in the correct way
# other rooms/assemblies are added at the end on demand.
rooms = [
"Lecture room 11",
"Seminar room 14-15",
"Seminar room 13",
"Lecture room M1",
"Lecture room M2",
"Lecture room M3",
]
def print_json(x):
try:
print(json.dumps(x, indent=2))
except:
print("Fallback: ", x)
def generate_wiki_schedules(wiki_url):
global wiki_schedule, workshop_schedule
data = Wiki(wiki_url)
print("Processing...")
wiki_schedule = Schedule.from_XC3_template("Wiki", congress_nr, 27, 4)
wiki_schedule.add_rooms(rooms)
# workshops are all events from the wiki, which are in workshop rooms – starting from day 0 (the 26.)
workshop_schedule = Schedule.from_XC3_template("Workshops", congress_nr, 26, 5)
workshop_schedule.add_rooms(rooms)
print("Combining data...")
global sessions_complete
sessions_complete = OrderedDict()
# process_wiki_events() fills global variables: out, wiki_schedule, workshop_schedule
process_wiki_events(data, wiki_schedule, workshop_schedule)
# write imported data from wiki to one merged file
with open("sessions_complete.json", "w") as fp:
json.dump(sessions_complete, fp, indent=2)
wiki_schedule.export("wiki")
# write all sessions in workshop rooms to an additional schedule.json/xml
workshop_schedule.export("workshops")
print("done")
return wiki_schedule
warnings = False
events_with_warnings = 0
events_in_halls_with_warnings = 0
# this method is also exported to be used as a library method, thereby we started to reduce requiring of global variables
def process_wiki_events(
wiki,
wiki_schedule,
workshop_schedule=None,
timestamp_offset=None,
options=None,
fetch_wikitext=True,
):
global sessions_complete, warnings, time_stamp_offset
if timestamp_offset is not None:
time_stamp_offset = timestamp_offset
sessions_complete = OrderedDict()
events_total = 0
events_successful = 0
events_in_halls = 0 # aka workshops
used_guids = []
debug = options and options.debug
def warn(msg, force=False):
global warnings, events_with_warnings, events_in_halls_with_warnings
if not warnings:
warnings = True
events_with_warnings += 1
if is_workshop_room_session:
events_in_halls_with_warnings += 1
if is_workshop_room_session or options.show_assembly_warnings or force:
print("")
print(event_wiki_name)
try:
print(" at " + start_time.isoformat())
except NameError:
pass
try:
print(" in " + room)
except NameError:
pass
print(" " + wiki_edit_url)
# if not is_workshop_room_session:
# msg += ' – at assembly?'
if is_workshop_room_session or options.show_assembly_warnings or force:
print(msg)
for event_wiki_name, event_r in wiki.events.items(): # python3
warnings = False
sys.stdout.write(".")
try:
wiki_page_name = event_wiki_name.split("#")[0].replace(
" ", "_"
) # or see fullurl property
wiki_edit_url = (
wiki.wiki_url
+ "/index.php?title="
+ urllib.parse.quote_plus(wiki_page_name)
+ "&action=edit"
)
wiki_parsetree_url = (
wiki.wiki_url
+ "/api.php?action=parse&format=json&page="
+ urllib.parse.quote_plus(wiki_page_name)
+ "&prop=parsetree"
)
session = wiki.parent_of_event(event_wiki_name)
event = event_r["printouts"]
event_n = None
events_total += 1
# One Event take place in multiple rooms...
# WORKAROND If that is the case just pick the first one
room = ""
is_workshop_room_session = False
if len(event["Has session location"]) == 1:
room = event["Has session location"][0]["fulltext"]
if room.split(":", 1)[0] == "Room":
is_workshop_room_session = True
room = Wiki.remove_prefix(room)
elif len(event["Has session location"]) == 0:
warn(" has no room yet, skipping...")
continue
else:
warn(" WARNING: has multiple rooms ???, just picking the first one…")
event["Has session location"] = event["Has session location"][0][
"fulltext"
]
# http://stackoverflow.com/questions/22698244/how-to-merge-two-json-string-in-python
# This will only work if there are unique keys in each json string.
# combined = dict(session.items() + event.items()) #python2
combined = (
session.copy()
) # python3 TODO test if this really leads to the same result
combined.update(event)
sessions_complete[event_wiki_name] = combined
if len(event["Has start time"]) < 1:
warn(" has no start time")
day = None
else:
date_time = datetime.fromtimestamp(
int(event["Has start time"][0]["timestamp"]) + time_stamp_offset
)
start_time = tz.localize(date_time)
day = wiki_schedule.get_day_from_time(start_time)
# if is_workshop_room_session and day is not None and event['Has duration']:
if day is not None and event["Has duration"]:
duration = 0
if event["Has duration"]:
duration = event["Has duration"][0]
if not isinstance(duration, int) and "value" in duration:
duration = duration["value"]
if duration > 60 * 24:
warn(" event takes longer than 24h, skipping...")
continue
lang = ""
if session["Held in language"] and len(session["Held in language"]) > 0:
lang = session["Held in language"][0].split(" - ", 1)[0]
if len(event["GUID"]) > 0:
guid = event["GUID"][0]
if not isinstance(guid, str):
raise Exception("GUID is not string, but " + guid)
else:
guid = voc.tools.gen_uuid(
session["fullurl"] + str(event["Has start time"][0])
)
warn(
" GUID was empty, generated one for now. Not shure if its stable..."
)
# if debug:
# print_json(event['GUID'])
if guid in used_guids:
warn(
" GUID {} was already used before, generated a random one for now. Please fix the session wiki page to ensure users can stay subscribed to event!".format(
guid
),
force=True,
)
guid = voc.tools.gen_uuid(
session["fullurl"] + str(event["Has start time"][0])
)
used_guids.append(guid)
local_id = voc.tools.get_id(guid)
description = ("\n".join(session["Has description"])).strip()
if fetch_wikitext:
if int(
session["Modification date"][0]["timestamp"]
) > voc.tools.last_edited.get(guid, 0):
wiki_text = ""
# Retry up to three times
for _ in range(3):
content_r = requests.get(wiki_parsetree_url, timeout=5)
if content_r.ok is True:
print(
"Page {0} requested successfully!".format(
wiki_parsetree_url
)
)
break
print(".")
if content_r.ok is False:
print(
" Requesting {1} failed, HTTP {0}.".format(
content_r.status_code, wiki_parsetree_url
)
)
else:
try:
wiki_text_tree = ET.fromstring(
voc.tools.parse_json(content_r.text)["parse"][
"parsetree"
].replace("\n", "")
)
except AttributeError:
wiki_text_tree = ET.fromstring(
voc.tools.parse_json(content_r.text)["parse"][
"parsetree"
]["*"].replace("\n", "")
)
except KeyError:
print(wiki_parsetree_url)
for element in wiki_text_tree.iterfind("template"):
if element.tail != None:
wiki_text += html.unescape(element.tail)
description = (
("\n".join(session["Has description"])).strip()
+ "\n"
+ wiki_text
)
else: # unmodified
if os.path.isfile("events/{guid}.json".format(guid=guid)):
with open(
"events/{guid}.json".format(guid=guid), "r"
) as fp:
# maintain order from file
temp = fp.read()
old_event = json.JSONDecoder().decode(temp)
description = old_event["description"]
voc.tools.last_edited[guid] = int(
session["Modification date"][0]["timestamp"]
)
event_n = Event(
[
("id", local_id),
("guid", guid),
(
"url",
session["fullurl"],
), # TODO: add enshure_url() which adds "https:"+ prefix when neccessary
("logo", None),
("date", start_time.isoformat()),
("start", start_time.strftime("%H:%M")),
# ('duration', str(timedelta(minutes=event['Has duration'][0])) ),
("duration", "%d:%02d" % divmod(duration, 60)),
("room", room),
(
"slug",
"{slug}-{id}-{name}".format(
slug=wiki_schedule.conference()["acronym"].lower(),
id=local_id,
name=voc.tools.normalise_string(
session["wiki_name"].lower()
),
),
),
("title", session["Has title"][0]),
("subtitle", "\n".join(event["Has subtitle"])),
("track", "self organized sessions"),
("type", " ".join(session["Has session type"]).lower()),
("language", lang),
("abstract", ""),
("description", description),
(
"persons",
[
OrderedDict(
[
("id", 0),
(
"url",
p["fullurl"],
), # sometimes a https: is needed...
(
"public_name",
Wiki.remove_prefix(p["fulltext"]),
), # must be last element so that transformation to xml works
]
)
for p in session["Is organized by"]
],
),
(
"links",
[
OrderedDict(
[
(
"url",
url,
), # TODO sometimes a https:// is needed...
("title", url),
]
)
for url in session["Has website"]
],
),
],
start_time,
)
# Break if conference day date and event date do not match
conference_day_start = wiki_schedule.day(day).start
conference_day_end = wiki_schedule.day(day).end
if not conference_day_start <= event_n.start < conference_day_end:
raise Exception(
"Current conference day from {0} to {1} does not match current event {2} with date {3}.".format(
conference_day_start,
conference_day_end,
event_n["id"],
event_n.start,
)
)
# Events from day 0 (26. December) do not go into the full schdedule
if start_time.day != 26 and not only_workshops:
wiki_schedule.add_event(event_n)
if workshop_schedule and is_workshop_room_session:
events_in_halls += 1
workshop_schedule.add_event(event_n)
events_successful += 1
except Warning as w:
warn(w)
except:
if "event_n" in locals():
print(event_n)
if "event" in locals():
print(json.dumps(event, indent=2))
print(" unexpected error: " + str(sys.exc_info()[0]))
traceback.print_exc()
if options.exit_when_exception_occours:
exit()
store_sos_ids()
store_last_edited()
if debug:
with open("sessions_complete.json", "w") as fp:
json.dump(sessions_complete, fp, indent=2)
print(
"\nFrom %d total events (%d in halls) where %d successful, while %d (%d in halls) produced warnings"
% (
events_total,
events_in_halls,
events_successful,
events_with_warnings,
events_in_halls_with_warnings,
)
)
if not options.show_assembly_warnings:
print(" (use --show-assembly-warnings cli option to show all warnings)")
class Wiki:
"""
This class is a container for self-organized sessions from a Semantic Mediawiki instance.
One session can have one or multiple events (aka slots) when it takes place.
"""
wiki_url = None
sessions = []
events = []
def __init__(self, wiki_url):
self.wiki_url = wiki_url
self.sessions = self.query(
"[[Category:Session]]",
[
"?Has description",
"?Has session type",
"?Held in language",
"?Is organized by",
"?Has website",
"?Modification date",
],
)
self.events = self.query(
"[[Has object type::Event]]",
[
"?Has subtitle",
"?Has start time",
"?Has end time",
"?Has duration",
"?Has session location",
"?Has event track",
"?Has color",
"?GUID",
],
)
def query(self, q, po):
r = None
results = OrderedDict()
offset = 0
while True:
print("Requesting wiki " + q)
# Retry up to three times
for _ in range(3):
r = requests.get(
self.wiki_url + "/index.php?title=Special:Ask",
params=(
("q", q),
("po", "\r\n".join(po)),
("p[format]", "json"),
("p[limit]", 500),
("p[offset]", offset),
),
)
if r.ok is True:
break
print(".")
if r.ok is False:
raise Exception(" Requesting failed, HTTP {0}.".format(r.status_code))
# this more complex way instead of sessions_r.json()['results'] is necessary
# to maintain the same order as in the input file
page = voc.tools.parse_json(r.text)["results"]
results.update(page)
# if we get exactly 500 results we have to fetch the next page,
# otherwhise we are done
if len(page) == 500:
offset += 500
else:
break
return results
def parent_of_event(self, event_wiki_name):
session_wiki_name = event_wiki_name.split("# ", 2)[0]
if session_wiki_name in self.sessions:
wiki_session = self.sessions[session_wiki_name]
else:
# is_workshop_room_session = True # workaround/don't ask
# This happens for imported events like these at the bottom of [[Static:Schedule]]
raise Warning(" event without session? -> ignore event")
session = wiki_session["printouts"]
session["fullurl"] = wiki_session["fullurl"]
session["wiki_name"] = session_wiki_name
try:
session["Has title"] = [Wiki.remove_prefix(session_wiki_name)]
except IndexError:
raise Warning(
" Skipping malformed session wiki name {0}.".format(session_wiki_name)
)
return session
@classmethod
def remove_prefix(cls, foo):
if ":" in foo:
return foo.split(":", 1)[1]
else:
return foo
def load_sos_ids():
if os.path.isfile("_sos_ids.json"):
with open("_sos_ids.json", "r") as fp:
# maintain order from file
temp = fp.read()
voc.tools.sos_ids = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(
temp
)
if sys.version_info.major < 3:
voc.tools.next_id = max(voc.tools.sos_ids.itervalues()) + 1
else:
voc.tools.next_id = max(voc.tools.sos_ids.values()) + 1
def store_sos_ids():
# write sos_ids to disk
with open("_sos_ids.json", "w") as fp:
json.dump(voc.tools.sos_ids, fp, indent=4)
def load_last_edited():
if os.path.isfile("_last_edited.json"):
with open("_last_edited.json", "r") as fp:
# maintain order from file
temp = fp.read()
voc.tools.last_edited = json.JSONDecoder(
object_pairs_hook=OrderedDict
).decode(temp)
def store_last_edited():
# write last_edited to disk
with open("_last_edited.json", "w") as fp:
json.dump(voc.tools.last_edited, fp, indent=4)
load_sos_ids()
load_last_edited()
if __name__ == "__main__":
generate_wiki_schedules(wiki_url)
if not local or options.git:
os.system("/usr/bin/env git add *.json *.xml")
os.system(
"/usr/bin/env git commit -m 'updates from " + str(datetime.now()) + "'"
)
# os.system("/usr/bin/env git push")