-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
executable file
·582 lines (531 loc) · 23.6 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
################################################################################
################################################################################
##################### #########################
##################### Release Our Data #########################
##################### #########################
##################### a HelloSilo Project #########################
##################### <[email protected]> #########################
################################################################################
## ##
## Copyright 2010 ##
## ##
## Parker Phinney @gameguy43 <[email protected]> ##
## Seth Woodworth @sethish <[email protected]> ##
## ##
## ##
## Licensed under the GPLv3 or later, ##
## see PERMISSION for copying permission ##
## and COPYING for the GPL License ##
## ##
################################################################################
################################################################################
import urllib2
import os.path
#import Queue
import sys
#import threading
import gevent
from gevent.queue import Queue
import traceback
import db
import shutil
from gevent import monkey
monkey.patch_all()
#import libs.cdc_phil_lib as imglib
# .scraper
# .parser
# .data_storer
import config
### General Utilites
def load_module(name):
fp, pathname, description = imp.find_module(name)
return imp.load_module(name, fp, pathname, description)
def mkdir(dirname):
if not os.path.isdir(dirname + "/"):
os.makedirs(dirname + "/")
def get_filename_base_for_id(id):
return str(id).zfill(5)
def get_extension_from_path(path):
basename, extension = os.path.splitext(path)
return extension
### Utils for filesystem stuff
def floorify(id):
## mod 100 the image id numbers to make smarter folders
floor = id - id % 100
floored = str(floor).zfill(5)[0:3]+"XX"
return floored
def ceilingify(id):
## mod 100 the image id numbers to make smarter folders
ceiling = id - id % 100 + 100
ceilinged = str(ceiling).zfill(5)[0:3]+"XX"
return ceilinged
def get_subdir_for_id(id):
return floorify(id) + '-' + ceilingify(id) + '/'
### HIGH-LEVEL FUNCTIONS
## For Automating
def nightly(dl_images=True, from_hd=False):
scrape_all_sites(dl_images=dl_images, from_hd=from_hd)
def scrape_all_sites(dl_images=True, from_hd=False):
image_databases = config.image_databases
for name, data in image_databases.items():
myscraper = mkscraper(name)
myscraper.scrape_all(dl_images=dl_images, from_hd=from_hd)
## For Testing
def generate_test_dataset(dl_images=True, from_hd=False, to_test_db=False):
image_databases = config.image_databases
for name, data in image_databases.items():
myscraper = mkscraper(name, test=to_test_db)
indeces = myscraper.imglib.tests.known_good_indeces
myscraper.scrape_indeces(indeces, dl_images=dl_images, from_hd=from_hd)
def drop_all_tables():
image_databases = config.image_databases
for name, data in image_databases.items():
myscraper = mkscraper(name)
myscraper.db.truncate_all_tables()
break
# make a scraper based on the config
def mkscraper(image_db_key, test=False):
kwargs = {}
if test:
data_root_dir = config.test_data_root_dir
kwargs['db_url'] = config.test_db_url
print data_root_dir
else:
data_root_dir = config.data_root_dir
kwargs['db_url'] = config.db_url
img_db_config = config.image_databases[image_db_key]
data_base_dir = data_root_dir + img_db_config['data_subdir']
img_libraries_metalib = config.img_libraries_metalib
kwargs['imglib'] = getattr(img_libraries_metalib, img_db_config['python_lib'])
# we keep this around so that we can construct a different data path for the web
kwargs['data_library_subdir'] = img_db_config['data_subdir']
kwargs['data_dir'] = data_base_dir
kwargs['html_subdir'] = config.html_subdir
kwargs['data_table_prefix'] = img_db_config['data_table_prefix']
kwargs['max_daemons'] = config.max_daemons
kwargs['max_filesize'] = config.max_filesize
kwargs['web_data_base_dir'] = config.web_data_base_dir
kwargs['long_name'] = img_db_config['long_name']
kwargs['homepage'] = img_db_config['homepage']
kwargs['code_url'] = img_db_config['code_url']
kwargs['abbrev'] = image_db_key
kwargs['test'] = test
return Scraper(**kwargs)
class Scraper:
def __init__(self, imglib, db_url, data_dir, html_subdir, data_table_prefix, data_library_subdir, max_daemons=10, max_filesize=None, web_data_base_dir=None, long_name='', homepage='', code_url='', abbrev='', test=False):
self.imglib = imglib
self.resolutions = imglib.data_schema.resolutions
self.max_daemons = max_daemons
self.max_filesize = max_filesize
self.long_name = long_name
self.homepage = homepage
self.code_url = code_url
self.abbrev = abbrev
self.data_dir = data_dir
self.html_dir = data_dir + html_subdir
self.testing = test
# we keep this around so that we can construct a different data path for the web
self.data_library_subdir = data_library_subdir
if web_data_base_dir:
self.web_data_base_dir = web_data_base_dir + self.data_library_subdir
metadata_table_name = data_table_prefix + "metadata"
db_kwargs = {
'data_schema' : self.imglib.data_schema,
'db_url' : db_url,
'metadata_table_name' : metadata_table_name,
'scraper' : self,
}
self.db = db.DB(**db_kwargs)
### HIGH-LEVEL
def scrape_all(self, dl_images=True, from_hd=False):
floor = self.db.get_highest_id_in_our_db()
ceiling = self.imglib.scraper.get_highest_id()
indeces = range(floor, ceiling+1)
self.scrape_indeces(indeces, dl_images=dl_images, from_hd=from_hd)
### FILESYSTEM STUFF
# if this gets heavy, we could move it to it's own file, like db.py
## READING
def get_resolution_download_dir(self, resolution):
return self.data_dir + self.resolutions[resolution]['subdir']
def get_resolution_local_image_location(self, resolution, id, remote_url=None):
extension = self.get_resolution_extension(resolution, id)
return self.get_resolution_download_dir(resolution) + get_subdir_for_id(id) + get_filename_base_for_id(id) + extension
def get_local_html_file_location(self, id):
filename_base = get_filename_base_for_id(id)
subdir = get_subdir_for_id(id)
#TODO: let's not do this any more
mkdir(self.html_dir + subdir)
return self.html_dir + subdir + filename_base + '.html'
def get_local_raw_html(self, id):
local_html_file_location = self.get_local_html_file_location(id)
fp = open(local_html_file_location, 'r')
html = fp.read()
return html
## WRITING
def store_raw_html(self, id, html):
local_html_file_location = self.get_local_html_file_location(id)
fp = open(local_html_file_location, 'w')
fp.write(html)
fp.close()
# bootstrapping: make the right data subdirs
def make_directories_if_necessary(self, ids, root_dir):
## directories for image downloads
subdirs = map(get_subdir_for_id, ids)
# this removes duplicates
subdirs = set(subdirs)
# convert the floors into strings of format like 015XX
# also, make the effing directories
map((lambda dirname: mkdir(root_dir + dirname)), subdirs)
### DOWNLOADING
# download all images that haven't already been downloaded
# (accoding to our db, not according to the filesystem)
def download_all_images(self):
for resolution, resolution_data in self.resolutions.items():
self.download_resolution_images(resolution)
# (helper for above)
def download_resolution_images(self, resolution):
# get the list of (id, url) tuples corresponding to the images we need to download
dl_these_tuples = self.db.get_set_images_to_dl(resolution)
# bootstrap file structure for download
ids = map(lambda tuple: tuple[0], dl_these_tuples)
root_dir = self.get_resolution_download_dir(resolution)
self.make_directories_if_necessary(ids, root_dir)
# populate the queue
q = Queue()
map(q.put, dl_these_tuples)
# make the downloaders and get to it!
dlers = [gevent.spawn(self.image_downloader, q, resolution) for i in range(self.max_daemons)]
gevent.joinall(dlers)
# this is run as a gevent "green thread"
def image_downloader(self, q, resolution):
while True:
# if the queue is empty, we're done!
if q.empty():
break
# grab url/id tuple from queue
id_url_tuple = q.get()
try:
print id_url_tuple
self.dl_image(resolution, id_url_tuple[0], id_url_tuple[1])
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: trouble dling image apparently... " + str(id)
traceback.print_exc()
continue
def dl_image(self, resolution, id, url):
# if we already know the file is too big
if (self.db.get_is_marked_as_too_big(id, resolution)):
print "this file is marked as too big (won't dl): " + local_filename
return False
local_filename = self.get_resolution_local_image_location(resolution, id, url)
remote = urllib2.urlopen(url)
filesize = int(remote.info().getheaders("Content-Length")[0])
# if we're just noticing that the file is too big
if (self.max_filesize and filesize > self.max_filesize):
print "looks like this file is too big (won't dl): " + local_filename
# signal to db that we're done downloading
self.db.mark_img_as_too_big(id, resolution)
print "finished marking as too big " + url
return False
# if the file isn't too big
else:
# download it!
local = open(local_filename, 'w')
local.write(remote.read())
local.close()
print "finished downloading " + url
# signal to db that we're done downloading
self.db.mark_img_as_downloaded(id, resolution)
print "finished marking as downloaded " + url
return True
# download to disk the html files for these indeces
# useful for grabbing html files to toss in to a sites's /samples
def dl_html_for_indeces(self, indeces):
# populate the queue
q = Queue()
map(q.put, indeces)
# make the downloaders and get to it!
dlers = [gevent.spawn(self.html_downloader, q) for i in range(self.max_daemons)]
gevent.joinall(dlers)
# this is run as a gevent "green thread"
def html_downloader(self, q):
while True:
# if the queue is empty, we're done!
if q.empty():
break
# grab the id from the queue
id = q.get()
try:
print id
self.dl_html(id)
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: trouble dling html apparently... " + str(id)
traceback.print_exc()
continue
def dl_html(self, id):
html = self.imglib.scraper.scrape_out_img_page(id)
self.store_raw_html(id, html)
return True
### PARSING
def parse_indeces(self, indeces):
for id in indeces:
try:
self.parse_id(id)
except:
continue
def parse_id(self, id):
# get the html
try:
html = self.get_local_raw_html(id)
except:
print "ERROR: couldn't grab local raw HTML for id " + str(id)
traceback.print_exc()
return False
try:
metadata = self.imglib.parser.parse_img_html_page(html)
metadata = self.imglib.parser.post_processing(metadata)
except:
print "ERROR: couldn't parse raw html for id " + str(id)
metadata = {
'id': id,
'we_couldnt_parse_it': True,
}
self.db.store_metadata_row(metadata)
print "we just recorded in the DB the fact that we couldn't parse this one"
failed_indices.append(id)
traceback.print_exc()
return False
if not metadata or metadata == {}:
print "ERROR: we thought we parsed raw html for id " + str(id) + ", but we got a blank dict back"
metadata = {
'id': id,
'we_couldnt_parse_it': True,
}
self.db.store_metadata_row(metadata)
print "we just recorded in the DB the fact that we couldn't parse this one"
failed_indices.append(id)
traceback.print_exc()
return False
try:
self.db.store_metadata_row(metadata)
except:
print "ERROR: couldn't store metadata for id " + str(id)
failed_indices.append(id)
traceback.print_exc()
return False
return True
def scrape_indeces(self, indeces, dl_images=True, from_hd=False):
## download the html
if not from_hd:
self.dl_html_for_indeces(indeces)
## parse the html
self.parse_indeces(indeces)
## download the images
if dl_images:
self.download_all_images()
'''
def scrape_indeces(self, indeces, dl_images=True, from_hd=False):
## main glue function
failed_indices = []
if not from_hd:
try:
#TODO: i think i can cut this cookie stuff
#cookiejar = self.imglib.scraper.get_me_a_cookie()
cookiejar = None
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: WE COULDN'T EVEN GET A COOKIE"
traceback.print_exc()
return None
for current_id in indeces:
print "STARTING: " + str(current_id)
if not from_hd:
try:
# 1: fetching html of id, for store and parse
print "downloading html for " + str(current_id) + " ..."
html = self.imglib.scraper.scrape_out_img_page(current_id, cookiejar)
print "downloaded"
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: couldn't scrape out html for id " + str(current_id)
failed_indices.append(current_id)
traceback.print_exc()
continue
# if we got a session error page:
#TODO: pretty sure this doesn't work any more. we won't try the id again if we get a session error page
if self.imglib.scraper.is_session_expired_page(html):
times_to_try_getting_cookie = 3
print "SESSION error. Getting a new cookie...we'll give this " + str(times_to_try_getting_cookie) + " tries..."
try_num = 1
while try_num <= times_to_try_getting_cookie:
try:
cookiejar = get_me_a_cookie()
except KeyboardInterrupt:
sys.exit(0)
except:
print "eep, no luck. giving it another shot..."
try_num+=1
continue
# refreshed cookie, returning to loop
print "SESSION success. got a new cookie."
break
# but if we didn't get a session error page
else:
try:
# 2: write html to disk
self.store_raw_html(current_id, html)
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: couldn't store raw html for id " + str(current_id)
failed_indices.append(current_id)
traceback.print_exc()
continue
# if we wanted to get the html from disk
else:
try:
html = self.get_local_raw_html(current_id)
except:
traceback.print_exc()
continue
try:
# 3: parse the metadata out of their html
metadata = self.imglib.parser.parse_img_html_page(html)
metadata = self.imglib.parser.post_processing(metadata)
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: couldn't parse raw html for id " + str(current_id)
metadata = {
'id': current_id,
'we_couldnt_parse_it': True,
}
self.db.store_metadata_row(metadata)
print "we just recorded in the DB the fact that we couldn't parse this one"
failed_indices.append(current_id)
traceback.print_exc()
continue
if not metadata or metadata == {}:
print "ERROR: we thought we parsed raw html for id " + str(current_id) + ", but we got a blank dict back"
metadata = {
'id': current_id,
'we_couldnt_parse_it': True,
}
self.db.store_metadata_row(metadata)
print "we just recorded in the DB the fact that we couldn't parse this one"
failed_indices.append(current_id)
traceback.print_exc()
continue
try:
self.db.store_metadata_row(metadata)
except KeyboardInterrupt:
sys.exit(0)
except:
print "ERROR: couldn't store metadata for id " + str(current_id)
failed_indices.append(current_id)
traceback.print_exc()
continue
# These lines will only run if everthing went according to plan
print "SUCCESS: everything went according to plan for id " + str(current_id)
#print "bootstrapping status tables..."
#self.bootstrap_status_tables()
print "HOLY CRAP WE ARE DONE"
if not len(failed_indices) == 0:
print "Failed at " + str(len(failed_indices)) + " indices :"
print failed_indices
if dl_images:
print "k, trying to get the images now"
self.download_all_images()
'''
### MISC
def get_resolution_extension(self, resolution, id):
try:
remote_url = self.db.get_resolution_url(resolution, id)
extension = get_extension_from_path(remote_url)
except:
extension = self.resolutions[resolution]['extension']
return extension
### CROSS-POSTING STUFF
def upload_to_wikicommons_if_unique(self, id):
import wikiuploader
myuploader = wikiuploader.WikiUploader(self, testing=self.testing)
metadata = self.db.get_image_metadata_dict(id)
myuploader.upload_to_wikicommons_if_unique(metadata)
#### WEB STUFF
def set_web_vars(self, web_data_base_dir):
self.web_data_base_dir = web_data_base_dir + self.data_library_subdir
def get_web_resolution_local_image_location(self, resolution, id, remote_url=None):
extension = self.get_resolution_extension(resolution, id)
return self.web_data_base_dir + self.resolutions[resolution]['subdir'] + get_subdir_for_id(id) + get_filename_base_for_id(id) + extension
def get_image_html_repr(self, id):
kwargs = {
'image_as_dict' : self.db.get_image_metadata_dict(id),
'image_resolution_to_local_file_location_fxn' :
lambda resolution: self.get_web_resolution_local_image_location(resolution, id),
}
html = self.db.repr_as_html(**kwargs)
return html
### TESTING/SETUP/ETC UTILS
# update the download statuses in our DB based on the FS
# useful if you accidentally kill your database
# NOTE: this will add rows even for ids that we don't have rows for yet
def update_download_statuses_based_on_fs(self, ceiling_id=50000):
for resolution, res_data in self.resolutions.items():
self.update_resolution_download_status_based_on_fs(resolution, ceiling_id)
# helper for above
def update_resolution_download_status_based_on_fs(self, resolution, ceiling_id=50000):
## go through ids and check if we have them
# TODO: this doesn't work currently because we don't know the extension of
root_dir = self.get_resolution_download_dir(resolution)
ids = range(1, ceiling_id+1)
for id in ids:
print "checking " + str(id)
local_file_location = self.get_resolution_local_image_location(resolution, id)
we_have_it = os.access(local_file_location,os.F_OK)
print str(we_have_it)
if we_have_it:
self.db.mark_img_as_downloaded(id, resolution)
print "marked as downloaded"
else:
self.db.mark_img_as_not_downloaded(id, resolution)
print "marked as not downloaded"
def clear_all_data(self):
# clear out the mysql data
self.db.truncate_all_tables()
# create path to move the old data to (good to have a backup)
if self.data_dir[-1] == '/':
backup_data_dir = self.data_dir[0:-1] + '_old'
else:
backup_data_dir = self.data_dir + '_old'
# move the data dir to a backup one
# first, nuke the destination
if os.path.isdir(backup_data_dir):
shutil.rmtree(backup_data_dir)
if os.path.isdir(self.data_dir):
shutil.move(self.data_dir, backup_data_dir)
if __name__ == '__main__':
do_this = 'nightly'
if do_this == 'nightly':
dl_images = True
from_hd = False
nightly(dl_images, from_hd)
elif do_this == 'testing':
generate_test_dataset(dl_images=True, from_hd=False)
#generate_test_dataset(dl_images=False, from_hd=True)
elif do_this == 'update_download_statuses':
for name, data in config.image_databases.items():
myscraper = mkscraper(name)
ceiling_id = myscraper.db.get_highest_id_in_our_db()
myscraper.update_download_statuses_based_on_fs(ceiling_id)
elif do_this == 'dl_indeces':
name = 'fws'
indeces = [2,1234]
myscraper = mkscraper(name)
myscraper.dl_html_for_indeces(indeces)
else:
pass