-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtests.py
executable file
·184 lines (150 loc) · 8.33 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import unittest2 as unittest
import scraper
import config
import os
class TestScraperFunctions(unittest.TestCase):
def setUp(self):
self.maxDiff = None
# instantiate a scraper
self.imglib_name = 'fema'
print "SETTING UP"
self.myscraper = scraper.mkscraper(self.imglib_name, test=True)
self.imglib = self.myscraper.imglib
# DELETE EVERYTHING THATS ALREADY IN THE TEST DB.
# LIKE, OMFG
self.myscraper.clear_all_data()
# clear out all of the data for this site (like, omfg)
'''
def test_db_creation(self):
db = myscraper.get_or_create_db()
# TODO: assert things about the database
# has the right tables
# insert something and then grab it out
'''
def scrape_known_good_indeces(self):
# grab known good indeces
known_good_indeces = self.imglib.tests.known_good_indeces
known_good_indeces.sort()
max_known_good_index = known_good_indeces.pop()
# woops--now we're missing the last one. better put that back.
known_good_indeces.append(max_known_good_index)
# do a scrape on them
#self.myscraper.scrape_indeces(known_good_indeces, dl_images=False, from_hd=True) #DEBUG TODO
self.myscraper.scrape_indeces(known_good_indeces, dl_images=True, from_hd=False)
#TODO: test the scraping from hd functionality
def test_scrape(self):
# grab known good indeces
known_good_indeces = self.imglib.tests.known_good_indeces
known_good_indeces.sort()
max_known_good_index = known_good_indeces.pop()
# woops--now we're missing the last one. better put that back.
known_good_indeces.append(max_known_good_index)
# do a scrape on them
#self.myscraper.scrape_indeces(known_good_indeces, dl_images=False, from_hd=True) #DEBUG TODO
self.myscraper.scrape_indeces(known_good_indeces, dl_images=True, from_hd=False)
# check that we have the right number of rows in the database
rows = self.myscraper.db.metadata_table.all()
import pprint; pprint.pprint(rows)
import pprint; pprint.pprint(known_good_indeces)
self.assertEqual(len(known_good_indeces), len(rows))
# check that the ids in the rows are right
all_rows = self.myscraper.db.metadata_table.all()
ids = map(lambda row: int(row.id), all_rows)
self.assertEqual(set(ids), set(known_good_indeces))
# check that at least one of the rows actually has the right data
known_metadata_mappings = self.imglib.tests.known_metadata_mappings
for id, known_metadata_mapping in known_metadata_mappings.items():
# this is kind of hackey, but makes sense--
# if we didn't put the known good data through the same encoding and decoding process, we might get problems where we have a list of tuples instead of a dict, etc
known_metadata_mapping = self.myscraper.db.re_objectify_data(
self.myscraper.db.prep_data_for_insertion(known_metadata_mapping))
in_db_data = self.myscraper.db.get_image_metadata_dict(id)
for key, known_data in known_metadata_mapping.items():
print key
self.assertEqual(known_data, in_db_data[key])
# check that all the images are marked as downloaded
# first, do it by hand
check_this_id = known_good_indeces[0]
check_this_id = str(check_this_id)
self.assertTrue(self.myscraper.db.get_image_metadata_dict(check_this_id)['thumb_status'])
# then do it the modular way
num_statuses_checked = 0
for id in known_good_indeces:
metadata = self.myscraper.db.get_image_metadata_dict(check_this_id)
for resolution, resolution_info in self.myscraper.resolutions.items():
if not self.myscraper.db.get_is_marked_as_too_big(id, resolution):
num_statuses_checked+=1
self.assertTrue(metadata[resolution_info['status_column_name']])
# make sure that we're at least checking a reasonable number of statuses
self.assertGreater(num_statuses_checked, 3)
# make sure that we have the HTML and image files for each of the known good indeces
for id in known_good_indeces:
html_file = self.myscraper.get_local_html_file_location(id)
print "making sure we have " + html_file
self.assertTrue(os.access(html_file,os.F_OK))
for resolution in self.myscraper.resolutions:
if not self.myscraper.db.get_is_marked_as_too_big(id, resolution):
extension = scraper.get_extension_from_path(self.myscraper.db.get_resolution_image_url(id, resolution))
remote_url = self.myscraper.db.get_resolution_image_url(id, resolution)
file = self.myscraper.get_resolution_local_image_location(resolution, id, remote_url)
print "making sure we have " + file
self.assertTrue(os.access(file,os.F_OK))
#TODO: test that all the images are there too
# (not sure how to handle)
# manually check that a specific one of the rows has all the right content? maybe
#TODO
# TODO: check that the function update_resolution_download_status_based_on_fs(self, resolution, ceiling_id=50000) works
#TODO: this test should be uncommented. However, right now it generates:
#ArgumentError: this Column already has a table!
def test_next_id(self):
known_good_indeces = self.imglib.tests.known_good_indeces
for id in known_good_indeces:
print id
self.assertTrue(isinstance(self.myscraper.db.get_next_successful_image_id(id), int))
self.assertTrue(isinstance(self.myscraper.db.get_prev_successful_image_id(id), int))
# order good indeces
known_good_indeces.sort()
# we'll download good1 and good3, but not good2
good1 = known_good_indeces[0]
good2 = known_good_indeces[1]
good3 = known_good_indeces[2]
self.myscraper.scrape_indeces([good1], dl_images=True, from_hd=False)
self.myscraper.scrape_indeces([good3], dl_images=True, from_hd=False)
local_file_we_dont_dl = self.myscraper.get_resolution_local_image_location(self.myscraper.resolutions.keys()[0], good2)
# touch the file we won't dl, to trick our scraper in to thinking we've downloaded it
#open(local_file_we_dont_dl, 'a')
self.myscraper.update_download_statuses_based_on_fs(ceiling_id=good3)
self.assertEqual(self.myscraper.db.get_next_successful_image_id(good1), good3)
self.assertEqual(self.myscraper.db.get_prev_successful_image_id(good3), good1)
def test_update_download_statuses(self):
#TODO
pass
def test_get_set_images_to_dl(self):
resolution = self.myscraper.resolutions.keys()[0]
known_good_indeces = self.imglib.tests.known_good_indeces
self.myscraper.scrape_indeces(known_good_indeces[0:2+1], dl_images=True, from_hd=False)
a = known_good_indeces[0]
b = known_good_indeces[1]
c = known_good_indeces[2]
self.assertNotEqual(a, b)
self.assertNotEqual(b, c)
a_url = self.myscraper.db.get_resolution_url(resolution, a)
b_url = self.myscraper.db.get_resolution_url(resolution, b)
c_url = self.myscraper.db.get_resolution_url(resolution, c)
# mark a as downloaded, b as not downloaded, c as too big,
self.myscraper.db.mark_img_as_downloaded(a, resolution)
self.myscraper.db.mark_img_as_not_downloaded(b, resolution)
self.myscraper.db.mark_img_as_too_big(c, resolution)
reported_images_to_dl = self.myscraper.db.get_set_images_to_dl(resolution)
reported_images_to_dl.sort()
# we should only want to download b
expected_images_to_dl = [(b, b_url)]
expected_images_to_dl.sort()
self.assertEqual(reported_images_to_dl, expected_images_to_dl)
not_expected_images_to_dl = [(a, a_url)]
not_expected_images_to_dl.sort()
self.assertNotEqual(reported_images_to_dl, not_expected_images_to_dl)
#TODO: case: * grab the highest index in the database
# maybe. or just make this a test that lives in the individual image libraries
if __name__ == '__main__':
unittest.main(verbosity=2)