-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-archive-org-search
executable file
·182 lines (169 loc) · 6.69 KB
/
mklist-archive-org-search
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
"""
Search for movies in the Internet Archive, and output the list with
IMDB title IDs similar to the other list of free movies. Look for
IMDB title references in the description and other relevant tags.
It is based on how the Butter plugin search for movies, but extended
to get more hits.
If the search result do not return a IMDB ID, see if the wikidata set
or the manually created data set have such ID and use this one.
"""
import copy
import json
import movielib
import re
import time
import urllib
import urllib2
def fetch(page=1):
print("Fetching page %d" % page)
"""
Based on the search URL found in
https://github.com/butterproviders/butter-provider-archive
The search result is less than 1000 entries 2017-11-05.
Removing the 'year' requirement result in several thousand entries more.
"""
# Search term:
term = """
(collection:moviesandfilms
OR collection:animationandcartoons
OR collection:classic_tv
OR collection:classic_cartoons)
AND NOT collection:movie_trailers
AND NOT collection:sabucat_trailers
AND NOT collection:stock_footage
AND NOT collection:home_movies
AND NOT collection:prelinger_mashups
AND NOT collection:brick_films
AND -mediatype:collection
AND format:"Archive BitTorrent"
"""
# Limiting to entries with year was part of the original Butter
# search, but reduses around 17000 entries to around 500.
if False:
term = term + " AND year"
url = 'https://archive.org/advancedsearch.php?sort%%5B%%5D=&sort%%5B%%5D=&sort%%5B%%5D=&output=json&rows=100000&page=%d' % page
url = url + '&q=' + urllib.quote_plus(term)
h = { "Accept" : "application/json"}
try:
request = urllib2.Request(url, headers=h)
jsondata = urllib2.urlopen(request).read()
#print jsondata
data = json.loads(jsondata)
return data
except urllib2.HTTPError as e:
print("Error:", str(e))
return None
def locate_imdb_refs(text):
if type (text) is list:
text = " ".join(text)
if -1 != text.find('imdb.com/title/tt'):
p = re.compile('(https?://[w.]*imdb.com/title/tt[^/ "]+/?)')
imdbs = p.findall(text)
newimdbs = []
for i in imdbs:
i = i.replace('/imdb.com/', '/www.imdb.com/')
i = i.replace('https://', 'http://')
if '/' != i[-1]:
i = i + '/'
newimdbs.append(i)
imdbs = newimdbs
return imdbs
return []
def loadlist(l, path):
try:
with open(path, 'rt') as input:
n = json.load(input)
for id in n.keys():
freenessurls = []
for field in ['archive', 'archive1', 'archive2', 'archive3', 'archive4',
'archive5', 'archive6', 'archive7', 'archive8', 'archive9',
'freenessurl', 'freenessurl1', 'freenessurl2', 'freenessurl3',
'freenessurl4', 'freenessurl5', 'freenessurl6', 'freenessurl7',
'freenessurl8', 'freenessurl9', 'freenessurl10', 'freenessurl11',
'freenessurl12', 'freenessurl13', 'freenessurl14']:
if field in n[id] and n[id][field] not in freenessurls:
freenessurls.append(n[id][field])
del n[id][field]
n[id]['freenessurls'] = freenessurls
if not id in l:
l[id] = n[id]
else:
for url in n[id]['freenessurls']:
if url not in l[id]['freenessurls']:
l[id]['freenessurls'].append(url)
except IOError as e:
return {}
def main():
wplist = {}
loadlist(wplist, 'free-movies-archive-org-wikidata.json')
loadlist(wplist, 'free-movies-manual.json')
print "C", len(wplist.keys())
outlist = {}
page = 1
while True:
l = fetch(page)
if not l or 0 == len(l['response']['docs']):
break
for e in l['response']['docs']:
freenessurl = "https://archive.org/details/%s" % e['identifier']
imdbmap = {}
if 'description' in e:
for i in locate_imdb_refs(e['description']):
imdbmap[i] = True
if 'stripped_tags' in e:
for i in locate_imdb_refs(e['stripped_tags']):
imdbmap[i] = True
# Pick the first title if the result is a list
if list == type(e['title']):
e['title'] = sorted(e['title'])[0]
fle = {
'status' : 'free',
'freenessurls' : [freenessurl],
'title' : e['title'],
}
if 'year' in e:
fle['year'] = e['year']
# Check if the archive.org ID already have a known IMDB title ID
if 0 == len(imdbmap.keys()):
for wpimdb in wplist.keys():
if 'freenessurls' in wplist[wpimdb] \
and freenessurl in wplist[wpimdb]['freenessurls']:
imdbmap[wpimdb] = True
# If not, use archive.org ID as unique ID
if 0 == len(imdbmap.keys()):
imdbmap[freenessurl] = True
for i in imdbmap.keys():
if i not in outlist:
outlist[i] = copy.deepcopy(fle)
else:
# Make stored year predictable, select oldest one.
if 'year' in fle:
if 'year' not in outlist[i]:
outlist[i]['year'] = fle['year']
elif outlist[i]['year'] > fle['year']:
outlist[i]['year'] = fle['year']
if freenessurl not in outlist[i]['freenessurls']:
outlist[i]['freenessurls'].append(freenessurl)
# Make stored title predictable, select one
# using sorting order.
if fle['title'] < outlist[i]['title']:
outlist[i]['title'] = fle['title']
page = page + 1
time.sleep(3)
# Rewrite freenessurls array to individual fields (freenessurl,
# archive, archive1...) in predictable/sorted order
for i in outlist:
seq = 0
for u in sorted(outlist[i]['freenessurls']):
if 0 == seq:
field = 'freenessurl'
else:
field = "freenessurl%d" % seq
outlist[i][field] = u
seq = seq + 1
del outlist[i]['freenessurls']
movielib.savelist(outlist, 'free-movies-archive-org-search.json')
print("Wrote %d" % len(outlist.keys()))
if __name__ == '__main__':
main()