-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-publicdomainmovies-info
executable file
·89 lines (82 loc) · 2.87 KB
/
mklist-publicdomainmovies-info
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
"""
Extract IMDB IDs of movies listed as in the public domain on
www.publicdomainmoves.info. Find IMDB title id by checking linked
Wikipedia info pages for links to IMDB.
"""
import lxml.html
import re
import time
import urllib2
import urlparse
import movielib
def get_movie_info(entryurl):
try:
root = lxml.html.fromstring(movielib.http_get_read(entryurl))
except urllib2.HTTPError as e:
return None
imdburl = None
e = {}
for a in root.cssselect("a[href]"):
#print a.attrib['href'], a.attrib['href'].find("imdb")
if -1 != a.attrib['href'].find("imdb.com"):
imdburl = a.attrib['href']
imdburl = imdburl.replace('/imdb.com/', '/www.imdb.com/')
imdburl = imdburl.replace('/us.imdb.com/', '/www.imdb.com/')
imdburl = imdburl.split('?')[0]
imdburl = imdburl.replace('/plotsummary', '/')
imdburl = imdburl.replace('/combined', '/')
imdburl = imdburl.replace('/fullcredits', '/')
if '/' != imdburl[-1]:
imdburl = imdburl + '/'
#print imdburl
e['imdb'] = imdburl
if -1 != a.attrib['href'].find("wikipedia.org/wiki") \
and -1 != a.text_content().lower().find('info'):
e['wp'] = a.attrib['href']
wpinfo = movielib.wikipedia_lookup(e['wp'])
if 'imdb' in wpinfo:
e['imdb'] = wpinfo['imdb']
year = None
for a in root.cssselect("span.post_tag a[rel='tag']"):
m = re.search("(\d{4})", a.text_content())
if m:
year = int(m.group(1))
return e, year
def get_movie_list(summaryurl, l):
print summaryurl
try:
root = lxml.html.fromstring(movielib.http_get_read(summaryurl))
except urllib2.HTTPError as e:
return "n/a"
count = 0
for a in root.cssselect("ul.wsp-posts-list li.wsp-post a"):
title = a.text_content()
entryurl = urlparse.urljoin(summaryurl, a.attrib['href'])
#print title, entryurl
e, year = get_movie_info(entryurl)
ref = entryurl
if 'imdb' in e: # There are no IMDB links at time of writing
ref = e['imdb']
l[ref] = {
'status' : 'free',
'freenessurl' : entryurl,
'title' : title,
}
if year:
l[ref]['year'] = year
if 'wp' in e:
l[ref]['wp'] = e['wp']
count = count + 1
return count
def main():
summaryurl = 'https://publicdomainmovies.info/all-movies/'
l = {}
get_movie_list(summaryurl, l)
movielib.savelist(l, name='free-movies-publicdomainmovies-info.json')
def test_match():
get_movie_info('https://publicdomainmovies.info/amazing-mr-x-full-movie/')
print(get_movie_info('https://publicdomainmovies.info/brain-wouldnt-die-full-movie/'))
if __name__ == '__main__':
#test_match()
main()