-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-publicdomainmovies-net
executable file
·136 lines (127 loc) · 4.61 KB
/
mklist-publicdomainmovies-net
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Extract IMDB IDs of movies listed as in the public domain on
www.publicdomainmoves.net.
"""
import argparse
import json
import lxml.html
import movielib
import re
import urllib2
import urlparse
def filter_imdb_ref(imdburl):
imdburl = imdburl.replace('/imdb.com/', '/www.imdb.com/')
imdburl = imdburl.replace('/us.imdb.com/', '/www.imdb.com/')
imdburl = imdburl.split('?')[0]
imdburl = imdburl.split('#')[0]
imdburl = imdburl.replace('/plotsummary', '/')
imdburl = imdburl.replace('/combined', '/')
imdburl = imdburl.replace('/fullcredits', '/')
imdburl = imdburl.replace('/rg/title-tease//title/', '/title/')
if '/' != imdburl[-1]:
imdburl = imdburl + '/'
return imdburl
def get_last_page(urlbase):
summaryurl = urlbase % 1
try:
root = lxml.html.fromstring(movielib.http_get_read(summaryurl))
except urllib2.HTTPError as e:
print("HTTP error for %s" % summaryurl)
return None
lastref = root.cssselect("ul li.pager-last a")
if lastref:
m = re.search(".*page=(\d+)", lastref[0].attrib['href'])
last = int(m.group(1))
return last
def get_movie_list(args, l, urlbase, page=1):
summaryurl = urlbase % page
print summaryurl
try:
root = lxml.html.fromstring(movielib.http_get_read(summaryurl))
except urllib2.HTTPError as e:
print("HTTP error for %s" % summaryurl)
return "n/a"
count = 0
for div in root.cssselect("div.view-content table tbody > tr"):
a = div.cssselect("a")[0]
title = div.cssselect("h1")[0].text_content()
year = None
yearstr = div.cssselect("div.date a")[0].text_content()
if '' != yearstr:
year = int(yearstr)
#print(title, year)
e = None
entryurl = None
imdburl = None
for a in div.cssselect("a[href]"):
if entryurl is None and -1 != a.attrib['href'].find("/movie/"):
entryurl = urlparse.urljoin(summaryurl, a.attrib['href'])
if -1 != a.attrib['href'].find("imdb.com/"):
imdburl = filter_imdb_ref(a.attrib['href'])
e = imdburl
info = {
'status' : 'free',
'freenessurl' : entryurl,
'title' : title,
}
if not e and args.imdblookup:
try:
imdburl = movielib.imdb_find_one(title, year)
if imdburl:
e = imdburl
info['imdblookup'] = '%s %d' % (title, year)
except KeyError:
# hit this with mojobake and UTF-8 in 'Spring in a
# Small Town (小城之春)'),
# http://publicdomainmovies.net/movie/spring-in-a-small-town-%E5%B0%8F%E5%9F%8E%E4%B9%8B%E6%98%A5
pass
if not e:
e = entryurl
if e in l:
for f in ['freenessurl', 'freenessurl2',
'freenessurl3', 'freenessurl4']:
if f in l[e] and l[e][f] == entryurl:
break
if f not in l[e]:
l[e][f] = entryurl
break
else:
l[e] = info
if year:
l[e]['year'] = year
#print e, l[e]
count = count + 1
return count
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--imdblookup', action='store_true', default=False,
help='also find title IDs by searching for title/year in IMDB')
args = parser.parse_args()
# Not using the front page, as its ordering do not seem to be
# stable - leading to a unpredictable subset of movies being
# extracted.
urlbase = 'http://publicdomainmovie.net/?page=%d'
#http://publicdomainmovies.net/movie/million-dollar-weekend-0
urlbases = [
'http://publicdomainmovie.net/cartoons?page=%d',
'http://publicdomainmovie.net/comedy_movies?page=%d',
'http://publicdomainmovie.net/drama_and_romance?page=%d',
'http://publicdomainmovie.net/feature_movies?page=%d',
'http://publicdomainmovie.net/science_fiction_and_horror?page=%d',
]
path = 'free-movies-publicdomainmovies-net.json'
l = {}
for urlbase in urlbases:
page = 1
lastpage = get_last_page(urlbase)
print("Checking pages %d to %d" % (page, lastpage))
while 0 < get_movie_list(args, l, urlbase, page=page):
page = page + 1
movielib.savelist(l, path)
if page > lastpage:
break
movielib.savelist(l, path)
if __name__ == '__main__':
main()