mklist-archive-org-search

#!/usr/bin/env python

"""
Search for movies in the Internet Archive, and output the list with
IMDB title IDs similar to the other list of free movies.  Look for
IMDB title references in the description and other relevant tags.

It is based on  how the Butter plugin search for movies, but extended
to get more hits.

If the search result do not return a IMDB ID, see if the wikidata set
or the manually created data set have such ID and use this one.
"""

import copy
import json
import movielib
import re
import time
import urllib
import urllib2

def fetch(page=1):
    print("Fetching page %d" % page)
    """
Based on the search URL found in
https://github.com/butterproviders/butter-provider-archive
The search result is less than 1000 entries 2017-11-05.

Removing the 'year' requirement result in several thousand entries more.
"""
    # Search term:
    term = """
(collection:moviesandfilms
OR collection:animationandcartoons
OR collection:classic_tv
OR collection:classic_cartoons)
AND NOT collection:movie_trailers
AND NOT collection:sabucat_trailers
AND NOT collection:stock_footage
AND NOT collection:home_movies
AND NOT collection:prelinger_mashups
AND NOT collection:brick_films
AND -mediatype:collection
AND format:"Archive BitTorrent"
"""
    # Limiting to entries with year was part of the original Butter
    # search, but reduses around 17000 entries to around 500.
    if False:
        term = term + " AND year"
        
    url = 'https://archive.org/advancedsearch.php?sort%%5B%%5D=&sort%%5B%%5D=&sort%%5B%%5D=&output=json&rows=100000&page=%d' % page
    url = url + '&q=' + urllib.quote_plus(term)
    h = { "Accept" : "application/json"}
    try:
        request = urllib2.Request(url, headers=h)
        jsondata = urllib2.urlopen(request).read()
        #print jsondata
        data = json.loads(jsondata)
        return data
    except urllib2.HTTPError as e:
        print("Error:", str(e))
        return None

def locate_imdb_refs(text):
    if type (text) is list:
        text = " ".join(text)
    if -1 != text.find('imdb.com/title/tt'):
        p = re.compile('(https?://[w.]*imdb.com/title/tt[^/ "]+/?)')
        imdbs = p.findall(text)
        newimdbs = []
        for i in imdbs:
            i = i.replace('/imdb.com/', '/www.imdb.com/')
            i = i.replace('https://', 'http://')
            if '/' != i[-1]:
                i = i + '/'
            newimdbs.append(i)
        imdbs = newimdbs
        return imdbs
    return []

def loadlist(l, path):
    try:
        with open(path, 'rt') as input:
            n = json.load(input)
            for id in n.keys():
                freenessurls = []
                for field in ['archive', 'archive1', 'archive2', 'archive3', 'archive4',
                              'archive5', 'archive6', 'archive7', 'archive8', 'archive9',
                              'freenessurl', 'freenessurl1', 'freenessurl2', 'freenessurl3',
                              'freenessurl4', 'freenessurl5', 'freenessurl6', 'freenessurl7',
                              'freenessurl8', 'freenessurl9', 'freenessurl10', 'freenessurl11',
                              'freenessurl12', 'freenessurl13', 'freenessurl14']:
                    if field in n[id] and n[id][field] not in freenessurls:
                        freenessurls.append(n[id][field])
                        del n[id][field]
                n[id]['freenessurls'] = freenessurls
                if not id in l:
                    l[id] = n[id]
                else:
                    for url in n[id]['freenessurls']:
                        if url not in l[id]['freenessurls']:
                            l[id]['freenessurls'].append(url)
    except IOError as e:
        return {}

def main():
    wplist = {}
    loadlist(wplist, 'free-movies-archive-org-wikidata.json')
    loadlist(wplist, 'free-movies-manual.json')
    print "C", len(wplist.keys())

    outlist = {}
    page = 1
    while True:
        l = fetch(page)
        if not l or 0 == len(l['response']['docs']):
            break
        for e in l['response']['docs']:
            freenessurl = "https://archive.org/details/%s" % e['identifier']
            imdbmap = {}
            if 'description' in e:
                for i in locate_imdb_refs(e['description']):
                    imdbmap[i] = True
            if 'stripped_tags' in e:
                for i in locate_imdb_refs(e['stripped_tags']):
                    imdbmap[i] = True
            # Pick the first title if the result is a list
            if list == type(e['title']):
                e['title'] = sorted(e['title'])[0]
            fle = {
                'status' : 'free',
                'freenessurls' : [freenessurl],
                'title' : e['title'],
                }
            if 'year' in e:
                fle['year'] = e['year']
            # Check if the archive.org ID already have a known IMDB title ID
            if 0 == len(imdbmap.keys()):
                for wpimdb in wplist.keys():
                    if 'freenessurls' in wplist[wpimdb] \
                       and freenessurl in wplist[wpimdb]['freenessurls']:
                        imdbmap[wpimdb] = True
            # If not, use archive.org ID as unique ID
            if 0 == len(imdbmap.keys()):
                imdbmap[freenessurl] = True
            for i in imdbmap.keys():
                if i not in outlist:
                    outlist[i] = copy.deepcopy(fle)
                else:
                    # Make stored year predictable, select oldest one.
                    if 'year' in fle:
                        if 'year' not in outlist[i]:
                            outlist[i]['year'] = fle['year']
                        elif outlist[i]['year'] > fle['year']:
                            outlist[i]['year'] = fle['year']
                    if freenessurl not in outlist[i]['freenessurls']:
                        outlist[i]['freenessurls'].append(freenessurl)
                    # Make stored title predictable, select one
                    # using sorting order.
                    if fle['title'] < outlist[i]['title']:
                        outlist[i]['title'] = fle['title']
        page = page + 1
        time.sleep(3)
    # Rewrite freenessurls array to individual fields (freenessurl,
    # archive, archive1...) in predictable/sorted order
    for i in outlist:
            seq = 0
            for u in sorted(outlist[i]['freenessurls']):
                if 0 == seq:
                    field = 'freenessurl'
                else:
                    field = "freenessurl%d" % seq
                outlist[i][field] = u
                seq = seq + 1
            del outlist[i]['freenessurls']
    movielib.savelist(outlist, 'free-movies-archive-org-search.json')

    print("Wrote %d" % len(outlist.keys()))

if __name__ == '__main__':
    main()