mklist-publicdomainreview

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Extract IMDB IDs for films listed on the Public Domain Review,
using the wikidata IMDB->archive.org map to look up IMDB IDs.
"""

import argparse
import json
import lxml.html
import movielib
import re
import urllib2
import urlparse

def get_movie_iaurl(entryurl):
    try:
        root = lxml.html.fromstring(movielib.http_get_read(entryurl))
    except urllib2.HTTPError as e:
        return None
    if root is None:
        return None
    for a in root.cssselect("td a"):
        if -1 != a.attrib['href'].find("archive.org/details/"):
            iaurl = a.attrib['href']
            iaurl = iaurl.replace("http://", "https://")
            return iaurl
    return None

def map2imdb(iaurl):
    global wplist
    for wpimdb in wplist.keys():
        if wplist[wpimdb]['freenessurl'] == iaurl:
            print("found %s in archive.org: %s" % (iaurl, wpimdb))
            return wpimdb
    return None

def get_movie_list(l, args, urlbase, page=1):
    summaryurl = urlbase % page
    print summaryurl
    try:
        root = lxml.html.fromstring(movielib.http_get_read(summaryurl))
    except urllib2.HTTPError as e:
        return 0
    if root is None:
        return 0
    count = 0
    for div in root.cssselect("article.medium-film"):
        titleyear = div.cssselect("h2.entry-title a")[0].text_content()
        m = re.search('^(.+)\s*\((\d+)\)$', titleyear)
        if m:
            title = m.group(1).strip()
            year = int(m.group(2))
        else:
            title = titleyear
            year = None
        entryurl = urlparse.urljoin(summaryurl,
                                    div.cssselect("h2.entry-title a")[0].attrib['href'])
        info = {
            'status' : 'free',
            'freenessurl' : entryurl,
            'title' : title,
        }
        iaurl = get_movie_iaurl(entryurl)
        imdburl = map2imdb(iaurl)
        if imdburl:
            e = imdburl
        else:
            e = entryurl
            print("Unable to find IMDB id for %s" % entryurl)
            if args.imdblookup:
                try:
                    imdburl = movielib.imdb_find_one(title, year)
                    if imdburl:
                        e = imdburl
                        info['imdblookup'] = '%s %d' % (title, year)
                except KeyError:
                    # Ignore titles with non-ascii characters, like
                    # http://publicdomainreview.org/collections/growing-things-a-film-lesson-in-nature-study-1928/
                    pass
        if e:
            if iaurl:
                info['archive'] = iaurl
            if year:
                info['year'] = year
            l[e] = info
            #print l[e]
            count = count + 1
    return count

def loadlist():
    try:
        with open('free-movies-archive-org-wikidata.json', 'rt') as input:
            return json.load(input)
    except IOError as e:
        return {}

def main():
    global wplist
    urlbase = 'http://publicdomainreview.org/collections/page/%d/?medium=film'

    parser = argparse.ArgumentParser()
    parser.add_argument('--imdblookup', action='store_true', default=False,
                        help='also find title IDs by searching for title/year in IMDB')
    args = parser.parse_args()

    wplist = loadlist()

    l = {}
    page = 1
    while 0 < get_movie_list(l, args, urlbase=urlbase, page=page):
        page = page + 1
    movielib.savelist(l, 'free-movies-publicdomainreview.json')

if __name__ == '__main__':
    main()