You are not logged in.

#1 2013-10-25 14:44:56

ben-arch
Member
Registered: 2005-12-17
Posts: 115

Griffith Movie Collection Manager and IMDB - partial fix. IMPROVED 30

Griffith seems to be no longer maintained but I've yet to find a proper replacement. So, to fix the IMDB plugin problems it seems I have only myself to count on.  Today, I've looked at the plugin and fixed, or partially fixed the following problems :

- Director : fixed
- Screenplay : fixed
- Cameramen : fixed (with a small problem.  You'll see)
- plot : fixed
- cast : fixed (with big whitespace problem) 30.10.2013 fixed with just one spare space at start of line

I will continue to look and improve and try to solve problems.

The complete code of PluginMovieIMDB.py follows.  Where I've made changes, the original code line is commented and begins with "#benel 25.10.2013" (or any other chabge date).  Changes were made to the line right after it.

(This message has also been posted in the old Griffith forum)

# -*- coding: UTF-8 -*-

__revision__ = '$Id$'

# Copyright (c) 2005-2013 Vasco Nunes, Piotr Ożarowski
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA

# You may use and distribute this software under the terms of the
# GNU General Public License, version 2 or later

import gutils, movie
import string, re

plugin_name         = 'IMDb'
plugin_description  = 'Internet Movie Database'
plugin_url          = 'www.imdb.com'
plugin_language     = _('English')
plugin_author       = 'Vasco Nunes, Piotr Ożarowski'
plugin_author_email = 'griffith@griffith.cc'
plugin_version      = '1.14'

class Plugin(movie.Movie):
    def __init__(self, id):
        self.encode   = 'utf-8'
        self.movie_id = id
        self.url      = "http://imdb.com/title/tt%s" % self.movie_id

    def initialize(self):
        self.cast_page = self.open_page(url=self.url + '/fullcredits')
        self.plot_page = self.open_page(url=self.url + '/plotsummary')
        self.comp_page = self.open_page(url=self.url + '/companycredits')
        self.tagl_page = self.open_page(url=self.url + '/taglines')

    def get_image(self):
        tmp = gutils.trim(self.page, 'id="img_primary"', '</a>')
        self.image_url = gutils.trim(tmp, 'src="', '"')

    def get_o_title(self):
        self.o_title = gutils.regextrim(self.page, 'class="title-extra"[^>]*>', '<')
        if not self.o_title:
            self.o_title = gutils.regextrim(self.page, '<h1>', '([ ]|[&][#][0-9]+[;])<span')
        if not self.o_title:
            self.o_title = re.sub(' [(].*', '', gutils.trim(self.page, '<title>', '</title>'))

    def get_title(self):    # same as get_o_title()
        self.title = gutils.regextrim(self.page, '<h1>', '([ ]|[&][#][0-9]+[;])<span')
        if not self.title:
            self.title = re.sub(' [(].*', '', gutils.trim(self.page, '<title>', '</title>'))

    def get_director(self):
        self.director = ''
#benel 25.10.2013       parts = re.split('<a href=', gutils.trim(self.cast_page, '>Directed by<', '</table>'))
        parts = re.split('=ttfc_fc_dr."', gutils.trim(self.cast_page, '>Directed by&', '</table>'))
        if len(parts) > 1:
            for part in parts[1:]:
                director = gutils.trim(part, '>', '<')
#benel 25.10.2013                self.director = self.director + director + ', '
                self.director = self.director + director[:-1] + ', '
            self.director = self.director[0:len(self.director) - 2]

    def get_plot(self):
#benel 25.10.2013        self.plot = gutils.regextrim(self.page, '<h5>Plot:</h5>', '(</div>|<a href.*)')
        self.plot = gutils.regextrim(self.page, '<h1 class="header">Plot Summary</h1>', '(<div class="odd" id="plotSynopsis">|<a href.*)')
        self.plot = self.__before_more(self.plot)
#benel 25.10.2013        elements = string.split(self.plot_page, '<p class="plotpar">')
        elements = string.split(self.plot_page, '<p class="plotSummary">')
        if len(elements) > 1:
            self.plot = self.plot + '\n\n'
            elements[0] = ''
            for element in elements:
                if element <> '':
                    self.plot = self.plot + gutils.strip_tags(gutils.before(element, '</a>')) + '\n\n'

    def get_year(self):
        self.year = gutils.trim(self.page, '<a href="/year/', '</a>')
        self.year = gutils.after(self.year, '>')

    def get_runtime(self):
        self.runtime = gutils.regextrim(self.page, 'Runtime:<[^>]+>', ' min')

    def get_genre(self):
        self.genre = string.replace(gutils.regextrim(self.page, 'Genre[s]*:<[^>]+>', '</div>'), '\n', '')
        self.genre = self.__before_more(self.genre)

    def get_cast(self):
        self.cast = ''
#benel 25.10.2013        self.cast = gutils.trim(self.cast_page, '<table class="cast">', '</table>')
        self.cast = gutils.trim(self.cast_page, '<table class="cast_list">', '</table>')
        if self.cast == '':
#benel 25.10.2013            self.cast = gutils.trim(self.page, '<table class="cast">', '</table>')
            self.cast = gutils.trim(self.page, '<table class="cast_list">', '</table>')
        self.cast = string.replace(self.cast, ' ... ', _(' as '))
        self.cast = string.replace(self.cast, '...', _(' as '))
        self.cast = string.replace(self.cast, '</tr><tr>', "\n")
#benel 30.10.2013  V
        self.cast = string.replace(self.cast, '  ', "")
        self.cast = string.replace(self.cast, '\n', "")
#benel 30.10.2013  ^
        self.cast = re.sub('</tr>[ \t]*<tr[ \t]*class="even">', "\n", self.cast)
        self.cast = re.sub('</tr>[ \t]*<tr[ \t]*class="odd">', "\n", self.cast)
        self.cast = self.__before_more(self.cast)

    def get_classification(self):
        self.classification = gutils.trim(self.page, '(<a href="/mpaa">MPAA</a>)', '</div>')
        self.classification = gutils.trim(self.classification, 'Rated ', ' ')

    def get_studio(self):
        self.studio = ''
        tmp = gutils.regextrim(self.comp_page, 'Production Companies<[^>]+', '</ul>')
        tmp = string.split(tmp, 'href="')
        for entry in tmp:
            entry = gutils.trim(entry, '>', '<')
            if entry:
                self.studio = self.studio + entry + ', '
        if self.studio:
            self.studio = self.studio[:-2]

    def get_o_site(self):
        self.o_site = ''

    def get_site(self):
        self.site = "http://www.imdb.com/title/tt%s" % self.movie_id

    def get_trailer(self):
        self.trailer = "http://www.imdb.com/title/tt%s/trailers" % self.movie_id

    def get_country(self):
        self.country = '<' + gutils.trim(self.page, 'Country:<', '</div>')
        self.country = re.sub('[ ]+', ' ', re.sub('[\n]+', '', self.country))

    def get_rating(self):
        pattern = re.compile('>([0-9]([.][0-9])*)(<[^>]+>)+[/](<[^>]+>)[0-9][0-9]<')
        result = pattern.search(self.page)
        if result:
            self.rating = result.groups()[0]
            if self.rating:
                try:
                    self.rating = round(float(self.rating), 0)
                except Exception, e:
                    self.rating = 0
        else:
            self.rating = 0

    def get_notes(self):
        self.notes = ''
        language = gutils.regextrim(self.page, 'Language:<[^>]+>', '</div>')
        language = gutils.strip_tags(language)
        language = re.sub('[\n]+', '', language)
        language = re.sub('[ ]+', ' ', language)
        language = language.strip()
        color = gutils.regextrim(self.page, 'Color:<[^>]+>', '</div>')
        color = gutils.strip_tags(color)
        color = re.sub('[\n]+', '', color)
        color = re.sub('[ ]+', ' ', color)
        color = color.strip()
        sound = gutils.regextrim(self.page, 'Sound Mix:<[^>]+>', '</div>')
        sound = gutils.strip_tags(sound)
        sound = re.sub('[\n]+', '', sound)
        sound = re.sub('[ ]+', ' ', sound)
        sound = sound.strip()
        tagline = gutils.regextrim(self.tagl_page, 'Taglines for', 'Related Links')
        index = string.rfind(tagline, '</div>')
        if index > -1:
            taglines = string.split(tagline[index:], '<hr')
            tagline = ''
            for entry in taglines:
                entry = gutils.clean(gutils.after(entry, '>'))
                if entry:
                    tagline = tagline + entry + '\n'
        else:
            tagline = ''
        if len(language)>0:
            self.notes = "%s: %s\n" %(_('Language'), language)
        if len(sound)>0:
            self.notes += "%s: %s\n" %(gutils.strip_tags(_('<b>Audio</b>')), sound)
        if len(color)>0:
            self.notes += "%s: %s\n" %(_('Color'), color)
        if len(tagline)>0:
            self.notes += "%s: %s\n" %('Tagline', tagline)

    def get_screenplay(self):
        self.screenplay = ''
#benel 25.10.2013        parts = re.split('<a href=', gutils.trim(self.cast_page, '>Writing credits<', '</table>'))
        parts = re.split('=ttfc_fc_wr."', gutils.trim(self.cast_page, '>Writing Credits', '</table>'))
        if len(parts) > 1:
            for part in parts[1:]:
                screenplay = gutils.trim(part, '>', '<')
                if screenplay == 'WGA':
                    continue
                screenplay = screenplay.replace(' (written by)', '')
                screenplay = screenplay.replace(' and<', '<')
#benel 25.10.2013                self.screenplay = self.screenplay + screenplay + ', '
                self.screenplay = self.screenplay + screenplay[:-1] + ', '
            if len(self.screenplay) > 2:
                self.screenplay = self.screenplay[0:len(self.screenplay) - 2]

    def get_cameraman(self):
        self.cameraman = ''
#benel 25.10.2013        tmp = gutils.regextrim(self.cast_page, 'Cinematography by&nbsp;<[^>]+', '</table>')
        tmp = gutils.regextrim(self.cast_page, 'Cinematography by', '</table>')
#benel 25.10.2013        tmp = string.split(tmp, 'href="')
        tmp = string.split(tmp, 'ttfc_fc_cr')
        for entry in tmp:
            entry = gutils.trim(entry, '>', '<')
            if entry:
#benel 25.10.2013                self.cameraman = self.cameraman + entry + ', '
                self.cameraman = self.cameraman[:-1] + entry + ', '
        if self.cameraman:
            self.cameraman = self.cameraman[:-2]

    def __before_more(self, data):
        for element in ['>See more<', '>more<', '>Full summary<', '>Full synopsis<']:
            tmp = string.find(data, element)
            if tmp>0:
                data = data[:tmp] + '>'
        return data

class SearchPlugin(movie.SearchMovie):
    PATTERN = re.compile(r"""<a href=['"]/title/tt([0-9]+)/[^>]+[>](.*?)</td>""")
    PATTERN_DIRECT = re.compile(r"""value="/title/tt([0-9]+)""")

    def __init__(self):
        # http://www.imdb.com/List?words=
        # finds every title sorted alphabetically, first results are with a quote at
        # the beginning (episodes from tv series), no popular results at first
        # http://www.imdb.com/find?more=tt;q=
        # finds a whole bunch of results. if you look for "Rocky" you will get 903 results.
        # http://www.imdb.com/find?s=tt;q=
        # seems to give the best results. 88 results for "Rocky", popular titles first.
        self.original_url_search   = 'http://www.imdb.com/find?s=tt&q='
        self.translated_url_search = 'http://www.imdb.com/find?s=tt&q='
        self.encode                = 'utf8'

    def search(self,parent_window):
        if not self.open_search(parent_window):
            return None
        return self.page

    def get_searches(self):
        elements = string.split(self.page, '<tr')
        if len(elements):
            for element in elements[1:]:
                match = self.PATTERN.findall(element)
                if len(match) > 1:
                    tmp = re.sub('^[0-9]+[.]', '', gutils.clean(match[1][1]))
                    self.ids.append(match[1][0])
                    self.titles.append(tmp)
        if len(self.ids) < 2:
            # try to find a direct result
            match = self.PATTERN_DIRECT.findall(self.page)
            if len(match) > 0:
                self.ids.append(match[0])


#
# Plugin Test
#
class SearchPluginTest(SearchPlugin):
    #
    # Configuration for automated tests:
    # dict { movie_id -> [ expected result count for original url, expected result count for translated url ] }
    #
    test_configuration = {
        'Rocky Balboa'         : [ 10, 10 ],
        'Ein glückliches Jahr' : [ 3, 3 ]
    }

class PluginTest:
    #
    # Configuration for automated tests:
    # dict { movie_id -> dict { arribute -> value } }
    #
    # value: * True/False if attribute only should be tested for any value
    #        * or the expected value
    #
    test_configuration = {
        '0138097' : {
            'title'             : 'Shakespeare in Love',
            'o_title'           : 'Shakespeare in Love',
            'director'          : 'John Madden',
            'plot'              : True,
            'cast'              : 'Geoffrey Rush' + _(' as ') + 'Philip Henslowe\n\
Tom Wilkinson' + _(' as ') + 'Hugh Fennyman\n\
Steven O\'Donnell' + _(' as ') + 'Lambert\n\
Tim McMullan' + _(' as ') + 'Frees (as Tim McMullen)\n\
Joseph Fiennes' + _(' as ') + 'Will Shakespeare\n\
Steven Beard' + _(' as ') + 'Makepeace - the Preacher\n\
Antony Sher' + _(' as ') + 'Dr. Moth\n\
Patrick Barlow' + _(' as ') + 'Will Kempe\n\
Martin Clunes' + _(' as ') + 'Richard Burbage\n\
Sandra Reinton' + _(' as ') + 'Rosaline\n\
Simon Callow' + _(' as ') + 'Tilney - Master of the Revels\n\
Judi Dench' + _(' as ') + 'Queen Elizabeth\n\
Bridget McConnell' + _(' as ') + 'Lady in Waiting (as Bridget McConnel)\n\
Georgie Glen' + _(' as ') + 'Lady in Waiting\n\
Nicholas Boulton' + _(' as ') + 'Henry Condell\n\
Gwyneth Paltrow' + _(' as ') + 'Viola De Lesseps\n\
Imelda Staunton' + _(' as ') + 'Nurse\n\
Colin Firth' + _(' as ') + 'Lord Wessex\n\
Desmond McNamara' + _(' as ') + 'Crier\n\
Barnaby Kay' + _(' as ') + 'Nol\n\
Jim Carter' + _(' as ') + 'Ralph Bashford\n\
Paul Bigley' + _(' as ') + 'Peter - the Stage Manager\n\
Jason Round' + _(' as ') + 'Actor in Tavern\n\
Rupert Farley' + _(' as ') + 'Barman\n\
Adam Barker' + _(' as ') + 'First Auditionee\n\
Joe Roberts' + _(' as ') + 'John Webster\n\
Harry Gostelow' + _(' as ') + 'Second Auditionee\n\
Alan Cody' + _(' as ') + 'Third Auditionee\n\
Mark Williams' + _(' as ') + 'Wabash\n\
David Curtiz' + _(' as ') + 'John Hemmings\n\
Gregor Truter' + _(' as ') + 'James Hemmings\n\
Simon Day' + _(' as ') + 'First Boatman\n\
Jill Baker' + _(' as ') + 'Lady De Lesseps\n\
Amber Glossop' + _(' as ') + 'Scullery Maid\n\
Robin Davies' + _(' as ') + 'Master Plum\n\
Hywel Simons' + _(' as ') + 'Servant\n\
Nicholas Le Prevost' + _(' as ') + 'Sir Robert De Lesseps\n\
Ben Affleck' + _(' as ') + 'Ned Alleyn\n\
Timothy Kightley' + _(' as ') + 'Edward Pope\n\
Mark Saban' + _(' as ') + 'Augustine Philips\n\
Bob Barrett' + _(' as ') + 'George Bryan\n\
Roger Morlidge' + _(' as ') + 'James Armitage\n\
Daniel Brocklebank' + _(' as ') + 'Sam Gosse\n\
Roger Frost' + _(' as ') + 'Second Boatman\n\
Rebecca Charles' + _(' as ') + 'Chambermaid\n\
Richard Gold' + _(' as ') + 'Lord in Waiting\n\
Rachel Clarke' + _(' as ') + 'First Whore\n\
Lucy Speed' + _(' as ') + 'Second Whore\n\
Patricia Potter' + _(' as ') + 'Third Whore\n\
John Ramm' + _(' as ') + 'Makepeace\'s Neighbor\n\
Martin Neely' + _(' as ') + 'Paris / Lady Montague (as Martin Neeley)\n\
The Choir of St. George\'s School in Windsor' + _(' as ') + 'Choir (as The Choir of St. George\'s School Windsor) rest of cast listed alphabetically:\n\
Jason Canning' + _(' as ') + 'Nobleman (uncredited)\n\
Kelley Costigan' + _(' as ') + 'Theatregoer (uncredited)\n\
Rupert Everett' + _(' as ') + 'Christopher Marlowe (uncredited)\n\
John Inman' + _(' as ') + 'Character Player (uncredited)',
            'country'           : 'USA',
            'genre'             : 'Comedy | Drama | Romance',
            'classification'    : False,
            'studio'            : 'Universal Pictures, Miramax Films, Bedford Falls Productions',
            'o_site'            : False,
            'site'              : 'http://www.imdb.com/title/tt0138097',
            'trailer'           : 'http://www.imdb.com/title/tt0138097/trailers',
            'year'              : 1998,
            'notes'             : _('Language') + ': English\n'\
+ _('Audio') + ': Dolby Digital\n'\
+ _('Color') + ': Color\n\
Tagline: ...A Comedy About the Greatest Love Story Almost Never Told...\n\
Love is the only inspiration',
            'runtime'           : 123,
            'image'             : True,
            'rating'            : 7,
            'screenplay'        : 'Marc Norman, Tom Stoppard',
            'cameraman'         : 'Richard Greatrex',
            'barcode'           : False
        },
    }

Hope this helps someone ...

Ben

Last edited by ben-arch (2013-10-30 15:38:11)

Offline

#2 2013-10-26 08:18:01

ConnorBehan
Package Maintainer (PM)
From: Long Island NY
Registered: 2007-07-05
Posts: 1,359
Website

Re: Griffith Movie Collection Manager and IMDB - partial fix. IMPROVED 30

The beauty of open source. Maybe ask the maintainers of griffith AUR packages to apply your patch.


6EA3 F3F3 B908 2632 A9CB E931 D53A 0445 B47A 0DAB
Great things come in tar.xz packages.

Offline

Board footer

Powered by FluxBB