User:Bthfan/bot code.py

From Wikidata
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Usage:

python scripts_harvest_template.py -template:"Infobox basketball biography" -lang:en -family:wikipedia team1 P54

Requires patch from https://gerrit.wikimedia.org/r/#/c/125575/ to work properly (note the patch has a bug, 
so currently this bot code does not work at all)
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013-2014
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id$'
#

import re
import pywikibot
from pywikibot import pagegenerators as pg, WikidataBot
import codecs
import sys
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
import collections

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


class HarvestRobot(WikidataBot):
    """
    A bot to add Wikidata claims
    """
    def __init__(self, generator, templateTitle, fields):
        """
        Arguments:
            * generator     - A generator that yields Page objects.
            * templateTitle - The template to work on
            * fields        - A dictionary of fields that are of use to us

        """
        self.generator = pg.PreloadingGenerator(generator)
        self.templateTitle = templateTitle.replace(u'_', u' ')
        # TODO: Make it a list which also includes the redirects to the template
        self.fields = fields
        self.repo = pywikibot.Site().data_repository()
        self.cacheSources()

    def run(self):
        """
        Starts the robot.
        """
        self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
        for page in self.generator:
            try:
                self.processPage(page)
            except Exception as e:
                pywikibot.exception(tb=True)
                
    def listsEqual(self, list1, list2):
        """
    Returns true if the lists are probably equal, ignoring order.
    Works for lists of unhashable items (like dictionaries).
    """
        if len(list1) != len(list2):
            return False
        if sorted(list1) != sorted(list2):
            return False

        for item in list1:
            if item not in list2:
                return False
                
        return True
    
    def listsEqualQualifier(self, list1, list2):
        """
    Returns true if two claim.qualifiers are probably equal, ignoring order.
    Works for lists of unhashable items (like dictionaries).
    """
        if len(list1) != len(list2):
            return False
        if sorted(list1) != sorted(list2):
            return False

        for item in list1:
            if item not in list2:
                return False

        # Additional check needed!
        # Are all elements from list1 in list2?
        found_item = False
        for k1 in list1.keys():
            if list2.has_key(k1):
                for listobject1 in list1[k1]:
                    found_item = False
                    for listobject2 in list2[k1]:
                        if listobject2.getTarget() == listobject1.getTarget():
                            found_item = True

                    if found_item == False:
                        return False

            else:
                # Qualifier property from list1 missing in list2
                return False

        # Reverse check: Are all elements from list2 in list1?
        found_item = False
        for k1 in list2.keys():
            if list1.has_key(k1):
                for listobject1 in list2[k1]:
                    found_item = False
                    for listobject2 in list1[k1]:
                        pywikibot.output(listobject2.getID())
                        pywikibot.output(listobject1.getID())
                        if listobject2.getTarget() == listobject1.getTarget():
                            found_item = True

                    if found_item == False:
                        return False

            else:
                # Qualifier property from list1 missing in list2
                return False
                
        return True

    def getTemplateSynonyms(self, title):
        """
        Fetches redirects of the title, so we can check against them
        """
        temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
        if not temp.exists():
            pywikibot.error(u'Template %s does not exist.' % temp.title())
            exit()

        pywikibot.output('Finding redirects...')  # Put some output here since it can take a while
        if temp.isRedirectPage():
            temp = temp.getRedirectTarget()
        titles = [page.title(withNamespace=False)
                  for page
                  in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
        titles.append(temp.title(withNamespace=False))
        return titles

    def _template_link_target(self, item, link_text):
        linked_page = None

        link = pywikibot.Link(link_text)
        linked_page = pywikibot.Page(link)

        if not linked_page.exists():
            pywikibot.output(u'%s doesn\'t exist so it can\'t be linked. Skipping' % (linked_page))
            return

        if linked_page.isRedirectPage():
            linked_page = linked_page.getRedirectTarget()

        linked_item = pywikibot.ItemPage.fromPage(linked_page)

        if not linked_item.exists():
            pywikibot.output(u'%s doesn\'t have a wikidata item to link with. Skipping' % (linked_page))
            return

        if linked_item.title() == item.title():
            pywikibot.output(u'%s links to itself. Skipping' % (linked_page))
            return

        return linked_item

    def getSeasonsAndYears(self, yearstring):
        splityears = re.findall('\d{3,}', yearstring)
        nba_years = re.findall('(NBA Year|nbay|Nbay)|(\d{3,})|(start|end)', yearstring)
        splityears2 = re.search('present|today', yearstring)
        nba_seasons = []

        if splityears2 is None and (len(splityears) == 1 or len(splityears) == 2):
            # Found one or two year numbers in the date string
            # No "present" or "today" term was found, so we're looking at some
            # "complete" entry
            # date_before = pywikibot.WbTime(year=splityears[0])
            year_before = splityears[0]
            if len(splityears) > 1:
                year_after = splityears[1]

            if (nba_years is not None):
                # We found a NBA year template, get the NBA seaons used
                if len(nba_years) == 1:
                    # Only one NBA entry found, check for consistency
                    if len(splityears) == 1:
                        # Looks like we only have one year there, so assume this is
                        # start year=end year and one season only
                        date_before = pywikibot.WbTime(year=splityears[0])
                        date_after = pywikibot.WbTime(year=splityears[0])
                        year_used = nba_years.group(1)
                        addNBASeasons(nba_years.group(1), nba_years.group(1), nba_seasons)
                    elif len(splityears) == 2:
                        # Could check where NBA season belongs to
                        # For now, just fix this manually later
                        # for idx, val in enumarate(splityears):
                        #    if nba_years[0][1] == val:
                        return (None, None, None, None)
                    
                elif len(nba_years) == 2:
                    # two NBA entries found, this should be the standard case
                    addNBASeasons(nba_years[0].group(1), nba_years[1].group(1), nba_seasons)
                                
        elif splityears2 is None and len(splityears) > 2:
            # We found a "present" or "today" string, but also multiple
            # year entries, possibly this is a malformed string
            # or a string like "2002, 2010-present [team name]"
            # We need to sort this out manually
            return (None, None, None, None)
        elif splityears2 is not None and len(splityears) == 1:
            # Looks like a "2010-today|present" string, check...
            if len(nba_years) == 1:
                # We found a NBA year template, get the NBA seaons used
                # HACK - hardcode last season here
                addNBASeasons(nba_years.group(1), 2013, nba_seasons)
            else:
                # Not sure what we're dealing with here, could be a badly
                # formatted string
                return (None, None, None, None)
                
        return (year_before, year_after, nba_seasons, splityears2)

    def addQualifiers(self, claim, date_before, date_after, qualifier_before, qualifier_after):
        if (date_before is not None and date_after is not None):
            claim.addQualifier(qualifier_before)
            claim.addQualifier(qualifier_after)
        elif (date_before is not None and date_after is None):
            # Player is still playing at that club
            claim.addQualifier(qualifier_before)

    def addNBASeasons(self, year_start, year_end, seasons):
        for x in range(year_start, year_end+1):
            # Get NBA season item
            if (x == "1999"):
                # Special case when when the year is 1999
                title = "1999–2000 NBA season"
            else:
                title = year_used + "–" + year_used[-2:] + " NBA season"
                
            temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
            seasons.append(pywikibot.ItemPage.fromPage(temp))

    def processPage(self, page):
        """
        Process a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output('Processing %s' % page)
        if not item.exists():
            pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
            #TODO FIXME: We should provide an option to create the page
        item.get()
        
        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            # Clean up template
            try:
                template = pywikibot.Page(page.site, template,
                                          ns=10).title(withNamespace=False)
            except pywikibot.exceptions.InvalidTitle as e:
                pywikibot.error(u"Failed parsing template; '%s' should be the template name." % template)
                continue
            # We found the template we were looking for
            if template in self.templateTitles:
                for i in range(1, 41):
                    if not "years" + `i` in fielddict.keys():
                        continue
                    value = fielddict["years" + `i`];
                    value = value.strip()
                    # value = pywikibot.Site().expand_text(value)
                    if not value:
                        continue

                    # Check if comma is used in this field, possibly we need
                    # to create two or more claims here
                    matches = []
                    comma_match = re.split(',', value)
                    if len(comma_match) > 1:
                        # Comma(s) found, add to list
                        for match in comma_match:
                            match = match.strip()
                            matches.append(match)
                        
                    else:
                        # Use normal method for splitting this up
                        matches.append(value)
                        

                    # This field contains something useful for us
                    # Check if the property isn't already set
                    claim = pywikibot.Claim(self.repo, 'P54')
                    # Try to extract a valid page
                    match = re.search(pywikibot.link_regex, fielddict["team" + `i`])
                    if not match:
                        # Try to get item by page title
                        link_text = fielddict["team" + `i`]
                    else:
                        link_text = match.group(1)
                        
                    linked_item = self._template_link_target(item, link_text)
                    if not linked_item:
                        continue

                    for years_string in matches:
                        # For every year string we've found in the years* property
                        # do a new iteration
                        (year_before_raw, year_after_raw, nba_seasons, splityears2) = \
                           self.getSeasonsAndYears(years_string)

                        if claim.getType() == 'wikibase-item':
                            if splityears2 is None and year_before_raw is not None and year_after_raw is not None:
                                # Player is no longer playing at that club
                                print year_before_raw
                                qualifier_before = pywikibot.Claim(self.repo, u'P580')
                                date_before = pywikibot.WbTime(year=year_before_raw)
                                qualifier_before.setTarget(date_before)
                                qualifier_after = pywikibot.Claim(self.repo, u'P582')
                                date_after = pywikibot.WbTime(year=year_after_raw)
                                qualifier_after.setTarget(date_after)

                            dont_add = False
                            for j in range(len(item.claims.setdefault('P54', []))):
                                # Check for any existing items with those qualifiers
                                # so this if-condition needs to be the first one
                                if item.claims['P54'][j].getTarget() == linked_item and len(item.claims['P54'][j].qualifiers) > 0:
                                    # Qualifiers might be the same, check for that
                                    # Don't reuse that entry for now! Only check if
                                    # the qualifiers are the same, then do nothing
                                    # If they are not, then add a new claim

                                    # Create object for comparison
                                    test_claim = pywikibot.Claim(self.repo, 'P54')
                                    print test_claim.qualifiers
                                    if (splityears2 is None and year_before_raw is not None):
                                        test_claim.qualifiers['P580'].append(qualifier_before)
                                        test_claim.qualifiers['P582'].append(qualifier_after)
                                    elif (splityears2 is not None and year_before_raw is not None):
                                        # Player is still playing at that club
                                        test_claim.qualifiers['P580'].append(qualifier_before)

                                    if self.listsEqualQualifier(test_claim.qualifiers, item.claims['P54'][j].qualifiers):
                                        pywikibot.output('setting dont_add to true')
                                        dont_add = True
                                        break

                                    pywikibot.output(
                                        u'dont_add is %s' % dont_add)
                                elif item.claims['P54'][j].getTarget() == linked_item and len(item.claims['P54'][j].qualifiers) == 0:
                                    # No qualifiers for this item, so can overwrite it
                                    # Refers to the same item, use it for
                                    # new claim and move to the correct
                                    # position
                                    claim = item.claims['P54'][j]
                                    item.claims['P54'].insert(i-1, item.claims['P54'].pop(j))
                                    break

                            if dont_add is False:
                                if claim.getTarget() is None:
                                    # No existing claim was reused, set target
                                    claim.setTarget(linked_item)
                                    item.addClaim(claim)

                                # Save data and then add qualifiers
                                item.editEntity()
                                self.addQualifiers(claim, date_before, date_after, qualifier_before, qualifier_after)


                        elif claim.getType() == 'string':
                            claim.setTarget(value.strip())
                        elif claim.getType() == 'commonsMedia':
                            commonssite = pywikibot.Site("commons", "commons")
                            imagelink = pywikibot.Link(value, source=commonssite, defaultNamespace=6)
                            image = pywikibot.ImagePage(imagelink)
                            if image.isRedirectPage():
                                image = pywikibot.ImagePage(image.getRedirectTarget())
                            if not image.exists():
                                pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (image.title(),))
                                continue
                            claim.setTarget(image)
                            item.addClaim(claim)
                        else:
                            pywikibot.output("%s is not a supported datatype." % claim.getType())
                            continue

                        pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                        # A generator might yield pages from multiple sites
                        if dont_add is False:
                            # Duplicate some code from page.py to avoid a API network request
                            # we don't want/need this here
                            source = self.getSource(page.site)
                            pywikibot.output(claim.sources)
                            source_test = collections.defaultdict(list)
                            source_test[source.getID()].append(source)
                            test_claim = pywikibot.Claim(self.repo, 'P54')
                            test_claim.sources.append(source_test)
                            if source and not self.listsEqual(test_claim.sources, claim.sources):
                                # Can only add sources if they don't already exist
                                # otherwise you get an API error
                                # todo: Do a diff between existing and new sources
                                # and add the differing sources
                                claim.addSource(source, bot=True)


def main():
    commandline_arguments = list()
    template_title = u''

    # Process global args and prepare generator args parser
    local_args = pywikibot.handleArgs()
    gen = pg.GeneratorFactory()

    for arg in local_args:
        if arg.startswith('-template'):
            if len(arg) == 9:
                template_title = pywikibot.input(
                    u'Please enter the template to work on:')
            else:
                template_title = arg[10:]
        elif gen.handleArg(arg):
            if arg.startswith(u'-transcludes:'):
                template_title = arg[13:]
        else:
            commandline_arguments.append(arg)

    if not template_title:
        pywikibot.error('Please specify either -template or -transcludes argument')
        return

    if len(commandline_arguments) % 2:
        raise ValueError  # or something.
    fields = dict()

    for i in range(0, len(commandline_arguments), 2):
        fields[commandline_arguments[i]] = commandline_arguments[i + 1]

    generator = gen.getCombinedGenerator()
    if not generator:
        gen.handleArg(u'-transcludes:' + template_title)
        generator = gen.getCombinedGenerator()

    bot = HarvestRobot(generator, template_title, fields)
    bot.run()

if __name__ == "__main__":
    main()