User:Underlying lk/harvest template.py

From Wikidata
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

"""
Usage:

python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74

This will work on all pages that transclude the template in the article
namespace

You can use any typical pagegenerator to provide with a list of pages:

python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74

&params;
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'
#

import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta
from pywikibot import wdhelper

docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}


class HarvestRobot:
    """
    A bot to add Wikidata claims
    """
    def __init__(self, generator, templateTitle, fields, overwrite=False):
        """
        Arguments:
            * generator     - A generator that yields Page objects.
            * templateTitle - The template to work on
            * fields        - A dictionary of fields that are of use to us
            * overwrite     - if existing claims should be overwritten

        """
        self.generator = generator
        self.templateTitle = templateTitle.replace(u'_', u' ')
        # TODO: Make it a list which also includes the redirects to the template
        self.fields = fields
        self.overwrite = overwrite
        self.repo = pywikibot.Site().data_repository()
        self.cacheSources()

    def getSource(self, site):
        """
        Get the source for the specified site,
        if possible
        """
        if site.family.name in self.source_values and site.code in self.source_values[site.family.name]:
            source = pywikibot.Claim(self.repo, 'P143')
            source.setTarget(self.source_values.get(site.family.name).get(site.code))
            return source

    def cacheSources(self):
        """
        Fetches the sources from the onwiki list
        and stores it internally
        """
        page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
        self.source_values = json.loads(page.get())
        for family_code, family in self.source_values.iteritems():
            for source_lang in family:
                self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,
                                                                                  family[source_lang])

    def run(self):
        """
        Starts the robot.
        """
        self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
        for i, page in enumerate(self.generator):
            try:
                self.procesPage(i, page)
            except Exception as e:
                pywikibot.exception(tb=True)

    def newItem(self, page, item):
        """
        Create item where none exists (from newitem.py by Multichill)
        """
        self.pageAge = 21
        self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
        self.lastEdit = 7
        self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)

        if page.isRedirectPage():
            pywikibot.output('%s is a redirect page. Skipping.' % page)
        elif page.namespace() == 2:
            pywikibot.output('%s is a user page. Skipping.' % page)
        elif page.editTime() > self.lastEditBefore:
            pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
        else:
            (revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
            if revTimestamp > self.pageAgeBefore:
                pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
            elif page.langlinks():
                # FIXME: Implement this
                pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
            else:
                # FIXME: i18n
                summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )

                data = {'sitelinks':
                        {item.getdbName(page.site):
                         {'site': item.getdbName(page.site),
                          'title': page.title()}
                         },
                        'labels':
                        {page.site.lang:
                         {'language': page.site.lang,
                          'value': page.title()}
                         }
                        }
                pywikibot.output(summary)
                item.editEntity(data, summary=summary)

    def getTemplateSynonyms(self, title):
        """
        Fetches redirects of the title, so we can check against them
        """
        pywikibot.output('Finding redirects...')  # Put some output here since it can take a while
        temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
        if temp.isRedirectPage():
            temp = temp.getRedirectTarget()
        titles = [page.title(withNamespace=False)
                  for page
                  in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
        titles.append(temp.title(withNamespace=False))
        return titles

    def procesPage(self, index, page):
        """
        Process a single page
        """
        item = pywikibot.ItemPage.fromPage(page)
        pywikibot.output(u'Processing No. %s: %s' % (index, page))
        if not item.exists():
            # create the page
            self.newItem(page, item)
            item = pywikibot.ItemPage.fromPage(page)
            if not item.exists():
                # The item was not created
                return
        pagetext = page.get()
        templates = pywikibot.extract_templates_and_params(pagetext)
        for (template, fielddict) in templates:
            # Clean up template
            template = pywikibot.Page(page.site, template,
                                      ns=10).title(withNamespace=False)
            # We found the template we were looking for
            if template in self.templateTitles:
                for field, value in fielddict.items():
                    field = field.strip()
                    if not field in self.fields:
                        continue
                    # This field contains something useful for us
                    value = value.strip()
                    pid = self.fields[field]
                    # Check if the property isn't already set
                    if self.overwrite is False:
                        if pid in item.get().get('claims'):
                            pywikibot.output(
                                u'%s already exists (-overwrite to change it)'
                                % pid)
                            continue
                            # TODO FIXME: This is a very crude way of dupe
                            # checking
                    if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + pid) is not -1:
                        pywikibot.output('%s cannot be added as it was recently removed from the item' % (pid,))
                    else:
                        claim = wdhelper.matchDatatype(pid, value, page, item)
                        if claim is None:
                            continue

                        if claim.getType() == 'quantity':
                            # temporary solution until quantities are properly supported
                            source = self.getSource(page.site)
                            if source:
                                claim.addSource(source, bot=True)
                            continue

                        if claim.getID() in item.get().get('claims'):
                            # overwrite
                            item.get()
                            claimToChange = item.claims[claim.getID()][0]
                            valueToChange = claimToChange.getTarget()
                            valueNew = claim.getTarget()
                            # if the new and old claims are the same
                            if valueToChange != valueNew:
                                pywikibot.output('Changing %s --> %s' % (valueToChange, valueNew))
                                claimToChange.changeTarget(valueNew)
                            else:
                                pywikibot.output('Old value %s same as new value %s' % (valueToChange, valueNew))
                        else:
                            pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
                            item.addClaim(claim)
                            # A generator might yield pages from multiple sites
                            source = self.getSource(page.site)
                            if source:
                                claim.addSource(source, bot=True)


def main():
    gen = pg.GeneratorFactory()
    commandline_arguments = list()
    templateTitle = u''
    overwrite = False
    for arg in pywikibot.handleArgs():
        if arg.startswith('-template'):
            if len(arg) == 9:
                templateTitle = pywikibot.input(
                    u'Please enter the template to work on:')
            else:
                templateTitle = arg[10:]
        elif arg.startswith('-overwrite'):
            overwrite = True
        elif gen.handleArg(arg):
            continue
        else:
            commandline_arguments.append(arg)

    if len(commandline_arguments) % 2 or not templateTitle:
        raise ValueError  # or something.
    fields = dict()

    for i in range(0, len(commandline_arguments), 2):
        fields[commandline_arguments[i]] = commandline_arguments[i + 1]

    generator = gen.getCombinedGenerator()
    if not generator:
        # transcluding generator based on templateTitle
        transclusionPage = pywikibot.Page(
            pywikibot.Link(
                templateTitle, defaultNamespace=10, source=pywikibot.Site()
            )
        )
        generator = pywikibot.Site().page_embeddedin(
            transclusionPage, filterRedirects=None,
            namespaces=0, step=None, total=None, content=False
        )

    bot = HarvestRobot(generator, templateTitle, fields, overwrite)
    bot.run()

if __name__ == "__main__":
    main()