User:Underlying lk/harvest template.py
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
Usage:
python harvest_template.py -lang:nl -template:"Taxobox straalvinnige" orde P70 familie P71 geslacht P74
This will work on all pages that transclude the template in the article
namespace
You can use any typical pagegenerator to provide with a list of pages:
python harvest_template.py -lang:nl -cat:Sisoridae -template:"Taxobox straalvinnige" -namespace:0 orde P70 familie P71 geslacht P74
¶ms;
"""
#
# (C) Multichill, Amir, 2013
# (C) Pywikibot team, 2013
#
# Distributed under the terms of MIT License.
#
__version__ = '$Id: 2507544f311b7164e04c7c83198a891f33e9f8ee $'
#
import re
import json
import pywikibot
from pywikibot import pagegenerators as pg
from datetime import datetime
from datetime import timedelta
from pywikibot import wdhelper
docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
class HarvestRobot:
"""
A bot to add Wikidata claims
"""
def __init__(self, generator, templateTitle, fields, overwrite=False):
"""
Arguments:
* generator - A generator that yields Page objects.
* templateTitle - The template to work on
* fields - A dictionary of fields that are of use to us
* overwrite - if existing claims should be overwritten
"""
self.generator = generator
self.templateTitle = templateTitle.replace(u'_', u' ')
# TODO: Make it a list which also includes the redirects to the template
self.fields = fields
self.overwrite = overwrite
self.repo = pywikibot.Site().data_repository()
self.cacheSources()
def getSource(self, site):
"""
Get the source for the specified site,
if possible
"""
if site.family.name in self.source_values and site.code in self.source_values[site.family.name]:
source = pywikibot.Claim(self.repo, 'P143')
source.setTarget(self.source_values.get(site.family.name).get(site.code))
return source
def cacheSources(self):
"""
Fetches the sources from the onwiki list
and stores it internally
"""
page = pywikibot.Page(self.repo, u'List of wikis/python', ns=4)
self.source_values = json.loads(page.get())
for family_code, family in self.source_values.iteritems():
for source_lang in family:
self.source_values[family_code][source_lang] = pywikibot.ItemPage(self.repo,
family[source_lang])
def run(self):
"""
Starts the robot.
"""
self.templateTitles = self.getTemplateSynonyms(self.templateTitle)
for i, page in enumerate(self.generator):
try:
self.procesPage(i, page)
except Exception as e:
pywikibot.exception(tb=True)
def newItem(self, page, item):
"""
Create item where none exists (from newitem.py by Multichill)
"""
self.pageAge = 21
self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge)
self.lastEdit = 7
self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit)
if page.isRedirectPage():
pywikibot.output('%s is a redirect page. Skipping.' % page)
elif page.namespace() == 2:
pywikibot.output('%s is a user page. Skipping.' % page)
elif page.editTime() > self.lastEditBefore:
pywikibot.output('Last edit on %s was on %s. Too recent. Skipping.' % (page, page.editTime().isoformat()))
else:
(revId, revTimestamp, revUser, revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0]
if revTimestamp > self.pageAgeBefore:
pywikibot.output('Page creation of %s on %s is too recent. Skipping.' % (page, page.editTime().isoformat()))
elif page.langlinks():
# FIXME: Implement this
pywikibot.output('Found language links (interwiki links). Haven\'t implemented that yet so skipping.')
else:
# FIXME: i18n
summary = u'Bot: New item with sitelink from %s' % (page.title(asLink=True, insite=self.repo), )
data = {'sitelinks':
{item.getdbName(page.site):
{'site': item.getdbName(page.site),
'title': page.title()}
},
'labels':
{page.site.lang:
{'language': page.site.lang,
'value': page.title()}
}
}
pywikibot.output(summary)
item.editEntity(data, summary=summary)
def getTemplateSynonyms(self, title):
"""
Fetches redirects of the title, so we can check against them
"""
pywikibot.output('Finding redirects...') # Put some output here since it can take a while
temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
if temp.isRedirectPage():
temp = temp.getRedirectTarget()
titles = [page.title(withNamespace=False)
for page
in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)]
titles.append(temp.title(withNamespace=False))
return titles
def procesPage(self, index, page):
"""
Process a single page
"""
item = pywikibot.ItemPage.fromPage(page)
pywikibot.output(u'Processing No. %s: %s' % (index, page))
if not item.exists():
# create the page
self.newItem(page, item)
item = pywikibot.ItemPage.fromPage(page)
if not item.exists():
# The item was not created
return
pagetext = page.get()
templates = pywikibot.extract_templates_and_params(pagetext)
for (template, fielddict) in templates:
# Clean up template
template = pywikibot.Page(page.site, template,
ns=10).title(withNamespace=False)
# We found the template we were looking for
if template in self.templateTitles:
for field, value in fielddict.items():
field = field.strip()
if not field in self.fields:
continue
# This field contains something useful for us
value = value.strip()
pid = self.fields[field]
# Check if the property isn't already set
if self.overwrite is False:
if pid in item.get().get('claims'):
pywikibot.output(
u'%s already exists (-overwrite to change it)'
% pid)
continue
# TODO FIXME: This is a very crude way of dupe
# checking
if str(item.getVersionHistory(total=50)).find("wbremoveclaims-remove:1| */ [[Property:" + pid) is not -1:
pywikibot.output('%s cannot be added as it was recently removed from the item' % (pid,))
else:
claim = wdhelper.matchDatatype(pid, value, page, item)
if claim is None:
continue
if claim.getType() == 'quantity':
# temporary solution until quantities are properly supported
source = self.getSource(page.site)
if source:
claim.addSource(source, bot=True)
continue
if claim.getID() in item.get().get('claims'):
# overwrite
item.get()
claimToChange = item.claims[claim.getID()][0]
valueToChange = claimToChange.getTarget()
valueNew = claim.getTarget()
# if the new and old claims are the same
if valueToChange != valueNew:
pywikibot.output('Changing %s --> %s' % (valueToChange, valueNew))
claimToChange.changeTarget(valueNew)
else:
pywikibot.output('Old value %s same as new value %s' % (valueToChange, valueNew))
else:
pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget()))
item.addClaim(claim)
# A generator might yield pages from multiple sites
source = self.getSource(page.site)
if source:
claim.addSource(source, bot=True)
def main():
gen = pg.GeneratorFactory()
commandline_arguments = list()
templateTitle = u''
overwrite = False
for arg in pywikibot.handleArgs():
if arg.startswith('-template'):
if len(arg) == 9:
templateTitle = pywikibot.input(
u'Please enter the template to work on:')
else:
templateTitle = arg[10:]
elif arg.startswith('-overwrite'):
overwrite = True
elif gen.handleArg(arg):
continue
else:
commandline_arguments.append(arg)
if len(commandline_arguments) % 2 or not templateTitle:
raise ValueError # or something.
fields = dict()
for i in range(0, len(commandline_arguments), 2):
fields[commandline_arguments[i]] = commandline_arguments[i + 1]
generator = gen.getCombinedGenerator()
if not generator:
# transcluding generator based on templateTitle
transclusionPage = pywikibot.Page(
pywikibot.Link(
templateTitle, defaultNamespace=10, source=pywikibot.Site()
)
)
generator = pywikibot.Site().page_embeddedin(
transclusionPage, filterRedirects=None,
namespaces=0, step=None, total=None, content=False
)
bot = HarvestRobot(generator, templateTitle, fields, overwrite)
bot.run()
if __name__ == "__main__":
main()