User:Bthfan/bot code.py
Jump to navigation
Jump to search
#!/usr/bin/python # -*- coding: utf-8 -*- """ Usage: python scripts_harvest_template.py -template:"Infobox basketball biography" -lang:en -family:wikipedia team1 P54 Requires patch from https://gerrit.wikimedia.org/r/#/c/125575/ to work properly (note the patch has a bug, so currently this bot code does not work at all) """ # # (C) Multichill, Amir, 2013 # (C) Pywikibot team, 2013-2014 # # Distributed under the terms of MIT License. # __version__ = '$Id$' # import re import pywikibot from pywikibot import pagegenerators as pg, WikidataBot import codecs import sys sys.stdout = codecs.getwriter('utf8')(sys.stdout) import collections docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp} class HarvestRobot(WikidataBot): """ A bot to add Wikidata claims """ def __init__(self, generator, templateTitle, fields): """ Arguments: * generator - A generator that yields Page objects. * templateTitle - The template to work on * fields - A dictionary of fields that are of use to us """ self.generator = pg.PreloadingGenerator(generator) self.templateTitle = templateTitle.replace(u'_', u' ') # TODO: Make it a list which also includes the redirects to the template self.fields = fields self.repo = pywikibot.Site().data_repository() self.cacheSources() def run(self): """ Starts the robot. """ self.templateTitles = self.getTemplateSynonyms(self.templateTitle) for page in self.generator: try: self.processPage(page) except Exception as e: pywikibot.exception(tb=True) def listsEqual(self, list1, list2): """ Returns true if the lists are probably equal, ignoring order. Works for lists of unhashable items (like dictionaries). """ if len(list1) != len(list2): return False if sorted(list1) != sorted(list2): return False for item in list1: if item not in list2: return False return True def listsEqualQualifier(self, list1, list2): """ Returns true if two claim.qualifiers are probably equal, ignoring order. Works for lists of unhashable items (like dictionaries). """ if len(list1) != len(list2): return False if sorted(list1) != sorted(list2): return False for item in list1: if item not in list2: return False # Additional check needed! # Are all elements from list1 in list2? found_item = False for k1 in list1.keys(): if list2.has_key(k1): for listobject1 in list1[k1]: found_item = False for listobject2 in list2[k1]: if listobject2.getTarget() == listobject1.getTarget(): found_item = True if found_item == False: return False else: # Qualifier property from list1 missing in list2 return False # Reverse check: Are all elements from list2 in list1? found_item = False for k1 in list2.keys(): if list1.has_key(k1): for listobject1 in list2[k1]: found_item = False for listobject2 in list1[k1]: pywikibot.output(listobject2.getID()) pywikibot.output(listobject1.getID()) if listobject2.getTarget() == listobject1.getTarget(): found_item = True if found_item == False: return False else: # Qualifier property from list1 missing in list2 return False return True def getTemplateSynonyms(self, title): """ Fetches redirects of the title, so we can check against them """ temp = pywikibot.Page(pywikibot.Site(), title, ns=10) if not temp.exists(): pywikibot.error(u'Template %s does not exist.' % temp.title()) exit() pywikibot.output('Finding redirects...') # Put some output here since it can take a while if temp.isRedirectPage(): temp = temp.getRedirectTarget() titles = [page.title(withNamespace=False) for page in temp.getReferences(redirectsOnly=True, namespaces=[10], follow_redirects=False)] titles.append(temp.title(withNamespace=False)) return titles def _template_link_target(self, item, link_text): linked_page = None link = pywikibot.Link(link_text) linked_page = pywikibot.Page(link) if not linked_page.exists(): pywikibot.output(u'%s doesn\'t exist so it can\'t be linked. Skipping' % (linked_page)) return if linked_page.isRedirectPage(): linked_page = linked_page.getRedirectTarget() linked_item = pywikibot.ItemPage.fromPage(linked_page) if not linked_item.exists(): pywikibot.output(u'%s doesn\'t have a wikidata item to link with. Skipping' % (linked_page)) return if linked_item.title() == item.title(): pywikibot.output(u'%s links to itself. Skipping' % (linked_page)) return return linked_item def getSeasonsAndYears(self, yearstring): splityears = re.findall('\d{3,}', yearstring) nba_years = re.findall('(NBA Year|nbay|Nbay)|(\d{3,})|(start|end)', yearstring) splityears2 = re.search('present|today', yearstring) nba_seasons = [] if splityears2 is None and (len(splityears) == 1 or len(splityears) == 2): # Found one or two year numbers in the date string # No "present" or "today" term was found, so we're looking at some # "complete" entry # date_before = pywikibot.WbTime(year=splityears[0]) year_before = splityears[0] if len(splityears) > 1: year_after = splityears[1] if (nba_years is not None): # We found a NBA year template, get the NBA seaons used if len(nba_years) == 1: # Only one NBA entry found, check for consistency if len(splityears) == 1: # Looks like we only have one year there, so assume this is # start year=end year and one season only date_before = pywikibot.WbTime(year=splityears[0]) date_after = pywikibot.WbTime(year=splityears[0]) year_used = nba_years.group(1) addNBASeasons(nba_years.group(1), nba_years.group(1), nba_seasons) elif len(splityears) == 2: # Could check where NBA season belongs to # For now, just fix this manually later # for idx, val in enumarate(splityears): # if nba_years[0][1] == val: return (None, None, None, None) elif len(nba_years) == 2: # two NBA entries found, this should be the standard case addNBASeasons(nba_years[0].group(1), nba_years[1].group(1), nba_seasons) elif splityears2 is None and len(splityears) > 2: # We found a "present" or "today" string, but also multiple # year entries, possibly this is a malformed string # or a string like "2002, 2010-present [team name]" # We need to sort this out manually return (None, None, None, None) elif splityears2 is not None and len(splityears) == 1: # Looks like a "2010-today|present" string, check... if len(nba_years) == 1: # We found a NBA year template, get the NBA seaons used # HACK - hardcode last season here addNBASeasons(nba_years.group(1), 2013, nba_seasons) else: # Not sure what we're dealing with here, could be a badly # formatted string return (None, None, None, None) return (year_before, year_after, nba_seasons, splityears2) def addQualifiers(self, claim, date_before, date_after, qualifier_before, qualifier_after): if (date_before is not None and date_after is not None): claim.addQualifier(qualifier_before) claim.addQualifier(qualifier_after) elif (date_before is not None and date_after is None): # Player is still playing at that club claim.addQualifier(qualifier_before) def addNBASeasons(self, year_start, year_end, seasons): for x in range(year_start, year_end+1): # Get NBA season item if (x == "1999"): # Special case when when the year is 1999 title = "1999–2000 NBA season" else: title = year_used + "–" + year_used[-2:] + " NBA season" temp = pywikibot.Page(pywikibot.Site(), title, ns=10) seasons.append(pywikibot.ItemPage.fromPage(temp)) def processPage(self, page): """ Process a single page """ item = pywikibot.ItemPage.fromPage(page) pywikibot.output('Processing %s' % page) if not item.exists(): pywikibot.output('%s doesn\'t have a wikidata item :(' % page) #TODO FIXME: We should provide an option to create the page item.get() pagetext = page.get() templates = pywikibot.extract_templates_and_params(pagetext) for (template, fielddict) in templates: # Clean up template try: template = pywikibot.Page(page.site, template, ns=10).title(withNamespace=False) except pywikibot.exceptions.InvalidTitle as e: pywikibot.error(u"Failed parsing template; '%s' should be the template name." % template) continue # We found the template we were looking for if template in self.templateTitles: for i in range(1, 41): if not "years" + `i` in fielddict.keys(): continue value = fielddict["years" + `i`]; value = value.strip() # value = pywikibot.Site().expand_text(value) if not value: continue # Check if comma is used in this field, possibly we need # to create two or more claims here matches = [] comma_match = re.split(',', value) if len(comma_match) > 1: # Comma(s) found, add to list for match in comma_match: match = match.strip() matches.append(match) else: # Use normal method for splitting this up matches.append(value) # This field contains something useful for us # Check if the property isn't already set claim = pywikibot.Claim(self.repo, 'P54') # Try to extract a valid page match = re.search(pywikibot.link_regex, fielddict["team" + `i`]) if not match: # Try to get item by page title link_text = fielddict["team" + `i`] else: link_text = match.group(1) linked_item = self._template_link_target(item, link_text) if not linked_item: continue for years_string in matches: # For every year string we've found in the years* property # do a new iteration (year_before_raw, year_after_raw, nba_seasons, splityears2) = \ self.getSeasonsAndYears(years_string) if claim.getType() == 'wikibase-item': if splityears2 is None and year_before_raw is not None and year_after_raw is not None: # Player is no longer playing at that club print year_before_raw qualifier_before = pywikibot.Claim(self.repo, u'P580') date_before = pywikibot.WbTime(year=year_before_raw) qualifier_before.setTarget(date_before) qualifier_after = pywikibot.Claim(self.repo, u'P582') date_after = pywikibot.WbTime(year=year_after_raw) qualifier_after.setTarget(date_after) dont_add = False for j in range(len(item.claims.setdefault('P54', []))): # Check for any existing items with those qualifiers # so this if-condition needs to be the first one if item.claims['P54'][j].getTarget() == linked_item and len(item.claims['P54'][j].qualifiers) > 0: # Qualifiers might be the same, check for that # Don't reuse that entry for now! Only check if # the qualifiers are the same, then do nothing # If they are not, then add a new claim # Create object for comparison test_claim = pywikibot.Claim(self.repo, 'P54') print test_claim.qualifiers if (splityears2 is None and year_before_raw is not None): test_claim.qualifiers['P580'].append(qualifier_before) test_claim.qualifiers['P582'].append(qualifier_after) elif (splityears2 is not None and year_before_raw is not None): # Player is still playing at that club test_claim.qualifiers['P580'].append(qualifier_before) if self.listsEqualQualifier(test_claim.qualifiers, item.claims['P54'][j].qualifiers): pywikibot.output('setting dont_add to true') dont_add = True break pywikibot.output( u'dont_add is %s' % dont_add) elif item.claims['P54'][j].getTarget() == linked_item and len(item.claims['P54'][j].qualifiers) == 0: # No qualifiers for this item, so can overwrite it # Refers to the same item, use it for # new claim and move to the correct # position claim = item.claims['P54'][j] item.claims['P54'].insert(i-1, item.claims['P54'].pop(j)) break if dont_add is False: if claim.getTarget() is None: # No existing claim was reused, set target claim.setTarget(linked_item) item.addClaim(claim) # Save data and then add qualifiers item.editEntity() self.addQualifiers(claim, date_before, date_after, qualifier_before, qualifier_after) elif claim.getType() == 'string': claim.setTarget(value.strip()) elif claim.getType() == 'commonsMedia': commonssite = pywikibot.Site("commons", "commons") imagelink = pywikibot.Link(value, source=commonssite, defaultNamespace=6) image = pywikibot.ImagePage(imagelink) if image.isRedirectPage(): image = pywikibot.ImagePage(image.getRedirectTarget()) if not image.exists(): pywikibot.output('[[%s]] doesn\'t exist so I can\'t link to it' % (image.title(),)) continue claim.setTarget(image) item.addClaim(claim) else: pywikibot.output("%s is not a supported datatype." % claim.getType()) continue pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget())) # A generator might yield pages from multiple sites if dont_add is False: # Duplicate some code from page.py to avoid a API network request # we don't want/need this here source = self.getSource(page.site) pywikibot.output(claim.sources) source_test = collections.defaultdict(list) source_test[source.getID()].append(source) test_claim = pywikibot.Claim(self.repo, 'P54') test_claim.sources.append(source_test) if source and not self.listsEqual(test_claim.sources, claim.sources): # Can only add sources if they don't already exist # otherwise you get an API error # todo: Do a diff between existing and new sources # and add the differing sources claim.addSource(source, bot=True) def main(): commandline_arguments = list() template_title = u'' # Process global args and prepare generator args parser local_args = pywikibot.handleArgs() gen = pg.GeneratorFactory() for arg in local_args: if arg.startswith('-template'): if len(arg) == 9: template_title = pywikibot.input( u'Please enter the template to work on:') else: template_title = arg[10:] elif gen.handleArg(arg): if arg.startswith(u'-transcludes:'): template_title = arg[13:] else: commandline_arguments.append(arg) if not template_title: pywikibot.error('Please specify either -template or -transcludes argument') return if len(commandline_arguments) % 2: raise ValueError # or something. fields = dict() for i in range(0, len(commandline_arguments), 2): fields[commandline_arguments[i]] = commandline_arguments[i + 1] generator = gen.getCombinedGenerator() if not generator: gen.handleArg(u'-transcludes:' + template_title) generator = gen.getCombinedGenerator() bot = HarvestRobot(generator, template_title, fields) bot.run() if __name__ == "__main__": main()