jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/370664 )
Change subject: [IMPR] Share claimit.py logic with harvest_template.py
......................................................................
[IMPR] Share claimit.py logic with harvest_template.py
It is now possible to import another value for the same property
that the item already has, using -exists:p. Each imported claim
can have its 'exists' setting, as well as all claims can share
the same (global) one.
Additionally, because presence of a property is no longer a reason
to skip adding a value, as well as because of performance, I'm
dropping the safeguard that skips the page when the item already
has all the properties to be harvested.
Bug: T72702
Bug: T76391
Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f
---
M pywikibot/bot.py
M scripts/claimit.py
M scripts/harvest_template.py
3 files changed, 151 insertions(+), 134 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
Ejegg: Looks good to me, but someone else must approve
diff --git a/pywikibot/bot.py b/pywikibot/bot.py
index 1d95652..3444389 100644
--- a/pywikibot/bot.py
+++ b/pywikibot/bot.py
@@ -2000,6 +2000,69 @@
source.setTarget(item)
return source
+ def user_add_claim_unless_exists(
+ self, item, claim, exists_arg='', source=None,
+ logger_callback=log, **kwargs):
+ """
+ Decorator of L{user_add_claim}.
+
+ Before adding a new claim, it checks if we can add it, using provided
+ filters.
+
+ @see: documentation of L{claimit.py<scripts.claimit>}
+ @param exists_arg: pattern for merging existing claims with new ones
+ @type exists_arg: str
+ @param logger_callback: function logging the output of the method
+ @type logger_callback: callable
+ @return: whether the claim could be added
+ @rtype: bool
+ """
+ # Existing claims on page of same property
+ for existing in item.get().get('claims').get(claim.getID(), []):
+ # If claim with same property already exists...
+ if 'p' not in exists_arg:
+ logger_callback(
+ 'Skipping %s because claim with same property already
exists'
+ % (claim.getID(),))
+ log('Use -exists:p option to override this behavior')
+ return False
+ if not existing.target_equals(claim.getTarget()):
+ continue
+ # If some attribute of the claim being added
+ # matches some attribute in an existing claim of
+ # the same property, skip the claim, unless the
+ # 'exists' argument overrides it.
+ if 't' not in exists_arg:
+ logger_callback(
+ 'Skipping %s because claim with same target already exists'
+ % (claim.getID(),))
+ log("Append 't' to -exists argument to override this
behavior")
+ return False
+ if 'q' not in exists_arg and not existing.qualifiers:
+ logger_callback(
+ 'Skipping %s because claim without qualifiers already
exists'
+ % (claim.getID(),))
+ log("Append 'q' to -exists argument to override this
behavior")
+ return False
+ if ('s' not in exists_arg or not source) and not existing.sources:
+ logger_callback(
+ 'Skipping %s because claim without source already exists'
+ % (claim.getID(),))
+ log("Append 's' to -exists argument to override this
behavior")
+ return False
+ if ('s' not in exists_arg and source and
+ any(source.getID() in ref and
+ all(snak.target_equals(source.getTarget())
+ for snak in ref[source.getID()])
+ for ref in existing.sources)):
+ logger_callback(
+ 'Skipping %s because claim with the same source already
exists'
+ % (claim.getID(),))
+ log("Append 's' to -exists argument to override this
behavior")
+ return False
+
+ return self.user_add_claim(item, claim, source, **kwargs)
+
def create_item_for_page(self, page, data=None, summary=None, **kwargs):
"""
Create an ItemPage with the provided page as the sitelink.
diff --git a/scripts/claimit.py b/scripts/claimit.py
index 1f213d0..caf75ec 100755
--- a/scripts/claimit.py
+++ b/scripts/claimit.py
@@ -92,60 +92,10 @@
def treat_page_and_item(self, page, item):
"""Treat each page."""
- # The generator might yield pages from multiple sites
- source = self.getSource(page.site)
-
for claim in self.claims:
- # Existing claims on page of same property
- for existing in item.claims.get(claim.getID(), []):
- # If claim with same property already exists...
- if 'p' not in self.exists_arg:
- pywikibot.log(
- 'Skipping %s because claim with same property already
exists'
- % (claim.getID(),))
- pywikibot.log(
- 'Use -exists:p option to override this behavior')
- break
- if not existing.target_equals(claim.getTarget()):
- continue
- # If some attribute of the claim being added
- # matches some attribute in an existing claim of
- # the same property, skip the claim, unless the
- # 'exists' argument overrides it.
- if 't' not in self.exists_arg:
- pywikibot.log(
- 'Skipping %s because claim with same target already
exists'
- % (claim.getID(),))
- pywikibot.log(
- "Append 't' to -exists argument to override this
behavior")
- break
- if 'q' not in self.exists_arg and not existing.qualifiers:
- pywikibot.log(
- 'Skipping %s because claim without qualifiers already
exists'
- % (claim.getID(),))
- pywikibot.log(
- "Append 'q' to -exists argument to override this
behavior")
- break
- if ('s' not in self.exists_arg or not source) and not
existing.sources:
- pywikibot.log(
- 'Skipping %s because claim without source already
exists'
- % (claim.getID(),))
- pywikibot.log(
- "Append 's' to -exists argument to override this
behavior")
- break
- if ('s' not in self.exists_arg and source and
- any(source.getID() in ref and
- all(snak.target_equals(source.getTarget())
- for snak in ref[source.getID()])
- for ref in existing.sources)):
- pywikibot.log(
- 'Skipping %s because claim with the same source already
exists'
- % (claim.getID(),))
- pywikibot.log(
- "Append 's' to -exists argument to override this
behavior")
- break
- else:
- self.user_add_claim(item, claim, page.site)
+ # The generator might yield pages from multiple sites
+ self.user_add_claim_unless_exists(
+ item, claim, self.exists_arg, page.site)
def main(*args):
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 7303524..9d76d23 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -34,6 +34,12 @@
-islink Treat plain text values as links ("text" ->
"[[text]]").
+-exists If set to 'p', add a new value, even if the item already
+ has the imported property but not the imported value.
+ If set to 'pt', add a new value, even if the item already
+ has the imported property with the imported value and
+ some qualifiers.
+
Examples:
python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
@@ -60,6 +66,14 @@
-template:"Infobox person" birth_place P19 -islink death_place P20
will do the same but only "birth_place" can be imported without a link.
+
+ python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
+ -template:"Infobox person" occupation P106 -exists:p
+
+ will import an occupation from "occupation" parameter of "Infobox
+ person" on English Wikipedia as Wikidata property "P106" (occupation).
The
+ page won't be skipped if the item already has that property but there is
+ not the new value.
"""
#
@@ -100,6 +114,7 @@
availableOptions = {
'islink': False,
+ 'exists': '',
}
@@ -121,10 +136,14 @@
@type islink: bool
@keyword create: Whether to create a new item if it's missing
@type create: bool
+ @keyword exists: pattern for merging existing claims with harvested
+ values
+ @type exists: str
"""
self.availableOptions.update({
'always': True,
'create': False,
+ 'exists': '',
'islink': False,
})
super(HarvestRobot, self).__init__(**kwargs)
@@ -197,25 +216,19 @@
"""
Compare bot's (global) and provided (local) options.
- @see: L{pywikibot.bot.OptionHandler.getOption}
-
- @rtype: bool
+ @see: L{OptionHandler.getOption}
"""
- # TODO: only works with booleans
default = self.getOption(option)
local = handler.getOption(option)
- return default is not local
+ if isinstance(default, bool) and isinstance(local, bool):
+ return default is not local
+ else:
+ return local or default
def treat_page_and_item(self, page, item):
"""Process a single page/item."""
if willstop:
raise KeyboardInterrupt
- item.get()
- if set(val[0] for val in self.fields.values()) <= set(
- item.claims.keys()):
- pywikibot.output('%s item %s has claims for all properties. '
- 'Skipping.' % (page, item.title()))
- return
templates = page.raw_extracted_templates
for (template, fielddict) in templates:
@@ -228,80 +241,71 @@
"Failed parsing template; '%s' should be the template
name."
% template)
continue
+
+ if template not in self.templateTitles:
+ continue
# We found the template we were looking for
- if template in self.templateTitles:
- for field, value in fielddict.items():
- field = field.strip()
- value = value.strip()
- if not field or not value:
+ for field, value in fielddict.items():
+ field = field.strip()
+ value = value.strip()
+ if not field or not value:
+ continue
+
+ if field not in self.fields:
+ continue
+
+ # This field contains something useful for us
+ prop, options = self.fields[field]
+ claim = pywikibot.Claim(self.repo, prop)
+ if claim.type == 'wikibase-item':
+ # Try to extract a valid page
+ match = pywikibot.link_regex.search(value)
+ if match:
+ link_text = match.group(1)
+ else:
+ if self._get_option_with_fallback(options, 'islink'):
+ link_text = value
+ else:
+ pywikibot.output(
+ '%s field %s value %s is not a wikilink. '
+ 'Skipping.' % (claim.getID(), field, value))
+ continue
+
+ linked_item = self._template_link_target(item, link_text)
+ if not linked_item:
continue
- # This field contains something useful for us
- if field in self.fields:
- prop, options = self.fields[field]
- # Check if the property isn't already set
- claim = pywikibot.Claim(self.repo, prop)
- if claim.getID() in item.get().get('claims'):
- pywikibot.output(
- 'A claim for %s already exists. Skipping.'
- % claim.getID())
- # TODO: Implement smarter approach to merging
- # harvested values with existing claims esp.
- # without overwriting humans unintentionally.
- else:
- if claim.type == 'wikibase-item':
- # Try to extract a valid page
- match = pywikibot.link_regex.search(value)
- if match:
- link_text = match.group(1)
- else:
- if self._get_option_with_fallback(
- options, 'islink'):
- link_text = value
- else:
- pywikibot.output(
- '%s field %s value %s is not a '
- 'wikilink. Skipping.'
- % (claim.getID(), field, value))
- continue
+ claim.setTarget(linked_item)
+ elif claim.type in ('string', 'external-id'):
+ claim.setTarget(value.strip())
+ elif claim.type == 'url':
+ match = self.linkR.search(value)
+ if not match:
+ continue
+ claim.setTarget(match.group('url'))
+ elif claim.type == 'commonsMedia':
+ commonssite = pywikibot.Site('commons', 'commons')
+ imagelink = pywikibot.Link(
+ value, source=commonssite, defaultNamespace=6)
+ image = pywikibot.FilePage(imagelink)
+ if image.isRedirectPage():
+ image = pywikibot.FilePage(image.getRedirectTarget())
+ if not image.exists():
+ pywikibot.output(
+ "{0} doesn't exist. I can't link to it"
+ ''.format(image.title(asLink=True)))
+ continue
+ claim.setTarget(image)
+ else:
+ pywikibot.output('%s is not a supported datatype.'
+ % claim.type)
+ continue
- linked_item = self._template_link_target(
- item, link_text)
- if not linked_item:
- continue
-
- claim.setTarget(linked_item)
- elif claim.type in ('string',
'external-id'):
- claim.setTarget(value.strip())
- elif claim.type == 'url':
- match = self.linkR.search(value)
- if not match:
- continue
- claim.setTarget(match.group('url'))
- elif claim.type == 'commonsMedia':
- commonssite = pywikibot.Site('commons',
- 'commons')
- imagelink = pywikibot.Link(value,
- source=commonssite,
- defaultNamespace=6)
- image = pywikibot.FilePage(imagelink)
- if image.isRedirectPage():
- image = pywikibot.FilePage(
- image.getRedirectTarget())
- if not image.exists():
- pywikibot.output(
- "{0} doesn't exist. I can't link to
it"
- ''.format(image.title(asLink=True)))
- continue
- claim.setTarget(image)
- else:
- pywikibot.output(
- '%s is not a supported datatype.'
- % claim.type)
- continue
-
- # A generator might yield pages from multiple sites
- self.user_add_claim(item, claim, page.site)
+ # A generator might yield pages from multiple sites
+ self.user_add_claim_unless_exists(
+ item, claim, self._get_option_with_fallback(
+ options, 'exists'),
+ page.site, pywikibot.output)
def main(*args):
--
To view, visit
https://gerrit.wikimedia.org/r/370664
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4b4bb9cd2f7427b2226c05212b27d0113163464f
Gerrit-PatchSet: 10
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Ejegg <ejegg(a)ejegg.com>
Gerrit-Reviewer: JAn Dudík <jan.dudik(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <Ladsgroup(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XXN <dan10real(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>