jenkins-bot has submitted this change and it was merged.
Change subject: Migrate XmlDumpPageGenerator to pagegenerators
......................................................................
Migrate XmlDumpPageGenerator to pagegenerators
Four scripts now use a generic page generator class:
- noreferences
- reflinks
- template
- weblinkchecker
Two scripts still have their own XML page generator:
- redirect
- replace
Bug: T85334
Change-Id: I5b6268673f5db5cc9506bc0e24ab70f72d9af573
---
M pywikibot/pagegenerators.py
M pywikibot/tools/__init__.py
M scripts/noreferences.py
M scripts/reflinks.py
M scripts/template.py
M scripts/weblinkchecker.py
M tests/reflinks_tests.py
7 files changed, 126 insertions(+), 148 deletions(-)
Approvals:
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d9a4ead..9f91111 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -42,10 +42,11 @@
issue_deprecation_warning,
DequeGenerator,
intersect_generators,
+ IteratorNextMixin,
filter_unique,
)
-from pywikibot import date, config, i18n
+from pywikibot import date, config, i18n, xmlreader
from pywikibot.comms import http
from pywikibot.exceptions import ArgumentDeprecationWarning
@@ -2331,6 +2332,62 @@
yield page
+class XMLDumpOldPageGenerator(IteratorNextMixin):
+
+ """Xml generator that yields Page objects with old text
loaded."""
+
+ @deprecated_args(xmlFilename='filename', xmlStart='start')
+ def __init__(self, filename, start=None, namespaces=[], site=None,
+ text_predicate=None):
+ """Constructor."""
+ # xmlFilename and xmlStart mapped to not break git blame
+ # use filename and start on new/changed lines
+ xmlFilename = filename
+ xmlStart = start
+
+ if text_predicate is None:
+ text_predicate = lambda text: True
+ self.text_predicate = text_predicate
+
+ self.xmlStart = xmlStart
+ self.namespaces = namespaces
+ self.skipping = bool(xmlStart)
+ self.site = site or pywikibot.Site()
+
+ dump = xmlreader.XmlDump(xmlFilename)
+ self.parser = dump.parse()
+
+ def __next__(self):
+ """Get next Page."""
+ while True:
+ try:
+ entry = next(self.parser)
+ except StopIteration:
+ raise
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ page = pywikibot.Page(self.site, entry.title)
+ if not self.namespaces == []:
+ if page.namespace() not in self.namespaces:
+ continue
+ if self.text_predicate(entry.text):
+ page.text = entry.text
+ return page
+
+
+class XMLDumpPageGenerator(XMLDumpOldPageGenerator):
+
+ """Xml generator that yields Page objects without text
loaded."""
+
+ def __next__(self):
+ """Get next Page from dump and remove the text."""
+ page = super(XMLDumpPageGenerator, self).__next__()
+ del page.text
+ return page
+
+
def YearPageGenerator(start=1, end=2050, site=None):
"""
Year page generator.
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 4655d91..4a93ca4 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -854,24 +854,27 @@
"""Unicode string with SelfCallMixin."""
-class DequeGenerator(collections.deque):
+class IteratorNextMixin(collections.Iterator):
+
+ """Backwards compatibility for Iterators."""
+
+ if PY2:
+
+ def next(self):
+ """Python 2 next."""
+ return self.__next__()
+
+
+class DequeGenerator(IteratorNextMixin, collections.deque):
"""A generator that allows items to be added during
generating."""
- def __iter__(self):
- """Return the object which will be iterated."""
- return self
-
- def next(self):
+ def __next__(self):
"""Python 3 iterator method."""
if len(self):
return self.popleft()
else:
raise StopIteration
-
- def __next__(self):
- """Python 3 iterator method."""
- return self.next()
class ContextManagerWrapper(object):
diff --git a/scripts/noreferences.py b/scripts/noreferences.py
index 42e6dd0..e2d793d 100755
--- a/scripts/noreferences.py
+++ b/scripts/noreferences.py
@@ -45,9 +45,14 @@
import re
+from functools import partial
+
import pywikibot
from pywikibot import i18n, pagegenerators, textlib, Bot
+from pywikibot.pagegenerators import (
+ XMLDumpPageGenerator,
+)
# This is required for the text that is shown when you run this script
# with the parameter -help.
@@ -440,37 +445,18 @@
maintenance_category = 'cite_error_refs_without_references_category'
+_ref_regex = re.compile('</ref>', re.IGNORECASE)
+_references_regex = re.compile('<references.*?/>', re.IGNORECASE)
-class XmlDumpNoReferencesPageGenerator(object):
- """
- Generator which will yield Pages that might lack a references tag.
+def _match_xml_page_text(text):
+ """Match page text."""
+ text = textlib.removeDisabledParts(text)
+ return _ref_regex.search(text) and not _references_regex.search(text)
- These pages will be retrieved from a local XML dump file
- (pages-articles or pages-meta-current).
- """
- def __init__(self, xmlFilename):
- """
- Constructor.
-
- Arguments:
- * xmlFilename - The dump's path, either absolute or relative
- """
- self.xmlFilename = xmlFilename
- self.refR = re.compile('</ref>', re.IGNORECASE)
- # The references tab can contain additional spaces and a group
- # attribute.
- self.referencesR = re.compile('<references.*?/>', re.IGNORECASE)
-
- def __iter__(self):
- """XML iterator."""
- from pywikibot import xmlreader
- dump = xmlreader.XmlDump(self.xmlFilename)
- for entry in dump.parse():
- text = textlib.removeDisabledParts(entry.text)
- if self.refR.search(text) and not self.referencesR.search(text):
- yield pywikibot.Page(pywikibot.Site(), entry.title)
+XmlDumpNoReferencesPageGenerator = partial(
+ XMLDumpPageGenerator, text_predicate=_match_xml_page_text)
class NoReferencesBot(Bot):
@@ -488,8 +474,8 @@
self.site = pywikibot.Site()
self.comment = i18n.twtranslate(self.site, 'noreferences-add-tag')
- self.refR = re.compile('</ref>', re.IGNORECASE)
- self.referencesR = re.compile('<references.*?/>', re.IGNORECASE)
+ self.refR = _ref_regex
+ self.referencesR = _references_regex
self.referencesTagR =
re.compile('<references>.*?</references>',
re.IGNORECASE | re.DOTALL)
try:
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 3035fc1..328b904 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -56,14 +56,18 @@
import sys
import io
+from functools import partial
+
import pywikibot
-from pywikibot import i18n, pagegenerators, textlib, xmlreader, Bot
+from pywikibot import i18n, pagegenerators, textlib, Bot
+from pywikibot.pagegenerators import (
+ XMLDumpPageGenerator as _XMLDumpPageGenerator,
+)
from pywikibot.tools.formatter import color_format
from scripts import noreferences
-# TODO: Convert to httlib2
if sys.version_info[0] > 2:
from urllib.parse import quote
from urllib.request import urlopen
@@ -185,41 +189,8 @@
# ( maintained by User:Dispenser )
listof404pages = '404-links.txt'
-
-class XmlDumpPageGenerator(object):
-
- """Xml generator that yields pages containing bare
references."""
-
- def __init__(self, xmlFilename, xmlStart, namespaces, site=None):
- self.xmlStart = xmlStart
- self.namespaces = namespaces
- self.skipping = bool(xmlStart)
- self.site = site or pywikibot.Site()
-
- dump = xmlreader.XmlDump(xmlFilename)
- self.parser = dump.parse()
-
- def __iter__(self):
- return self
-
- def next(self):
- while True:
- try:
- entry = next(self.parser)
- except StopIteration:
- raise
- if self.skipping:
- if entry.title != self.xmlStart:
- continue
- self.skipping = False
- page = pywikibot.Page(self.site, entry.title)
- if not self.namespaces == []:
- if page.namespace() not in self.namespaces:
- continue
- if linksInRef.search(entry.text):
- return page
-
- __next__ = next
+XmlDumpPageGenerator = partial(
+ _XMLDumpPageGenerator, text_predicate=linksInRef.search)
class RefLink(object):
diff --git a/scripts/template.py b/scripts/template.py
index facda7c..e789176 100755
--- a/scripts/template.py
+++ b/scripts/template.py
@@ -118,12 +118,14 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, xmlreader, Bot
+from pywikibot import i18n, pagegenerators, Bot
from pywikibot.exceptions import ArgumentDeprecationWarning
+from pywikibot.pagegenerators import XMLDumpPageGenerator
+
from scripts.replace import ReplaceRobot as ReplaceBot
-class XmlDumpTemplatePageGenerator(object):
+class XmlDumpTemplatePageGenerator(XMLDumpPageGenerator):
"""
Generator which yields Pages that transclude a template.
@@ -144,11 +146,7 @@
"""
self.templates = templates
self.xmlfilename = xmlfilename
-
- def __iter__(self):
- """Yield page objects until the entire XML dump has been
read."""
mysite = pywikibot.Site()
- dump = xmlreader.XmlDump(self.xmlfilename)
# regular expression to find the original template.
# {{vfd}} does the same thing as {{Vfd}}, so both will be found.
# The old syntax, {{msg:vfd}}, will also be found.
@@ -164,10 +162,9 @@
templateRegex = re.compile(
r'\{\{ *([mM][sS][gG]:)?(?:%s) *(?P<parameters>\|[^}]+|) *}}'
% '|'.join(templatePatterns))
- for entry in dump.parse():
- if templateRegex.search(entry.text):
- page = pywikibot.Page(mysite, entry.title)
- yield page
+
+ super(XmlDumpTemplatePageGenerator, self).__init__(
+ xmlfilename, site=mysite, text_predicate=templateRegex.search)
class TemplateRobot(ReplaceBot):
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 23afde0..015f108 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -108,6 +108,7 @@
import time
import sys
+from functools import partial
from warnings import warn
try:
@@ -117,8 +118,11 @@
import pywikibot
-from pywikibot import i18n, config, pagegenerators, textlib, xmlreader, weblib
+from pywikibot import i18n, config, pagegenerators, textlib, weblib
from pywikibot.bot import ExistingPageBot, SingleSiteBot
+from pywikibot.pagegenerators import (
+ XMLDumpPageGenerator as _XMLDumpPageGenerator,
+)
from pywikibot.tools.formatter import color_format
# TODO: Convert to httlib2
@@ -247,48 +251,8 @@
yield m.group('urlb')
-class XmlDumpPageGenerator(object):
-
- """Xml generator that yiels pages containing a web
link."""
-
- def __init__(self, xmlFilename, xmlStart, namespaces):
- self.xmlStart = xmlStart
- self.namespaces = namespaces
- self.skipping = bool(xmlStart)
- self.site = pywikibot.Site()
-
- dump = xmlreader.XmlDump(xmlFilename)
- self.parser = dump.parse()
-
- def __iter__(self):
- return self
-
- def next(self):
- try:
- for entry in self.parser:
- if self.skipping:
- if entry.title != self.xmlStart:
- continue
- self.skipping = False
- page = pywikibot.Page(self.site, entry.title)
- if self.namespaces:
- if page.namespace() not in self.namespaces:
- continue
- found = False
- for url in weblinksIn(entry.text):
- found = True
- if found:
- return page
- except KeyboardInterrupt:
- try:
- if not self.skipping:
- pywikibot.output(
- u'To resume, use "-xmlstart:%s" on the command
line.'
- % entry.title)
- except NameError:
- pass
-
- __next__ = next
+XmlDumpPageGenerator = partial(
+ _XMLDumpPageGenerator, text_predicate=weblinksIn)
class NotAnURLError(BaseException):
diff --git a/tests/reflinks_tests.py b/tests/reflinks_tests.py
index 603371f..f154738 100644
--- a/tests/reflinks_tests.py
+++ b/tests/reflinks_tests.py
@@ -27,8 +27,8 @@
def test_non_bare_ref_urls(self):
"""Test pages without bare references are not
processed."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('article-pear-0.10.xml'),
- xmlStart=u'Pear',
+ filename=join_xml_data_path('article-pear-0.10.xml'),
+ start='Pear',
namespaces=[0, 1],
site=self.get_site())
pages = list(gen)
@@ -37,8 +37,8 @@
def test_simple_bare_refs(self):
"""Test simple bare references in multiple
namespaces."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake page',
namespaces=[0, 1],
site=self.get_site())
pages = list(gen)
@@ -48,8 +48,8 @@
def test_namespace_empty_list(self):
"""Test namespaces=[] processes all namespaces."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start=u'Fake page',
namespaces=[],
site=self.get_site())
pages = list(gen)
@@ -60,8 +60,8 @@
def test_namespace_None(self):
"""Test namespaces=None processes all
namespaces."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake page',
namespaces=None,
site=self.get_site())
pages = list(gen)
@@ -72,8 +72,8 @@
def test_namespace_string_ids(self):
"""Test namespaces with ids as string."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake page',
namespaces=["0", "1"],
site=self.get_site())
pages = list(gen)
@@ -83,8 +83,8 @@
def test_namespace_names(self):
"""Test namespaces with namespace names."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake page',
namespaces=["Talk"],
site=self.get_site())
pages = list(gen)
@@ -95,8 +95,8 @@
def test_start_with_underscore(self):
"""Test with underscore in start page title."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=u'Fake_page',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake_page',
namespaces=[0, 1],
site=self.get_site())
pages = list(gen)
@@ -106,8 +106,8 @@
def test_without_start(self):
"""Test without a start page title."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart=None,
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start=None,
namespaces=[0, 1],
site=self.get_site())
pages = list(gen)
@@ -118,8 +118,8 @@
def test_start_prefix(self):
"""Test with a prefix as a start page title."""
gen = XmlDumpPageGenerator(
- xmlFilename=join_xml_data_path('dummy-reflinks.xml'),
- xmlStart='Fake',
+ filename=join_xml_data_path('dummy-reflinks.xml'),
+ start='Fake',
namespaces=[0, 1],
site=self.get_site())
pages = list(gen)
--
To view, visit
https://gerrit.wikimedia.org/r/239658
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I5b6268673f5db5cc9506bc0e24ab70f72d9af573
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>