jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[FEAT] Add parser for <pages /> tag

Add parser for <pages /> tag, defined in ProofreadPage Extension.
This is used for Page transclusion in Wikisource.

https: //

Change-Id: I62f67ba7e77d3bc1322456be47164ef449f3e03f
M tests/
M pywikibot/
2 files changed, 357 insertions(+), 1 deletion(-)

diff --git a/pywikibot/ b/pywikibot/
index 56181aa..eaec537 100644
--- a/pywikibot/
+++ b/pywikibot/
@@ -26,6 +26,7 @@
# Distributed under the terms of the MIT license.
import json
import re
import time
@@ -33,6 +34,7 @@
from http import HTTPStatus
from typing import Any, Optional, Union
from urllib.parse import unquote
+from weakref import WeakKeyDictionary

from requests.exceptions import ReadTimeout

@@ -46,6 +48,7 @@
+ pairwise,
from pywikibot.comms import http
from import ListGenerator, Request
@@ -76,6 +79,248 @@
_IndexType = Tuple[Optional['IndexPage'], List['IndexPage']]

+class TagAttr:
+ """Tag attribute of <pages />.
+ Represent a single attribute.
+ It is used internally in PagesTagParser() and shall not be used
+ stand-alone.
+ It manages string formatting output and conversion str <--> int and quotes.
+ Input value can only be srt or int and shall have quotes or nothing.
+ >>> a = TagAttr('to', 3.0)
+ Traceback (most recent call last):
+ ...
+ TypeError: value=3.0 must be str or int.
+ >>> a = TagAttr('to', 'A123"')
+ Traceback (most recent call last):
+ ...
+ ValueError: value=A123" has wrong quotes.
+ >>> a = TagAttr('to', 3)
+ >>> a
+ TagAttr('to', 3)
+ >>> str(a)
+ 'to=3'
+ >>> a.attr
+ 'to'
+ >>> a.value
+ 3
+ >>> a = TagAttr('to', '3')
+ >>> a
+ TagAttr('to', '3')
+ >>> str(a)
+ 'to=3'
+ >>> a.attr
+ 'to'
+ >>> a.value
+ 3
+ >>> a = TagAttr('to', '"3"')
+ >>> a
+ TagAttr('to', '"3"')
+ >>> str(a)
+ 'to="3"'
+ >>> a.value
+ 3
+ >>> a = TagAttr('to', "'3'")
+ >>> a
+ TagAttr('to', "'3'")
+ >>> str(a)
+ "to='3'"
+ >>> a.value
+ 3
+ >>> a = TagAttr('to', 'A123')
+ >>> a
+ TagAttr('to', 'A123')
+ >>> str(a)
+ 'to=A123'
+ >>> a.value
+ 'A123'
+ """
+ def __init__(self, attr, value):
+ """Initializer."""
+ self.attr = attr
+ self._value = self._convert(value)
+ def _convert(self, value):
+ """Handle conversion from str to int and quotes."""
+ if not isinstance(value, (str, int)):
+ raise TypeError(f'value={value} must be str or int.')
+ self._orig_value = value
+ if isinstance(value, str):
+ if (value.startswith('"') != value.endswith('"')
+ or value.startswith("'") != value.endswith("'")):
+ raise ValueError(f'value={value} has wrong quotes.')
+ value = value.strip('"\'')
+ value = int(value) if value.isdigit() else value
+ return value
+ @property
+ def value(self):
+ """Attribute value."""
+ return self._value
+ @value.setter
+ def value(self, value):
+ self._value = self._convert(value)
+ def __str__(self):
+ attr = 'from' if self.attr == 'ffrom' else self.attr
+ return f'{attr}={self._orig_value}'
+ def __repr__(self):
+ attr = 'from' if self.attr == 'ffrom' else self.attr
+ return f"{self.__class__.__name__}('{attr}', {repr(self._orig_value)})"
+class TagAttrDesc:
+ """A descriptor tag."""
+ def __init__(self):
+ """Initializer."""
+ self.attrs = WeakKeyDictionary()
+ def __set_name__(self, owner, name):
+ self.public_name = name
+ def __get__(self, obj, objtype=None):
+ attr = self.attrs.get(obj)
+ return attr.value if attr is not None else None
+ def __set__(self, obj, value):
+ attr = self.attrs.get(obj)
+ if attr is not None:
+ attr.value = value
+ else:
+ self.attrs[obj] = TagAttr(self.public_name, value)
+ def __delete__(self, obj):
+ self.attrs.pop(obj, None)
+class PagesTagParser(
+ """Parser for tag <pages />.
+ See
+ Parse text and extract the first <pages ... /> tag.
+ Individual attributes will be accessible with dot notation.
+ >>> tp = PagesTagParser(
+ ... 'Text: <pages index="Index.pdf" from="first" to="last" />')
+ >>> tp
+ PagesTagParser('<pages index="Index.pdf" from="first" to="last" />')
+ Atttributes can be modified via dot notation.
+ If an attribute is a number, it is converted to int.
+ Note: 'from' is represented as 'ffrom' due to conflict with keyword.
+ >>> tp.ffrom = 1; = '"3"'
+ >>> tp.ffrom
+ 1
+ >>>
+ 3
+ Quotes are stripped in the value and added back in the str representation.
+ Note that quotes are not mandatory.
+ >>> tp
+ PagesTagParser('<pages index="Index.pdf" from=1 to="3" />')
+ Atttributes can be added via dot notation.
+ Order is fixed (same order as attribute definition in the class).
+ >>> tp.fromsection = '"A"'
+ >>> tp.fromsection
+ 'A'
+ >>> tp
+ PagesTagParser('<pages index="Index.pdf" from=1 to="3" fromsection="A" />')
+ Atttributes can be deleted.
+ >>> del tp.fromsection
+ >>> tp
+ PagesTagParser('<pages index="Index.pdf" from=1 to="3" />')
+ Attribute presence can be checked.
+ >>> 'to' in tp
+ True
+ >>> 'step' in tp
+ False
+ """
+ pat_tag = re.compile(r'<pages (?P<attrs>[^/]*?)/>')
+ tokens = (
+ 'index',
+ 'from',
+ 'to',
+ 'include',
+ 'exclude',
+ 'step',
+ 'header',
+ 'tosection',
+ 'fromsection',
+ 'onlysection',
+ )
+ tokens = '(' + '=|'.join(tokens) + '=)'
+ pat_attr = re.compile(tokens)
+ index = TagAttrDesc()
+ ffrom = TagAttrDesc()
+ to = TagAttrDesc()
+ include = TagAttrDesc()
+ exclude = TagAttrDesc()
+ step = TagAttrDesc()
+ header = TagAttrDesc()
+ tosection = TagAttrDesc()
+ fromsection = TagAttrDesc()
+ onlysection = TagAttrDesc()
+ def __init__(self, text):
+ """Initializer."""
+ m =
+ if m is None:
+ raise ValueError(f'Invalid text={text}')
+ tag = m['attrs']
+ matches = list(self.pat_attr.finditer(tag))
+ positions = [m.span()[0] for m in matches] + [len(tag)]
+ for begin, end in pairwise(positions):
+ attribute = tag[begin:end - 1]
+ attr, _, value = attribute.partition('=')
+ if attr == 'from':
+ attr = 'f' + attr
+ setattr(self, attr, value)
+ @classmethod
+ def get_descriptors(cls):
+ """Get TagAttrDesc descriptors."""
+ res = {k: v for k, v in cls.__dict__.items()
+ if isinstance(v, TagAttrDesc)}
+ return res
+ def __contains__(self, attr):
+ return getattr(self, attr) is not None
+ def __str__(self):
+ descriptors = self.get_descriptors().items()
+ attrs = [v.attrs.get(self) for k, v in descriptors
+ if v.attrs.get(self) is not None]
+ attrs = ' '.join(str(attr) for attr in attrs)
+ return f'<pages {attrs} />' if attrs else '<pages />'
+ def __repr__(self):
+ return f"{self.__class__.__name__}('{self}')"
def decompose(fn: Callable) -> Callable: # type: ignore
"""Decorator for ProofreadPage.

diff --git a/tests/ b/tests/
index 47b6965..44d6ef0 100755
--- a/tests/
+++ b/tests/
@@ -13,7 +13,12 @@
import pywikibot
from import api
from pywikibot.exceptions import UnknownExtensionError
-from pywikibot.proofreadpage import IndexPage, ProofreadPage
+from pywikibot.proofreadpage import (
+ IndexPage,
+ PagesTagParser,
+ ProofreadPage,
+ TagAttr,
from import has_module
from tests import unittest_print
from tests.aspects import TestCase, require_modules
@@ -24,6 +29,97 @@
from tests.utils import skipping

+class TestPagesTagParser(TestCase):
+ """Test TagAttr class."""
+ net = False
+ def test_tag_attr_int(self):
+ """Test TagAttr for int values."""
+ attr = TagAttr('to', 3)
+ self.assertEqual(repr(attr), "TagAttr('to', 3)")
+ self.assertEqual(str(attr), 'to=3')
+ self.assertEqual(attr.attr, 'to')
+ self.assertEqual(attr.value, 3)
+ def test_tag_attr_srt_int(self):
+ """Test TagAttr for str values that can be converted to int."""
+ attr = TagAttr('to', '3')
+ self.assertEqual(repr(attr), "TagAttr('to', '3')")
+ self.assertEqual(str(attr), 'to=3')
+ self.assertEqual(attr.attr, 'to')
+ self.assertEqual(attr.value, 3)
+ attr.value = '"3"'
+ self.assertEqual(str(attr), 'to="3"')
+ self.assertEqual(repr(attr), """TagAttr('to', '"3"')""")
+ self.assertEqual(attr.value, 3)
+ def test_tag_attr_str(self):
+ """Test TagAttr for str value."""
+ attr = TagAttr('fromsection', 'A123')
+ self.assertEqual(repr(attr), "TagAttr('fromsection', 'A123')")
+ self.assertEqual(str(attr), 'fromsection=A123')
+ self.assertEqual(attr.attr, 'fromsection')
+ self.assertEqual(attr.value, 'A123')
+ attr.value = '"A123"'
+ self.assertEqual(repr(attr), """TagAttr('fromsection', '"A123"')""")
+ self.assertEqual(str(attr), 'fromsection="A123"')
+ self.assertEqual(attr.value, 'A123')
+ attr.value = "'A123'"
+ self.assertEqual(repr(attr), """TagAttr('fromsection', "'A123'")""")
+ self.assertEqual(str(attr), "fromsection='A123'")
+ self.assertEqual(attr.value, 'A123')
+ def test_tag_attr_exceptions(self):
+ """Test TagAttr for Exceptions."""
+ self.assertRaises(ValueError, TagAttr, 'fromsection', 'A123"')
+ self.assertRaises(TypeError, TagAttr, 'fromsection', 3.0)
+ def test_pages_tag_parser(self):
+ """Test PagesTagParser."""
+ tp = PagesTagParser('Text: <pages />')
+ self.assertEqual(repr(tp), "PagesTagParser('<pages />')")
+ text = 'Text: <pages from="first" to="last" />'
+ tp = PagesTagParser(text)
+ self.assertEqual(
+ repr(tp), """PagesTagParser('<pages from="first" to="last" />')""")
+ self.assertEqual(tp.ffrom, 'first')
+ self.assertEqual(, 'last')
+ tp.index = '"Index.pdf"'
+ self.assertEqual(tp.index, 'Index.pdf')
+ tp.ffrom, = 1, '"3"'
+ self.assertEqual(tp.ffrom, 1)
+ self.assertEqual(, 3)
+ self.assertEqual(str(tp), '<pages index="Index.pdf" from=1 to="3" />')
+ del tp.index
+ self.assertNotIn('index', tp)
+ = "'3'"
+ self.assertEqual(str(tp), """<pages from=1 to='3' />""")
+ tp.step = 3
+ self.assertEqual(str(tp), """<pages from=1 to='3' step=3 />""")
+ self.assertIn('step', tp)
+ def test_pages_tag_parser_exceptions(self):
+ """Test PagesTagParser Exceptions."""
+ text = """Text: <pages index="Index.pdf />"""
+ self.assertRaises(ValueError, PagesTagParser, text)
+ text = """Text: <pages index="Index.pdf' />"""
+ self.assertRaises(ValueError, PagesTagParser, text)
+ text = """Text: <pages index="Index.pdf from=C" />"""
+ self.assertRaises(ValueError, PagesTagParser, text)
class TestProofreadPageInvalidSite(TestCase):

"""Test ProofreadPage class."""

To view, visit change 871240. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I62f67ba7e77d3bc1322456be47164ef449f3e03f
Gerrit-Change-Number: 871240
Gerrit-PatchSet: 12
Gerrit-Owner: Mpaa <>
Gerrit-Reviewer: Xqt <>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged