[FEAT] Add parser for <pages /> tag

Add parser for <pages /> tag, defined in ProofreadPage Extension.
This is used for Page transclusion in Wikisource.

See:
https: //www.mediawiki.org/wiki/Help:Extension:ProofreadPage/Pages_tag

Change-Id: I62f67ba7e77d3bc1322456be47164ef449f3e03f
---
M tests/proofreadpage_tests.py
M pywikibot/proofreadpage.py
2 files changed, 357 insertions(+), 1 deletion(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 56181aa..eaec537 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -26,6 +26,7 @@
 #
 # Distributed under the terms of the MIT license.
 #
+import collections.abc
 import json
 import re
 import time
@@ -33,6 +34,7 @@
 from http import HTTPStatus
 from typing import Any, Optional, Union
 from urllib.parse import unquote
+from weakref import WeakKeyDictionary
 
 from requests.exceptions import ReadTimeout
 
@@ -46,6 +48,7 @@
     Sequence,
     Set,
     Tuple,
+    pairwise,
 )
 from pywikibot.comms import http
 from pywikibot.data.api import ListGenerator, Request
@@ -76,6 +79,248 @@
 _IndexType = Tuple[Optional['IndexPage'], List['IndexPage']]
 
 
+class TagAttr:
+    """Tag attribute of <pages />.
+
+    Represent a single attribute.
+    It is used internally in PagesTagParser() and shall not be used
+    stand-alone.
+
+    It manages string formatting output and conversion str <--> int and quotes.
+    Input value can only be srt or int and shall have quotes or nothing.
+
+    >>> a = TagAttr('to', 3.0)
+    Traceback (most recent call last):
+      ...
+    TypeError: value=3.0 must be str or int.
+
+    >>> a = TagAttr('to', 'A123"')
+    Traceback (most recent call last):
+      ...
+    ValueError: value=A123" has wrong quotes.
+
+    >>> a = TagAttr('to', 3)
+    >>> a
+    TagAttr('to', 3)
+    >>> str(a)
+    'to=3'
+    >>> a.attr
+    'to'
+    >>> a.value
+    3
+
+    >>> a = TagAttr('to', '3')
+    >>> a
+    TagAttr('to', '3')
+    >>> str(a)
+    'to=3'
+    >>> a.attr
+    'to'
+    >>> a.value
+    3
+
+    >>> a = TagAttr('to', '"3"')
+    >>> a
+    TagAttr('to', '"3"')
+    >>> str(a)
+    'to="3"'
+    >>> a.value
+    3
+
+    >>> a = TagAttr('to', "'3'")
+    >>> a
+    TagAttr('to', "'3'")
+    >>> str(a)
+    "to='3'"
+    >>> a.value
+    3
+
+    >>> a = TagAttr('to', 'A123')
+    >>> a
+    TagAttr('to', 'A123')
+    >>> str(a)
+    'to=A123'
+    >>> a.value
+    'A123'
+    """
+
+    def __init__(self, attr, value):
+        """Initializer."""
+        self.attr = attr
+        self._value = self._convert(value)
+
+    def _convert(self, value):
+        """Handle conversion from str to int and quotes."""
+        if not isinstance(value, (str, int)):
+            raise TypeError(f'value={value} must be str or int.')
+
+        self._orig_value = value
+
+        if isinstance(value, str):
+            if (value.startswith('"') != value.endswith('"')
+                    or value.startswith("'") != value.endswith("'")):
+                raise ValueError(f'value={value} has wrong quotes.')
+            value = value.strip('"\'')
+            value = int(value) if value.isdigit() else value
+
+        return value
+
+    @property
+    def value(self):
+        """Attribute value."""
+        return self._value
+
+    @value.setter
+    def value(self, value):
+        self._value = self._convert(value)
+
+    def __str__(self):
+        attr = 'from' if self.attr == 'ffrom' else self.attr
+        return f'{attr}={self._orig_value}'
+
+    def __repr__(self):
+        attr = 'from' if self.attr == 'ffrom' else self.attr
+        return f"{self.__class__.__name__}('{attr}', {repr(self._orig_value)})"
+
+
+class TagAttrDesc:
+    """A descriptor tag."""
+
+    def __init__(self):
+        """Initializer."""
+        self.attrs = WeakKeyDictionary()
+
+    def __set_name__(self, owner, name):
+        self.public_name = name
+
+    def __get__(self, obj, objtype=None):
+        attr = self.attrs.get(obj)
+        return attr.value if attr is not None else None
+
+    def __set__(self, obj, value):
+        attr = self.attrs.get(obj)
+        if attr is not None:
+            attr.value = value
+        else:
+            self.attrs[obj] = TagAttr(self.public_name, value)
+
+    def __delete__(self, obj):
+        self.attrs.pop(obj, None)
+
+
+class PagesTagParser(collections.abc.Container):
+    """Parser for tag <pages />.
+
+    See https://www.mediawiki.org/wiki/Help:Extension:ProofreadPage/Pages_tag
+
+    Parse text and extract the first <pages ... /> tag.
+    Individual attributes will be accessible with dot notation.
+
+    >>> tp = PagesTagParser(
+    ... 'Text: <pages index="Index.pdf" from="first" to="last" />')
+    >>> tp
+    PagesTagParser('<pages index="Index.pdf" from="first" to="last" />')
+
+    Atttributes can be modified via dot notation.
+    If an attribute is a number, it is converted to int.
+    Note: 'from' is represented as 'ffrom' due to conflict with keyword.
+    >>> tp.ffrom = 1; tp.to = '"3"'
+    >>> tp.ffrom
+    1
+    >>> tp.to
+    3
+
+    Quotes are stripped in the value and added back in the str representation.
+    Note that quotes are not mandatory.
+    >>> tp
+    PagesTagParser('<pages index="Index.pdf" from=1 to="3" />')
+
+    Atttributes can be added via dot notation.
+    Order is fixed (same order as attribute definition in the class).
+    >>> tp.fromsection = '"A"'
+    >>> tp.fromsection
+    'A'
+    >>> tp
+    PagesTagParser('<pages index="Index.pdf" from=1 to="3" fromsection="A" />')
+
+    Atttributes can be deleted.
+    >>> del tp.fromsection
+    >>> tp
+    PagesTagParser('<pages index="Index.pdf" from=1 to="3" />')
+
+    Attribute presence can be checked.
+    >>> 'to' in tp
+    True
+
+    >>> 'step' in tp
+    False
+    """
+
+    pat_tag = re.compile(r'<pages (?P<attrs>[^/]*?)/>')
+    tokens = (
+        'index',
+        'from',
+        'to',
+        'include',
+        'exclude',
+        'step',
+        'header',
+        'tosection',
+        'fromsection',
+        'onlysection',
+    )
+    tokens = '(' + '=|'.join(tokens) + '=)'
+    pat_attr = re.compile(tokens)
+
+    index = TagAttrDesc()
+    ffrom = TagAttrDesc()
+    to = TagAttrDesc()
+    include = TagAttrDesc()
+    exclude = TagAttrDesc()
+    step = TagAttrDesc()
+    header = TagAttrDesc()
+    tosection = TagAttrDesc()
+    fromsection = TagAttrDesc()
+    onlysection = TagAttrDesc()
+
+    def __init__(self, text):
+        """Initializer."""
+        m = self.pat_tag.search(text)
+        if m is None:
+            raise ValueError(f'Invalid text={text}')
+
+        tag = m['attrs']
+        matches = list(self.pat_attr.finditer(tag))
+        positions = [m.span()[0] for m in matches] + [len(tag)]
+
+        for begin, end in pairwise(positions):
+            attribute = tag[begin:end - 1]
+            attr, _, value = attribute.partition('=')
+            if attr == 'from':
+                attr = 'f' + attr
+            setattr(self, attr, value)
+
+    @classmethod
+    def get_descriptors(cls):
+        """Get TagAttrDesc descriptors."""
+        res = {k: v for k, v in cls.__dict__.items()
+               if isinstance(v, TagAttrDesc)}
+        return res
+
+    def __contains__(self, attr):
+        return getattr(self, attr) is not None
+
+    def __str__(self):
+        descriptors = self.get_descriptors().items()
+        attrs = [v.attrs.get(self) for k, v in descriptors
+                 if v.attrs.get(self) is not None]
+        attrs = ' '.join(str(attr) for attr in attrs)
+        return f'<pages {attrs} />' if attrs else '<pages />'
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}('{self}')"
+
+
 def decompose(fn: Callable) -> Callable:  # type: ignore
     """Decorator for ProofreadPage.
 
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 47b6965..44d6ef0 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -13,7 +13,12 @@
 import pywikibot
 from pywikibot.data import api
 from pywikibot.exceptions import UnknownExtensionError
-from pywikibot.proofreadpage import IndexPage, ProofreadPage
+from pywikibot.proofreadpage import (
+    IndexPage,
+    PagesTagParser,
+    ProofreadPage,
+    TagAttr,
+)
 from pywikibot.tools import has_module
 from tests import unittest_print
 from tests.aspects import TestCase, require_modules
@@ -24,6 +29,97 @@
 from tests.utils import skipping
 
 
+class TestPagesTagParser(TestCase):
+    """Test TagAttr class."""
+
+    net = False
+
+    def test_tag_attr_int(self):
+        """Test TagAttr for int values."""
+        attr = TagAttr('to', 3)
+        self.assertEqual(repr(attr), "TagAttr('to', 3)")
+        self.assertEqual(str(attr), 'to=3')
+        self.assertEqual(attr.attr, 'to')
+        self.assertEqual(attr.value, 3)
+
+    def test_tag_attr_srt_int(self):
+        """Test TagAttr for str values that can be converted to int."""
+        attr = TagAttr('to', '3')
+        self.assertEqual(repr(attr), "TagAttr('to', '3')")
+        self.assertEqual(str(attr), 'to=3')
+        self.assertEqual(attr.attr, 'to')
+        self.assertEqual(attr.value, 3)
+
+        attr.value = '"3"'
+        self.assertEqual(str(attr), 'to="3"')
+        self.assertEqual(repr(attr), """TagAttr('to', '"3"')""")
+        self.assertEqual(attr.value, 3)
+
+    def test_tag_attr_str(self):
+        """Test TagAttr for str value."""
+        attr = TagAttr('fromsection', 'A123')
+        self.assertEqual(repr(attr), "TagAttr('fromsection', 'A123')")
+        self.assertEqual(str(attr), 'fromsection=A123')
+        self.assertEqual(attr.attr, 'fromsection')
+        self.assertEqual(attr.value, 'A123')
+
+        attr.value = '"A123"'
+        self.assertEqual(repr(attr), """TagAttr('fromsection', '"A123"')""")
+        self.assertEqual(str(attr), 'fromsection="A123"')
+        self.assertEqual(attr.value, 'A123')
+
+        attr.value = "'A123'"
+        self.assertEqual(repr(attr), """TagAttr('fromsection', "'A123'")""")
+        self.assertEqual(str(attr), "fromsection='A123'")
+        self.assertEqual(attr.value, 'A123')
+
+    def test_tag_attr_exceptions(self):
+        """Test TagAttr for Exceptions."""
+        self.assertRaises(ValueError, TagAttr, 'fromsection', 'A123"')
+        self.assertRaises(TypeError, TagAttr, 'fromsection', 3.0)
+
+    def test_pages_tag_parser(self):
+        """Test PagesTagParser."""
+        tp = PagesTagParser('Text: <pages />')
+        self.assertEqual(repr(tp), "PagesTagParser('<pages />')")
+
+        text = 'Text: <pages from="first" to="last" />'
+        tp = PagesTagParser(text)
+        self.assertEqual(
+            repr(tp), """PagesTagParser('<pages from="first" to="last" />')""")
+        self.assertEqual(tp.ffrom, 'first')
+        self.assertEqual(tp.to, 'last')
+
+        tp.index = '"Index.pdf"'
+        self.assertEqual(tp.index, 'Index.pdf')
+
+        tp.ffrom, tp.to = 1, '"3"'
+        self.assertEqual(tp.ffrom, 1)
+        self.assertEqual(tp.to, 3)
+        self.assertEqual(str(tp), '<pages index="Index.pdf" from=1 to="3" />')
+
+        del tp.index
+        self.assertNotIn('index', tp)
+
+        tp.to = "'3'"
+        self.assertEqual(str(tp), """<pages from=1 to='3' />""")
+
+        tp.step = 3
+        self.assertEqual(str(tp), """<pages from=1 to='3' step=3 />""")
+        self.assertIn('step', tp)
+
+    def test_pages_tag_parser_exceptions(self):
+        """Test PagesTagParser Exceptions."""
+        text = """Text: <pages index="Index.pdf />"""
+        self.assertRaises(ValueError, PagesTagParser, text)
+
+        text = """Text: <pages index="Index.pdf' />"""
+        self.assertRaises(ValueError, PagesTagParser, text)
+
+        text = """Text: <pages index="Index.pdf from=C" />"""
+        self.assertRaises(ValueError, PagesTagParser, text)
+
+
 class TestProofreadPageInvalidSite(TestCase):
 
     """Test ProofreadPage class."""