jenkins-bot has submitted this change and it was merged.
Change subject: Fix NESTED_TEMPLATE_REGEX
......................................................................
Fix NESTED_TEMPLATE_REGEX
Improve performance of NESTED_TEMPLATE_REGEX.
Also fixes bugs preventing the regex capturing templates:
* with names containing numbers.
* with values containing non-adjacent single brackets
* captures arbitary template levels safely
Bug: T63024
Bug: T105621
Change-Id: I61a92fb1b6d893de31fab738ab883af231917f4c
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 72 insertions(+), 10 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index a0b4d99..367d175 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -55,7 +55,7 @@
# It exists for backwards compatibility to the old 'TEMP_REGEX'
# which was the _ETP_REGEX.
TEMP_REGEX = DeprecatedRegex(r"""
-{{\s*(?:msg:)?\s*
+{{\s*(?:msg:\s*)?
(?P<name>[^{\|]+?)\s*
(?:\|(?P<params>[^{]*
(?:(?:{}|{{[A-Z]+(?:\:[^}])?}}|{{{[^}]+}}}) [^{]*)*
@@ -72,18 +72,26 @@
# Prefix msg: is not included in the 'name' group, but all others are
# included for backwards compatibility with TEMP_REGEX.
# Only parser functions using # are excluded.
+# When more than two levels of templates are found, this regex will
+# capture from the beginning of the first {{ to the end of the last }},
+# with wikitext between templates as part of the parameters of the first
+# template in the wikitext.
+# This ensures it fallsback to a safe mode for replaceExcept, as it
+# ensures that any replacement will not occur within template text.
NESTED_TEMPLATE_REGEX = re.compile(r"""
-{{\s*(?:msg:)?\s*
- (?P<name>[^{\|#0-9][^{\|#0-9]*?)\s*
- (?:\|(?P<params>[^{]*
- (({{{[^}]+}}}
- |{{[^}|]+\|?[^}]*}}
- |{}
- ) [^{]*
- )*
- )?
+{{\s*(?:msg:\s*)?
+ (?P<name>[^{\|#0-9][^{\|#]*?)\s*
+ (?:\|(?P<params> [^{]*?
+ (({{{[^{}]+?}}}
+ |{{[^{}]+?}}
+ |{[^{}]*?}
+ ) [^{]*?
+ )*?
+ )?
)?
}}
+|
+(?P<unhandled_depth>{{\s*[^{\|#0-9][^{\|#]*?\s* [^{]* {{ .* }})
""", re.VERBOSE)
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index d6b8b69..1de1fa9 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -314,6 +314,10 @@
self.assertEqual(func('{{{a|b}}X}'),
[('a', OrderedDict((('1', 'b'), )))])
+ #
sf.net bug 1575: unclosed template
+ self.assertEqual(func('{{a'), [])
+ self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
+
def _etp_regex_differs(self, func):
"""Common cases not handled the same by
ETP_REGEX."""
self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('
b', 'c'), )))])
@@ -350,6 +354,18 @@
self._order_differs(func)
self._etp_regex_differs(func)
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'),
))),
+ ('a', OrderedDict([('1',
'{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])
+
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1',
'{{d|}}'), ))),
+ ('a', OrderedDict([('1',
'{{c|{{d|}}}}')])),
+ ('d', OrderedDict([('1', '')]))
+ ])
+
def test_extract_templates_params_regex(self):
"""Test using many complex regexes."""
func = functools.partial(textlib.extract_templates_and_params_regex,
@@ -367,6 +383,20 @@
func = textlib.extract_templates_and_params_regex
self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
[('a', OrderedDict((('b', ''), )))])
+
+ # Identical to mwpfh
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'),
))),
+ ('a', OrderedDict([('1',
'{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])
+
+ # However fails to correctly handle three levels of balanced brackets
+ # with empty parameters
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1',
'{{d|}}}'), ))),
+ ('d', OrderedDict([('1', '}')]))
+ ])
def test_extract_templates_params(self):
"""Test that the normal entry point works."""
@@ -404,6 +434,13 @@
[(u'a', OrderedDict([('1', u'{{b'),
('2', u'c}}}'),
('3', u'd')]))])
+
+ # Safe fallback to handle arbitary template levels
+ # by merging top level templates together.
+ # i.e. 'b' is not recognised as a template, and 'foo' is also
+ # consumed as part of 'a'.
+ self.assertEqual(func('{{a|{{c|{{d|{{e|}}}} }} }} foo {{b}}'),
+ [(None, OrderedDict())])
def test_regexes(self):
"""_ETP_REGEX, NESTED_TEMPLATE_REGEX and TEMP_REGEX
tests."""
@@ -493,6 +530,17 @@
self.assertIsNotNone(func('{{a|{{c}} }}'))
self.assertIsNotNone(func('{{a|{{c|d}} }}'))
+
+ # All templates are captured when template depth is greater than 2
+ m = func('{{a|{{c|{{d|}} }} | foo = bar }} foo {{bar}} baz')
+ self.assertIsNotNone(m)
+ self.assertIsNotNone(m.group(0))
+ self.assertIsNone(m.group('name'))
+ self.assertIsNone(m.group(1))
+ self.assertIsNone(m.group('params'))
+ self.assertIsNone(m.group(2))
+ self.assertIsNotNone(m.group('unhandled_depth'))
+ self.assertTrue(m.group(0).endswith('foo {{bar}}'))
class TestReplaceLinks(TestCase):
@@ -962,6 +1010,12 @@
['template'], site=self.site),
'X' + template_sample[1:])
+ #
sf.net bug 1575: unclosed template
+ template_sample = template_sample[:-2]
+ self.assertEqual(textlib.replaceExcept(template_sample, 'a',
'X',
+ ['template'], site=self.site),
+ 'X' + template_sample[1:])
+
def test_replace_source_reference(self):
"""Test replacing in text which contains back
references."""
# Don't use a valid reference number in the original string, in case it
--
To view, visit
https://gerrit.wikimedia.org/r/226531
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I61a92fb1b6d893de31fab738ab883af231917f4c
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>