jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/424385 )
Change subject: Allow whitespace at end of html tags
......................................................................
Allow whitespace at end of html tags
Whitespace is allowed after tag names in XML and HTML,
so regexps should also look for it.
Reg:
https://www.w3.org/TR/REC-xml/#sec-starttags and
https://html.spec.whatwg.org/multipage/syntax.html#start-tags
Bug: T191559
Change-Id: I29d03e2ee7c1fc5278a8df5e05252529f10d5a5f
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 37 insertions(+), 33 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 8f3da8a..0c0940b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -244,48 +244,48 @@
def _create_default_regexes():
"""Fill (and possibly overwrite) _regex_cache with default
regexes."""
_regex_cache.update({
- 'comment': re.compile(r'(?s)<!--.*?-->'),
+ 'comment': re.compile(r'(?s)<!--.*?-->'),
# section headers
- 'header': re.compile(r'(?m)^=+.+=+ *$'),
+ 'header': re.compile(r'(?m)^=+.+=+ *$'),
# preformatted text
- 'pre': re.compile(r'(?is)<pre[
>].*?</pre>'),
- 'source': re.compile(r'(?is)<source
.*?</source>'),
- 'score': re.compile(r'(?is)<score[
>].*?</score>'),
+ 'pre': re.compile(r'(?is)<pre[
>].*?</pre\s*>'),
+ 'source': re.compile(r'(?is)<source
.*?</source\s*>'),
+ 'score': re.compile(r'(?is)<score[
>].*?</score\s*>'),
# inline references
- 'ref': re.compile(r'(?is)<ref[
>].*?</ref>'),
- 'template': NESTED_TEMPLATE_REGEX,
+ 'ref': re.compile(r'(?is)<ref[ >].*?</ref>'),
+ 'template': NESTED_TEMPLATE_REGEX,
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
+ 'startspace': re.compile(r'(?m)^ (.*?)$'),
# tables often have whitespace that is used to improve wiki
# source code readability.
# TODO: handle nested tables.
- 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table[
>].*?</table>'),
- 'hyperlink': compileLinkR(),
- 'gallery':
re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+ 'table': re.compile(r'(?ims)'
+ r'^{\|.*?^\|}|<table[
>].*?</table\s*>'),
+ 'hyperlink': compileLinkR(),
+ 'gallery':
re.compile(r'(?is)<gallery.*?>.*?</gallery\s*>'),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+ 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
# also finds links to foreign sites with preleading ":"
- 'interwiki': (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
- lambda site: '|'.join(
- site.validLanguageLinks() +
- list(site.family.obsolete.keys()))),
+ 'interwiki': (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
+ lambda site: '|'.join(
+ site.validLanguageLinks()
+ + list(site.family.obsolete.keys()))),
# Wikibase property inclusions
- 'property': (r'(?i)\{\{\s*\#(?:%s):\s*p\d+.*?\}\}',
- lambda site: '|'.join(
- site.getmagicwords('property'))),
+ 'property': (r'(?i)\{\{\s*\#(?:%s):\s*p\d+.*?\}\}',
+ lambda site:
'|'.join(site.getmagicwords('property'))),
# Module invocations (currently only Lua)
- 'invoke': (r'(?is)\{\{\s*\#(?:%s):.*?\}\}',
- lambda site:
'|'.join(site.getmagicwords('invoke'))),
+ 'invoke': (r'(?is)\{\{\s*\#(?:%s):.*?\}\}',
+ lambda site:
'|'.join(site.getmagicwords('invoke'))),
# categories
- 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
- lambda site: '|'.join(site.namespaces[14])),
+ 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
+ lambda site: '|'.join(site.namespaces[14])),
# files
- 'file': (FILE_LINK_REGEX,
- lambda site: '|'.join(site.namespaces[6])),
+ 'file': (FILE_LINK_REGEX,
+ lambda site: '|'.join(site.namespaces[6])),
# pagelist tag (used in Proofread extension).
- 'pagelist': re.compile(r'(?is)<pagelist.*?/>'),
+ 'pagelist': re.compile(r'(?is)<pagelist.*?/>'),
})
@@ -321,12 +321,12 @@
# nowiki, noinclude, includeonly, timeline, math and other
# extensions
_regex_cache[exc] = re.compile(
- r'(?is)<{0}>.*?</{0}>'.format(exc))
+ r'(?is)<{0}\s*>.*?</{0}\s*>'.format(exc))
result.append(_regex_cache[exc])
# handle alias
if exc == 'source':
dontTouchRegexes.append(re.compile(
- r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
+ r'(?is)<syntaxhighlight .*?</syntaxhighlight\s*>'))
else:
# assume it's a regular expression
dontTouchRegexes.append(exc)
@@ -469,11 +469,11 @@
"""
regexes = {
'comments': r'<!--.*?-->',
- 'includeonly':
r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
- 'syntaxhighlight': r'<syntaxhighlight
.*?</syntaxhighlight>',
+ 'includeonly':
r'<includeonly\s*>.*?</includeonly\s*>',
+ 'nowiki': r'<nowiki\s*>.*?</nowiki\s*>',
+ 'pre': r'<pre\s*>.*?</pre\s*>',
+ 'source': r'<source .*?</source\s*>',
+ 'syntaxhighlight': r'<syntaxhighlight
.*?</syntaxhighlight\s*>',
}
if '*' in tags:
tags = list(regexes.keys())
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 073373c..bc9fa2e 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1159,6 +1159,10 @@
self.assertEqual(textlib.replaceExcept('<pre>x</pre>',
'x', 'y',
['pre'], site=self.site),
'<pre>x</pre>')
+ self.assertEqual(textlib.replaceExcept('<nowiki >x</nowiki
>x',
+ 'x', 'y',
['nowiki'],
+ site=self.site),
+ '<nowiki >x</nowiki >y') # T191559
self.assertEqual(textlib.replaceExcept('<source
lang="xml">x</source>',
'x', 'y',
['source'],
site=self.site),
--
To view, visit
https://gerrit.wikimedia.org/r/424385
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I29d03e2ee7c1fc5278a8df5e05252529f10d5a5f
Gerrit-Change-Number: 424385
Gerrit-PatchSet: 4
Gerrit-Owner: Danmichaelo <danmichaelo(a)gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999(a)gmail.com>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>