[Gerrit] pywikibot/core[master]: Allow whitespace at end of html tags - Pywikibot-commits

6 Apr 2018

jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/424385 )

Change subject: Allow whitespace at end of html tags
......................................................................

Allow whitespace at end of html tags

Whitespace is allowed after tag names in XML and HTML,
so regexps should also look for it.
Reg: https://www.w3.org/TR/REC-xml/#sec-starttags and
https://html.spec.whatwg.org/multipage/syntax.html#start-tags

Bug: T191559
Change-Id: I29d03e2ee7c1fc5278a8df5e05252529f10d5a5f
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 37 insertions(+), 33 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 8f3da8a..0c0940b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -244,48 +244,48 @@
 def _create_default_regexes():
     """Fill (and possibly overwrite) _regex_cache with default
regexes."""
     _regex_cache.update({
-        'comment':      re.compile(r'(?s)<!--.*?-->'),
+        'comment':    re.compile(r'(?s)<!--.*?-->'),
         # section headers
-        'header':       re.compile(r'(?m)^=+.+=+ *$'),
+        'header':     re.compile(r'(?m)^=+.+=+ *$'),
         # preformatted text
-        'pre':          re.compile(r'(?is)<pre[
>].*?</pre>'),
-        'source':       re.compile(r'(?is)<source
.*?</source>'),
-        'score':        re.compile(r'(?is)<score[
>].*?</score>'),
+        'pre':        re.compile(r'(?is)<pre[
>].*?</pre\s*>'),
+        'source':     re.compile(r'(?is)<source
.*?</source\s*>'),
+        'score':      re.compile(r'(?is)<score[
>].*?</score\s*>'),
         # inline references
-        'ref':          re.compile(r'(?is)<ref[
>].*?</ref>'),
-        'template':     NESTED_TEMPLATE_REGEX,
+        'ref':        re.compile(r'(?is)<ref[ >].*?</ref>'),
+        'template':   NESTED_TEMPLATE_REGEX,
         # lines that start with a space are shown in a monospace font and
         # have whitespace preserved.
-        'startspace':   re.compile(r'(?m)^ (.*?)$'),
+        'startspace': re.compile(r'(?m)^ (.*?)$'),
         # tables often have whitespace that is used to improve wiki
         # source code readability.
         # TODO: handle nested tables.
-        'table':        re.compile(r'(?ims)^{\|.*?^\|}|<table[
>].*?</table>'),
-        'hyperlink':    compileLinkR(),
-        'gallery':     
re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+        'table':      re.compile(r'(?ims)'
+                                 r'^{\|.*?^\|}|<table[
>].*?</table\s*>'),
+        'hyperlink':  compileLinkR(),
+        'gallery':   
re.compile(r'(?is)<gallery.*?>.*?</gallery\s*>'),
         # this matches internal wikilinks, but also interwiki, categories, and
         # images.
-        'link':         re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+        'link':       re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
         # also finds links to foreign sites with preleading ":"
-        'interwiki':    (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
-                         lambda site: '|'.join(
-                             site.validLanguageLinks() +
-                             list(site.family.obsolete.keys()))),
+        'interwiki':  (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
+                       lambda site: '|'.join(
+                           site.validLanguageLinks()
+                           + list(site.family.obsolete.keys()))),
         # Wikibase property inclusions
-        'property':     (r'(?i)\{\{\s*\#(?:%s):\s*p\d+.*?\}\}',
-                         lambda site: '|'.join(
-                             site.getmagicwords('property'))),
+        'property':   (r'(?i)\{\{\s*\#(?:%s):\s*p\d+.*?\}\}',
+                       lambda site:
'|'.join(site.getmagicwords('property'))),
         # Module invocations (currently only Lua)
-        'invoke':       (r'(?is)\{\{\s*\#(?:%s):.*?\}\}',
-                         lambda site:
'|'.join(site.getmagicwords('invoke'))),
+        'invoke':     (r'(?is)\{\{\s*\#(?:%s):.*?\}\}',
+                       lambda site:
'|'.join(site.getmagicwords('invoke'))),
         # categories
-        'category':     (r'\[\[ *(?:%s)\s*:.*?\]\]',
-                         lambda site: '|'.join(site.namespaces[14])),
+        'category':   (r'\[\[ *(?:%s)\s*:.*?\]\]',
+                       lambda site: '|'.join(site.namespaces[14])),
         # files
-        'file':         (FILE_LINK_REGEX,
-                         lambda site: '|'.join(site.namespaces[6])),
+        'file':       (FILE_LINK_REGEX,
+                       lambda site: '|'.join(site.namespaces[6])),
         # pagelist tag (used in Proofread extension).
-        'pagelist':      re.compile(r'(?is)<pagelist.*?/>'),
+        'pagelist':   re.compile(r'(?is)<pagelist.*?/>'),
     })
 
 
@@ -321,12 +321,12 @@
                 # nowiki, noinclude, includeonly, timeline, math and other
                 # extensions
                 _regex_cache[exc] = re.compile(
-                    r'(?is)<{0}>.*?</{0}>'.format(exc))
+                    r'(?is)<{0}\s*>.*?</{0}\s*>'.format(exc))
                 result.append(_regex_cache[exc])
             # handle alias
             if exc == 'source':
                 dontTouchRegexes.append(re.compile(
-                    r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
+                    r'(?is)<syntaxhighlight .*?</syntaxhighlight\s*>'))
         else:
             # assume it's a regular expression
             dontTouchRegexes.append(exc)
@@ -469,11 +469,11 @@
     """
     regexes = {
         'comments':        r'<!--.*?-->',
-        'includeonly':    
r'<includeonly>.*?</includeonly>',
-        'nowiki':          r'<nowiki>.*?</nowiki>',
-        'pre':             r'<pre>.*?</pre>',
-        'source':          r'<source .*?</source>',
-        'syntaxhighlight': r'<syntaxhighlight
.*?</syntaxhighlight>',
+        'includeonly':    
r'<includeonly\s*>.*?</includeonly\s*>',
+        'nowiki':          r'<nowiki\s*>.*?</nowiki\s*>',
+        'pre':             r'<pre\s*>.*?</pre\s*>',
+        'source':          r'<source .*?</source\s*>',
+        'syntaxhighlight': r'<syntaxhighlight
.*?</syntaxhighlight\s*>',
     }
     if '*' in tags:
         tags = list(regexes.keys())
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 073373c..bc9fa2e 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1159,6 +1159,10 @@
         self.assertEqual(textlib.replaceExcept('<pre>x</pre>',
'x', 'y',
                                                ['pre'], site=self.site),
                          '<pre>x</pre>')
+        self.assertEqual(textlib.replaceExcept('<nowiki   >x</nowiki   
>x',
+                                               'x', 'y',
['nowiki'],
+                                               site=self.site),
+                         '<nowiki   >x</nowiki    >y')  # T191559
         self.assertEqual(textlib.replaceExcept('<source
lang="xml">x</source>',
                                                'x', 'y',
['source'],
                                                site=self.site),

-- 
To view, visit https://gerrit.wikimedia.org/r/424385
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I29d03e2ee7c1fc5278a8df5e05252529f10d5a5f
Gerrit-Change-Number: 424385
Gerrit-PatchSet: 4
Gerrit-Owner: Danmichaelo &lt;danmichaelo(a)gmail.com&gt;
Gerrit-Reviewer: Dalba &lt;dalba.wiki(a)gmail.com&gt;
Gerrit-Reviewer: Dvorapa &lt;dvorapa(a)seznam.cz&gt;
Gerrit-Reviewer: John Vandenberg &lt;jayvdb(a)gmail.com&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: Zhuyifei1999 &lt;zhuyifei1999(a)gmail.com&gt;
Gerrit-Reviewer: Zoranzoki21 &lt;zorandori4444(a)gmail.com&gt;
Gerrit-Reviewer: jenkins-bot <>