jenkins-bot has submitted this change and it was merged.
Change subject: Improve performance for replaceExcept
......................................................................
Improve performance for replaceExcept
Improving performance for replaceExcept in textlib:
* Avoid recompiling regexes. this is done using cache
* Early terminate the method if the replacement isn't relevant.
This avoid the large overhead of checking exceptions, parsing the new
etc, in the common case where the replace isn't relevant.
Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc
---
M CREDITS
M pywikibot/textlib.py
2 files changed, 99 insertions(+), 61 deletions(-)
Approvals:
John Vandenberg: Looks good to me, but someone else must approve
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CREDITS b/CREDITS
index 5807c00..205b51e 100644
--- a/CREDITS
+++ b/CREDITS
@@ -22,6 +22,7 @@
DMaggot
DrTrigon
Egon
+Eranroz
Erwin
Felix Reimann
Filnik
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 6a57668..170f1bd 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -33,6 +33,9 @@
from pywikibot.family import Family
from pywikibot.tools import OrderedDict
+# cache for replaceExcept to avoid recompile or regexes each call
+_regex_cache = {}
+
TEMP_REGEX = re.compile(
r'{{(?:msg:)?(?P<name>[^{\|]+?)(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?))?}}')
@@ -79,6 +82,94 @@
return s
+def _create_default_regexes():
+ """Fill (and possibly overwrite) _regex_cache with default regexes."""
+ _regex_cache.update({
+ 'comment': re.compile(r'(?s)<!--.*?-->'),
+ # section headers
+ 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'),
+ # preformatted text
+ 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
+ 'source': re.compile(r'(?is)<source .*?</source>'),
+ # inline references
+ 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
+ # lines that start with a space are shown in a monospace font and
+ # have whitespace preserved.
+ 'startspace': re.compile(r'(?m)^ (.*?)$'),
+ # tables often have whitespace that is used to improve wiki
+ # source code readability.
+ # TODO: handle nested tables.
+ 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
+ 'hyperlink': compileLinkR(),
+ 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+ # this matches internal wikilinks, but also interwiki, categories, and
+ # images.
+ 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+ # also finds links to foreign sites with preleading ":"
+ 'interwiki': (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
+ lambda site: '|'.join(
+ site.validLanguageLinks() +
+ list(site.family.obsolete.keys()))),
+ # Wikibase property inclusions
+ 'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
+ # Module invocations (currently only Lua)
+ 'invoke': re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'),
+ # categories
+ 'category': ('\[\[ *(?:%s)\s*:.*?\]\]',
+ lambda site: '|'.join(site.namespaces[14])),
+ # files
+ 'file': ('\[\[ *(?:%s)\s*:.*?\]\]',
+ lambda site: '|'.join(site.namespaces[6])),
+ })
+
+
+def _get_regexes(keys, site):
+ """Fetch compiled regexes."""
+ if site is None:
+ site = pywikibot.Site()
+
+ if not _regex_cache:
+ _create_default_regexes()
+
+ result = []
+ # 'dontTouchRegexes' exist to reduce git blame only.
+ dontTouchRegexes = result
+
+ for exc in keys:
+ if isinstance(exc, basestring):
+ # assume the string is a reference to a standard regex above,
+ # which may not yet have a site specific re compiled.
+ if exc in _regex_cache:
+ if type(_regex_cache[exc]) is tuple:
+ if (exc, site) not in _regex_cache:
+ re_text, re_var = _regex_cache[exc]
+ _regex_cache[(exc, site)] = re.compile(
+ re_text % re_var(site))
+
+ result.append(_regex_cache[(exc, site)])
+ else:
+ result.append(_regex_cache[exc])
+ elif exc == 'template':
+ # template is not supported by this method.
+ pass
+ else:
+ # nowiki, noinclude, includeonly, timeline, math ond other
+ # extensions
+ if exc not in _regex_cache:
+ _regex_cache[exc] = re.compile(r'(?is)<%s>.*?</%s>'
+ % (exc, exc))
+ result.append(_regex_cache[exc])
+ # handle alias
+ if exc == 'source':
+ dontTouchRegexes.append(re.compile(
+ r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
+ else:
+ # assume it's a regular expression
+ dontTouchRegexes.append(exc)
+
+ return result
+
+
def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
allowoverlap=False, marker='', site=None):
"""
@@ -102,45 +193,6 @@
if nothing is changed, it is added at the end
"""
- if site is None:
- site = pywikibot.Site()
-
- exceptionRegexes = {
- 'comment': re.compile(r'(?s)<!--.*?-->'),
- # section headers
- 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'),
- # preformatted text
- 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
- 'source': re.compile(r'(?is)<source .*?</source>'),
- # inline references
- 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
- # lines that start with a space are shown in a monospace font and
- # have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
- # tables often have whitespace that is used to improve wiki
- # source code readability.
- # TODO: handle nested tables.
- 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
- 'hyperlink': compileLinkR(),
- 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
- # this matches internal wikilinks, but also interwiki, categories, and
- # images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
- # also finds links to foreign sites with preleading ":"
- 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks() +
- list(site.family.obsolete.keys()))),
- # Wikibase property inclusions
- 'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
- # Module invocations (currently only Lua)
- 'invoke': re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'),
- # categories
- 'category': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(14, all=True))),
- # files
- 'file': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(6, all=True))),
-
- }
-
# if we got a string, compile it as a regular expression
if isinstance(old, basestring):
if caseInsensitive:
@@ -148,28 +200,13 @@
else:
old = re.compile(old)
- dontTouchRegexes = []
- except_templates = False
- for exc in exceptions:
- if isinstance(exc, basestring):
- # assume it's a reference to the exceptionRegexes dictionary
- # defined above.
- if exc in exceptionRegexes:
- dontTouchRegexes.append(exceptionRegexes[exc])
- elif exc == 'template':
- except_templates = True
- else:
- # nowiki, noinclude, includeonly, timeline, math ond other
- # extensions
- dontTouchRegexes.append(re.compile(r'(?is)<%s>.*?</%s>'
- % (exc, exc)))
- # handle alias
- if exc == 'source':
- dontTouchRegexes.append(re.compile(
- r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
- else:
- # assume it's a regular expression
- dontTouchRegexes.append(exc)
+ # early termination if not relevant
+ if not old.search(text):
+ return text + marker
+
+ dontTouchRegexes = _get_regexes(exceptions, site)
+
+ except_templates = 'template' in exceptions
# mark templates
# don't care about mw variables and parser functions
--
To view, visit https://gerrit.wikimedia.org/r/206091
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc
Gerrit-PatchSet: 14
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Eranroz <eranroz89(a)gmail.com>
Gerrit-Reviewer: Eranroz <eranroz89(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Ricordisamoa <ricordisamoa(a)openmailbox.org>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] Link: Use iwmap for parse_site
......................................................................
[IMPROV] Link: Use iwmap for parse_site
Instead of a statically configured dict Link.parse_site is using the
interwikimap reported by the API. This changes the output marginally as
the tuple now can contain None (when there is such configured site).
Bug: T97932
Change-Id: I9a85925e26e1dadb1c60d00d45e7ad0cfad3272f
---
M pywikibot/family.py
M pywikibot/page.py
2 files changed, 17 insertions(+), 13 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/family.py b/pywikibot/family.py
index 5f5d1b2..e7f97e4 100644
--- a/pywikibot/family.py
+++ b/pywikibot/family.py
@@ -871,6 +871,9 @@
issue_deprecation_warning('nocapitalize',
"APISite.siteinfo['case'] or "
"Namespace.case == 'case-sensitive'", 2)
+ elif name == 'known_families':
+ issue_deprecation_warning('known_families',
+ 'APISite.interwiki(prefix)', 2)
return super(Family, self).__getattribute__(name)
@staticmethod
diff --git a/pywikibot/page.py b/pywikibot/page.py
index bd98c0c..5195f59 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -4426,14 +4426,17 @@
return "pywikibot.page.Link(%r, %r)" % (self.title, self.site)
def parse_site(self):
- """Parse only enough text to determine which site the link points to.
+ """
+ Parse only enough text to determine which site the link points to.
This method does not parse anything after the first ":"; links
with multiple interwiki prefixes (such as "wikt:fr:Parlais") need
to be re-parsed on the first linked wiki to get the actual site.
- @return: tuple of (family-name, language-code) for the linked site.
-
+ @return: The family name and site code for the linked site. If the site
+ is not supported by the configured families it returns None instead
+ of a str.
+ @rtype: str or None, str or None
"""
t = self._text
fam = self._source.family
@@ -4453,16 +4456,14 @@
if prefix in fam.langs:
# prefix is a language code within the source wiki family
return (fam.name, prefix)
- known = fam.get_known_families(site=self._source)
- if prefix in known:
- if known[prefix] == fam.name:
- # interwiki prefix links back to source family
- t = t[t.index(u":") + 1:].lstrip(u" ")
- # strip off the prefix and retry
- continue
- # prefix is a different wiki family
- return (known[prefix], code)
- break
+ try:
+ newsite = self._source.interwiki(prefix)
+ except KeyError:
+ break # text before : doesn't match any known prefix
+ except SiteDefinitionError:
+ return (None, None)
+ else:
+ return (newsite.family.name, newsite.code)
return (fam.name, code) # text before : doesn't match any known prefix
def parse(self):
--
To view, visit https://gerrit.wikimedia.org/r/208510
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9a85925e26e1dadb1c60d00d45e7ad0cfad3272f
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>
Build Update for wikimedia/pywikibot-core
-------------------------------------
Build: #2179
Status: Errored
Duration: 30 minutes and 22 seconds
Commit: 671c1d3 (master)
Author: John Vandenberg
Message: Consume global arg -dir in handle_args
-dir: is processed in config2.py
If not consumed by handle_args, it is returned as a local arg,
and breaks generate-user-files which reports an error when
there are any local args given.
Change-Id: Ic549617ec6204ae342e792c4baff7ee44f725c17
View the changeset: https://github.com/wikimedia/pywikibot-core/compare/3d62b59c0bc8...671c1d3d…
View the full build log and details: https://travis-ci.org/wikimedia/pywikibot-core/builds/61022104
--
You can configure recipients for build notifications in your .travis.yml file. See http://docs.travis-ci.com/user/notifications