jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/445761 )
Change subject: [IMPR] Provide additional information for deprecation warning
......................................................................
[IMPR] Provide additional information for deprecation warning
- also decrease stack level to show the line causing the warning
- use DiscussionPage.site when calling extract_sections in archivebot.py
Bug: T199605
Change-Id: I8282d06fe2e848e844e8c018704988d7d06b62b3
---
M pywikibot/textlib.py
M scripts/archivebot.py
2 files changed, 3 insertions(+), 2 deletions(-)
Approvals:
Dalba: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 9a83cd4..3c836cd 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -315,7 +315,8 @@
if not site and exc in ('interwiki', 'property', 'invoke',
'category', 'file'):
issue_deprecation_warning(
- 'site=None', 'a valid site', 3)
+ 'site=None',
+ "a valid site for '{}' regex".format(exc), 2)
site = pywikibot.Site()
if (exc, site) not in _regex_cache:
diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index fe907da..fcc08de 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -462,7 +462,7 @@
text = re.sub(r'^===', marker + r'===', text, flags=re.M)
# Find threads, avoid archiving categories or interwiki
- header, threads, footer = extract_sections(text)
+ header, threads, footer = extract_sections(text, self.site)
header = header.replace(marker, '')
if header and footer:
self.header = '\n\n'.join((header.rstrip(), footer, ''))
--
To view, visit https://gerrit.wikimedia.org/r/445761
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I8282d06fe2e848e844e8c018704988d7d06b62b3
Gerrit-Change-Number: 445761
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/445768 )
Change subject: textlib.extract_sections: Not everything after a language link is footer
......................................................................
textlib.extract_sections: Not everything after a language link is footer
- Move some parts of extract_sections into independent functions:
_extract_sections and _extract_headings.
- Use namedtuples for readability.
- Remove the parts that assumed everything after a language link or
everything after a category is footer. (T199539) Instead the footer
is calculated as the last section's ending language links, categories,
or whitespace.
Bug: T199539
Change-Id: I74daaa9613522c4cbbc67eb7caea7e1b910c5739
---
M pywikibot/textlib.py
1 file changed, 44 insertions(+), 58 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 6544292..54968cf 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -13,7 +13,7 @@
#
from __future__ import absolute_import, unicode_literals
-from collections import OrderedDict
+from collections import OrderedDict, namedtuple
try:
from collections.abc import Sequence
except ImportError: # Python 2.7
@@ -828,6 +828,38 @@
# -------------------------------
# Functions dealing with sections
# -------------------------------
+_Heading = namedtuple('_Heading', ('text', 'start', 'end'))
+_Section = namedtuple('_Section', ('title', 'content'))
+
+
+def _extract_headings(text, site):
+ """Return _Heading objects."""
+ headings = []
+ heading_regex = _get_regexes(['header'], site)[0]
+ for match in heading_regex.finditer(text):
+ start, end = match.span()
+ if not isDisabled(text, start) and not isDisabled(text, end):
+ headings.append(_Heading(match.group(), start, end))
+ return headings
+
+
+def _extract_sections(text, headings):
+ """Return _Section objects."""
+ if headings:
+ # Assign them their contents
+ contents = []
+ for i, heading in enumerate(headings):
+ try:
+ next_heading = headings[i + 1]
+ except IndexError:
+ contents.append(text[heading.end:])
+ else:
+ contents.append(text[heading.end:next_heading.start])
+ return [_Section(heading.text, content)
+ for heading, content in zip(headings, contents)]
+ return []
+
+
def extract_sections(text, site=None):
"""
Return section headings and contents found in text.
@@ -859,63 +891,17 @@
@rtype: tuple of (str, list of tuples, str)
"""
- headings = []
- contents = []
- body = []
-
- # Find valid headings
- heading_regex = _get_regexes(['header'], site)[0]
- pos = 0
- while True:
- match = heading_regex.search(text[pos:])
- if not match:
- break
- start = pos + match.start()
- end = pos + match.end()
- if not (isDisabled(text, start)
- or isDisabled(text, end)):
- headings += [(match.group(), start, end)]
- pos = end
-
- if headings:
- # Assign them their contents
- for i, current in enumerate(headings):
- try:
- following = headings[i + 1]
- except IndexError:
- following = None
- if following:
- contents.append(text[current[2]:following[1]])
- else:
- contents.append(text[current[2]:])
- body = [(heading[0], section)
- for heading, section in zip(headings, contents)]
-
+ headings = _extract_headings(text, site)
+ sections = _extract_sections(text, headings)
# Find header and footer contents
- header = text[:headings[0][1]] if headings else text
-
- last_section = body[-1][1] if body else header
- skippings = ['category', 'interwiki']
- footer_regexes = _get_regexes(skippings, site)
- # we want only interwikis, not interlanguage links
- footer_regexes[1] = re.compile(
- footer_regexes[1].pattern.replace(':?', ''))
- # find where to cut
- positions = []
- for reg in footer_regexes:
- match = reg.search(last_section)
- if match:
- positions.append(match.start())
- pos = min(pos for pos in positions) if positions else len(last_section)
-
- # Strip footer from last section content
- last_section, footer = last_section[:pos], last_section[pos:]
- if body:
- body[-1] = (body[-1][0], last_section)
- else:
- header = last_section
-
- return header, body, footer
+ header = text[:headings[0].start] if headings else text
+ last_section_contents = sections[-1].content if sections else header
+ cat_regex, interwiki_regex = _get_regexes(('category', 'interwiki'), site)
+ langlink_pattern = interwiki_regex.pattern.replace(':?', '')
+ footer = re.search(
+ r'(%s)*\Z' % r'|'.join((langlink_pattern, cat_regex.pattern, r'\s+')),
+ last_section_contents).group().strip()
+ return header, sections, footer
# -----------------------------------------------
@@ -2006,7 +1992,7 @@
It does not care whether a section string may contain spaces or
underlines. Both will match.
- If a section parameter contains a internal link, it will match the
+ If a section parameter contains an internal link, it will match the
section with or without a preceding colon which is required for a
text link e.g. for categories and files.
--
To view, visit https://gerrit.wikimedia.org/r/445768
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I74daaa9613522c4cbbc67eb7caea7e1b910c5739
Gerrit-Change-Number: 445768
Gerrit-PatchSet: 5
Gerrit-Owner: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/445763 )
Change subject: textlib.does_text_contain_section: Do not expect colon to be always escaped
......................................................................
textlib.does_text_contain_section: Do not expect colon to be always escaped
Python 3.7's re.escape does not escape the :.
Bug: T199604
Change-Id: I1f8f2d0e254f421e76b97647fa93aface7ff9e72
---
M pywikibot/textlib.py
1 file changed, 1 insertion(+), 1 deletion(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 9a83cd4..6544292 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -2017,7 +2017,7 @@
"""
# match preceding colon for text links
- section = re.sub(r'\\\[\\\[(\\:)?', r'\[\[\:?', re.escape(section))
+ section = re.sub(r'\\\[\\\[(\\?:)?', r'\[\[\:?', re.escape(section))
# match underscores and white spaces
section = re.sub(r'\\?[ _]', '[ _]', section)
m = re.search("=+[ ']*%s[ ']*=+" % section, pagetext)
--
To view, visit https://gerrit.wikimedia.org/r/445763
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I1f8f2d0e254f421e76b97647fa93aface7ff9e72
Gerrit-Change-Number: 445763
Gerrit-PatchSet: 2
Gerrit-Owner: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/441627 )
Change subject: [IMPR] Use newly created extract_sections from textlib in archivebot
......................................................................
[IMPR] Use newly created extract_sections from textlib in archivebot
+ avoid archiving categories and interwiki
Change-Id: I644f17dc6b1c775de134c5e795c456423cc40147
---
M scripts/archivebot.py
1 file changed, 21 insertions(+), 39 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index a0e229a..60b4285 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -111,8 +111,8 @@
from pywikibot.date import apply_month_delta
from pywikibot import i18n
-from pywikibot.textlib import TimeStripper, _get_regexes
-from pywikibot.textlib import to_local_digits
+from pywikibot.textlib import (extract_sections, findmarker, TimeStripper,
+ to_local_digits)
from pywikibot.tools import issue_deprecation_warning, FrozenDict
ZERO = datetime.timedelta(0)
@@ -455,46 +455,28 @@
self.threads = []
self.archives = {}
self.archived_threads = 0
+
+ # Exclude non-thread headings
text = self.get()
+ marker = findmarker(text)
+ text = re.sub(r'^===', marker + r'===', text, flags=re.M)
- # Replace text in following exceptions by spaces, but don't change line
- # numbers and character positions
- exceptions = ['comment', 'code', 'pre', 'source', 'nowiki']
- exc_regexes = _get_regexes(exceptions, self.site)
- stripped_text = text
- for regex in exc_regexes:
- for match in re.finditer(regex, stripped_text):
- before = stripped_text[:match.start()]
- restricted = stripped_text[match.start():match.end()]
- after = stripped_text[match.end():]
- restricted = re.sub(r'[^\n]', ' ', restricted)
- stripped_text = before + restricted + after
-
- # Find thread headers in stripped text and return their line numbers
- stripped_lines = stripped_text.split('\n')
- thread_headers = []
- for line_number, line in enumerate(stripped_lines, start=1):
- if re.search(r'^== *[^=].*? *== *$', line):
- thread_headers.append(line_number)
- # Fill self by original thread headers on returned line numbers
- lines = text.split('\n')
- found = False # Reading header
- cur_thread = None
- for line_number, line in enumerate(lines, start=1):
- if line_number in thread_headers:
- thread_header = re.search('^== *([^=].*?) *== *$', line)
- found = True # Reading threads now
- if cur_thread:
- self.threads.append(cur_thread)
- cur_thread = DiscussionThread(thread_header.group(1), self.now,
- self.timestripper)
- else:
- if found:
- cur_thread.feed_line(line)
- else:
- self.header += line + '\n'
- if cur_thread:
+ # Find threads, avoid archiving categories or interwiki
+ header, threads, footer = extract_sections(text)
+ header = header.replace(marker, '')
+ if header and footer:
+ self.header = '\n\n'.join((header.rstrip(), footer, ''))
+ else:
+ self.header = header + footer
+ for thread_heading, thread_content in threads:
+ cur_thread = DiscussionThread(thread_heading.strip('= '), self.now,
+ self.timestripper)
+ lines = thread_content.replace(marker, '').splitlines()
+ lines = lines[1:] # remove heading line
+ for line in lines:
+ cur_thread.feed_line(line)
self.threads.append(cur_thread)
+
# This extra info is not desirable when run under the unittest
# framework, which may be run either directly or via setup.py
if pywikibot.calledModuleName() not in ['archivebot_tests', 'setup']:
--
To view, visit https://gerrit.wikimedia.org/r/441627
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I644f17dc6b1c775de134c5e795c456423cc40147
Gerrit-Change-Number: 441627
Gerrit-PatchSet: 3
Gerrit-Owner: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999(a)gmail.com>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot