Xqt has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/594478 )
Change subject: [cleanup] Always use mwparserfromhell if present
......................................................................
[cleanup] Always use mwparserfromhell if present
use_mwparserfromhell is True by default and mwparserfromhell will
be used in that case if the package is installed.
The extract_templates_and_params_regex function is also available
if mwparserfromhell should not be used in rare cases. Note that
this fallback does not work for deeply nested templates (> 3).
Update tests
Change-Id: Id055618106eec13957c0038210cbabf19771ad5b
---
M pywikibot/config2.py
M pywikibot/textlib.py
M tests/textlib_tests.py
3 files changed, 22 insertions(+), 37 deletions(-)
Approvals:
Legoktm: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index b5e5a1c..641f5b2 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -110,10 +110,11 @@
# to other modules.
_private_values = {'authenticate', 'db_password'}
-_deprecated_variables = {'available_ssl_project', 'fake_user_agent',
- 'interwiki_contents_on_disk', 'line_separator', 'LS',
- 'panoramio', 'proxy', 'special_page_limit',
- 'sysopnames', 'use_SSL_onlogin', 'use_SSL_always'}
+_deprecated_variables = {
+ 'available_ssl_project', 'fake_user_agent', 'interwiki_contents_on_disk',
+ 'line_separator', 'LS', 'panoramio', 'proxy', 'special_page_limit',
+ 'sysopnames', 'use_mwparserfromhell', 'use_SSL_onlogin', 'use_SSL_always',
+}
# ############# ACCOUNT SETTINGS ##############
@@ -870,13 +871,6 @@
# processing. As higher this value this effect will decrease.
max_queue_size = 64
-# Settings to enable mwparserfromhell
-# <https://mwparserfromhell.readthedocs.org/en/latest/>
-# Currently used in textlib.extract_templates_and_params
-# This is more accurate than our current regex, but only works
-# if the user has already installed the library.
-use_mwparserfromhell = True
-
# Pickle protocol version to use for storing dumps.
# This config variable is not used for loading dumps.
# Version 2 is common to both Python 2 and 3, and should
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 36a5229..a52c569 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -22,7 +22,6 @@
import re
import pywikibot
-from pywikibot import config2 as config
from pywikibot.exceptions import InvalidTitle, SiteDefinitionError
from pywikibot.family import Family
from pywikibot.tools import (
@@ -1615,9 +1614,8 @@
parameters, and if this results multiple parameters with the same name
only the last value provided will be returned.
- This uses the package L{mwparserfromhell} (mwpfh) if it is installed
- and enabled by config.mwparserfromhell. Otherwise it falls back on a
- regex based implementation.
+ This uses the package L{mwparserfromhell} (mwpfh) if it is installed.
+ Otherwise it falls back on a regex based implementation.
There are minor differences between the two implementations.
@@ -1636,33 +1634,29 @@
@type text: str
@param remove_disabled_parts: Remove disabled wikitext such as comments
and pre. If None (default), this is enabled when mwparserfromhell
- is not available or is disabled in the config, and disabled if
- mwparserfromhell is present and enabled in the config.
+ is not available and disabled if mwparserfromhell is present.
@type remove_disabled_parts: bool or None
@param strip: if enabled, strip arguments and values of templates.
If None (default), this is enabled when mwparserfromhell
- is not available or is disabled in the config, and disabled if
- mwparserfromhell is present and enabled in the config.
+ is not available and disabled if mwparserfromhell is present.
@type strip: bool
@return: list of template name and params
@rtype: list of tuple
"""
- use_mwparserfromhell = (config.use_mwparserfromhell
- and not isinstance(mwparserfromhell, Exception))
+ use_regex = isinstance(mwparserfromhell, Exception)
if remove_disabled_parts is None:
- remove_disabled_parts = not use_mwparserfromhell
-
- if strip is None:
- strip = not use_mwparserfromhell
-
+ remove_disabled_parts = use_regex
if remove_disabled_parts:
text = removeDisabledParts(text)
- if use_mwparserfromhell:
- return extract_templates_and_params_mwpfh(text, strip)
- else:
+ if strip is None:
+ strip = use_regex
+
+ if use_regex:
return extract_templates_and_params_regex(text, False, strip)
+ else:
+ return extract_templates_and_params_mwpfh(text, strip)
def extract_templates_and_params_mwpfh(text, strip=False):
@@ -1673,8 +1667,7 @@
Use extract_templates_and_params, which will select this
mwparserfromhell implementation if based on whether the
- mwparserfromhell package is installed and enabled by
- config.mwparserfromhell.
+ mwparserfromhell package is installed.
@param text: The wikitext from which templates are extracted
@type text: str
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 1cf7791..9fb9461 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -15,11 +15,11 @@
import pywikibot
import pywikibot.textlib as textlib
-from pywikibot.textlib import _MultiTemplateMatchBuilder, extract_sections
-from pywikibot import config, UnknownSite
from pywikibot.site import _IWEntry
+from pywikibot.textlib import _MultiTemplateMatchBuilder, extract_sections
from pywikibot.tools import suppress_warnings
+from pywikibot import UnknownSite
from tests.aspects import (
unittest, require_modules, TestCase, DefaultDrySiteTestCase,
@@ -725,7 +725,7 @@
def test_removing_disabled_parts_regex(self):
"""Test removing disabled parts when using the regex variant."""
- self.patch(config, 'use_mwparserfromhell', False)
+ self.patch(textlib, 'mwparserfromhell', Exception())
textlib.extract_templates_and_params('{{a<!-- -->}}', True)
self.assertEqual(self._text, '{{a}}')
self.assertFalse(self._mwpfh)
@@ -739,7 +739,6 @@
@require_modules('mwparserfromhell')
def test_removing_disabled_parts_mwpfh(self):
"""Test removing disabled parts when using the mwpfh variant."""
- self.patch(config, 'use_mwparserfromhell', True)
textlib.extract_templates_and_params('{{a<!-- -->}}', True)
self.assertEqual(self._text, '{{a}}')
self.assertTrue(self._mwpfh)
@@ -752,7 +751,7 @@
def test_strip_regex(self):
"""Test stripping values when using the regex variant."""
- self.patch(config, 'use_mwparserfromhell', False)
+ self.patch(textlib, 'mwparserfromhell', Exception())
textlib.extract_templates_and_params('{{a| foo }}', False, True)
self.assertEqual(self._args, (False, True))
self.assertFalse(self._mwpfh)
@@ -766,7 +765,6 @@
@require_modules('mwparserfromhell')
def test_strip_mwpfh(self):
"""Test stripping values when using the mwpfh variant."""
- self.patch(config, 'use_mwparserfromhell', True)
textlib.extract_templates_and_params('{{a| foo }}', None, True)
self.assertEqual(self._args, (True, ))
self.assertTrue(self._mwpfh)
--
To view, visit https://gerrit.wikimedia.org/r/594478
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Id055618106eec13957c0038210cbabf19771ad5b
Gerrit-Change-Number: 594478
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Legoktm <legoktm(a)member.fsf.org>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/595861 )
Change subject: [flake8] Solve some flake8 issues
......................................................................
[flake8] Solve some flake8 issues
A new flake8 release was published today.
Change-Id: Id7d1594f588ce737a8b7798638a34b378743486d
---
M pywikibot/data/mysql.py
M pywikibot/date.py
M pywikibot/diff.py
M pywikibot/site_detect.py
M pywikibot/textlib.py
M scripts/casechecker.py
M scripts/imagecopy_self.py
M scripts/interwiki.py
M scripts/replicate_wiki.py
M scripts/solve_disambiguation.py
M tests/api_tests.py
11 files changed, 61 insertions(+), 63 deletions(-)
Approvals:
Dvorapa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/data/mysql.py b/pywikibot/data/mysql.py
index 15a6ceb..05186c3 100644
--- a/pywikibot/data/mysql.py
+++ b/pywikibot/data/mysql.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Miscellaneous helper functions for mysql queries."""
#
-# (C) Pywikibot team, 2016-2019
+# (C) Pywikibot team, 2016-2020
#
# Distributed under the terms of the MIT license.
#
@@ -71,7 +71,8 @@
if not isinstance(_query, UnicodeType):
_query = UnicodeType(_query, encoding='utf-8')
_query = _query.strip()
- _query = '\n'.join(' {0}'.format(l) for l in _query.splitlines())
+ _query = '\n'.join(' {0}'.format(line)
+ for line in _query.splitlines())
pywikibot.output('Executing query:\n' + _query)
cursor.execute(query, params)
diff --git a/pywikibot/date.py b/pywikibot/date.py
index 6ad13da..c4e0030 100644
--- a/pywikibot/date.py
+++ b/pywikibot/date.py
@@ -374,7 +374,7 @@
continue
if (len(s) in (2, 3) and s[0] == '%'
and s[-1] in _digitDecoders
- and(len(s) == 2 or s[1] in _decimalDigits)):
+ and (len(s) == 2 or s[1] in _decimalDigits)):
# Must match a "%2d" or "%d" style
dec = _digitDecoders[s[-1]]
if isinstance(dec, UnicodeType):
diff --git a/pywikibot/diff.py b/pywikibot/diff.py
index 002df37..1bc9a20 100644
--- a/pywikibot/diff.py
+++ b/pywikibot/diff.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Diff module."""
#
-# (C) Pywikibot team, 2014-2019
+# (C) Pywikibot team, 2014-2020
#
# Distributed under the terms of the MIT license.
#
@@ -89,10 +89,10 @@
"""Generator of diff text for this hunk, without formatting."""
# make sure each line ends with '\n' to prevent
# behaviour like http://bugs.python.org/issue2142
- def check_line(l):
- if not l.endswith('\n'):
- return l + '\n'
- return l
+ def check_line(line):
+ if not line.endswith('\n'):
+ line += '\n'
+ return line
for tag, i1, i2, j1, j2 in self.group:
# equal/delete/insert add additional space after the sign as it's
diff --git a/pywikibot/site_detect.py b/pywikibot/site_detect.py
index bf1d231..ac5908b 100644
--- a/pywikibot/site_detect.py
+++ b/pywikibot/site_detect.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Classes for detecting a MediaWiki site."""
#
-# (C) Pywikibot team, 2010-2019
+# (C) Pywikibot team, 2010-2020
#
# Distributed under the terms of the MIT license.
#
@@ -154,8 +154,8 @@
self.version = list(filter(
lambda x: x.startswith('MediaWiki'),
- (l.strip()
- for l in d['error']['*'].split('\n'))))[0].split()[1]
+ (line.strip()
+ for line in d['error']['*'].split('\n'))))[0].split()[1]
except Exception:
pass
else:
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 36a5229..d73510b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -667,10 +667,10 @@
'a sequence, a Link or a basestring but '
'is "{0}"'.format(type(replacement)))
- def title_section(l):
- title = l.title
- if l.section:
- title += '#' + l.section
+ def title_section(link):
+ title = link.title
+ if link.section:
+ title += '#' + link.section
return title
if isinstance(replace, Sequence):
diff --git a/scripts/casechecker.py b/scripts/casechecker.py
index 7d6273f..f744990 100755
--- a/scripts/casechecker.py
+++ b/scripts/casechecker.py
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
"""Bot to find all pages on the wiki with mixed latin and cyrilic alphabets."""
#
-# (C) Pywikibot team, 2006-2019
+# (C) Pywikibot team, 2006-2020
#
# Distributed under the terms of the MIT license.
#
@@ -408,8 +408,8 @@
msg = []
foundSuggestions = False
- for l in allLinks:
- ltxt = l['title']
+ for link in allLinks:
+ ltxt = link['title']
err = self.ProcessTitle(ltxt)
if err:
if len(err[1]) > 0:
@@ -484,14 +484,14 @@
# See if it would make sense to treat the whole word as either
# cyrilic or latin
mightBeLat = mightBeLcl = True
- for l in badWord:
- if l in self.localLtr:
- if mightBeLat and l not in self.localSuspects:
+ for letter in badWord:
+ if letter in self.localLtr:
+ if mightBeLat and letter not in self.localSuspects:
mightBeLat = False
else:
- if mightBeLcl and l not in self.latinSuspects:
+ if mightBeLcl and letter not in self.latinSuspects:
mightBeLcl = False
- if l not in ascii_letters:
+ if letter not in ascii_letters:
raise ValueError('Assert failed')
# Some words are well known and frequently mixed-typed
@@ -645,16 +645,16 @@
res += self.lclClrFnt
else:
res += self.latClrFnt
- for l in word:
- if l in self.localLtr:
+ for letter in word:
+ if letter in self.localLtr:
if not lastIsCyr:
res += self.suffixClr + self.lclClrFnt
lastIsCyr = True
- elif l in ascii_letters:
+ elif letter in ascii_letters:
if lastIsCyr:
res += self.suffixClr + self.latClrFnt
lastIsCyr = False
- res += l
+ res += letter
return res + self.suffixClr + '</b>'
def _ColorCodeWordScreen(self, word):
@@ -664,16 +664,16 @@
res += self.colorFormatLocalColor
else:
res += self.colorFormatLatinColor
- for l in word:
- if l in self.localLtr:
+ for letter in word:
+ if letter in self.localLtr:
if not lastIsCyr:
res += self.colorFormatLocalColor
lastIsCyr = True
- elif l in self.latLtr:
+ elif letter in self.latLtr:
if lastIsCyr:
res += self.colorFormatLatinColor
lastIsCyr = False
- res += l
+ res += letter
return formatter.color_format(res + self.colorFormatSuffix)
def AddNoSuggestionTitle(self, title):
diff --git a/scripts/imagecopy_self.py b/scripts/imagecopy_self.py
index 885fc1c..4195a26 100644
--- a/scripts/imagecopy_self.py
+++ b/scripts/imagecopy_self.py
@@ -315,8 +315,8 @@
informationTemplate,
informationFields,
]
- for l in lists:
- if not l.get(lang):
+ for item in lists:
+ if not item.get(lang):
return False
return True
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index aceae76..9fd6a7d 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -1629,7 +1629,7 @@
# clone original newPages dictionary, so that we can modify it to the
# local page's needs
new = newPages.copy()
- interwikis = [pywikibot.Page(l) for l in page.iterlanglinks()]
+ interwikis = [pywikibot.Page(link) for link in page.iterlanglinks()]
# remove interwiki links to ignore
for iw in re.finditer(r'<!-- *\[\[(.*?:.*?)\]\] *-->', pagetext):
@@ -1852,8 +1852,8 @@
page = new[site]
if not page.section():
try:
- linkedPages = {pywikibot.Page(l)
- for l in page.iterlanglinks()}
+ linkedPages = {pywikibot.Page(link)
+ for link in page.iterlanglinks()}
except pywikibot.NoPage:
pywikibot.warning(
'Page {} does no longer exist?!'.format(page))
diff --git a/scripts/replicate_wiki.py b/scripts/replicate_wiki.py
index a7cc7cd..9dc870f 100755
--- a/scripts/replicate_wiki.py
+++ b/scripts/replicate_wiki.py
@@ -39,7 +39,7 @@
destination_wiki destination wiki(s)
"""
#
-# (C) Pywikibot team, 2012-2019
+# (C) Pywikibot team, 2012-2020
#
# Distributed under the terms of the MIT license.
#
@@ -169,16 +169,16 @@
.format(site.user()))
output = '== Pages that differ from original ==\n\n'
if self.differences[site]:
- output += ''.join('* [[:%s]]\n' % l for l in
- self.differences[site])
+ output += ''.join('* [[:{}]]\n'.format(page_title)
+ for page_title in self.differences[site])
else:
output += 'All important pages are the same'
output += (
'\n\n== Admins from original that are missing here ==\n\n')
if self.user_diff[site]:
- output += ''.join('* %s\n' % l.replace('_', ' ') for l in
- self.user_diff[site])
+ output += ''.join('* {}\n'.format(user_name.replace('_', ' '))
+ for user_name in self.user_diff[site])
else:
output += (
'All users from original are also present on this wiki')
diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index e40b9cf..be0c757 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -721,9 +721,9 @@
"""
titles = {firstcap(t) for t in self.firstlinks(page)}
links = list(links)
- for l in links[:]: # uses a copy because of remove!
- if l.title() not in titles:
- links.remove(l)
+ for link in links[:]: # uses a copy because of remove!
+ if link.title() not in titles:
+ links.remove(link)
return links
def treat_links(self, refPage, disambPage):
@@ -1058,13 +1058,15 @@
links = disambPage2.linkedPages()
if self.first_only:
links = self.firstize(disambPage2, links)
- links = [correctcap(l, disambPage2.get()) for l in links]
+ links = [correctcap(link, disambPage2.get())
+ for link in links]
except pywikibot.NoPage:
pywikibot.output('No page at {0}, using redirect target.'
.format(disambTitle))
links = disambPage.linkedPages()[:1]
- links = [correctcap(l, disambPage.get(get_redirect=True))
- for l in links]
+ links = [correctcap(link,
+ disambPage.get(get_redirect=True))
+ for link in links]
self.alternatives += links
else:
try:
@@ -1096,22 +1098,22 @@
links = disambPage2.linkedPages()
if self.first_only:
links = self.firstize(disambPage2, links)
- links = [correctcap(l, disambPage2.get())
- for l in links]
+ links = [correctcap(link, disambPage2.get())
+ for link in links]
except pywikibot.NoPage:
pywikibot.output(
'Page does not exist; using first '
'link in page {0}.'.format(disambPage.title()))
links = disambPage.linkedPages()[:1]
- links = [correctcap(l, disambPage.get())
- for l in links]
+ links = [correctcap(link, disambPage.get())
+ for link in links]
else:
try:
links = disambPage.linkedPages()
if self.first_only:
links = self.firstize(disambPage, links)
- links = [correctcap(l, disambPage.get())
- for l in links]
+ links = [correctcap(link, disambPage.get())
+ for link in links]
except pywikibot.NoPage:
pywikibot.output('Page does not exist, skipping.')
return False
diff --git a/tests/api_tests.py b/tests/api_tests.py
index 1f7bd33..851eb1c 100644
--- a/tests/api_tests.py
+++ b/tests/api_tests.py
@@ -689,8 +689,7 @@
"""Test PropertyGenerator with prop 'info'."""
mainpage = self.get_mainpage()
links = list(self.site.pagelinks(mainpage, total=10))
- titles = [l.title(with_section=False)
- for l in links]
+ titles = [link.title(with_section=False) for link in links]
gen = api.PropertyGenerator(site=self.site,
prop='info',
parameters={'titles': '|'.join(titles)})
@@ -707,8 +706,7 @@
"""Test PropertyGenerator with prop 'revisions'."""
mainpage = self.get_mainpage()
links = list(self.site.pagelinks(mainpage, total=10))
- titles = [l.title(with_section=False)
- for l in links]
+ titles = [link.title(with_section=False) for link in links]
gen = api.PropertyGenerator(site=self.site,
prop='revisions',
parameters={'titles': '|'.join(titles)})
@@ -727,8 +725,7 @@
"""Test PropertyGenerator with prop 'revisions' and 'coordinates'."""
mainpage = self.get_mainpage()
links = list(self.site.pagelinks(mainpage, total=10))
- titles = [l.title(with_section=False)
- for l in links]
+ titles = [link.title(with_section=False) for link in links]
gen = api.PropertyGenerator(site=self.site,
prop='revisions|coordinates',
parameters={'titles': '|'.join(titles)})
@@ -747,8 +744,7 @@
"""Test PropertyGenerator with many limited props."""
mainpage = self.get_mainpage()
links = list(self.site.pagelinks(mainpage, total=30))
- titles = [l.title(with_section=False)
- for l in links]
+ titles = [link.title(with_section=False) for link in links]
params = {
'rvprop': 'ids|flags|timestamp|user|comment|content',
'titles': '|'.join(titles)}
@@ -775,8 +771,7 @@
"""Test PropertyGenerator with many limited props and continuations."""
mainpage = self.get_mainpage()
links = list(self.site.pagelinks(mainpage, total=30))
- titles = [l.title(with_section=False)
- for l in links]
+ titles = [link.title(with_section=False) for link in links]
gen = api.PropertyGenerator(
site=self.site, prop='info|categoryinfo|langlinks|templates',
parameters={'titles': '|'.join(titles)})
--
To view, visit https://gerrit.wikimedia.org/r/595861
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Id7d1594f588ce737a8b7798638a34b378743486d
Gerrit-Change-Number: 595861
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/595302 )
Change subject: [doc] Minor typo in ROADMAP.rst
......................................................................
[doc] Minor typo in ROADMAP.rst
Change-Id: I3602d2447a687d90cb01fdc4f9c054ecea7abf00
---
M ROADMAP.rst
1 file changed, 1 insertion(+), 1 deletion(-)
Approvals:
Dvorapa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/ROADMAP.rst b/ROADMAP.rst
index 1424385..ea8d2b0 100644
--- a/ROADMAP.rst
+++ b/ROADMAP.rst
@@ -7,7 +7,7 @@
Future releases
~~~~~~~~~~~~~~~
-* 3.0.20200508 Page.getVersionHistory and Page.fullVersionHistory() methods will be removed (T136513, T151110)
+* 3.0.20200508: Page.getVersionHistory and Page.fullVersionHistory() methods will be removed (T136513, T151110)
* 3.0.20200405: Site and Page methods deprecated for 10 years or longer will be removed
* 3.0.20200405: Usage of SkipPageError with BaseBot will be removed
* 3.0.20200326: Functions dealing with stars list will be removed
--
To view, visit https://gerrit.wikimedia.org/r/595302
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I3602d2447a687d90cb01fdc4f9c054ecea7abf00
Gerrit-Change-Number: 595302
Gerrit-PatchSet: 4
Gerrit-Owner: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/594535 )
Change subject: [IMPR] Add to docs in scripts/data_ingestion.py
......................................................................
[IMPR] Add to docs in scripts/data_ingestion.py
Improve the documentation in scripts/data_ingestion.py with additional
information including a description of what the script does, required
config files and parameters.
Bug: T250636
Change-Id: Ia0e9dd9f8a754b9a1a17a65bdf1f7c138bda3f57
---
M scripts/data_ingestion.py
1 file changed, 102 insertions(+), 5 deletions(-)
Approvals:
Dvorapa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py
index ff5e555..56df789 100755
--- a/scripts/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -1,11 +1,98 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
-"""
-A generic bot to do data ingestion (batch uploading).
-usage:
+r"""
+A generic bot to do data ingestion (batch uploading) of photos or other files.
- python pwb.py data_ingestion -csvdir:local_dir/ -page:config_page
+In addition it installs related metadata. The uploading is primarily from a url
+to a wiki-site.
+
+Required configuration files
+============================
+ - a 'Data ingestion' template on a wiki site that specifies the name of a
+ csv file, and csv configuration values.
+ - a csv file that specifies each file to upload, the file's copy-from URL
+ location, and some metadata.
+
+Required parameters
+===================
+The following parameters are required. The 'csvdir' and the 'page:csvFile' will
+be joined creating a path to a csv file that should contain specified
+information about files to upload.
+
+-csvdir A directory path to csv files
+
+-page A wiki path to templates. One of the templates at this
+ location must be a 'Data ingestion' template with the
+ following parameters.
+
+ Required parameters
+ csvFile
+
+ Optional parameters
+ sourceFormat
+ options: 'csv'
+
+ sourceFileKey
+ options: 'StockNumber'
+
+ csvDialect
+ options: 'excel', ''
+
+ csvDelimiter
+ options: any delimiter, ',' is most common
+
+ csvEncoding
+ options: 'utf8', 'Windows-1252'
+
+ formattingTemplate
+
+ titleFormat
+
+
+Example 'Data ingestion' template
+=================================
+.. code::
+
+ {{Data ingestion
+ |sourceFormat=csv
+ |csvFile=csv_ingestion.csv
+ |sourceFileKey=%(StockNumber)
+ |csvDialect=
+ |csvDelimiter=,
+ |csvEncoding=utf8
+ |formattingTemplate=Template:Data ingestion test configuration
+ |titleFormat=%(name)s - %(set)s.%(_ext)s
+ }}
+
+
+Csv file
+========
+A full example can be found at tests/data/csv_ingestion.csv
+The 'url' field is the location a file will be copied from.
+
+csv field Headers::
+
+ description.en,source,author,license,set,name,url
+
+
+Usage
+=====
+.. code::
+
+ python pwb.py data_ingestion -csvdir:<local_dir/> -page:<cfg_page_on_wiki>
+
+
+Example
+=======
+Warning! Put it in one line, otherwise it won't work correctly.
+
+.. code::
+
+ python pwb.py data_ingestion \
+ -csvdir:"test/data" \
+ -page:"User:<Your-Username>/data_ingestion_test_template"
+
"""
#
# (C) Pywikibot team, 2012-2020
@@ -186,7 +273,13 @@
self.generator = value
def treat(self, photo):
- """Process each page."""
+ """
+ Process each page.
+
+ 1. Check for existing duplicates on the wiki specified in self.site.
+ 2. If duplicates are found, then skip uploading.
+ 3. Download the file from photo.URL and upload the file to self.site.
+ """
duplicates = photo.findDuplicateImages()
if duplicates:
pywikibot.output('Skipping duplicate of {!r}'
@@ -254,6 +347,10 @@
"""
# Process global args and prepare generator args parser
local_args = pywikibot.handle_args(args)
+
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
genFactory = pagegenerators.GeneratorFactory()
csv_dir = None
--
To view, visit https://gerrit.wikimedia.org/r/594535
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia0e9dd9f8a754b9a1a17a65bdf1f7c138bda3f57
Gerrit-Change-Number: 594535
Gerrit-PatchSet: 8
Gerrit-Owner: DC Slagel <dcs(a)mailworks.org>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: DC Slagel <dcs(a)mailworks.org>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Isaacandy <isaac(a)iznd.xyz>
Gerrit-Reviewer: Siebrand <siebrand(a)kitano.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/593055 )
Change subject: [FEAT] pagegenerators: handle protocols in -weblink
......................................................................
[FEAT] pagegenerators: handle protocols in -weblink
Currently, http is hardcoded in -weblink. It's not possible
to use another protocol.
This patch allows you to specify a protocol in this CLI
parameter, in the form of a URL, such as
-weblink:https://wikipedia.org. Http is keept as default
if none is given.
Added tests for -weblink and LinksearchPageGenerator class
behind it.
Bug: T251310
Bug: T251308
Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
---
M pywikibot/pagegenerators.py
M pywikibot/site/__init__.py
M tests/pagegenerators_tests.py
3 files changed, 77 insertions(+), 13 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d45c181..1521617 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -993,10 +993,6 @@
if not value:
value = pywikibot.input(
'Pages with which weblink should be processed?')
- # If url is * we make it None in order to search for every page
- # with any URL.
- if value == '*':
- value = None
return LinksearchPageGenerator(value, site=self.site)
def _handle_transcludes(self, value):
@@ -2583,10 +2579,10 @@
@deprecated_args(link='url', euprotocol='protocol', step=None)
def LinksearchPageGenerator(url, namespaces=None, total=None,
- site=None, protocol='http'):
+ site=None, protocol=None):
"""Yield all pages that link to a certain URL, like Special:Linksearch.
- @param url: The URL to search for (without the protocol prefix);
+ @param url: The URL to search for (with ot without the protocol prefix);
this may include a '*' as a wildcard, only at the start of the
hostname
@type url: str
@@ -2594,8 +2590,11 @@
@type namespaces: list of int
@param total: Maximum number of pages to retrieve in total
@type total: int
- @param site: Site for generator results.
+ @param site: Site for generator results
@type site: L{pywikibot.site.BaseSite}
+ @param protocol: Protocol to search for, likely http or https, http by
+ default. Full list shown on Special:LinkSearch wikipage
+ @type protocol: str
"""
if site is None:
site = pywikibot.Site()
diff --git a/pywikibot/site/__init__.py b/pywikibot/site/__init__.py
index 5b78630..d8896cb 100644
--- a/pywikibot/site/__init__.py
+++ b/pywikibot/site/__init__.py
@@ -4610,18 +4610,39 @@
return bkgen
@deprecated_args(step=None)
- def exturlusage(self, url=None, protocol='http', namespaces=None,
+ def exturlusage(self, url=None, protocol=None, namespaces=None,
total=None, content=False):
"""Iterate Pages that contain links to the given URL.
@see: U{https://www.mediawiki.org/wiki/API:Exturlusage}
- @param url: The URL to search for (without the protocol prefix);
- this may include a '*' as a wildcard, only at the start of the
- hostname
- @param protocol: The protocol prefix (default: "http")
-
+ @param url: The URL to search for (with ot without the protocol
+ prefix); this may include a '*' as a wildcard, only at the start
+ of the hostname
+ @type url: str
+ @param namespaces: list of namespace numbers to fetch contribs from
+ @type namespaces: list of int
+ @param total: Maximum number of pages to retrieve in total
+ @type total: int
+ @param protocol: Protocol to search for, likely http or https, http by
+ default. Full list shown on Special:LinkSearch wikipage
+ @type protocol: str
"""
+ separator = '://'
+ if separator in url:
+ found_protocol = url[:url.index(separator)]
+ url = url[url.index(separator) + len(separator):]
+ if protocol and protocol != found_protocol:
+ raise ValueError('Protocol was specified, but a different one '
+ 'was found in searched url')
+ protocol = found_protocol
+ if not protocol:
+ protocol = 'http'
+
+ # If url is * we make it None in order to search for every page
+ # with any URL.
+ if url == '*':
+ url = None
return self._generator(api.PageGenerator, type_arg='exturlusage',
geuquery=url, geuprotocol=protocol,
namespaces=namespaces,
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ace3262..c2d1b3f 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -1626,6 +1626,50 @@
assert False # this shouldn't be reached
+class TestLinksearchPageGenerator(TestCase):
+
+ """Tests for pagegenerators.LinksearchPageGenerator."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ def test_weblink(self):
+ """Test -weblink."""
+ cases = (('wikipedia.org', 'http://wikipedia.org'),
+ ('en.wikipedia.org', 'http://en.wikipedia.org'),
+ ('https://fr.wikipedia.org', 'https://fr.wikipedia.org'),
+ ('ftp://*', 'ftp://'))
+
+ for search, expected in cases:
+ gf = pagegenerators.GeneratorFactory(site=self.site)
+ gf.handleArg('-weblink:%s' % search)
+ gf.handleArg('-ns:2')
+ gf.handleArg('-limit:1')
+ gen = gf.getCombinedGenerator()
+ genlist = list(gen)
+ self.assertLength(genlist, 1)
+
+ page = genlist[0]
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertTrue(page.exists())
+ self.assertEqual(page.namespace(), 2)
+ self.assertIn(expected, page.text)
+
+ def test_double_opposite_protocols(self):
+ """Test LinksearchPageGenerator with two opposite protocols."""
+ self.assertRaises(ValueError, pagegenerators.LinksearchPageGenerator,
+ 'http://w.wiki', protocol='https', site=self.site)
+
+ def test_double_same_protocols(self):
+ """Test LinksearchPageGenerator with two same protocols."""
+ gen = pagegenerators.LinksearchPageGenerator('https://w.wiki',
+ protocol='https',
+ site=self.site,
+ total=1)
+ self.assertIsInstance(gen, pywikibot.data.api.PageGenerator)
+ self.assertEqual(len(list(gen)), 1)
+
+
if __name__ == '__main__': # pragma: no cover
try:
unittest.main()
--
To view, visit https://gerrit.wikimedia.org/r/593055
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
Gerrit-Change-Number: 593055
Gerrit-PatchSet: 4
Gerrit-Owner: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/594524 )
Change subject: [IMPR] Faster lookup for write actions
......................................................................
[IMPR] Faster lookup for write actions
Use a set instead a tuple for loopup which is upto
20 times faster.
Change-Id: I5b767a94a4352ff376d369cf11dc19be14dda09e
---
M pywikibot/data/api.py
1 file changed, 2 insertions(+), 2 deletions(-)
Approvals:
Matěj Suchánek: Looks good to me, but someone else must approve
Framawiki: Looks good to me, but someone else must approve
Dvorapa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/data/api.py b/pywikibot/data/api.py
index d65bf7e..cca8864 100644
--- a/pywikibot/data/api.py
+++ b/pywikibot/data/api.py
@@ -1221,7 +1221,7 @@
# Actions that imply database updates on the server, used for various
# things like throttling or skipping actions when we're in simulation
# mode
- self.write = self.action in (
+ self.write = self.action in {
'block', 'clearhasmsg', 'createaccount', 'delete', 'edit',
'emailuser', 'filerevert', 'flowthank', 'imagerotate', 'import',
'managetags', 'mergehistory', 'move', 'options', 'patrol',
@@ -1233,7 +1233,7 @@
'wbremovequalifiers', 'wbremovereferences', 'wbsetaliases',
'wbsetclaim', 'wbsetclaimvalue', 'wbsetdescription', 'wbsetlabel',
'wbsetqualifier', 'wbsetreference', 'wbsetsitelink',
- )
+ }
# Client side verification that the request is being performed
# by a logged in user, and warn if it isn't a config username.
if self.write:
--
To view, visit https://gerrit.wikimedia.org/r/594524
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I5b767a94a4352ff376d369cf11dc19be14dda09e
Gerrit-Change-Number: 594524
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)