jenkins-bot has submitted this change and it was merged.
Change subject: Add Family.from_url support for generated families
......................................................................
Add Family.from_url support for generated families
Site.interwiki used Family._get_path_regex, which didnt support
family classes that have multiple site and do not accept None
as the 'code' in methods Family.path and Family.nicepath.
Cache the Family url regex.
Also cache exceptions that occur while loading sites,
and include them in the outermost exception.
Bug: T85658
Change-Id: Idf16bf08db9dbea58197004a39c66296d2f6e713
---
M pywikibot/__init__.py
M pywikibot/families/wikia_family.py
M pywikibot/family.py
M pywikibot/page.py
M pywikibot/site.py
M tests/family_tests.py
6 files changed, 206 insertions(+), 54 deletions(-)
Approvals:
John Vandenberg: Looks good to me, but someone else must approve
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index db96621..89d748f 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -560,21 +560,25 @@
code = cached[0]
fam = cached[1]
else:
- raise Error("Unknown URL '{0}'.".format(url))
+ raise SiteDefinitionError("Unknown URL
'{0}'.".format(url))
else:
# Iterate through all families and look, which does apply to
# the given URL
for fam in config.family_files:
- family = pywikibot.family.Family.load(fam)
- code = family.from_url(url)
- if code:
- _url_cache[url] = (code, fam)
- break
+ try:
+ family = pywikibot.family.Family.load(fam)
+ code = family.from_url(url)
+ if code:
+ _url_cache[url] = (code, fam)
+ break
+ except Exception as e:
+ pywikibot.warning('Error in Family(%s).from_url: %s'
+ % (fam, e))
else:
_url_cache[url] = None
# TODO: As soon as AutoFamily is ready, try and use an
# AutoFamily
- raise Error("Unknown URL '{0}'.".format(url))
+ raise SiteDefinitionError("Unknown URL
'{0}'.".format(url))
else:
# Fallback to config defaults
code = code or config.mylang
diff --git a/pywikibot/families/wikia_family.py b/pywikibot/families/wikia_family.py
index 73e0f46..2121e9b 100644
--- a/pywikibot/families/wikia_family.py
+++ b/pywikibot/families/wikia_family.py
@@ -20,7 +20,7 @@
self.name = u'wikia'
self.langs = {
- u'wikia': None,
+ 'wikia': 'www.wikia.com',
}
def hostname(self, code):
diff --git a/pywikibot/family.py b/pywikibot/family.py
index f0d0d8f..032433d 100644
--- a/pywikibot/family.py
+++ b/pywikibot/family.py
@@ -29,7 +29,7 @@
from pywikibot import config2 as config
from pywikibot.tools import deprecated, deprecate_arg, issue_deprecation_warning
-from pywikibot.exceptions import Error, UnknownFamily, FamilyMaintenanceWarning
+from pywikibot.exceptions import UnknownFamily, FamilyMaintenanceWarning
logger = logging.getLogger("pywiki.wiki.family")
@@ -1080,40 +1080,115 @@
def nicepath(self, code):
return '/wiki/'
+ def _get_path_regex(self, code):
+ """
+ Return a regex matching a site URL path.
+
+ @return: regex string
+ @rtype: unicode
+ """
+ # The trailing slash after path(code) is optional.
+ return ('(?:%s?|%s)' %
+ (re.escape(self.path(code) + '/'),
+ re.escape(self.nicepath(code))))
+
+ def _get_url_regex(self, code):
+ """
+ Return a regex matching a site URL.
+
+ Regex match group 1 is the domain.
+
+ Does not make use of ssl_hostname or ssl_pathprefix.
+
+ @return: regex string
+ @rtype: unicode
+ """
+ return (r'(?:\/\/|%s\:\/\/)(%s)%s' %
+ (self.protocol(code),
+ re.escape(self.hostname(code)),
+ self._get_path_regex(code)))
+
def rcstream_host(self, code):
raise NotImplementedError("This family does not support RCStream")
def nice_get_address(self, code, title):
return '%s%s' % (self.nicepath(code), title)
- def _get_path_regex(self):
+ def _get_regex_all(self):
"""
- Return a regex matching the path after the domain.
+ Return a regex matching any site.
- It is using L{path} and L{nicepath} with code set to
- 'None'. If that returns a KeyError (L{scriptpath} probably
- using the C{langs} dictionary) it retries it with the key from
- C{langs} if it only contains one entry and throws an Error
- otherwise. In that case the Family instance should overwrite this
- method or supply code independent methods.
+ It is using Family methods with code set to 'None' initially.
+ That will raise KeyError if the Family methods use the code to
+ lookup the correct value in a dictionary such as C{langs}.
+ On KeyError, it retries it with each key from C{langs}.
- @raise Error: If it's not possible to automatically get a code
- independent regex.
+ @return: regex string
+ @rtype: unicode
"""
- def _get_coded_path_regex(code):
- return ('(?:' + re.escape(self.path(code) + '/') +
'|' +
- re.escape(self.nicepath(code)) + ')')
+ if hasattr(self, '_regex_all'):
+ return self._regex_all
+
try:
- return _get_coded_path_regex(None)
+ self._regex_all = self._get_url_regex(None)
+ return self._regex_all
except KeyError:
# Probably automatically generated family
- if len(self.langs) == 1:
- return _get_coded_path_regex(next(iter(self.langs.keys())))
- else:
- raise Error('Pywikibot is unable to generate an automatic '
- 'path regex for the family {0}. It is recommended '
- 'to overwrite "_get_path_regex" in that '
- 'family.'.format(self.name))
+ pass
+
+ # If there is only one code, use it.
+ if len(self.langs) == 1:
+ code = next(iter(self.langs.keys()))
+ self._regex_all = self._get_url_regex(code)
+ return self._regex_all
+
+ try:
+ protocol = self.protocol(None) + '\:\/\/'
+ except KeyError:
+ protocol = None
+
+ try:
+ hostname = re.escape(self.hostname(None))
+ except KeyError:
+ hostname = None
+
+ try:
+ path = self._get_path_regex(None)
+ except KeyError:
+ path = None
+
+ # If two or more of the three above varies, the regex cant be optimised
+ none_count = [protocol, hostname, path].count(None)
+
+ if none_count > 1:
+ self._regex_all = ('(?:%s)'
+ % '|'.join(self._get_url_regex(code)
+ for code in self.langs.keys()))
+ return self._regex_all
+
+ if not protocol:
+ protocols = set(self.protocol(code) + '\:\/\/'
+ for code in self.langs.keys())
+ protocol = '|'.join(protocols)
+
+ # Allow protocol neutral '//'
+ protocol = '(?:\/\/|%s)' % protocol
+
+ if not hostname:
+ hostnames = set(re.escape(self.hostname(code))
+ for code in self.langs.keys())
+ hostname = '|'.join(hostnames)
+
+ # capture hostname
+ hostname = '(' + hostname + ')'
+
+ if not path:
+ regexes = set(self._get_path_regex(code)
+ for code in self.langs.keys())
+ path = '(?:%s)' % '|'.join(regexes)
+
+ self._regex_all = protocol + hostname + path
+ return self._regex_all
def from_url(self, url):
"""
@@ -1123,27 +1198,35 @@
L{Family.nice_get_address} or L{Family.path}. If the protocol doesn't
match but is present in the interwikimap it'll log this.
- It uses L{Family._get_path_regex} to generate a regex defining the path
- after the domain.
+ It ignores $1 in the url, and anything that follows it.
@return: The language code of the url. None if that url is not from
this family.
@rtype: str or None
+ @raises RuntimeError: Mismatch between Family langs dictionary and
+ URL regex.
"""
- url_match = re.match(r'(?:(https?)://|//)?(.*){0}'
- '\$1'.format(self._get_path_regex()), url)
+ if '$1' in url:
+ url = url[:url.find('$1')]
+
+ url_match = re.match(self._get_regex_all(), url)
if not url_match:
return None
+
for code, domain in self.langs.items():
- if domain == url_match.group(2):
- break
- else:
- return None
- if url_match.group(1) and url_match.group(1) != self.protocol(code):
- pywikibot.log('The entry in the interwikimap uses {0} but the '
- 'family is configured to use {1}'.format(
- url_match.group(1), self.protocol(code)))
- return code
+ if domain is None:
+ warn('Family(%s): langs missing domain names' % self.name,
+ FamilyMaintenanceWarning)
+ elif domain == url_match.group(1):
+ return code
+
+ # if domain was None, this will return the only possible code.
+ if len(self.langs) == 1:
+ return next(iter(self.langs))
+
+ raise RuntimeError(
+ 'Family(%s): matched regex has not matched a domain in langs'
+ % self.name)
def maximum_GET_length(self, code):
return config.maximum_GET_length
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 5f58dee..c77cc78 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -4498,11 +4498,11 @@
newsite = self._site.interwiki(prefix)
except KeyError:
break # text before : doesn't match any known prefix
- except SiteDefinitionError:
+ except SiteDefinitionError as e:
raise SiteDefinitionError(
u'{0} is not a local page on {1}, and the interwiki prefix '
- '{2} is not supported by PyWikiBot!'.format(
- self._text, self._site, prefix))
+ '{2} is not supported by PyWikiBot!:\n{3}'.format(
+ self._text, self._site, prefix, e))
else:
t = t[t.index(u":"):].lstrip(u":").lstrip(u"
")
if first_other_site:
diff --git a/pywikibot/site.py b/pywikibot/site.py
index 62b8c4f..bbac719 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -54,7 +54,6 @@
LockedNoPage,
NoPage,
UnknownSite,
- SiteDefinitionError,
FamilyMaintenanceWarning,
NoUsername,
SpamfilterError,
@@ -638,6 +637,9 @@
del new['_pagemutex']
if '_throttle' in new:
del new['_throttle']
+ # site cache contains exception information, which cant be pickled
+ if '_iw_sites' in new:
+ del new['_iw_sites']
return new
def __setstate__(self, attrs):
@@ -703,10 +705,10 @@
self._iw_sites = {}
for iw in self.siteinfo['interwikimap']:
try:
- site = (pywikibot.Site(url=iw['url']), 'local' in
iw)
- except Error:
- site = (None, False)
- self._iw_sites[iw['prefix']] = site
+ site = pywikibot.Site(url=iw['url'])
+ except Exception as e:
+ site = e
+ self._iw_sites[iw['prefix']] = (site, 'local' in iw)
def interwiki(self, prefix):
"""
@@ -719,11 +721,13 @@
self._cache_interwikimap()
if prefix in self._iw_sites:
site = self._iw_sites[prefix]
- if site[0]:
+ if isinstance(site[0], BaseSite):
return site[0]
+ elif isinstance(site[0], Exception):
+ raise site[0]
else:
- raise SiteDefinitionError(
- u"No family/site found for prefix
'{0}'".format(prefix))
+ raise TypeError('_iw_sites[%s] is wrong type: %s'
+ % (prefix, type(site[0])))
else:
raise KeyError(u"'{0}' is not an interwiki
prefix.".format(prefix))
diff --git a/tests/family_tests.py b/tests/family_tests.py
index 24e9c95..19c447d 100644
--- a/tests/family_tests.py
+++ b/tests/family_tests.py
@@ -11,6 +11,7 @@
from pywikibot.family import Family
from pywikibot.exceptions import UnknownFamily
+
import pywikibot.site
from tests.aspects import (
@@ -91,6 +92,66 @@
self.assertEqual(family.obsolete['ru-sib'], None)
+class TestFamilyUrlRegex(TestCase):
+
+ """Test family URL regex."""
+
+ net = False
+
+ def test_get_regex_wikipedia_precise(self):
+ """Test the family regex is optimal."""
+ f = Family.load('wikipedia')
+ regex = f._get_regex_all()
+
+ self.assertTrue(regex.startswith('(?:\/\/|https\:\/\/)('))
+ self.assertIn('vo\.wikipedia\.org', regex)
+ self.assertTrue(regex.endswith(')(?:\/w\/index\.php\/?|\/wiki\/)'))
+
+ def test_from_url_wikipedia_extra(self):
+ """Test various URLs against wikipedia regex."""
+ f = Family.load('wikipedia')
+
+ prefix = 'https://vo.wikipedia.org'
+
+ self.assertEqual(f.from_url(prefix + '/wiki/'), 'vo')
+ self.assertEqual(f.from_url(prefix + '/w/index.php'), 'vo')
+ self.assertEqual(f.from_url(prefix + '/w/index.php/'), 'vo')
+ self.assertEqual(f.from_url(prefix + '/w/index.php?title=$1'),
'vo')
+
+ self.assertEqual(f.from_url(prefix + '/wiki/$1'), 'vo')
+
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1')$1'), 'vo')
+
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo')oo'),
'vo')
+ self.assertEqual(f.from_url(prefix + '/w/index.php/$1'), 'vo')
+
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1')$1'), 'vo')
+
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo')oo'),
'vo')
+
+ # wrong protocol
+
self.assertIsNone(f.from_url('http://vo.wikipedia.org/wiki/$1'))
+
self.assertIsNone(f.from_url('ftp://vo.wikipedia.org/wiki/$1'))
+ # wrong code
+
self.assertIsNone(f.from_url('https://foobar.wikipedia.org/wiki/$1'…
+ # wrong family
+
self.assertIsNone(f.from_url('https://vo.wikibooks.org/wiki/$1'))
+
self.assertIsNone(f.from_url('http://vo.wikibooks.org/wiki/$1'))
+ # invalid path
+
self.assertIsNone(f.from_url('https://vo.wikipedia.org/wik/$1'))
+
self.assertIsNone(f.from_url('https://vo.wikipedia.org/index.php/$1'…
+
+ def test_each_family(self):
+ """Test each family builds a working regex."""
+ for family in pywikibot.config.family_files:
+ family = Family.load(family)
+ # Test family does not respond to from_url due to overlap
+ # with Wikipedia family.
+ if family.name == 'test':
+ continue
+ for code in family.langs:
+ url = ('%s://%s%s$1' % (family.protocol(code),
+ family.hostname(code),
+ family.path(code)))
+ self.assertEqual(family.from_url(url), code)
+
+
class TestOldFamilyMethod(DeprecationTestCase):
"""Test cases for old site.Family method."""
--
To view, visit
https://gerrit.wikimedia.org/r/182406
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Idf16bf08db9dbea58197004a39c66296d2f6e713
Gerrit-PatchSet: 10
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>