Revision: 7920
Author: xqt
Date: 2010-02-09 14:34:01 +0000 (Tue, 09 Feb 2010)
Log Message:
-----------
family/wikipedia: read redirect tags via API (dict removed)
cc: add fixArabicLetters
solve_disambiguation: update exception list
pywikibot: bugfixes
Modified Paths:
--------------
trunk/pywikipedia/cosmetic_changes.py
trunk/pywikipedia/family.py
trunk/pywikipedia/pywikibot/__init__.py
trunk/pywikipedia/pywikibot/textlib.py
trunk/pywikipedia/solve_disambiguation.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/cosmetic_changes.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -275,6 +275,7 @@
text = self.fixHtml(text)
text = self.fixStyle(text)
text = self.fixTypo(text)
+ text = self.fixArabicLetters(text)
try:
text = isbn.hyphenateIsbnNumbers(text)
except isbn.InvalidIsbnException, error:
@@ -679,6 +680,40 @@
text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1',
exceptions)
return text
+ def fixArabicLetters(self, text):
+ if self.site.lang=='ckb':
+ exceptions = [
+ 'gallery',
+ 'hyperlink',
+ 'interwiki',
+ 'link',
+ 'math',
+ 'pre',
+ 'template',
+ 'timeline',
+ 'ref',
+ 'source',
+ 'startspace',
+ ]
+ text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+ text = pywikibot.replaceExcept(text, ur'ه([.، ])', ur'ە\1',
exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ە',
exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+ text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+ text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی',
exceptions)
+ # replace persian digits
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i],
u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
+ # do not change digits in class, style and table params
+ pattern = re.compile(u'=".*?"', re.UNICODE)
+ exceptions.append(pattern)
+ # do not change digits inside html-tags
+ pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+ exceptions.append(pattern)
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i],
exceptions)
+ return text
+
class CosmeticChangesBot:
def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic
changes'):
self.generator = generator
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/family.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -3449,178 +3449,6 @@
def category_namespaces(self, code):
return self.namespace(code, 14, all = True)
- # Localised magic words for language code 'xyz' can be found in
- # the MediaWiki source code in the file
- # /mediawiki/trunk/phase3/languages/messages/MessagesXyz.php
- # in the 'magicwords' array
-
- # Localised redirect codes
-
- # Note that redirect codes are case-insensitive, so it is enough
- # to enter the code in lowercase here.
-
- # When creating a redirect page, only the first item is looked for.
- # When matching for redirects, default 'redirect' is always inserted
- # => if default redirect keyword used for a language is not 'redirect',
- # it is not necessary to add 'redirect' at the end of the list
- redirect = {
- 'ab': [u'перенаправление', u'перенапр',
u'redirect'],
- 'ace': [u'alih'],
- 'af': [u'aanstuur'],
- 'aln': [u'ridrejto'],
- 'als': [u'weiterleitung'],
- 'an': [u'redirección'],
- 'ar': [u'تحويل'],
- 'arn': [u'redirección'],
- 'arz': [u'تحويل'],
- 'av': [u'перенаправление', u'перенапр'],
- 'ay': [u'redirección'],
- 'ba': [u'перенаправление', u'перенапр'],
- 'bar': [u'weiterleitung'],
- 'bat-smg': [u'peradresavimas'],
- 'bcc': [u'تغییرمسیر'],
- 'be-tarask': [u'перанакіраваньне'],
- 'be-x-old': [u'перанакіраваньне'],
- 'bg': [u'виж', u'пренасочване'],
- 'bm': [u'redirection'],
- 'bqi': [u'تغییرمسیر'],
- 'br': [u'adkas'],
- 'bug': [u'alih'],
- 'bs': [u'preusmjeri'],
- 'cbk-zam': [u'redirección'],
- 'ce': [u'перенаправление', u'перенапр'],
- 'cs': [u'přesměruj'],
- 'cu': [u'прѣнаправлєниѥ'],
- 'cv': [u'перенаправление', u'перенапр'],
- 'cy': [u'ail-cyfeirio', u'ailgyfeirio'],
- 'de': [u'weiterleitung'],
- 'de-at': [u'weiterleitung'],
- 'de-ch': [u'weiterleitung'],
- 'de-formal': [u'weiterleitung'],
- 'dsb': [u'weiterleitung'],
- 'el': [u'ανακατευθυνση'],
- 'eml': [u'rinvia', u'rinvio'],
- 'eo': [u'alidirektu'],
- 'es': [u'redirección'],
- 'et': [u'suuna'],
- 'eu': [u'birzuzendu'],
- 'fa': [u'تغییرمسیر'],
- 'ff': [u'redirection'],
- 'fi': [u'ohjaus', u'uudelleenohjaus'],
- 'fiu-vro': [u'saadaq'],
- 'fr': [u'redirection'],
- 'frp': [u'redirèccion', u'redirection'],
- 'fur': [u'rinvia', u'rinvio'],
- 'ga': [u'athsheoladh'],
- 'gag': [u'yönlendirme'],
- 'gl': [u'redirección'],
- 'glk': [u'تغییرمسیر'],
- 'gn': [u'redirección'],
- 'gsw': [u'weiterleitung'],
- 'he': [u'הפניה'],
- 'hr': [u'preusmjeri'],
- 'hsb': [u'weiterleitung'],
- 'ht': [u'redirection'],
- 'hu': [u'átirányítás'],
- 'hy': [u'վերահղում'],
- 'id': [u'alih'],
- 'inh': [u'перенаправление', u'перенапр'],
- 'is': [u'tilvísun'],
- 'it': [u'rinvia', u'rinvio'],
- 'ja': [u'転送', u'リダイレクト'],
- 'jv': [u'alih'],
- 'ka': [u'გადამისამართება'],
- 'kaa': [u'aýdaw', u'айдау'],
- 'kk': [u'айдау'],
- 'kk-arab': [u'ايداۋ'],
- 'kk-cyrl': [u'АЙДАУ'],
- 'kk-latn': [u'aýdaw', u'айдау'],
- 'km':
[u'\u1794\u1789\u17d2\u1787\u17bc\u1793\u1794\u1793\u17d2\u178f',
-
u'\u1794\u17d2\u178f\u17bc\u179a\u1791\u17b8\u178f\u17b6\u17c6\u1784',
-
u'\u1794\u17d2\u178a\u17bc\u179a\u1785\u17c6\u178e\u1784\u1787\u17be\u1784',
- u'ប្តូរទីតាំងទៅ'],
- 'ko': [u'넘겨주기'],
- 'ksh': [u'ömleide op', u'ömleidung'],
- 'kv': [u'перенаправление', u'перенапр'],
- 'lad': [u'redirección'],
- 'lb': [u'weiterleitung'],
- 'lbe': [u'перенаправление', u'перенапр'],
- 'li': [u'doorverwijzing'],
- 'lij': [u'rinvia', u'rinvio'],
- 'lld': [u'rinvia', u'rinvio'],
- 'lmo': [u'rinvia', u'rinvio'],
- 'ln': [u'redirection'],
- 'lt': [u'peradresavimas'],
- 'map-bms': [u'alih'],
- 'mg': [u'redirection'],
- 'mhr': [u'перенаправление', u'перенапр'],
- 'mk': [u'пренасочување', u'види'],
- 'ml': [u'തിരിച്ചുവിടുക', u'തിരിച്ചുവിടല്'],
- 'mo': [u'redirecteaza'],
- 'mr': [u'पुनर्निर्देशन'],
- 'mt': [u'rindirizza'],
- 'mwl': [u'ancaminar'],
- 'myv': [u'перенаправление', u'перенапр'],
- 'mzn': [u'تغییرمسیر'],
- 'nah': [u'redirección'],
- 'nap': [u'rinvia'],
- 'nds': [u'wiederleiden', u'weiterleitung'],
- 'nds-nl': [u'deurverwiezing', u'doorverwijzing'],
- 'new': [u'पुनर्निर्देश'],
- 'nl': [u'doorverwijzing'],
- 'nn': [u'omdiriger'],
- 'no': [u'omdirigering'],
- 'oc': [u'redireccion'],
- 'os': [u'рарвыст', u'перенаправление',
u'перенапр'],
- 'pdc': [u'weiterleitung'],
- 'pl': [u'patrz', u'przekieruj', u'tam'],
- 'pms': [u'rinvia', u'rinvio'],
- 'pt': [u'redirecionamento'],
- 'pt-br': [u'redirecionamento'],
- 'qu': [u'pusapuna', u'redirección'],
- 'rmy': [u'redirecteaza'],
- 'ro': [u'redirecteaza'],
- 'ru': [u'перенаправление', u'перенапр'],
- 'sa': [u'पुनर्निदेशन'],
- 'sah': [u'перенаправление', u'перенапр'],
- 'scn': [u'rinvia', u'rinvio'],
- 'sd': [u'چوريو'],
- 'sg': [u'redirection'],
- 'shi': [u'تحويل'],
- 'si': [u'යළියොමුව'],
- 'sk': [u'presmeruj'],
- 'sl': [u'preusmeritev'],
- 'sli': [u'weiterleitung'],
- 'sq': [u'ridrejto'],
- 'sr': [u'преусмери', u'преусмери'],
- 'sr-ec': [u'преусмери'],
- 'sr-el': [u'preusmeri'],
- 'srn': [u'stir', u'doorverwijzing'],
- 'stq': [u'weiterleitung'],
- 'su': [u'alih'],
- 'sv': [u'omdirigering'],
- 'szl': [u'patrz', u'przekieruj', u'tam'],
- 'ta': [u'வழிமாற்று'],
- 'te': [u'దారిమార్పు'],
- 'th': [u'เปลี่ยนทาง'],
- 'tr': [u'yönlendirme'],
- 'tt': [u'yünältü'],
- 'tt-latn': [u'yünältü'],
- 'tt-cyrl': [u'перенаправление', u'перенапр'],
- 'ty': [u'redirection'],
- 'udm': [u'перенаправление', u'перенапр'],
- 'uk': [u'перенаправлення', u'перенаправление',
u'перенапр'],
- 'vec': [u'rinvia', u'rinvio'],
- 'vep': [u'suuna'],
- 'vi': [u'đổi', u'đổi'],
- 'vls': [u'doorverwijzing'],
- 'vro': [u'saadaq', u'suuna'],
- 'wa': [u'redirection'],
- 'wo': [u'redirection'],
- 'yi': [u'ווייטערפירן', u'הפניה'],
- 'zea': [u'doorverwijzing']
- }
-
# So can be pagename code
pagename = {
'bg': [u'СТРАНИЦА'],
Modified: trunk/pywikipedia/pywikibot/__init__.py
===================================================================
--- trunk/pywikipedia/pywikibot/__init__.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/pywikibot/__init__.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -16,7 +16,9 @@
import wikipedia
+link_regex = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+
def showDiff(oldtext, newtext):
"""
Output a string showing the differences between oldtext and newtext.
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/pywikibot/textlib.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -198,14 +198,19 @@
'parts' parameter, which defaults to all.
"""
regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly':
r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
+ 'comments' : r'<!--.*?-->',
+ 'includeonly':
r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
+ 'source': r'<source .*?</source>',
+ 'syntaxhighlight': r'<syntaxhighlight
.*?</syntaxhighlight>',
}
if '*' in tags:
tags = regexes.keys()
+ # add alias
+ tags = set(tags)
+ if 'source' in tags:
+ tags.add('syntaxhighlight')
toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
re.IGNORECASE | re.DOTALL)
return toRemoveR.sub('', text)
@@ -254,9 +259,9 @@
marker = text[firstinseparator:firstinmarker] + marker
return marker
-
+#-------------------------------------------------
# Functions dealing with interwiki language links
-
+#-------------------------------------------------
# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
# interproject. These functions only deal with links to a
# corresponding page in another language on the same project (e.g.,
@@ -302,8 +307,8 @@
site = insite.getSite(code = lang)
try:
result[site] = pywikibot.Page(site, pagetitle, insite = insite)
- except InvalidTitle:
- output(
+ except pywikibot.InvalidTitle:
+ pywikibot.output(
u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
% (lang, pagetitle))
continue
@@ -486,8 +491,9 @@
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
-
+#---------------------------------------
# Functions dealing with category links
+#---------------------------------------
def getCategoryLinks(text, site):
import catlib
@@ -665,6 +671,9 @@
#catLinks.sort()
return sep.join(catLinks) + '\r\n'
+#---------------------------------------
+# Functions dealing with external links
+#---------------------------------------
def compileLinkR(withoutBracketed=False, onlyBracketed=False):
"""Return a regex that matches external links."""
@@ -695,6 +704,9 @@
linkR = re.compile(regex)
return linkR
+#----------------------------------
+# Functions dealing with templates
+#----------------------------------
def extract_templates_and_params(text, get_redirect=False):
"""Return list of template calls found in text.
@@ -805,7 +817,9 @@
result.append((name, params))
return result
+#----------------
# I18N functions
+#----------------
# Languages to use for comment text after the actual language but before
# en:. For example, if for language 'xx', you want the preference of
Modified: trunk/pywikipedia/solve_disambiguation.py
===================================================================
--- trunk/pywikipedia/solve_disambiguation.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/solve_disambiguation.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -266,12 +266,12 @@
u'Benutzer:SrbBot.*',
u'Benutzer:PortalBot/.+',
u'Benutzer:Xqbot/.+',
- u'Benutzer Diskussion:.+',
u'Lehnwort',
u'Liste griechischer Wortstämme in deutschen Fremdwörtern',
u'Liste von Gräzismen',
u'Portal:Abkürzungen/.+',
u'Portal:Astronomie/Moves',
+ u'Portal:Astronomie/Index/.+',
u'Wikipedia:Administratoren/Anfragen',
u'Wikipedia:Archiv/.+',
u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]',
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-08 15:37:02 UTC (rev 7919)
+++ trunk/pywikipedia/wikipedia.py 2010-02-09 14:34:01 UTC (rev 7920)
@@ -7437,10 +7437,12 @@
special redirect tag.
"""
- if default:
- return self.family.redirect.get(self.lang, [u"REDIRECT"])[0]
- else:
- return self.family.redirect.get(self.lang, None)
+ tag = self.siteinfo('magicwords').get('redirect')[0][1:]
+ if tag:
+ # remove first "#" letter
+ return tag[0][1:]
+ elif default:
+ return u'REDIRECT'
def redirectRegex(self):
"""Return a compiled regular expression matching on redirect
pages.
@@ -7448,24 +7450,23 @@
Group 1 in the regex match object will be the target title.
"""
-
+ #NOTE: this is needed, since the API can give false positives!
+ default = 'REDIRECT'
try:
- redirKeywords = [u'redirect'] + self.family.redirect[self.lang]
- redirKeywordsR = r'(?:' + '|'.join(redirKeywords) +
')'
+ keywords = self.siteinfo('magicwords')['redirect']
+ pattern = r'(?:' + '|'.join(keywords) + ')'
except KeyError:
# no localized keyword for redirects
- redirKeywordsR = r'redirect'
-
- # A redirect starts with hash (#), followed by a keyword, then
- # arbitrary stuff, then a wikilink. The wikilink may contain
- # a label, although this is not useful.
-
+ pattern = r'#%s' % default
if self.versionnumber() > 12:
# in MW 1.13 (at least) a redirect directive can follow whitespace
prefix = r'\s*'
else:
prefix = r'[\r\n]*'
- return re.compile(prefix + '#' + redirKeywordsR
+ # A redirect starts with hash (#), followed by a keyword, then
+ # arbitrary stuff, then a wikilink. The wikilink may contain
+ # a label, although this is not useful.
+ return re.compile(prefix + pattern
+ '\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]',
re.IGNORECASE | re.UNICODE | re.DOTALL)