Revision: 8630
Author: xqt
Date: 2010-10-09 19:32:57 +0000 (Sat, 09 Oct 2010)
Log Message:
-----------
import wikipedia as pywikibot for merging to rewrite
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
trunk/pywikipedia/revertbot.py
trunk/pywikipedia/selflink.py
trunk/pywikipedia/spamremove.py
trunk/pywikipedia/speedy_delete.py
trunk/pywikipedia/spellcheck.py
trunk/pywikipedia/standardize_interwiki.py
trunk/pywikipedia/standardize_notes.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/reflinks.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -33,15 +33,19 @@
Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
+# (C) Pywikipedia bot team, 2008-2010
#
-# Distributed under the terms of the GPL
-
+# Distributed under the terms of the MIT license.
+#
__version__ = '$Id$'
+#
-from BeautifulSoup import UnicodeDammit
import sys, re, urllib2, httplib, socket, codecs, ftplib
-import wikipedia, pagegenerators, noreferences
import subprocess, tempfile, os, gzip, StringIO
+import wikipedia as pywikibot
+from BeautifulSoup import UnicodeDammit
+import pagegenerators
+import noreferences
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
@@ -90,9 +94,13 @@
'it':u'Titolo generato automaticamente',
}
-soft404 =
re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
re.IGNORECASE)
+soft404 = re.compile(
+ ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
+ re.IGNORECASE)
# matches an URL at the index of a website
-dirIndex =
re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
re.IGNORECASE)
+dirIndex = re.compile(
+
ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
+ re.IGNORECASE)
# Extracts the domain name
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
@@ -156,7 +164,7 @@
self.xmlStart = xmlStart
self.namespaces = namespaces
self.skipping = bool(xmlStart)
- self.site = wikipedia.getSite()
+ self.site = pywikibot.getSite()
import xmlreader
dump = xmlreader.XmlDump(xmlFilename)
@@ -175,7 +183,7 @@
if entry.title != self.xmlStart:
continue
self.skipping = False
- page=wikipedia.Page(self.site, entry.title)
+ page=pywikibot.Page(self.site, entry.title)
if not self.namespaces == []:
if page.namespace() not in self.namespaces:
continue
@@ -188,14 +196,16 @@
def __init__(self, link, name):
self.refname = name
self.link = link
- self.site = wikipedia.getSite()
- self.linkComment = wikipedia.translate(self.site, comment)
+ self.site = pywikibot.getSite()
+ self.linkComment = pywikibot.translate(self.site, comment)
self.url = re.sub(u'#.*', '', self.link)
self.title = None
def refTitle(self):
"""Returns the <ref> with its new title"""
- return '<ref%s>[%s %s<!-- %s -->]</ref>' %
(self.refname, self.link, self.title, self.linkComment)
+ return '<ref%s>[%s %s<!-- %s -->]</ref>' %
(self.refname, self.link,
+ self.title,
+ self.linkComment)
def refLink(self):
"""No title has been found, return the unbracketed
link"""
@@ -203,14 +213,14 @@
def refDead(self):
"""Dead link, tag it with a {{dead link}}"""
- tag = wikipedia.translate(self.site, deadLinkTag) % self.link
+ tag = pywikibot.translate(self.site, deadLinkTag) % self.link
return '<ref%s>%s</ref>' % (self.refname, tag)
def transform(self, ispdf = False):
"""Normalize the title"""
#convert html entities
if not ispdf:
- self.title = wikipedia.html2unicode(self.title)
+ self.title = pywikibot.html2unicode(self.title)
self.title = re.sub(r'-+', '-', self.title)
#remove formatting, i.e long useless strings
self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
@@ -228,7 +238,7 @@
self.title = self.title.replace('}}', '}}')
#prevent multiple quotes being interpreted as '' or '''
self.title = self.title.replace('\'\'',
'\''')
- self.title = wikipedia.unicode2html(self.title, self.site.encoding())
+ self.title = pywikibot.unicode2html(self.title, self.site.encoding())
# TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self):
@@ -257,10 +267,13 @@
"""
def __init__(self):
# Match references
- self.REFS =
re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
- self.NAMES =
re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
- self.GROUPS =
re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
- self.autogen = wikipedia.translate(wikipedia.getSite(), autogen)
+ self.REFS = re.compile(
+
u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+ self.NAMES = re.compile(
+
u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
+ self.GROUPS = re.compile(
+
u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
+ self.autogen = pywikibot.translate(pywikibot.getSite(), autogen)
def process(self, text):
# keys are ref groups
@@ -299,7 +312,7 @@
#First name associated with this content
if name == 'population':
- wikipedia.output(content)
+ pywikibot.output(content)
if not name in foundRefNames:
# first time ever we meet this name
if name == 'population':
@@ -350,11 +363,13 @@
name = v[0]
if v[1]:
name = u'"%s"' % name
- text = re.sub(u'<ref
name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref
name=%s />' % name, text)
+ text = re.sub(
+ u'<ref
name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k,
+ u'<ref name=%s />' % name, text)
return text
class ReferencesRobot:
- def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
+ def __init__(self, generator, acceptall=False, limit=None, ignorepdf=False):
"""
- generator : Page generator
- acceptall : boolean, is -always on ?
@@ -365,10 +380,11 @@
self.acceptall = acceptall
self.limit = limit
self.ignorepdf = ignorepdf
- self.site = wikipedia.getSite()
- self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site,
stopPage))
+ self.site = pywikibot.getSite()
+ self.stopPage = pywikibot.Page(self.site,
+ pywikibot.translate(self.site, stopPage))
- local = wikipedia.translate(self.site, badtitles)
+ local = pywikibot.translate(self.site, badtitles)
if local:
bad = '(' + globalbadtitles + '|' + local + ')'
else:
@@ -380,9 +396,9 @@
try :
self.stopPageRevId = self.stopPage.latestRevision()
- except wikipedia.NoPage :
- wikipedia.output(u'The stop page %s does not exist'
- % self.stopPage.aslink())
+ except pywikibot.NoPage :
+ pywikibot.output(u'The stop page %s does not exist'
+ % self.stopPage.title(asLink=True))
raise
# Regex to grasp content-type meta HTML tag in HTML source
@@ -392,20 +408,22 @@
# Extract html title from page
self.TITLE =
re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
# Matches content inside <script>/<style>/HTML comments
- self.NON_HTML =
re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
+ self.NON_HTML = re.compile(
+
ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
# Authorized mime types for HTML pages
- self.MIME =
re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
+ self.MIME = re.compile(
+ ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def put_page(self, page, new):
"""
Prints diffs between orginal and new (text), puts new text for page
"""
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
% page.title())
- wikipedia.showDiff(page.get(), new)
+ pywikibot.showDiff(page.get(), new)
if not self.acceptall:
- choice = wikipedia.inputChoice(u'Do you want to accept ' +
+ choice = pywikibot.inputChoice(u'Do you want to accept ' +
u'these changes?',
['Yes', 'No', 'All'],
['y', 'N', 'a'],
'N')
@@ -416,22 +434,24 @@
if self.acceptall:
try:
page.put(new)
- except wikipedia.EditConflict:
- wikipedia.output(u'Skipping %s because of edit conflict'
+ except pywikibot.EditConflict:
+ pywikibot.output(u'Skipping %s because of edit conflict'
% (page.title(),))
- except wikipedia.SpamfilterError, e:
- wikipedia.output(u'Cannot change %s because of blacklist entry
%s' % (page.title(), e.url))
- except wikipedia.PageNotSaved, error:
- wikipedia.output(u'Error putting page: %s' % (error.args,))
- except wikipedia.LockedPage:
- wikipedia.output(u'Skipping %s (locked page)'
+ except pywikibot.SpamfilterError, e:
+ pywikibot.output(
+ u'Cannot change %s because of blacklist entry %s'
+ % (page.title(), e.url))
+ except pywikibot.PageNotSaved, error:
+ pywikibot.output(u'Error putting page: %s' % (error.args,))
+ except pywikibot.LockedPage:
+ pywikibot.output(u'Skipping %s (locked page)'
% (page.title(),))
- except wikipedia.ServerError, e:
- wikipedia.output(u'Server Error : %s' % e)
+ except pywikibot.ServerError, e:
+ pywikibot.output(u'Server Error : %s' % e)
def httpError(self, err_num, link, pagetitleaslink):
"""Log HTTP Error"""
- wikipedia.output(u'HTTP error (%s) for %s on %s'
+ pywikibot.output(u'HTTP error (%s) for %s on %s'
% (err_num, link, pagetitleaslink),
toStdout = True)
@@ -440,24 +460,27 @@
Use pdfinfo to retrieve title from a PDF.
Unix-only, I'm afraid.
"""
- wikipedia.output( u'PDF file.' )
+ pywikibot.output( u'PDF file.' )
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'r+w')
urlobj.write(f.read())
try:
- pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"],
stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=False).communicate()[0]
+ pdfinfo_out =
subprocess.Popen([r"pdfinfo","/dev/stdin"],
+ stdin=urlobj, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
shell=False).communicate()[0]
for aline in pdfinfo_out.splitlines():
if aline.lower().startswith('title'):
ref.title = aline.split(None)[1:]
ref.title = ' '.join(ref.title)
- if ref.title != '': wikipedia.output(u'title: '
+ref.title )
- wikipedia.output( u'PDF done.' )
+ if ref.title != '':
+ pywikibot.output(u'title: %s' % ref.title)
+ pywikibot.output(u'PDF done.')
except ValueError:
- wikipedia.output( u'pdfinfo value error.' )
+ pywikibot.output(u'pdfinfo value error.')
except OSError:
- wikipedia.output( u'pdfinfo OS error.' )
+ pywikibot.output(u'pdfinfo OS error.')
except: # Ignore errors
- wikipedia.output( u'PDF processing error.' )
+ pywikibot.output(u'PDF processing error.')
pass
finally:
urlobj.close()
@@ -467,11 +490,12 @@
"""
Runs the Bot
"""
- wikipedia.setAction(wikipedia.translate(self.site, msg))
+ pywikibot.setAction(pywikibot.translate(self.site, msg))
try:
deadLinks = codecs.open(listof404pages, 'r',
'latin_1').read()
except IOError:
- wikipedia.output('You need to download
http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same
directory')
+ pywikibot.output(
+ 'You need to download
http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same
directory')
raise
socket.setdefaulttimeout(30)
editedpages = 0
@@ -480,17 +504,18 @@
# Load the page's text from the wiki
new_text = page.get()
if not page.canBeEdited():
- wikipedia.output(u"You can't edit page %s"
- % page.aslink())
+ pywikibot.output(u"You can't edit page %s"
+ % page.title(asLink=True))
continue
- except wikipedia.NoPage:
- wikipedia.output(u'Page %s not found' % page.aslink())
+ except pywikibot.NoPage:
+ pywikibot.output(u'Page %s not found' % page.title(asLink=True))
continue
- except wikipedia.IsRedirectPage:
- wikipedia.output(u'Page %s is a redirect' % page.aslink())
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(u'Page %s is a redirect'
+ % page.title(asLink=True))
continue
- for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())):
+ for match in linksInRef.finditer(pywikibot.removeDisabledParts(page.get())):
#for each link to change
link = match.group(u'url')
#debugging purpose
@@ -508,17 +533,24 @@
headers = f.info()
contentType = headers.getheader('Content-Type')
if contentType and not self.MIME.search(contentType):
- if ref.link.lower().endswith('.pdf') and not
self.ignorepdf:
+ if ref.link.lower().endswith('.pdf') and \
+ not self.ignorepdf:
# If file has a PDF suffix
self.getPDFTitle(ref, f)
else:
- wikipedia.output(u'\03{lightyellow}WARNING\03{default} :
media : %s ' % ref.link)
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : media : %s
'
+ % ref.link)
if ref.title:
- if not re.match('(?i) *microsoft (word|excel|visio)',
ref.title):
+ if not re.match(
+ '(?i) *microsoft (word|excel|visio)',
+ ref.title):
ref.transform(ispdf=True)
repl = ref.refTitle()
else:
- wikipedia.output('\03{lightyellow}WARNING\03{default}
: PDF title blacklisted : %s ' % ref.title)
+ pywikibot.output(
+ '\03{lightyellow}WARNING\03{default} : PDF title
blacklisted : %s '
+ % ref.title)
repl = ref.refLink()
else:
repl = ref.refLink()
@@ -526,12 +558,19 @@
continue
# Get the real url where we end (http redirects !)
redir = f.geturl()
- if redir != ref.link and domain.findall(redir) ==
domain.findall(link):
- if soft404.search(redir) and not soft404.search(ref.link):
- wikipedia.output(u'\03{lightyellow}WARNING\03{default} :
Redirect 404 : %s ' % ref.link)
+ if redir != ref.link and \
+ domain.findall(redir) == domain.findall(link):
+ if soft404.search(redir) and \
+ not soft404.search(ref.link):
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : Redirect 404
: %s '
+ % ref.link)
continue
- if dirIndex.match(redir) and not dirIndex.match(ref.link):
- wikipedia.output(u'\03{lightyellow}WARNING\03{default} :
Redirect to root : %s ' % ref.link)
+ if dirIndex.match(redir) and \
+ not dirIndex.match(ref.link):
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : Redirect to
root : %s '
+ % ref.link)
continue
# uncompress if necessary
@@ -548,15 +587,21 @@
socket.setdefaulttimeout(None)
except UnicodeError:
- #example :
http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in
[[fr:Cyanure]]
- wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in
%s' % (ref.url, page.aslink()))
+ #example :
http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
+ # in [[fr:Cyanure]]
+ pywikibot.output(
+ u'\03{lightred}Bad link\03{default} : %s in %s'
+ % (ref.url, page.title(asLink=True)))
continue
except urllib2.HTTPError, e:
- wikipedia.output(u'HTTP error (%s) for %s on %s'
- % (e.code, ref.url, page.aslink()),
+ pywikibot.output(u'HTTP error (%s) for %s on %s'
+ % (e.code, ref.url,
+ page.title(asLink=True)),
toStdout = True)
- # 410 Gone, indicates that the resource has been purposely removed
- if e.code == 410 or (e.code == 404 and (u'\t%s\t' % ref.url
in deadLinks)):
+ # 410 Gone, indicates that the resource has been purposely
+ # removed
+ if e.code == 410 or \
+ (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
@@ -565,7 +610,8 @@
IOError,
httplib.error), e:
#except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error,
socket.error), e:
- wikipedia.output(u'Can\'t retrieve page %s : %s' %
(ref.url, e))
+ pywikibot.output(u'Can\'t retrieve page %s : %s'
+ % (ref.url, e))
continue
except ValueError:
#Known bug of httplib, google for :
@@ -606,21 +652,25 @@
else:
enc.append(tmp)
else:
- wikipedia.output(u'No charset found for %s' % ref.link)
+ pywikibot.output(u'No charset found for %s' % ref.link)
#continue # do not process pages without charset
if not contentType:
- wikipedia.output(u'No content-type found for %s' % ref.link)
+ pywikibot.output(u'No content-type found for %s' % ref.link)
continue
elif not self.MIME.search(contentType):
- wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media :
%s ' % ref.link)
+ pywikibot.output(
+ u'\03{lightyellow}WARNING\03{default} : media : %s '
+ % ref.link)
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
- # Ugly hacks to try to survive when both server and page return no
encoding.
+ # Ugly hacks to try to survive when both server and page
+ # return no encoding.
# Uses most used encodings for each national suffix
if u'.ru' in ref.link or u'.su' in ref.link:
- # see
http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no
page encoding
+ # see
http://www.sci.aha.ru/ATL/ra13a.htm : no server
+ # encoding, no page encoding
enc = enc + ['koi8-r', 'windows-1251']
elif u'.jp' in ref.link:
enc.append("shift jis 2004")
@@ -641,7 +691,7 @@
#Can't easily parse them. (~1 on 1000)
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- wikipedia.output('%s : Hybrid encoding...' % ref.link)
+ pywikibot.output('%s : Hybrid encoding...' % ref.link)
continue
@@ -657,24 +707,29 @@
if not ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- wikipedia.output(u'%s : No title found...' % ref.link)
+ pywikibot.output(u'%s : No title found...' % ref.link)
continue
if enc and u.originalEncoding not in enc:
- # BeautifulSoup thinks that the original encoding of our page was not
one
- # of the encodings we specified. Output a warning.
- wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s
(%s)' % (ref.link, ref.title))
+ # BeautifulSoup thinks that the original encoding of our
+ # page was not one of the encodings we specified. Output a
+ # warning.
+ pywikibot.output(
+ u'\03{lightpurple}ENCODING\03{default} : %s (%s)'
+ % (ref.link, ref.title))
# XXX Ugly hack
if u'é' in ref.title:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- wikipedia.output(u'%s : Hybrid encoding...' % ref.link)
+ pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
continue
if self.titleBlackList.match(ref.title):
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
- wikipedia.output(u'\03{lightred}WARNING\03{default} %s :
Blacklisted title (%s)' % (ref.link, ref.title))
+ pywikibot.output(
+ u'\03{lightred}WARNING\03{default} %s : Blacklisted title
(%s)'
+ % (ref.link, ref.title))
continue
# Truncate long titles. 175 is arbitrary
@@ -692,22 +747,25 @@
new_text = self.deduplicator.process(new_text)
if new_text == page.get():
- wikipedia.output('No changes were necessary in %s'
- % page.aslink())
+ pywikibot.output('No changes were necessary in %s'
+ % page.title(asLink=True))
continue
editedpages += 1
self.put_page(page, new_text)
if self.limit and editedpages >= self.limit:
- wikipedia.output('Edited %s pages, stopping.' % self.limit)
+ pywikibot.output('Edited %s pages, stopping.' % self.limit)
return
if editedpages % 20 == 0:
- wikipedia.output('\03{lightgreen}Checking stop
page...\03{default}')
+ pywikibot.output(
+ '\03{lightgreen}Checking stop page...\03{default}')
actualRev = self.stopPage.latestRevision()
if actualRev != self.stopPageRevId:
- wikipedia.output(u'[[%s]] has been edited : Someone wants us to
stop.' % self.stopPage)
+ pywikibot.output(
+ u'[[%s]] has been edited : Someone wants us to stop.'
+ % self.stopPage)
return
def main():
@@ -720,14 +778,14 @@
limit = None
namespaces = []
generator = None
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
if arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
except ValueError:
namespaces.append(arg[11:])
elif arg.startswith('-summary:'):
- wikipedia.setAction(arg[9:])
+ pywikibot.setAction(arg[9:])
elif arg == '-always':
always = True
elif arg == '-ignorepdf':
@@ -736,13 +794,13 @@
limit = int(arg[7:])
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
- xmlStart = wikipedia.input(
+ xmlStart = pywikibot.input(
u'Please enter the dumped article to start with:')
else:
xmlStart = arg[10:]
elif arg.startswith('-xml'):
if len(arg) == 4:
- xmlFilename = wikipedia.input(
+ xmlFilename = pywikibot.input(
u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
@@ -759,7 +817,7 @@
generator = genFactory.getCombinedGenerator()
if not generator:
# syntax error, show help text from the top of this file
- wikipedia.showHelp('reflinks')
+ pywikibot.showHelp('reflinks')
return
generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
generator = pagegenerators.RedirectFilterPageGenerator(generator)
@@ -770,4 +828,4 @@
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/revertbot.py
===================================================================
--- trunk/pywikipedia/revertbot.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/revertbot.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -1,13 +1,21 @@
-import wikipedia, query, userlib
-
-__version__ = '$Id$'
-
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
"""
- (c) Bryan Tong Minh, 2008
- (c) Pywikipedia team, 2008-2010
- Licensed under the terms of the MIT license.
"""
+#
+# (C) Bryan Tong Minh, 2008
+# (C) Pywikipedia bot team, 2008-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+import re
+import wikipedia as pywikibot
+import query, userlib
+
+
class BaseRevertBot(object):
""" Base revert bot
@@ -94,38 +102,39 @@
rev['user'], rev['timestamp'])
if self.comment: comment += ': ' + self.comment
- page = wikipedia.Page(self.site, item['title'])
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<" % page.aslink(True, True))
+ page = pywikibot.Page(self.site, item['title'])
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % page.aslink(True, True))
old = page.get()
new = rev['*']
- wikipedia.showDiff(old, new)
+ pywikibot.showDiff(old, new)
page.put(new, comment)
return comment
def log(self, msg):
- wikipedia.output(msg)
+ pywikibot.output(msg)
-import re
class myRevertBot(BaseRevertBot):
def callback(self, item):
if 'top' in item:
- page = wikipedia.Page(self.site, item['title'])
+ page = pywikibot.Page(self.site, item['title'])
text=page.get()
pattern = re.compile(u'\[\[.+?:.+?\..+?\]\]', re.UNICODE)
return pattern.search(text) >= 0
return False
+
def main():
item = None
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
continue
- bot = myRevertBot(site = wikipedia.getSite())
+ bot = myRevertBot(site = pywikibot.getSite())
bot.revert_contribs()
if __name__ == "__main__":
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/selflink.py
===================================================================
--- trunk/pywikipedia/selflink.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/selflink.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -25,12 +25,18 @@
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
-
+#
+# (C) Pywikipedia bot team, 2006-2010
+#
+# Distributed under the terms of the MIT license.
+#
__version__='$Id$'
+#
-import wikipedia, pagegenerators, catlib
+import re, sys
+import wikipedia as pywikibot
+import pagegenerators, catlib
import editarticle
-import re, sys
# This is required for the text that is shown when you run this script
# with the parameter -help.
@@ -78,7 +84,7 @@
def __iter__(self):
import xmlreader
- mysite = wikipedia.getSite()
+ mysite = pywikibot.getSite()
dump = xmlreader.XmlDump(self.xmlFilename)
for entry in dump.parse():
if mysite.nocapitalize:
@@ -89,14 +95,14 @@
re.escape(entry.title[1:]))
selflinkR = re.compile(r'\[\[' + title + '(\|[^\]]*)?\]\]')
if selflinkR.search(entry.text):
- yield wikipedia.Page(mysite, entry.title)
+ yield pywikibot.Page(mysite, entry.title)
continue
class SelflinkBot:
def __init__(self, generator, always=False):
self.generator = generator
- linktrail = wikipedia.getSite().linktrail()
+ linktrail = pywikibot.getSite().linktrail()
# The regular expression which finds links. Results consist of four groups:
# group title is the target page title, that is, everything before | or ].
# group section is the page section. It'll include the # to make life easier
for us.
@@ -122,9 +128,9 @@
or match.group('section'):
return text, False
try:
- linkedPage = wikipedia.Page(page.site(), match.group('title'))
- except wikipedia.InvalidTitle, err:
- wikipedia.output(u'Warning: %s' % err)
+ linkedPage = pywikibot.Page(page.site(), match.group('title'))
+ except pywikibot.InvalidTitle, err:
+ pywikibot.output(u'Warning: %s' % err)
return text, False
# Check whether the link found is to the current page itself.
@@ -137,16 +143,16 @@
if self.always:
choice = 'a'
else:
- wikipedia.output(
+ pywikibot.output(
text[max(0, match.start() - context) : match.start()] \
+ '\03{lightred}' + text[match.start() : match.end()] \
+ '\03{default}' + text[match.end() : match.end() +
context])
- choice = wikipedia.inputChoice(
+ choice = pywikibot.inputChoice(
u'\nWhat shall be done with this selflink?\n',
['unlink', 'make bold', 'skip',
'edit', 'more context',
'unlink all', 'quit'],
['U', 'b', 's', 'e', 'm',
'a', 'q'], 'u')
- wikipedia.output(u'')
+ pywikibot.output(u'')
if choice == 's':
# skip this link
@@ -161,7 +167,8 @@
return text, True
elif choice == 'm':
# show more context by recursive self-call
- return self.handleNextLink(page, text, match, context = context +
100)
+ return self.handleNextLink(page, text, match,
+ context=context + 100)
elif choice == 'a':
self.always = True
elif choice == 'q':
@@ -178,14 +185,17 @@
def treat(self, page):
# Show the title of the page we're working on.
# Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<" % page.title())
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % page.title())
try:
oldText = page.get()
# Inside image maps, don't touch selflinks, as they're used
# to create tooltip labels. See for example:
#
http://de.wikipedia.org/w/index.php?title=Innenstadt_%28Bautzen%29&diff…
if '<imagemap>' in oldText:
- wikipedia.output(u'Skipping page %s because it contains an image
map.' % page.aslink())
+ pywikibot.output(
+ u'Skipping page %s because it contains an image map.'
+ % page.title(asLink=True))
return
text = oldText
curpos = 0
@@ -193,27 +203,30 @@
match = self.linkR.search(text, pos = curpos)
if not match:
break
- # Make sure that next time around we will not find this same hit.
+ # Make sure that next time around we will not find this same
+ # hit.
curpos = match.start() + 1
text, jumpToBeginning = self.handleNextLink(page, text, match)
if jumpToBeginning:
curpos = 0
if oldText == text:
- wikipedia.output(u'No changes necessary.')
+ pywikibot.output(u'No changes necessary.')
else:
- wikipedia.showDiff(oldText, text)
+ pywikibot.showDiff(oldText, text)
page.put_async(text)
- except wikipedia.NoPage:
- wikipedia.output(u"Page %s does not exist?!" % page.aslink())
- except wikipedia.IsRedirectPage:
- wikipedia.output(u"Page %s is a redirect; skipping." %
page.aslink())
- except wikipedia.LockedPage:
- wikipedia.output(u"Page %s is locked?!" % page.aslink())
+ except pywikibot.NoPage:
+ pywikibot.output(u"Page %s does not exist?!"
+ % page.title(asLink=True))
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(u"Page %s is a redirect; skipping."
+ % page.title(asLink=True))
+ except pywikibot.LockedPage:
+ pywikibot.output(u"Page %s is locked?!" % page.title(asLink=True))
def run(self):
- comment = wikipedia.translate(wikipedia.getSite(), msg)
- wikipedia.setAction(comment)
+ comment = pywikibot.translate(pywikibot.getSite(), msg)
+ pywikibot.setAction(comment)
for page in self.generator:
if self.done: break
@@ -234,10 +247,11 @@
genFactory = pagegenerators.GeneratorFactory()
always = False
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
if arg.startswith('-xml'):
if len(arg) == 4:
- xmlFilename = wikipedia.input(u'Please enter the XML dump\'s
filename:')
+ xmlFilename = pywikibot.input(
+ u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
gen = XmlDumpSelflinkPageGenerator(xmlFilename)
@@ -265,12 +279,12 @@
pageTitle.append(arg)
if pageTitle:
- page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
+ page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle))
gen = iter([page])
if not gen:
gen = genFactory.getCombinedGenerator()
if not gen:
- wikipedia.showHelp('selflink')
+ pywikibot.showHelp('selflink')
else:
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
@@ -282,4 +296,4 @@
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/spamremove.py
===================================================================
--- trunk/pywikipedia/spamremove.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/spamremove.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -23,11 +23,20 @@
"""
+#
+# (C) Pywikipedia bot team, 2007-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+#
+
import sys
-import wikipedia, editarticle, pagegenerators
+import wikipedia as pywikibot
+import pagegenerators
+import editarticle
-__version__ = '$Id$'
-
def main():
automatic = False
namespaces = []
@@ -48,7 +57,7 @@
'zh': u'機器人: 移除廣告黑名單連結 %s',
}
spamSite = ''
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
if arg.startswith("-automatic"):
automatic = True
elif arg.startswith('-namespace:'):
@@ -59,56 +68,61 @@
else:
spamSite = arg
if not automatic:
- wikipedia.put_throttle.setDelay(1)
+ pywikibot.put_throttle.setDelay(1)
if not spamSite:
- wikipedia.showHelp('spamremove')
- wikipedia.output(u"No spam site specified.")
+ pywikibot.showHelp('spamremove')
+ pywikibot.output(u"No spam site specified.")
sys.exit()
- mysite = wikipedia.getSite()
+ mysite = pywikibot.getSite()
pages = list(set(mysite.linksearch(spamSite)))
if namespaces:
- pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages,
namespaces)))
+ pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages,
+ namespaces)))
if len(pages) == 0:
- wikipedia.output('No page found.')
+ pywikibot.output('No page found.')
else:
- wikipedia.getall(mysite, pages)
+ pywikibot.getall(mysite, pages)
for p in pages:
text = p.get()
if not spamSite in text:
continue
# Show the title of the page we're working on.
# Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<" % p.title())
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % p.title())
lines = text.split('\n')
newpage = []
lastok = ""
for line in lines:
if spamSite in line:
if lastok:
- wikipedia.output(lastok)
- wikipedia.output('\03{lightred}%s\03{default}' % line)
+ pywikibot.output(lastok)
+ pywikibot.output('\03{lightred}%s\03{default}' % line)
lastok = None
else:
newpage.append(line)
if line.strip():
if lastok is None:
- wikipedia.output(line)
+ pywikibot.output(line)
lastok = line
if automatic:
answer = "y"
else:
- answer = wikipedia.inputChoice(u'\nDelete the red lines?',
['yes', 'no', 'edit'], ['y', 'N', 'e'],
'n')
+ answer = pywikibot.inputChoice(u'\nDelete the red lines?',
+ ['yes', 'no',
'edit'],
+ ['y', 'N', 'e'],
'n')
if answer == "n":
continue
elif answer == "e":
editor = editarticle.TextEditor()
- newtext = editor.edit(text, highlight = spamSite, jumpIndex =
text.find(spamSite))
+ newtext = editor.edit(text, highlight=spamSite,
+ jumpIndex=text.find(spamSite))
else:
newtext = "\n".join(newpage)
if newtext != text:
- p.put(newtext, wikipedia.translate(mysite, msg) % spamSite)
+ p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/speedy_delete.py
===================================================================
--- trunk/pywikipedia/speedy_delete.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/speedy_delete.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -19,20 +19,25 @@
NOTE: This script currently only works for the Wikipedia project.
"""
-__version__ = '$Id$'
+
#
+# (C) Pywikipedia bot team, 2007-2010
+#
# Distributed under the terms of the MIT license.
#
-import wikipedia
+__version__ = '$Id$'
+#
+
+import wikipedia as pywikibot
import pagegenerators, catlib
import time
class SpeedyRobot:
+ """ This robot will load a list of pages from the category of
candidates for
+ speedy deletion on the language's wiki and give the user an interactive
+ prompt to decide whether each should be deleted or not.
+
"""
- This robot will load a list of pages from the category of candidates for speedy
- deletion on the language's wiki and give the user an interactive prompt to
decide
- whether each should be deleted or not.
- """
csd_cat={
'wikipedia':{
@@ -452,28 +457,32 @@
Arguments:
none yet
"""
- self.mySite = wikipedia.getSite()
- self.csdCat = catlib.Category(self.mySite, wikipedia.translate(self.mySite,
self.csd_cat))
+ self.mySite = pywikibot.getSite()
+ self.csdCat = catlib.Category(self.mySite,
+ pywikibot.translate(self.mySite,
+ self.csd_cat))
self.savedProgress = None
self.preloadingGen = None
def guessReasonForDeletion(self, page):
reason = None
- # TODO: The following check loads the page 2 times. Find a better way to do it.
- if page.isTalkPage() and (page.toggleTalkPage().isRedirectPage() or not
page.toggleTalkPage().exists()):
+ # TODO: The following check loads the page 2 times. Find a better way to
+ # do it.
+ if page.isTalkPage() and (page.toggleTalkPage().isRedirectPage() or
+ not page.toggleTalkPage().exists()):
# This is probably a talk page that is orphaned because we
# just deleted the associated article.
- reason = wikipedia.translate(self.mySite, self.talk_deletion_msg)
+ reason = pywikibot.translate(self.mySite, self.talk_deletion_msg)
else:
# Try to guess reason by the template used
templateNames = page.templates()
- reasons = wikipedia.translate(self.mySite, self.deletion_messages)
+ reasons = pywikibot.translate(self.mySite, self.deletion_messages)
for templateName in templateNames:
if templateName in reasons:
if type(reasons[templateName]) is not unicode:
#Make alias to delete_reasons
- reason = wikipedia.translate(self.mySite,
self.delete_reasons)[reasons[templateName]]
+ reason = pywikibot.translate(self.mySite,
self.delete_reasons)[reasons[templateName]]
else:
reason = reasons[templateName]
break
@@ -484,26 +493,32 @@
def getReasonForDeletion(self, page):
suggestedReason = self.guessReasonForDeletion(page)
- wikipedia.output(u'The suggested reason is: \03{lightred}%s\03{default}'
% suggestedReason)
+ pywikibot.output(
+ u'The suggested reason is: \03{lightred}%s\03{default}'
+ % suggestedReason)
- # We don't use wikipedia.translate() here because for some languages the
+ # We don't use pywikibot.translate() here because for some languages the
# entry is intentionally left out.
if self.mySite.family.name in self.delete_reasons:
if page.site().lang in self.delete_reasons[self.mySite.family.name]:
- localReasons = wikipedia.translate(page.site().lang,
self.delete_reasons)
- wikipedia.output(u'')
+ localReasons = pywikibot.translate(page.site().lang,
+ self.delete_reasons)
+ pywikibot.output(u'')
localReasoneKey = localReasons.keys()
localReasoneKey.sort()
for key in localReasoneKey:
- wikipedia.output((key + ':').ljust(8) + localReasons[key])
- wikipedia.output(u'')
- reason = wikipedia.input(u'Please enter the reason for deletion,
choose a default reason, or press enter for the suggested message:')
+ pywikibot.output((key + ':').ljust(8) + localReasons[key])
+ pywikibot.output(u'')
+ reason = pywikibot.input(
+ u'Please enter the reason for deletion, choose a default reason,
or press enter for the suggested message:')
if reason.strip() in localReasons:
reason = localReasons[reason]
else:
- reason = wikipedia.input(u'Please enter the reason for deletion, or
press enter for the suggested message:')
+ reason = pywikibot.input(
+ u'Please enter the reason for deletion, or press enter for the
suggested message:')
else:
- reason = wikipedia.input(u'Please enter the reason for deletion, or press
enter for the suggested message:')
+ reason = pywikibot.input(
+ u'Please enter the reason for deletion, or press enter for the
suggested message:')
if not reason:
reason = suggestedReason
@@ -525,56 +540,69 @@
try:
pageText = page.get(get_redirect = True).split("\n")
count += 1
- except wikipedia.NoPage:
- wikipedia.output(u'Page %s does not exist or has already been
deleted, skipping.' % page.aslink())
+ except pywikibot.NoPage:
+ pywikibot.output(
+ u'Page %s does not exist or has already been deleted,
skipping.'
+ % page.title(asLink=True))
continue
# Show the title of the page we're working on.
# Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<" % page.title())
- wikipedia.output(u'- - - - - - - - - ')
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % page.title())
+ pywikibot.output(u'- - - - - - - - - ')
if len(pageText) > 75:
- wikipedia.output('The page detail is too many lines, only output
first 50 lines:')
- wikipedia.output(u'- - - - - - - - - ')
- wikipedia.output(u'\n'.join(pageText[:50]))
+ pywikibot.output(
+ 'The page detail is too many lines, only output first 50
lines:')
+ pywikibot.output(u'- - - - - - - - - ')
+ pywikibot.output(u'\n'.join(pageText[:50]))
else:
- wikipedia.output(u'\n'.join(pageText))
- wikipedia.output(u'- - - - - - - - - ')
- choice = wikipedia.inputChoice(u'Input action?',
['delete', 'skip', 'update', 'quit'], ['d',
'S', 'u', 'q'], 'S')
+ pywikibot.output(u'\n'.join(pageText))
+ pywikibot.output(u'- - - - - - - - - ')
+ choice = pywikibot.inputChoice(u'Input action?',
+ ['delete', 'skip',
'update',
+ 'quit'],
+ ['d', 'S', 'u',
'q'], 'S')
if choice == 'q':
keepGoing = False
break
elif choice == 'u':
- wikipedia.output(u'Updating from CSD category.')
+ pywikibot.output(u'Updating from CSD category.')
self.savedProgress = page.title()
startFromBeginning = False
break
elif choice == 'd':
reason = self.getReasonForDeletion(page)
- wikipedia.output(u'The chosen reason is:
\03{lightred}%s\03{default}' % reason)
+ pywikibot.output(
+ u'The chosen reason is: \03{lightred}%s\03{default}'
+ % reason)
page.delete(reason, prompt = False)
else:
- wikipedia.output(u'Skipping page %s' % page.title())
+ pywikibot.output(u'Skipping page %s' % page.title())
startFromBeginning = True
if count == 0:
if startFromBeginning:
- wikipedia.output(u'There are no pages to delete.\nWaiting for 30
seconds or press Ctrl+C to quit...')
+ pywikibot.output(
+ u'There are no pages to delete.\nWaiting for 30 seconds or
press Ctrl+C to quit...')
try:
time.sleep(30)
except KeyboardInterrupt:
keepGoing = False
else:
startFromBeginning = True
- wikipedia.output(u'Quitting program.')
+ pywikibot.output(u'Quitting program.')
def refreshGenerator(self):
- generator = pagegenerators.CategorizedPageGenerator(self.csdCat, start =
self.savedProgress)
- # wrap another generator around it so that we won't produce orphaned talk
pages.
+ generator = pagegenerators.CategorizedPageGenerator(
+ self.csdCat, start=self.savedProgress)
+ # wrap another generator around it so that we won't produce orphaned
+ # talk pages.
generator2 = pagegenerators.PageWithTalkPageGenerator(generator)
- self.preloadingGen = pagegenerators.PreloadingGenerator(generator2, pageNumber =
20)
+ self.preloadingGen = pagegenerators.PreloadingGenerator(generator2,
+ pageNumber=20)
def main():
# read command line parameters
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
pass #No args yet
bot = SpeedyRobot()
@@ -584,4 +612,4 @@
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/spellcheck.py
===================================================================
--- trunk/pywikipedia/spellcheck.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/spellcheck.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -51,15 +51,17 @@
"""
#
# (C) Andre Engels, 2005
+# (C) Pywikipedia bot team, 2006-2010
#
# Distributed under the terms of the MIT license.
#
-
__version__ = '$Id$'
+#
import re, sys
-import wikipedia, pagegenerators
import string, codecs
+import wikipedia as pywikibot
+import pagegenerators
msg={
'ar':u'تدقيق إملائي بمساعدة البوت',
@@ -73,8 +75,9 @@
'pt':u'Bot de correção ortográfica',
}
+
class SpecialTerm(object):
- def __init__(self,text):
+ def __init__(self, text):
self.style = text
@@ -102,7 +105,8 @@
def getalternatives(string):
# Find possible correct words for the incorrect word string
- basetext = wikipedia.input(u"Give a text that should occur in the words to be
checked.\nYou can choose to give no text, but this will make searching slow:")
+ basetext = pywikibot.input(
+ u"Give a text that should occur in the words to be checked.\nYou can choose
to give no text, but this will make searching slow:")
basetext = basetext.lower()
simwords = {}
for i in xrange(11):
@@ -140,26 +144,28 @@
def askAlternative(word,context=None):
correct = None
- wikipedia.output(u"="*60)
- wikipedia.output(u"Found unknown word '%s'"%word)
+ pywikibot.output(u"="*60)
+ pywikibot.output(u"Found unknown word '%s'"%word)
if context:
- wikipedia.output(u"Context:")
- wikipedia.output(u""+context)
- wikipedia.output(u"-"*60)
+ pywikibot.output(u"Context:")
+ pywikibot.output(u""+context)
+ pywikibot.output(u"-"*60)
while not correct:
for i in xrange(len(Word(word).getAlternatives())):
- wikipedia.output(u"%s: Replace by
'%s'"%(i+1,Word(word).getAlternatives()[i].replace('_','
')))
- wikipedia.output(u"a: Add '%s' as correct"%word)
+ pywikibot.output(u"%s: Replace by '%s'"
+ % (i+1,
+ Word(word).getAlternatives()[i].replace('_','
')))
+ pywikibot.output(u"a: Add '%s' as correct"%word)
if word[0].isupper():
- wikipedia.output(u"c: Add '%s' as correct"%(uncap(word)))
- wikipedia.output(u"i: Ignore once (default)")
- wikipedia.output(u"p: Ignore on this page")
- wikipedia.output(u"r: Replace text")
- wikipedia.output(u"s: Replace text, but do not save as alternative")
- wikipedia.output(u"g: Guess (give me a list of similar words)")
- wikipedia.output(u"*: Edit by hand")
- wikipedia.output(u"x: Do not check the rest of this page")
- answer = wikipedia.input(u":")
+ pywikibot.output(u"c: Add '%s' as correct" %
(uncap(word)))
+ pywikibot.output(u"i: Ignore once (default)")
+ pywikibot.output(u"p: Ignore on this page")
+ pywikibot.output(u"r: Replace text")
+ pywikibot.output(u"s: Replace text, but do not save as alternative")
+ pywikibot.output(u"g: Guess (give me a list of similar words)")
+ pywikibot.output(u"*: Edit by hand")
+ pywikibot.output(u"x: Do not check the rest of this page")
+ answer = pywikibot.input(u":")
if answer == "": answer = "i"
if answer in "aAiIpP":
correct = word
@@ -169,11 +175,13 @@
elif answer in "pP":
pageskip.append(word)
elif answer in "rRsS":
- correct = wikipedia.input(u"What should I replace it by?")
+ correct = pywikibot.input(u"What should I replace it by?")
if answer in "rR":
if correct_html_codes:
correct = removeHTML(correct)
- if correct != cap(word) and correct != uncap(word) and correct != word:
+ if correct != cap(word) and \
+ correct != uncap(word) and \
+ correct != word:
try:
knownwords[word] += [correct.replace(' ','_')]
except KeyError:
@@ -190,7 +198,7 @@
if possible:
print "Found alternatives:"
for pos in possible:
- wikipedia.output(" %s"%pos)
+ pywikibot.output(" %s"%pos)
else:
print "No similar words found."
elif answer=="*":
@@ -204,7 +212,8 @@
return correct
def removeHTML(page):
- # TODO: Consider removing this; this stuff can be done by cosmetic_changes.py
+ # TODO: Consider removing this; this stuff can be done by
+ # cosmetic_changes.py
result = page
result = result.replace('Ä',u'Ä')
result = result.replace('ä',u'ä')
@@ -266,13 +275,15 @@
loc += len(match.group(1))
bigword = Word(match.group(2))
smallword = bigword.derive()
- if not Word(smallword).isCorrect(checkalternative = knownonly) and (checknames or
not smallword[0].isupper()):
- replacement =
askAlternative(smallword,context=text[max(0,loc-40):loc+len(match.group(2))+40])
+ if not Word(smallword).isCorrect(checkalternative = knownonly) and \
+ (checknames or not smallword[0].isupper()):
+ replacement = askAlternative(smallword,
+ context=text[max(0,loc-40):loc +
len(match.group(2))+40])
if replacement == edit:
import editarticle
editor = editarticle.TextEditor()
# TODO: Don't know to which index to jump
- newtxt = editor.edit(text, jumpIndex = 0, highlight = smallword)
+ newtxt = editor.edit(text, jumpIndex = 0, highlight=smallword)
if newtxt:
text = newtxt
elif replacement == endpage:
@@ -291,6 +302,7 @@
pageskip = []
return text
+
class Word(object):
def __init__(self,text):
self.word = text
@@ -348,13 +360,16 @@
if rep == self.derive():
return self.word
if self.derive() not in self.word:
- return wikipedia.input(u"Please give the result of replacing %s by %s in
%s:"%(self.derive(),rep,self.word))
+ return pywikibot.input(
+ u"Please give the result of replacing %s by %s in %s:"
+ % (self.derive(), rep, self.word))
return self.word.replace(self.derive(),rep)
def isCorrect(self,checkalternative = False):
# If checkalternative is True, the word will only be found incorrect if
# it is on the spelling list as a spelling error. Otherwise it will
- # be found incorrect if it is not on the list as a correctly spelled word.
+ # be found incorrect if it is not on the list as a correctly spelled
+ # word.
if self.word == "":
return True
if self.word in pageskip:
@@ -367,7 +382,7 @@
except KeyError:
pass
if self.word != uncap(self.word):
- return Word(uncap(self.word)).isCorrect(checkalternative = checkalternative)
+ return Word(uncap(self.word)).isCorrect(checkalternative=checkalternative)
else:
if checkalternative:
if checklang == 'nl' and self.word.endswith("'s"):
@@ -424,7 +439,7 @@
checklang = None
knownonly = False
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
if arg.startswith("-start:"):
start = arg[7:]
elif arg.startswith("-newpages"):
@@ -446,11 +461,11 @@
else:
title.append(arg)
- mysite = wikipedia.getSite()
+ mysite = pywikibot.getSite()
if not checklang:
checklang = mysite.language()
- wikipedia.setAction(wikipedia.translate(mysite,msg))
- filename = wikipedia.config.datafilepath('spelling',
+ pywikibot.setAction(pywikibot.translate(mysite,msg))
+ filename = pywikibot.config.datafilepath('spelling',
'spelling-' + checklang + '.txt')
print "Getting wordlist"
try:
@@ -480,40 +495,43 @@
else:
print "Wordlist successfully loaded."
# This is a purely interactive bot, we therefore do not want to put-throttle
- wikipedia.put_throttle.setDelay(1)
+ pywikibot.put_throttle.setDelay(1)
except:
- wikipedia.stopme()
+ pywikibot.stopme()
raise
try:
if newpages:
- for (page, date, length, loggedIn, user, comment) in
wikipedia.getSite().newpages(1000):
+ for (page, date, length, loggedIn, user, comment) in
pywikibot.getSite().newpages(1000):
try:
text = page.get()
- except wikipedia.Error:
+ except pywikibot.Error:
pass
else:
- text = spellcheck(text,checknames=checknames,knownonly=knownonly)
+ text = spellcheck(text, checknames=checknames,
+ knownonly=knownonly)
if text != page.get():
page.put(text)
elif start:
for page in
pagegenerators.PreloadingGenerator(pagegenerators.AllpagesPageGenerator(start=start,includeredirects=False)):
try:
text = page.get()
- except wikipedia.Error:
+ except pywikibot.Error:
pass
else:
- text = spellcheck(text,checknames=checknames,knownonly=knownonly)
+ text = spellcheck(text, checknames=checknames,
+ knownonly=knownonly)
if text != page.get():
page.put(text)
if longpages:
- for (page, length) in wikipedia.getSite().longpages(500):
+ for (page, length) in pywikibot.getSite().longpages(500):
try:
text = page.get()
- except wikipedia.Error:
+ except pywikibot.Error:
pass
else:
- text = spellcheck(text, checknames = checknames,knownonly=knownonly)
+ text = spellcheck(text, checknames=checknames,
+ knownonly=knownonly)
if text != page.get():
page.put(text)
@@ -521,20 +539,20 @@
title = ' '.join(title)
while title != '':
try:
- page = wikipedia.Page(mysite,title)
+ page = pywikibot.Page(mysite,title)
text = page.get()
- except wikipedia.NoPage:
+ except pywikibot.NoPage:
print "Page does not exist."
- except wikipedia.IsRedirectPage:
+ except pywikibot.IsRedirectPage:
print "Page is a redirect page"
else:
text = spellcheck(text,knownonly=knownonly)
if text != page.get():
page.put(text)
- title = wikipedia.input(u"Which page to check now? (enter to
stop)")
+ title = pywikibot.input(u"Which page to check now? (enter to
stop)")
finally:
- wikipedia.stopme()
- filename = wikipedia.config.datafilepath('spelling',
+ pywikibot.stopme()
+ filename = pywikibot.config.datafilepath('spelling',
'spelling-' + checklang + '.txt')
if rebuild:
list = knownwords.keys()
@@ -547,7 +565,8 @@
if Word(word).isCorrect():
if word != uncap(word):
if Word(uncap(word)).isCorrect():
- # Capitalized form of a word that is in the list uncapitalized
+ # Capitalized form of a word that is in the list
+ # uncapitalized
continue
f.write("1 %s\n"%word)
else:
Modified: trunk/pywikipedia/standardize_interwiki.py
===================================================================
--- trunk/pywikipedia/standardize_interwiki.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/standardize_interwiki.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -9,7 +9,7 @@
"""
#
# (C) Rob W.W. Hooft, 2003
-# (C) Filnik, 2007
+# (C) Pywikipedia bot team, 2003-2010
#
# Distributed under the terms of the MIT license.
#
@@ -17,8 +17,9 @@
#
import os, sys
-import wikipedia, config
import difflib
+import wikipedia as pywikibot
+import config
# The summary that the Bot will use.
comment = {
@@ -47,47 +48,48 @@
nothing = False
# Load the default parameters and start
-for arg in wikipedia.handleArgs():
+for arg in pywikibot.handleArgs():
if arg.startswith('-start'):
if len(arg) == 6:
- start = unicode(wikipedia.input(u'From what page do you want to
start?'))
+ start = unicode(pywikibot.input(
+ u'From what page do you want to start?'))
else:
start = unicode(arg[7:])
-site = wikipedia.getSite()
-comm = wikipedia.translate(site, comment)
+site = pywikibot.getSite()
+comm = pywikibot.translate(site, comment)
# What follows is the main part of the code.
try:
for pl in site.allpages(start):
plname = pl.title()
- wikipedia.output(u'\nLoading %s...' % plname)
+ pywikibot.output(u'\nLoading %s...' % plname)
try:
oldtext = pl.get()
- except wikipedia.IsRedirectPage:
- wikipedia.output(u"%s is a redirect!" % plname)
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(u"%s is a redirect!" % plname)
continue
old = pl.interwiki()
new = {}
for pl2 in old:
new[pl2.site()] = pl2
- newtext = wikipedia.replaceLanguageLinks(oldtext, new)
+ newtext = pywikibot.replaceLanguageLinks(oldtext, new)
if new:
if oldtext != newtext:
- wikipedia.showDiff(oldtext, newtext)
+ pywikibot.showDiff(oldtext, newtext)
# Submit changes
try:
status, reason, data = pl.put(newtext, comment=comm)
if str(status) != '302':
- wikipedia.output(status, reason)
- except wikipedia.LockedPage:
- wikipedia.output(u"%s is locked" % plname)
+ pywikibot.output(status, reason)
+ except pywikibot.LockedPage:
+ pywikibot.output(u"%s is locked" % plname)
continue
else:
- wikipedia.output(u'No changes needed.')
+ pywikibot.output(u'No changes needed.')
continue
else:
- wikipedia.output(u'No interwiki found.')
+ pywikibot.output(u'No interwiki found.')
continue
finally:
- wikipedia.stopme()
+ pywikibot.stopme()
Modified: trunk/pywikipedia/standardize_notes.py
===================================================================
--- trunk/pywikipedia/standardize_notes.py 2010-10-09 16:11:46 UTC (rev 8629)
+++ trunk/pywikipedia/standardize_notes.py 2010-10-09 19:32:57 UTC (rev 8630)
@@ -8,7 +8,7 @@
NOTE: This script is not capable of handling the <ref></ref> syntax. It just
handles the {{ref}} syntax, which is still used, but DEPRECATED on the English
-Wikipedia.
+wikipedia.
You can run the bot with the following commandline parameters:
@@ -20,7 +20,8 @@
-page - Only edit a single page.
Argument can also be given as "-page:pagename". You can give
this
parameter multiple times to edit multiple pages.
--regex - Make replacements using regular expressions. (Obsolete; always True)
+-regex - Make replacements using regular expressions.
+ (Obsolete; always True)
-except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given,
XYZ will be regarded as a regular expression.
-namespace:n - Namespace to process. Works only with a sql dump
@@ -41,17 +42,18 @@
#
__version__ = '$Id$'
#
-# 2005-07-15: Find name of section containing citations: doFindRefSection(). (SEWilco)
+# 2005-07-15: Find name of section containing citations: doFindRefSection().
+# (SEWilco)
# 2005-07-15: Obey robots.txt restrictions. (SEWilco)
-# 2005-07-15: Build list of all sections which may contain citations:
doFindAllCitationSections(). (SEWilco)
+# 2005-07-15: Build list of all sections which may contain citations:
+# doFindAllCitationSections(). (SEWilco)
#
-#from __future__ import generators
import subprocess, sys, re, random
import socket, urllib, robotparser
-import wikipedia, pagegenerators, config
-
from datetime import date
+import wikipedia as pywikibot
+import pagegenerators, config
# httpcache is optional
have_httpcache = True
@@ -77,7 +79,8 @@
}
fixes = {
- # These replacements will convert alternate reference formats to format used by this
tool.
+ # These replacements will convert alternate reference formats to format used
+ # by this tool.
'ALTREFS': {
'regex': True,
# We don't want to mess up pages which discuss HTML tags, so we skip
@@ -95,7 +98,8 @@
},
'replacements': [
# Everything case-insensitive (?i)
- # These translate variations of footnote templates to ref|note format.
+ # These translate variations of footnote templates to ref|note
+ # format.
(r'(?i){{an\|(.*?)}}', r"{{ref|\1}}"),
(r'(?i){{anb\|(.*?)}}', r"{{note|\1}}"),
(r'(?i){{endnote\|(.*?)}}', r"{{note|\1}}"),
@@ -141,50 +145,56 @@
# news sites for which to generate 'news reference' citations, the org name, and
prefix to strip
newssites = [
- ( 'abcnews.go.com', 'ABC News', 'ABC News: ' ),
- ( 'books.guardian.co.uk', 'The Guardian', 'Guardian Unlimited :
The Guardian : ' ),
- ( 'edition.cnn.com', 'CNN', 'CNN.com - ' ),
- ( 'news.bbc.co.uk', 'BBC', 'BBC NEWS : ' ),
- ( 'news.scotsman.com', 'The Scotsman', 'Scotsman.com News - '
),
- ( 'nyobserver.com', 'New York Observer', '' ),
- ( 'observer.guardian.co.uk', 'The Guardian', 'The Observer :
' ),
- ( 'politics.guardian.co.uk', 'The Guardian', 'Guardian Unlimited
Politics : ' ),
- ( 'seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle
Times: ' ),
- ( 'service.spiegel.de', 'Der Spiegel', '' ),
- ( 'thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman -
' ),
- ( 'today.reuters.com', 'Reuters', 'Latest News and Financial
Information : ' ),
- ( 'today.reuters.co.uk', 'Reuters', 'Latest News and Financial
Information : ' ),
- ( 'www.boston.com', 'The Boston Globe', 'Boston.com / ' ),
- ( 'www.cbsnews.com', 'CBS News', 'CBS News : ' ),
- ( 'www.cnn.com', 'CNN', 'CNN.com - ' ),
- ( 'www.cnsnews.com', 'Cybercast News Service', '' ),
- ( 'www.csmonitor.com', 'Christian Science Monitor', '' ),
- ( 'www.dallasnews.com', 'The Dallas Morning News', '' ),
- ( 'www.forbes.com', 'Forbes', '' ),
- ( 'www.foxnews.com', 'Fox News Channel', 'FOXNews.com - ' ),
- ( 'www.gnn.com', 'Government News Network', 'GNN - ' ),
- ( 'www.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The
Guardian : ' ),
- ( 'www.latimes.com', 'Los Angeles Times', '' ),
- ( 'www.msnbc.msn.com', 'MSNBC', '' ),
- ( 'www.nationalreview.com', 'National Review', '' ),
- ( 'www.nytimes.com', 'The New York Times', '' ),
- ( 'www.sfgate.com', 'San Francisco Chronicle', '' ),
- ( 'www.socialistworker.co.uk', 'Socialist Worker', '' ),
- ( 'www.spectator.org', 'The American Spectator', '' ),
- ( 'www.telegraph.co.uk', 'The Daily Telegraph', 'Telegraph
newspaper online - ' ),
- ( 'www.time.com', 'TIME', '' ),
- ( 'www.timesonline.co.uk', 'The Times', 'World news from The
Times and the Sunday Times - ' ),
- ( 'www.usatoday.com', 'USA Today', 'USATODAY.com - ' ),
- ( 'www.washingtonpost.com', 'The Washington Post', '' ),
- ( 'www.washtimes.com', 'The Washington Times', '' ),
- ( 'www.weeklystandard.com', 'The Weekly Standard', '' ),
- ( 'www.wired.com', 'Wired magazine', 'Wired News: ' ),
- ( 'wwwimage.cbsnews.com', 'CBS News', 'CBS News : ' ),
+ ('abcnews.go.com', 'ABC News', 'ABC News: '),
+ ('books.guardian.co.uk', 'The Guardian',
+ 'Guardian Unlimited : The Guardian : '),
+ ('edition.cnn.com', 'CNN', 'CNN.com - '),
+ ('news.bbc.co.uk', 'BBC', 'BBC NEWS : '),
+ ('news.scotsman.com', 'The Scotsman', 'Scotsman.com News -
'),
+ ('nyobserver.com', 'New York Observer', ''),
+ ('observer.guardian.co.uk', 'The Guardian', 'The Observer :
'),
+ ('politics.guardian.co.uk', 'The Guardian',
+ 'Guardian Unlimited Politics : '),
+ ('seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle
Times: '),
+ ('service.spiegel.de', 'Der Spiegel', ''),
+ ('thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman -
'),
+ ('today.reuters.com', 'Reuters', 'Latest News and Financial
Information : '),
+ ('today.reuters.co.uk', 'Reuters',
+ 'Latest News and Financial Information : '),
+ ('www.boston.com', 'The Boston Globe', 'Boston.com / '),
+ ('www.cbsnews.com', 'CBS News', 'CBS News : '),
+ ('www.cnn.com', 'CNN', 'CNN.com - '),
+ ('www.cnsnews.com', 'Cybercast News Service', ''),
+ ('www.csmonitor.com', 'Christian Science Monitor', ''),
+ ('www.dallasnews.com', 'The Dallas Morning News', ''),
+ ('www.forbes.com', 'Forbes', ''),
+ ('www.foxnews.com', 'Fox News Channel', 'FOXNews.com - '),
+ ('www.gnn.com', 'Government News Network', 'GNN - '),
+ ('www.guardian.co.uk', 'The Guardian',
+ 'Guardian Unlimited : The Guardian : '),
+ ('www.latimes.com', 'Los Angeles Times', ''),
+ ('www.msnbc.msn.com', 'MSNBC', ''),
+ ('www.nationalreview.com', 'National Review', ''),
+ ('www.nytimes.com', 'The New York Times', ''),
+ ('www.sfgate.com', 'San Francisco Chronicle', ''),
+ ('www.socialistworker.co.uk', 'Socialist Worker', ''),
+ ('www.spectator.org', 'The American Spectator', ''),
+ ('www.telegraph.co.uk', 'The Daily Telegraph',
+ 'Telegraph newspaper online - '),
+ ('www.time.com', 'TIME', ''),
+ ('www.timesonline.co.uk', 'The Times',
+ 'World news from The Times and the Sunday Times - '),
+ ('www.usatoday.com', 'USA Today', 'USATODAY.com - '),
+ ('www.washingtonpost.com', 'The Washington Post', ''),
+ ('www.washtimes.com', 'The Washington Times', ''),
+ ('www.weeklystandard.com', 'The Weekly Standard', ''),
+ ('www.wired.com', 'Wired magazine', 'Wired News: '),
+ ('wwwimage.cbsnews.com', 'CBS News', 'CBS News : '),
]
+
class ReplacePageGenerator:
- """
- Generator which will yield Pages for pages that might contain text to
+ """ Generator which will yield Pages for pages that might contain text
to
replace. These pages might be retrieved from a local SQL dump file or a
text file, or as a list of pages entered by the user.
@@ -205,7 +215,9 @@
will be used when source is 'sqldump'.
* pagenames - a list of pages which will be used when source is
'userinput'.
+
"""
+
def __init__(self, source, replacements, exceptions, regex = False, namespace = -1,
textfilename = None, sqlfilename = None, categoryname = None, pagenames = None):
self.source = source
self.replacements = replacements
@@ -218,8 +230,7 @@
self.pagenames = pagenames
def read_pages_from_sql_dump(self):
- """
- Generator which will yield Pages to pages that might contain text to
+ """ Generator which will yield Pages to pages that might contain
text to
replace. These pages will be retrieved from a local sql dump file
(cur table).
@@ -229,12 +240,13 @@
are values
* exceptions - a list of strings; pages which contain one of these
won't be changed.
- * regex - if the entries of replacements and exceptions should
- be interpreted as regular expressions
+ * regex - if the entries of replacements and exceptions
+ should be interpreted as regular expressions
+
"""
- mysite = wikipedia.getSite()
+ mysite = pywikibot.getSite()
import sqldump
- dump = sqldump.SQLdump(self.sqlfilename, wikipedia.getSite().encoding())
+ dump = sqldump.SQLdump(self.sqlfilename, pywikibot.getSite().encoding())
for entry in dump.entries():
skip_page = False
if self.namespace != -1 and self.namespace != entry.namespace:
@@ -255,11 +267,11 @@
if self.regex:
old = re.compile(old)
if old.search(entry.text):
- yield wikipedia.Page(mysite, entry.full_title())
+ yield pywikibot.Page(mysite, entry.full_title())
break
else:
if old in entry.text:
- yield wikipedia.Page(mysite, entry.full_title())
+ yield pywikibot.Page(mysite, entry.full_title())
break
def read_pages_from_category(self):
@@ -270,9 +282,10 @@
Arguments:
* textfilename - the textfile's path, either absolute or relative
+
"""
import catlib
- category = catlib.Category(wikipedia.getSite(), self.categoryname)
+ category = catlib.Category(pywikibot.getSite(), self.categoryname)
for page in category.articles(recurse = False):
yield page
@@ -284,6 +297,7 @@
Arguments:
* textfilename - the textfile's path, either absolute or relative
+
"""
f = open(self.textfilename, 'r')
# regular expression which will find [[wiki links]]
@@ -294,7 +308,7 @@
# TODO: use findall() instead.
m=R.match(line)
if m:
- yield wikipedia.Page(wikipedia.getSite(), m.group(1))
+ yield pywikibot.Page(pywikibot.getSite(), m.group(1))
f.close()
def read_pages_from_wiki_page(self):
@@ -305,9 +319,10 @@
Arguments:
* pagetitle - the title of a page on the home wiki
+
'''
- listpage = wikipedia.Page(wikipedia.getSite(), self.pagetitle)
- list = wikipedia.get(listpage)
+ listpage = pywikibot.Page(pywikibot.getSite(), self.pagetitle)
+ list = pywikibot.get(listpage)
# TODO - UNFINISHED
# TODO: Make MediaWiki's search feature available.
@@ -326,7 +341,7 @@
yield pl
elif self.source == 'userinput':
for pagename in self.pagenames:
- yield wikipedia.Page(wikipedia.getSite(), pagename)
+ yield pywikibot.Page(pywikibot.getSite(), pagename)
class ReplaceRobot:
def __init__(self, generator, replacements, refsequence, references,
@@ -375,36 +390,40 @@
new_text = new_text.replace(old, new)
# Find name of Notes section.
- refsectionname = self.doFindRefSection( new_text )
+ refsectionname = self.doFindRefSection(new_text)
# Get list of all sections which may contain citations.
- refsectionlist = self.doFindAllCitationSections( new_text, refsectionname )
+ refsectionlist = self.doFindAllCitationSections(new_text,
+ refsectionname)
# Read existing Notes section contents into references list
- wikipedia.output( u"Reading existing Notes section" )
+ pywikibot.output(u"Reading existing Notes section")
self.doReadReferencesSection( new_text, refsectionname )
while self.references and self.references[len(self.references)-1] ==
u'\n':
del self.references[len(self.references)-1] # delete trailing empty lines
# Convert any external links to footnote references
- wikipedia.output( u"Converting external links" )
- new_text = self.doConvertExternalLinks( new_text )
+ pywikibot.output(u"Converting external links" )
+ new_text = self.doConvertExternalLinks(new_text)
# Accumulate ordered list of all references
- wikipedia.output( u"Collecting references" )
+ pywikibot.output(u"Collecting references")
(duplicatefound, self.refusage) = self.doBuildSequenceListOfReferences( new_text
)
# Rewrite references, including dealing with duplicates.
- wikipedia.output( u"Rewriting references" )
- new_text = self.doRewriteReferences( new_text, self.refusage, refsectionname )
+ pywikibot.output(u"Rewriting references")
+ new_text = self.doRewriteReferences(new_text, self.refusage,
+ refsectionname)
# Reorder Notes to match sequence of ordered list
- wikipedia.output( u"Collating references" )
- self.references = self.doReorderReferences( self.references, self.refusage)
+ pywikibot.output(u"Collating references")
+ self.references = self.doReorderReferences(self.references,
+ self.refusage)
# Rebuild Notes section
- wikipedia.output( u"Rebuilding References section" )
- new_text = self.doUpdateReferencesSection( new_text, self.refusage,
refsectionname )
+ pywikibot.output(u"Rebuilding References section" )
+ new_text = self.doUpdateReferencesSection(new_text, self.refusage,
+ refsectionname)
return new_text
def doConvertExternalLinks(self, original_text):
+ """ Returns the text which is generated by converting external
links to
+ References. Adds References to reference list.
+
"""
- Returns the text which is generated by converting external links to References.
- Adds References to reference list.
- """
new_text = '' # Default is no text
skipsection = False
for text_line in original_text.splitlines(True): # Scan all text line by line
@@ -422,7 +441,7 @@
# TODO: recognize {{inline}} invisible footnotes when something can be
done with them
#
# Ignore lines within comments
- if not text_line.startswith( u'<!--' ):
+ if not text_line.startswith( u'<!--'):
# Fix erroneous external links in double brackets
Rextlink =
re.compile(r'(?i)\[\[(?P<linkname>http://[^\]]+?)\]\]')
# TODO: compiling the regex each time might be inefficient
@@ -485,20 +504,17 @@
m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line )
if m: # if in a section, remember section name
sectionname = m.group('sectionname').strip()
- wikipedia.output( u'Section: %s' % sectionname )
+ pywikibot.output( u'Section: %s' % sectionname )
else: # else not a section name so look for reference
n = re.search( r'(i?){{(note|ibid)[|]', text_line )
if n: # if reference found
refsectionname = sectionname # found reference section
- wikipedia.output( u'Ref section: %s' % refsectionname )
+ pywikibot.output( u'Ref section: %s' % refsectionname )
break # stop looking
return refsectionname
def doFindAllCitationSections(self, original_text, refsectionname):
-
- """
- Returns list of sections which may contain citations.
- """
+ """ Returns list of sections which may contain citations.
"""
refsectionlist = [ ( refsectionname) ]
sectionname = ''
for text_line in original_text.splitlines(True): # Scan all text line by line
@@ -523,13 +539,13 @@
if m: # if in a section, check if should skip this section
if refsectionname != '': # if a certain section name has been
identified
m_section = m.group('sectionname')
- wikipedia.output( u'Looking for "%s":
"%s"' % (refsectionname,unicode(m_section)) )
+ pywikibot.output( u'Looking for "%s":
"%s"' % (refsectionname,unicode(m_section)) )
if unicode(m_section.strip()) == unicode(refsectionname):
- wikipedia.output( u'Found Ref section.' )
+ pywikibot.output( u'Found Ref section.')
skipsection = True # skipsection left True so no further
links converted
else: # else grab all possible sections
if m.group('sectionname').lower().strip() in
referencesectionnames:
- wikipedia.output( 'RefSection found by default names: %s'
% m.group('sectionname') )
+ pywikibot.output('RefSection found by default names: %s'
% m.group('sectionname') )
skipsection = True # skipsection left True so no further
links converted
if skipsection:
new_text = new_text + text_line # skip section, so retain text.
@@ -543,11 +559,11 @@
m = Rtext_line.search( text_line )
alphabet26 = u'abcdefghijklmnopqrstuvwxyz'
while m: # if found a reference
- if m.group('reftype').lower() in ( 'ref',
'ref_num', 'ref_label' ): # confirm ref
+ if m.group('reftype').lower() in ('ref',
'ref_num', 'ref_label'): # confirm ref
refkey = m.group('refname').strip()
if refkey != '':
if refkey in refusage:
- # wikipedia.output( u'refusage[%s] = %s' %
(refkey,refusage[refkey]) )
+ # pywikibot.output( u'refusage[%s] = %s' %
(refkey,refusage[refkey]) )
if refusage[refkey][2] == 0: # if first use of
reference
text_line=text_line[:m.start(0)] +
'{{ref|%s}}' % (refkey) + text_line[m.end(0):]
refusage[refkey][2] += 1 # count use of reference
@@ -574,60 +590,71 @@
urlfile = None
urlheaders = None
if len(extlink_linkname) > 5:
- socket.setdefaulttimeout( 20 ) # timeout in seconds
- wikipedia.get_throttle() # throttle down to Wikipedia rate
+ socket.setdefaulttimeout(20) # timeout in seconds
+ pywikibot.get_throttle() # throttle down to Wikipedia rate
# Obey robots.txt restrictions
rp = robotparser.RobotFileParser()
rp.set_url( extlink_linkname )
try:
rp.read() # read robots.txt
except (IOError, socket.timeout):
- wikipedia.output( u'Error accessing URL: %s' %
unicode(extlink_linkname) )
+ pywikibot.output(u'Error accessing URL: %s'
+ % unicode(extlink_linkname))
else:
urlobj = None
if not rp.can_fetch( "*", extlink_linkname ):
- wikipedia.output( u'Robot prohibited: %s' %
unicode(extlink_linkname) )
+ pywikibot.output(u'Robot prohibited: %s'
+ % unicode(extlink_linkname))
else: # else access allowed
try:
if have_httpcache:
- cache = HTTPCache( extlink_linkname )
+ cache = HTTPCache(extlink_linkname)
urlfile = cache.filename() # filename of cached date
urlheaders = cache.info()
else:
- (urlfile, urlheaders) = urllib.urlretrieve( extlink_linkname
)
+ (urlfile, urlheaders) = urllib.urlretrieve(extlink_linkname)
except IOError:
- wikipedia.output( u'Error accessing URL. %s' %
unicode(extlink_linkname) )
+ pywikibot.output(u'Error accessing URL. %s'
+ % unicode(extlink_linkname))
except (socket.herror, socket.gaierror), (err, msg):
- wikipedia.output( u'Error %i accessing URL, %s. %s' %
(err, unicode(msg), unicode(extlink_linkname)) )
+ pywikibot.output(u'Error %i accessing URL, %s. %s'
+ % (err, unicode(msg),
+ unicode(extlink_linkname)))
except socket.timeout, msg:
- wikipedia.output( u'Error accessing URL, %s. %s' %
(unicode(msg), unicode(extlink_linkname)) )
+ pywikibot.output(u'Error accessing URL, %s. %s'
+ % (unicode(msg),
+ unicode(extlink_linkname)))
except: # Ignore other errors
pass
if urlfile != None:
urlobj = open( urlfile )
if extlink_linkname.lower().endswith('.pdf'):
# If file has a PDF suffix
- wikipedia.output( u'PDF file.' )
+ pywikibot.output( u'PDF file.')
try:
pdfinfo_out =
subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj,
stdout=subprocess.PIPE, shell=False).communicate()[0]
for aline in pdfinfo_out.splitlines():
if aline.lower().startswith('title'):
urltitle = aline.split(None)[1:]
urltitle = ' '.join(urltitle)
- if urltitle != '':
wikipedia.output(u'title: ' +urltitle )
+ if urltitle:
+ pywikibot.output(u'title: %s'
+ % urltitle)
else:
if aline.lower().startswith('author'):
urlauthor = aline.split(None)[1:]
urlauthor = ' '.join(urlauthor)
- if urlauthor != '':
wikipedia.output(u'author: ' +urlauthor )
+ if urlauthor:
+ pywikibot.output(u'author: %s'
+ % urlauthor )
except ValueError:
- wikipedia.output( u'pdfinfo value error.' )
+ pywikibot.output( u'pdfinfo value error.')
except OSError:
- wikipedia.output( u'pdfinfo OS error.' )
+ pywikibot.output( u'pdfinfo OS error.')
except: # Ignore errors
- wikipedia.output( u'PDF processing error.' )
+ pywikibot.output( u'PDF processing error.')
pass
- wikipedia.output( u'PDF done.' )
+ pywikibot.output( u'PDF done.')
if urlobj:
urlobj.close()
else:
@@ -643,14 +670,16 @@
except:
urltitle = u' ' # error, no title
urltitle = u' '.join(urltitle.split()) # merge
whitespace
- wikipedia.output( u'::::Title: %s' % urltitle )
+ pywikibot.output( u'::::Title: %s' % urltitle )
break # found a title so stop looking
else:
if maxalines < 1:
- wikipedia.output( u'No title in URL. %s' %
unicode(extlink_linkname) )
+ pywikibot.output(
+ u'No title in URL. %s'
+ % unicode(extlink_linkname) )
else:
if urlobj != None:
- wikipedia.output( u'::+URL: ' + extlink_linkname
)
+ pywikibot.output( u'::+URL: ' + extlink_linkname
)
# urlinfo = urlobj.info()
aline = urlobj.read()
full_page = ''
@@ -664,7 +693,7 @@
try:
urltitle =
unicode(titleRE.group('HTMLtitle'), 'utf-8')
urltitle = u'
'.join(urltitle.split()) # merge whitespace
- wikipedia.output( u'::::Title:
%s' % urltitle )
+ pywikibot.output( u'::::Title:
%s' % urltitle )
except:
aline = urlobj.read()
continue
@@ -676,7 +705,7 @@
aline = urlobj.read()
else:
aline = urlobj.read()
- if urltitle != '': wikipedia.output( u'title:
' + urltitle )
+ if urltitle != '': pywikibot.output( u'title:
' + urltitle )
# Try a more advanced search
##from nltk.parser.probabilistic import *
##from nltk.tokenizer import *
@@ -698,17 +727,17 @@
#for tok in train_tokens: britaggerrules.train(tok,
max_rules=200, min_score=2)
# brittaggerrul = britaggerrules.train(train_tokens,
max_rules=200, min_score=2)
#britaggerrul = ()
- #britagger = BrillTagger(initial_tagger=unitagger,
rules=britaggerrul, SUBTOKENS='WORDS' )
+ #britagger = BrillTagger(initial_tagger=unitagger,
rules=britaggerrul, SUBTOKENS='WORDS')
# Training completed
# Examine text
##text_token = Token(TEXT=full_page)
##WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token)
#unitagger.tag(text_token)
#britagger.tag(text_token)
- ### wikipedia.output( unicode(text_token) )
+ ### pywikibot.output( unicode(text_token) )
else:
- wikipedia.output( u'No data retrieved.' )
- socket.setdefaulttimeout( 200 ) # timeout in seconds
+ pywikibot.output( u'No data retrieved.')
+ socket.setdefaulttimeout(200)
urltitle = urltitle.replace(u'|',u':')
return urltitle.strip()
@@ -731,11 +760,11 @@
new_text = u''
now = date.today()
if extlink_linktext == None or len(extlink_linktext.strip()) < 20:
- wikipedia.output( u'Fetching URL: %s' % unicode(extlink_linkname) )
+ pywikibot.output( u'Fetching URL: %s' % unicode(extlink_linkname) )
urltitle = self.doGetTitleFromURL( extlink_linkname ) # try to get title
from URL
if urltitle == None or urltitle == '':
- urltitle = extlink_linkname # Assume linkname for title
- wikipedia.output( u'Title is: %s' % urltitle )
+ urltitle = extlink_linkname
+ pywikibot.output( u'Title is: %s' % urltitle )
extlink_linktext = urltitle
for newref in self.references: # scan through all references
if extlink_linkname in newref: # if undescribed linkname same as a
previous entry
@@ -750,7 +779,7 @@
for (sitename, newscompany, stripprefix) in newssites:
if refname.startswith( sitename ):
# If there is a prefix to strip from the title
- if stripprefix and extlink_linktext.startswith( stripprefix ):
+ if stripprefix and extlink_linktext.startswith(stripprefix):
extlink_linktext = extlink_linktext[len(stripprefix):]
new_text = u'{{news reference | title=%s | url=%s | urldate=%s |
org=%s }}' % ( extlink_linktext, extlink_linkname, now.isoformat(), newscompany ) +
'\n'
break
@@ -764,12 +793,14 @@
a format suitable for the Notes section.
"""
# TODO: look up DOI info and create full reference
- urltitle = self.doGetTitleFromURL( 'http://dx.doi.org/' + doi_linktext )
# try to get title from URL
+ urltitle =
self.doGetTitleFromURL('http://dx.doi.org/' + doi_linktext ) #
try to get title from URL
refname = 'refbot%d' % refsequence
if urltitle:
- new_text = '# {{note|%s}} %s {{doi|%s}}' % (refname, urltitle,
doi_linktext) + '\n'
+ new_text = '# {{note|%s}} %s {{doi|%s}}\n' \
+ % (refname, urltitle, doi_linktext)
else:
- new_text = '# {{note|%s}} {{doi|%s}}' % (refname, doi_linktext) +
'\n'
+ new_text = '# {{note|%s}} {{doi|%s}}\n' \
+ % (refname, doi_linktext)
return (refname, new_text)
def doBuildSequenceListOfReferences(self, original_text):
@@ -777,14 +808,14 @@
Returns a list with all found references and sequence numbers.
"""
duplicatefound = False
- refusage = {} # Nothing found yet
+ refusage = {}
# Data structure: refusage[reference_key] = [ sequence_in_document, count,
count_during_dup_handling ]
for text_line in original_text.splitlines(True): # Scan all text line by line
# Check for various references
Rtext_line =
re.compile(r'(?i){{(?P<reftype>ref|ref_num|ref_label)\|(?P<refname>[^}|]+?)}}')
m = Rtext_line.search( text_line )
while m: # if found a reference
- if m.group('reftype').lower() in ( 'ref',
'ref_num', 'ref_label' ): # confirm ref
+ if m.group('reftype').lower() in ('ref',
'ref_num', 'ref_label'): # confirm ref
refkey = m.group('refname').strip()
if refkey != '':
if refkey in refusage:
@@ -793,7 +824,7 @@
else:
refusage[refkey] = [len(refusage),0,0] # remember this
reference
m = Rtext_line.search( text_line, m.end() )
- wikipedia.output( u'Number of refs: %d' % (len(refusage)) )
+ pywikibot.output( u'Number of refs: %d' % (len(refusage)) )
return (duplicatefound, refusage)
def doReadReferencesSection(self, original_text, refsectionname):
@@ -803,7 +834,7 @@
Contents of all Notes sections will be read.
"""
# TODO: support subsections within Notes
- new_text = '' # Default is no text
+ new_text = ''
intargetsection = False
for text_line in original_text.splitlines(True): # Scan all text line by line
# Check for target section
@@ -811,19 +842,20 @@
if m: # if in a section, check if Notes section
if refsectionname != '': # if a certain section name has been
identified
m_section = m.group('sectionname')
- wikipedia.output( u'Looking for "%s":
"%s"' % (refsectionname,m_section) )
+ pywikibot.output(u'Looking for "%s":
"%s"'
+ % (refsectionname,m_section) )
if unicode(m_section.strip()) == unicode(refsectionname):
- wikipedia.output( u'Read Ref section.' )
- intargetsection = True # flag as being in section
+ pywikibot.output(u'Read Ref section.')
+ intargetsection = True
new_text = new_text + text_line
else:
- intargetsection = False # flag as not being in
section
+ intargetsection = False
else: # else grab all possible sections
if m.group('sectionname').lower().strip() in
referencesectionnames:
- intargetsection = True # flag as being in section
+ intargetsection = True
new_text = new_text + text_line
else:
- intargetsection = False # flag as not being in
section
+ intargetsection = False
else:
if intargetsection: # if inside target section, remember this
reference line
if text_line.strip() != '':
@@ -837,8 +869,8 @@
if intargetsection: # if still inside target section
# Convert any # wiki list to *; will be converted later if a
reference
if text_line[0] == '#':
- text_line = '*' + text_line[1:] # replace # with *
wiki
- self.references.append( text_line.rstrip() + u'\n' ) #
Append line to references
+ text_line = '*' + text_line[1:]
+ self.references.append(text_line.rstrip() + u'\n')
new_text = new_text + text_line.rstrip() + u'\n'
return new_text
@@ -891,7 +923,7 @@
Returns the text which is generated by rebuilding the Notes section.
Rewrite Notes section from references list.
"""
- new_text = '' # Default is no text
+ new_text = ''
intargetsection = False
for text_line in original_text.splitlines(True): # Scan all text line by line
# Check for target section
@@ -899,9 +931,9 @@
if m: # if in a section, check if Notes section
if refsectionname != '': # if a certain section name has been
identified
m_section = m.group('sectionname')
- wikipedia.output( u'Looking for "%s":
"%s"' % (refsectionname,m_section) )
+ pywikibot.output( u'Looking for "%s":
"%s"' % (refsectionname,m_section) )
if unicode(m_section.strip()) == unicode(refsectionname):
- wikipedia.output( u'Updating Ref section.' )
+ pywikibot.output( u'Updating Ref section.')
intargetsection = True # flag as being in section
else:
intargetsection = False # flag as not being in section
@@ -933,7 +965,7 @@
if not intargetsection: # if not in Notes section, remember
line
new_text = new_text + text_line # append new line to new text
# If references list not emptied, there was no Notes section found
- if self.references != []: # empty references
+ if self.references != []:
# New Notes section needs to be created at bottom.
text_line_counter = 0 # current line
last_text_line_counter_value = 0 # number of last line of possible text
@@ -978,26 +1010,29 @@
# Load the page's text from the wiki
original_text = pl.get()
if pl.editRestriction:
- wikipedia.output(u'Skipping locked page %s' % pl.title())
+ pywikibot.output(u'Skipping locked page %s' % pl.title())
continue
- except wikipedia.NoPage:
- wikipedia.output(u'Page %s not found' % pl.title())
+ except pywikibot.NoPage:
+ pywikibot.output(u'Page %s not found' % pl.title())
continue
- except wikipedia.IsRedirectPage:
+ except pywikibot.IsRedirectPage:
continue
match = self.checkExceptions(original_text)
# skip all pages that contain certain texts
if match:
- wikipedia.output(u'Skipping %s because it contains %s' %
(pl.title(), match))
+ pywikibot.output(u'Skipping %s because it contains %s'
+ % (pl.title(), match))
else:
new_text = self.doReplacements(original_text)
if new_text == original_text:
- wikipedia.output('No changes were necessary in %s' %
pl.title())
+ pywikibot.output('No changes were necessary in %s'
+ % pl.title())
else:
- wikipedia.output(u'>>> %s <<<' %
pl.title())
- wikipedia.showDiff(original_text, new_text)
+ pywikibot.output(u'>>> %s <<<' %
pl.title())
+ pywikibot.showDiff(original_text, new_text)
if not self.acceptall:
- choice = wikipedia.input(u'Do you want to accept these
changes? [y|n|a(ll)]')
+ choice = pywikibot.input(
+ u'Do you want to accept these changes? [y|n|a(ll)]')
if choice in ['a', 'A']:
self.acceptall = True
if self.acceptall or choice in ['y', 'Y']:
@@ -1034,7 +1069,7 @@
# default to -1 which means all namespaces will be processed
namespace = -1
# Load default summary message.
- editSummary = wikipedia.translate(wikipedia.getSite(), msg)
+ editSummary = pywikibot.translate(pywikibot.getSite(), msg)
# List of references in Notes section
references = []
# Notes sequence number
@@ -1043,31 +1078,33 @@
refusage = {}
# Read commandline parameters.
- for arg in wikipedia.handleArgs():
+ for arg in pywikibot.handleArgs():
if arg == '-regex':
regex = True
elif arg.startswith('-file'):
if len(arg) == 5:
- textfilename = wikipedia.input(u'Please enter the filename:')
+ textfilename = pywikibot.input(u'Please enter the filename:')
else:
textfilename = arg[6:]
source = 'textfile'
elif arg.startswith('-cat'):
if len(arg) == 4:
- categoryname = wikipedia.input(u'Please enter the category
name:')
+ categoryname = pywikibot.input(
+ u'Please enter the category name:')
else:
categoryname = arg[5:]
source = 'category'
elif arg.startswith('-sql'):
if len(arg) == 4:
- sqlfilename = wikipedia.input(u'Please enter the SQL dump\'s
filename:')
+ sqlfilename = pywikibot.input(
+ u'Please enter the SQL dump\'s filename:')
else:
sqlfilename = arg[5:]
source = 'sqldump'
elif arg.startswith('-page'):
if len(arg) == 5:
pagenames.append(
- wikipedia.input(u'Which page do you want to change?'))
+ pywikibot.input(u'Which page do you want to change?'))
else:
pagenames.append(arg[6:])
source = 'userinput'
@@ -1085,16 +1122,19 @@
if source == None or len(commandline_replacements) not in [0, 2]:
# syntax error, show help text from the top of this file
- wikipedia.output(__doc__, 'utf-8')
+ pywikibot.output(__doc__, 'utf-8')
return
if (len(commandline_replacements) == 2):
replacements[commandline_replacements[0]] = commandline_replacements[1]
- editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' +
commandline_replacements[0] + ' +' + commandline_replacements[1] + ')'
+ editSummary = pywikibot.translate(pywikibot.getSite(), msg)
+ % ' (-' + commandline_replacements[0] + ' +' +
commandline_replacements[1] + ')'
else:
change = ''
- default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) %
change
- wikipedia.output(u'The summary message will default to: %s' %
default_summary_message)
- summary_message = wikipedia.input(u'Press Enter to use this default message,
or enter a description of the changes your bot will make:')
+ default_summary_message = pywikibot.translate(pywikibot.getSite(), msg) %
change
+ pywikibot.output(u'The summary message will default to: %s'
+ % default_summary_message)
+ summary_message = pywikibot.input(
+ u'Press Enter to use this default message, or enter a description of the
changes your bot will make:')
if summary_message == '':
summary_message = default_summary_message
editSummary = summary_message
@@ -1103,18 +1143,20 @@
try:
fix = fixes['ALTREFS']
except KeyError:
- wikipedia.output(u'Available predefined fixes are: %s' %
fixes.keys())
+ pywikibot.output(u'Available predefined fixes are: %s'
+ % fixes.keys())
return
if 'regex' in fix:
regex = fix['regex']
if 'msg' in fix:
- editSummary = wikipedia.translate(wikipedia.getSite(), fix['msg'])
+ editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg'])
if 'exceptions' in fix:
exceptions = fix['exceptions']
replacements = fix['replacements']
- gen = ReplacePageGenerator(source, replacements, exceptions, regex, namespace,
- textfilename, sqlfilename, categoryname, pagenames)
+ gen = ReplacePageGenerator(source, replacements, exceptions, regex,
+ namespace, textfilename, sqlfilename,
+ categoryname, pagenames)
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
bot = ReplaceRobot(preloadingGen, replacements, refsequence, references,
refusage, exceptions, regex, acceptall, editSummary)
@@ -1125,4 +1167,4 @@
try:
main()
finally:
- wikipedia.stopme()
+ pywikibot.stopme()