Revision: 8700
Author: xqt
Date: 2010-11-06 02:18:01 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
saves the titles to disk (patch bug #3093682 submitted by Bin?\195?\161ris) Thanks!
-- Introduced two new command line argument, -save and -savenew, and an updated main() to
process them
-- Updated ReplaceRobot class with two new functions and edit counter, and a new
parameter
-- Extended help (description of new feature and one more example)
-- Some code wrappings to fit in 80 characters I found because I know xqt likes them :)
Modified Paths:
--------------
trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py 2010-11-06 00:48:58 UTC (rev 8699)
+++ trunk/pywikipedia/replace.py 2010-11-06 02:18:01 UTC (rev 8700)
@@ -32,6 +32,17 @@
before the one specified (may also be given as
-xmlstart:Article).
+-save Saves the titles of the articles to a file instead of
+ modifying the articles. This way you may collect titles to
+ work on in automatic mode, and process them later with
+ -file. Opens the file for append, if exists.
+ If you insert the contents of the file into a wikipage, it
+ will appear as a numbered list, and may be used with -links.
+ Argument may also be given as "-save:filename".
+
+-savenew Just like -save, except that overwrites the existing file.
+ Argument may also be given as "-savenew:filename".
+
-addcat:cat_name Adds "cat_name" category to every altered page.
-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex
@@ -109,21 +120,30 @@
python replace.py -page:John_Doe -fix:isbn
+Let's suppose, you want to change "color" to "colour" manually,
but gathering
+the articles is too slow, so you want to save the list while you are sleeping.
+You have Windows, so "python" is not necessary. Use this:
+
+ replace.py -xml -save:color.txt color colour -always
+You may use color.txt later with -file or -links, if you upload it to the wiki.
+
This command will change 'referer' to 'referrer', but not in pages which
talk about HTTP, where the typo has become part of the standard:
python replace.py referer referrer -file:typos.txt -excepttext:HTTP
+
+Please type "replace.py | more" if you can't read the top of the help.
"""
from __future__ import generators
#
-# (C) Daniel Herding & the Pywikipedia team, 2004-2009
+# (C) Daniel Herding & the Pywikipedia team, 2004-2010
#
__version__='$Id$'
#
# Distributed under the terms of the MIT license.
#
-import sys, re, time
+import sys, re, time, codecs
import wikipedia as pywikibot
import pagegenerators
import editarticle
@@ -263,7 +283,7 @@
"""
def __init__(self, generator, replacements, exceptions={},
acceptall=False, allowoverlap=False, recursive=False,
- addedCat=None, sleep=None, editSummary=''):
+ addedCat=None, sleep=None, editSummary='', articles=None):
"""
Arguments:
* generator - A generator that yields Page objects.
@@ -278,6 +298,8 @@
replaced.
* addedCat - If set to a value, add this category to every page
touched.
+ * articles - An open file to save the page titles. If None,
+ we work on our wikisite immediately (default).
Structure of the exceptions dictionary:
This dictionary can have these keys:
@@ -310,6 +332,11 @@
self.sleep = sleep
# Some function to set default editSummary should probably be added
self.editSummary = editSummary
+ self.articles = articles
+
+ #An edit counter to split the file by 100 titles if -save or -savenew
+ #is on, and to display the number of edited articles otherwise.
+ self.editcounter = 0
def isTitleExcepted(self, title):
"""
@@ -354,6 +381,31 @@
allowoverlap=self.allowoverlap)
return new_text
+ def writeEditCounter(self):
+ """ At the end of our work this writes the counter.
"""
+ if self.articles:
+ pywikibot.output(u'%d title%s saved.'
+ % (self.editcounter,
+ (lambda x: bool(x-1) and 's were' or '
was')
+ (self.editcounter)))
+ else:
+ pywikibot.output(u'%d page%s changed.'
+ % (self.editcounter,
+ (lambda x: bool(x-1) and 's were' or '
was')
+ (self.editcounter)))
+
+ def splitLine(self):
+ """Returns a splitline after every 100th title. Splitline is in
HTML
+ comment format in case we want to insert the list into a wikipage.
+ We use it to make the file more readable.
+
+ """
+ if self.editcounter % 100:
+ return ''
+ else:
+ return (u'<!-- ***** %dth title is above this line. *****
-->\n' %
+ self.editcounter)
+
def run(self):
"""
Starts the robot.
@@ -408,7 +460,8 @@
break
choice = pywikibot.inputChoice(
u'Do you want to accept these changes?',
- ['Yes', 'No', 'Edit', 'open in
Browser', 'All', "Quit"],
+ ['Yes', 'No', 'Edit', 'open in
Browser', 'All',
+ 'Quit'],
['y', 'N', 'e', 'b',
'a', 'q'], 'N')
if choice == 'e':
editor = editarticle.TextEditor()
@@ -427,30 +480,57 @@
new_text = original_text
continue
if choice == 'q':
+ self.writeEditCounter()
return
if choice == 'a':
self.acceptall = True
if choice == 'y':
- page.put_async(new_text, self.editSummary)
+ if not self.articles:
+ #Primary behaviour: working on wiki
+ page.put_async(new_text, self.editSummary)
+ self.editcounter += 1
+ #Bug: this increments even if put_async fails
+ #This is separately in two clauses of if for
+ #future purposes to get feedback form put_async
+ else:
+ #Save the title for later processing instead of editing
+ self.editcounter += 1
+ self.articles.write(u'#%s\n%s'
+ % (page.title(asLink=True),
+ self.splitLine()))
+ self.articles.flush() # For the peace of our soul :-)
# choice must be 'N'
break
if self.acceptall and new_text != original_text:
- try:
- page.put(new_text, self.editSummary)
- except pywikibot.EditConflict:
- pywikibot.output(u'Skipping %s because of edit conflict'
- % (page.title(),))
- except pywikibot.SpamfilterError, e:
- pywikibot.output(
- u'Cannot change %s because of blacklist entry %s'
- % (page.title(), e.url))
- except pywikibot.PageNotSaved, error:
- pywikibot.output(u'Error putting page: %s'
- % (error.args,))
- except pywikibot.LockedPage:
- pywikibot.output(u'Skipping %s (locked page)'
- % (page.title(),))
+ if not self.articles:
+ #Primary behaviour: working on wiki
+ try:
+ page.put(new_text, self.editSummary)
+ self.editcounter += 1 #increment only on success
+ except pywikibot.EditConflict:
+ pywikibot.output(u'Skipping %s because of edit conflict'
+ % (page.title(),))
+ except pywikibot.SpamfilterError, e:
+ pywikibot.output(
+ u'Cannot change %s because of blacklist entry %s'
+ % (page.title(), e.url))
+ except pywikibot.PageNotSaved, error:
+ pywikibot.output(u'Error putting page: %s'
+ % (error.args,))
+ except pywikibot.LockedPage:
+ pywikibot.output(u'Skipping %s (locked page)'
+ % (page.title(),))
+ else:
+ #Save the title for later processing instead of editing
+ self.editcounter += 1
+ self.articles.write(u'#%s\n%s'
+ % (page.title(asLink=True),
+ self.splitLine()))
+ self.articles.flush()
+ #Finally:
+ self.writeEditCounter()
+
def prepareRegexForMySQL(pattern):
pattern = pattern.replace('\s', '[:space:]')
pattern = pattern.replace('\d', '[:digit:]')
@@ -517,6 +597,11 @@
# Between a regex and another (using -fix) sleep some time (not to waste
# too much CPU
sleep = None
+ # Do not save the page titles, rather work on wiki
+ titlefile = None
+ filename = None
+ # If we save, primary behaviour is append rather then new file
+ append = True
# Read commandline parameters.
for arg in pywikibot.handleArgs(*args):
@@ -539,9 +624,22 @@
elif arg.startswith('-page'):
if len(arg) == 5:
PageTitles.append(pywikibot.input(
- u'Which page do you want to change?'))
+ u'Which page do you want to change?'))
else:
PageTitles.append(arg[6:])
+ elif arg.startswith('-savenew'):
+ if len(arg) == 8:
+ filename = pywikibot.input(
+u'Please enter the filename to save the titles \n(will be deleted if exists):')
+ else:
+ filename = arg[9:]
+ elif arg.startswith('-save'):
+ append = False
+ if len(arg) == 5:
+ filename = pywikibot.input(
+ u'Please enter the filename to save the titles:')
+ else:
+ filename = arg[6:]
elif arg.startswith('-excepttitle:'):
exceptions['title'].append(arg[13:])
elif arg.startswith('-requiretitle:'):
@@ -585,8 +683,9 @@
replacements.append((commandline_replacements[0],
commandline_replacements[1]))
if not summary_commandline:
- editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-'
+ commandline_replacements[0] + ' +'
- + commandline_replacements[1] + ')')
+ editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (
+ ' (-%s +%s)' % (commandline_replacements[0],
+ commandline_replacements[1]))
elif (len(commandline_replacements) > 1):
if (fix is None):
for i in xrange (0, len(commandline_replacements), 2):
@@ -598,7 +697,8 @@
for i in range(0, len(commandline_replacements), 2)]
replacementsDescription = '(%s)' % ', '.join(
[('-' + pair[0] + ' +' + pair[1]) for pair in
pairs])
- editSummary = pywikibot.translate(pywikibot.getSite(), msg ) %
replacementsDescription
+ editSummary = pywikibot.translate(pywikibot.getSite(), msg ) \
+ % replacementsDescription
else:
raise pywikibot.Error(
'Specifying -fix with replacements is undefined')
@@ -710,10 +810,30 @@
preloadingGen = pagegenerators.PreloadingGenerator(gen,
pageNumber=20, lookahead=100)
else:
- preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize)
+ preloadingGen = pagegenerators.PreloadingGenerator(gen,
+ pageNumber=maxquerysize)
+
+ #Finally we open the file for page titles or set article to None
+ if filename:
+ try:
+ #This opens in strict error mode, that means bot will stop
+ #on encoding errors with ValueError.
+ #See
http://docs.python.org/library/codecs.html#codecs.open
+ titlefile = codecs.open(filename, encoding='utf-8',
+ mode=(lambda x: x and 'a' or
'w')(append))
+ except IOError:
+ pywikibot.output("%s cannot be opened for writing." %
+ filename)
+ return
bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall,
- allowoverlap, recursive, add_cat, sleep, editSummary)
- bot.run()
+ allowoverlap, recursive, add_cat, sleep, editSummary,
+ titlefile)
+ try:
+ bot.run()
+ finally:
+ if titlefile:
+ #Just for the spirit of programming (it was flushed)
+ titlefile.close()
if __name__ == "__main__":