Revision: 6740
Author: nicdumz
Date: 2009-04-27 15:41:56 +0000 (Mon, 27 Apr 2009)
Log Message:
-----------
#2782025 : [sorting of possibilities]
Falling back to lists for the values of PageTree.tree to keep
order. Adding a lengthy comment explaining the reason of that
implementation choice: yes, using dicts would be faster, but no,
we can't.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-27 15:21:01 UTC (rev 6739)
+++ trunk/pywikipedia/interwiki.py 2009-04-27 15:41:56 UTC (rev 6740)
@@ -508,6 +508,26 @@
Allows filtering efficiently by Site.
"""
def __init__(self):
+ # self.tree :
+ # Dictionary:
+ # keys: Site
+ # values: list of pages
+ # All pages found within Site are kept in
+ # self.tree[site]
+
+ # While using dict values would be faster for
+ # the remove() operation,
+ # keeping list values is important, because
+ # the order in which the pages were found matters:
+ # the earlier a page is found, the closer it is to the
+ # Subject.originPage. Chances are that pages found within
+ # 2 interwiki distance from the originPage are more related
+ # to the original topic than pages found later on, after
+ # 3, 4, 5 or more interwiki hops.
+
+ # Keeping this order is hence important to display ordered
+ # list of pages to the user when he'll be asked to resolve
+ # conflicts.
self.tree = {}
self.size = 0
@@ -527,15 +547,15 @@
def add(self, page):
site = page.site()
if not site in self.tree:
- self.tree[site] = {}
- self.tree[site][page] = True
+ self.tree[site] = []
+ self.tree[site].append(page)
self.size += 1
def remove(self, page):
try:
- del self.tree[page.site()][page]
+ self.tree[page.site()].remove(page)
self.size -= 1
- except KeyError:
+ except ValueError:
pass
def removeSite(self, site):
@@ -556,8 +576,8 @@
yield site, len(d)
def __iter__(self):
- for site, d in self.tree.iteritems():
- for page in d:
+ for site, plist in self.tree.iteritems():
+ for page in plist:
yield page
class Subject(object):
Revision: 6734
Author: nicdumz
Date: 2009-04-27 12:22:14 +0000 (Mon, 27 Apr 2009)
Log Message:
-----------
Merging changes from trunk
Modified Paths:
--------------
branches/rewrite/scripts/interwiki.py
Property Changed:
----------------
branches/rewrite/scripts/interwiki.py
Modified: branches/rewrite/scripts/interwiki.py
===================================================================
--- branches/rewrite/scripts/interwiki.py 2009-04-27 12:17:21 UTC (rev 6733)
+++ branches/rewrite/scripts/interwiki.py 2009-04-27 12:22:14 UTC (rev 6734)
@@ -295,11 +295,11 @@
"""
seq2 = copy.copy(seq)
if key:
- if cmp == None:
+ if cmp is None:
cmp = __builtins__.cmp
seq2.sort(lambda x,y: cmp(key(x), key(y)))
else:
- if cmp == None:
+ if cmp is None:
seq2.sort()
else:
seq2.sort(cmp)
@@ -502,6 +502,64 @@
nobackonly = False
hintsareright = False
+class PageTree(object):
+ """
+ Structure to manipulate a set of pages.
+ Allows filtering efficiently by Site.
+ """
+ def __init__(self):
+ self.tree = {}
+ self.size = 0
+
+ def filter(self, site):
+ """
+ Iterates over pages that are in Site site
+ """
+ try:
+ for page in self.tree[site]:
+ yield page
+ except KeyError:
+ pass
+
+ def __len__(self):
+ return self.size
+
+ def add(self, page):
+ site = page.site()
+ if not site in self.tree:
+ self.tree[site] = {}
+ self.tree[site][page] = True
+ self.size += 1
+
+ def remove(self, page):
+ try:
+ del self.tree[page.site()][page]
+ self.size -= 1
+ except KeyError:
+ pass
+
+ def removeSite(self, site):
+ """
+ Removes all pages from Site site
+ """
+ try:
+ self.size -= len(self.tree[site])
+ del self.tree[site]
+ except KeyError:
+ pass
+
+ def siteCounts(self):
+ """
+ Yields (Site, number of pages in site) pairs
+ """
+ for site, d in self.tree.iteritems():
+ yield site, len(d)
+
+ def __iter__(self):
+ for site, d in self.tree.iteritems():
+ for page in d:
+ yield page
+
class Subject(object):
"""
Class to follow the progress of a single 'subject' (i.e. a page with
@@ -515,10 +573,12 @@
self.originPage = originPage
# todo is a list of all pages that still need to be analyzed.
# Mark the origin page as todo.
- self.todo = [originPage]
+ self.todo = PageTree()
+ self.todo.add(originPage)
+
# done is a list of all pages that have been analyzed and that
# are known to belong to this subject.
- self.done = []
+ self.done = PageTree()
# foundIn is a dictionary where pages are keys and lists of
# pages are values. It stores where we found each page.
# As we haven't yet found a page that links to the origin page, we
@@ -526,7 +586,7 @@
self.foundIn = {self.originPage:[]}
# This is a list of all pages that are currently scheduled for
# download.
- self.pending = []
+ self.pending = PageTree()
if globalvar.hintsareright:
# This is a set of sites that we got hits to
self.hintedsites = set()
@@ -544,8 +604,8 @@
first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending:
- if page.site() == site:
+ for tree in [self.done, self.pending]:
+ for page in tree.filter(site):
if page.exists() and page.isDisambig():
return page
return None
@@ -557,8 +617,8 @@
first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending:
- if page.site() == site:
+ for tree in [self.done, self.pending]:
+ for page in tree.filter(site):
if page.exists() and not page.isDisambig() and not page.isRedirectPage():
return page
return None
@@ -570,8 +630,8 @@
have been found, the first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending + self.todo:
- if page.site() == site:
+ for tree in [self.done, self.pending, self.todo]:
+ for page in tree.filter(site):
if page.namespace() == self.originPage.namespace():
if page.exists() and not page.isRedirectPage():
return page
@@ -590,23 +650,19 @@
pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets
= globalvar.hintnobracket)
for page in pages:
- self.todo.append(page)
+ self.todo.add(page)
self.foundIn[page] = [None]
if keephintedsites:
self.hintedsites.add(page.site)
- def openSites(self, allowdoubles = False):
- """Return a list of sites for all things we still need to do"""
- distinctSites = {}
+ def openSites(self):
+ """
+ Iterator. Yields (site, count) pairs:
+ * site is a site where we still have work to do on
+ * count is the number of items in that Site that need work on
+ """
+ return self.todo.siteCounts()
- for page in self.todo:
- site = page.site()
- if allowdoubles:
- distinctSites[page] = site
- else:
- distinctSites[site] = site
- return distinctSites.values()
-
def willWorkOn(self, site):
"""
By calling this method, you 'promise' this instance that you will
@@ -615,24 +671,25 @@
"""
# Bug-check: Isn't there any work still in progress? We can't work on
# different sites at a time!
- if self.pending != []:
+ if len(self.pending) > 0:
raise 'BUG: Can\'t start to work on %s; still working on %s' % (site, self.pending)
# Prepare a list of suitable pages
- for page in self.todo:
- if page.site() == site:
- self.pending.append(page)
- for page in self.pending:
- self.todo.remove(page)
+ result = []
+ for page in self.todo.filter(site):
+ self.pending.add(page)
+ result.append(page)
+
+ self.todo.removeSite(site)
# If there are any, return them. Otherwise, nothing is in progress.
- return self.pending
+ return result
def makeForcedStop(self,counter):
"""
Ends work on the page before the normal end.
"""
- for page in self.todo:
- counter.minus(page.site())
- self.todo = []
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
self.forcedStop = True
def addIfNew(self, page, counter, linkingPage):
@@ -658,10 +715,15 @@
return False
else:
self.foundIn[page] = [linkingPage]
- self.todo.append(page)
+ self.todo.add(page)
counter.plus(page.site())
return True
+ def skipPage(self, page, target, counter):
+ return self.isIgnored(target) or \
+ self.namespaceMismatch(page, target, counter) or \
+ self.wiktionaryMismatch(target)
+
def namespaceMismatch(self, linkingPage, linkedPage, counter):
"""
Checks whether or not the given page has another namespace
@@ -745,14 +807,16 @@
else:
choice = 'y'
if self.originPage.isDisambig() and not page.isDisambig():
- if self.getFoundDisambig(page.site()):
- wikipedia.output(u"NOTE: Ignoring non-disambiguation page %s for %s because disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), self.getFoundDisambig(page.site()).aslink(True)))
+ disambig = self.getFoundDisambig(page.site())
+ if disambig:
+ wikipedia.output(u"NOTE: Ignoring non-disambiguation page %s for %s because disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), disambig.aslink(True)))
return (True, None)
else:
choice = wikipedia.inputChoice('WARNING: %s is a disambiguation page, but %s doesn\'t seem to be one. Follow it anyway?' % (self.originPage.aslink(True), page.aslink(True)), ['Yes', 'No', 'Add an alternative', 'Give up'], ['y', 'n', 'a', 'g'])
elif not self.originPage.isDisambig() and page.isDisambig():
- if self.getFoundNonDisambig(page.site()):
- wikipedia.output(u"NOTE: Ignoring disambiguation page %s for %s because non-disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), self.getFoundNonDisambig(page.site()).aslink(True)))
+ nondisambig = self.getFoundNonDisambig(page.site())
+ if nondisambig:
+ wikipedia.output(u"NOTE: Ignoring disambiguation page %s for %s because non-disambiguation page %s has already been found." % (page.aslink(True), self.originPage.aslink(True), nondisambig.aslink(True)))
return (True, None)
else:
choice = wikipedia.inputChoice('WARNING: %s doesn\'t seem to be a disambiguation page, but %s is one. Follow it anyway?' % (self.originPage.aslink(True), page.aslink(True)), ['Yes', 'No', 'Add an alternative', 'Give up'], ['y', 'n', 'a', 'g'])
@@ -822,124 +886,143 @@
# Loop over all the pages that should have been taken care of
for page in self.pending:
# Mark the page as done
- self.done.append(page)
+ self.done.add(page)
# make sure that none of the linked items is an auto item
if globalvar.skipauto:
dictName, year = page.autoFormat()
- if dictName != None:
+ if dictName is not None:
wikipedia.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)' % (self.originPage.site().language(), self.originPage.title(), page.site().language(),page.title(),dictName,year))
# Register this fact at the todo-counter.
counter.minus(page.site())
# Now check whether any interwiki links should be added to the
# todo list.
- if page.section() and not page.isRedirectPage():
- # We have been referred to a part of a page, not the whole page. Do not follow references.
- pass
- else:
+
+
+ if not page.exists():
+ wikipedia.output(u"NOTE: %s does not exist" % page.aslink(True))
+ if page == self.originPage:
+ # The page we are working on is the page that does not exist.
+ # No use in doing any work on it in that case.
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
+ # In some rare cases it might be we already did check some 'automatic' links
+ self.done = PageTree()
+ continue
+
+ elif page.isRedirectPage():
+ redirectTargetPage = page.getRedirectTarget()
+ wikipedia.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True)))
+ if page == self.originPage:
+ if globalvar.initialredirect:
+ self.originPage = redirectTargetPage
+ self.todo.add(redirectTargetPage)
+ counter.plus(redirectTargetPage.site)
+ else:
+ # This is a redirect page to the origin. We don't need to
+ # follow the redirection.
+ # In this case we can also stop all hints!
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
+ elif not globalvar.followredirect:
+ wikipedia.output(u"NOTE: not following redirects.")
+ elif page.site().family == redirectTargetPage.site().family \
+ and not self.skipPage(page, redirectTargetPage, counter):
+ if self.addIfNew(redirectTargetPage, counter, page):
+ if config.interwiki_shownew:
+ wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True)))
+
+ continue
+
+ elif page.section():
+ continue
+
+
+ # Page exists, isnt a redirect, and is a plain link (no section)
+
+ try:
+ iw = page.interwiki()
+ except wikipedia.NoSuchSite:
+ wikipedia.output(u"NOTE: site %s does not exist" % page.site())
+ continue
+
+ (skip, alternativePage) = self.disambigMismatch(page, counter)
+ if skip:
+ wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True))
+ self.done.remove(page)
+ iw = ()
+ if alternativePage:
+ # add the page that was entered by the user
+ self.addIfNew(alternativePage, counter, None)
+
+ duplicate = None
+ for p in self.done.filter(page.site()):
+ if p != page and p.exists() and not p.isRedirectPage():
+ duplicate = p
+ break
+
+ if self.originPage == page:
+ self.untranslated = (len(iw) == 0)
+ if globalvar.untranslatedonly:
+ # Ignore the interwiki links.
+ iw = ()
+
+ elif globalvar.autonomous and duplicate:
+
+ wikipedia.output(u"Stopping work on %s because duplicate pages"\
+ " %s and %s are found" % (self.originPage.aslink(),
+ duplicate.aslink(True),
+ page.aslink(True)))
+ self.makeForcedStop(counter)
try:
- iw = page.interwiki()
- except wikipedia.IsRedirectPage, arg:
- redirectTargetPage = wikipedia.Page(page.site(), arg.args[0])
- wikipedia.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True)))
- if page == self.originPage:
- if globalvar.initialredirect:
- self.originPage = redirectTargetPage
- self.pending.append(redirectTargetPage)
- counter.plus(redirectTargetPage.site)
- else:
- # This is a redirect page to the origin. We don't need to
- # follow the redirection.
- # In this case we can also stop all hints!
- for page2 in self.todo:
- counter.minus(page2.site())
- self.todo = []
- elif not globalvar.followredirect:
- wikipedia.output(u"NOTE: not following redirects.")
- else:
- if not (self.isIgnored(redirectTargetPage) or self.namespaceMismatch(page, redirectTargetPage, counter) or self.wiktionaryMismatch(redirectTargetPage) or (page.site().family != redirectTargetPage.site().family)):
- if self.addIfNew(redirectTargetPage, counter, page):
+ f = codecs.open(
+ wikipedia.config.datafilepath('autonomous_problems.dat'),
+ 'a', 'utf-8')
+ f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site()))
+ if config.interwiki_graph and config.interwiki_graph_url:
+ filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0])
+ f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename))
+ f.write("\n")
+ f.close()
+ # FIXME: What errors are we catching here?
+ # except: should be avoided!!
+ except:
+ #raise
+ wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.')
+ sys.exit()
+ iw = ()
+ elif page.isEmpty() and not page.isCategory():
+ wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True))
+ # Ignore the interwiki links
+ self.done.remove(page)
+ iw = ()
+
+ for linkedPage in iw:
+ if globalvar.hintsareright:
+ if linkedPage.site in self.hintedsites:
+ wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
+ break
+ if not self.skipPage(page, linkedPage, counter):
+ if globalvar.followinterwiki or page == self.originPage:
+ if self.addIfNew(linkedPage, counter, page):
+ # It is new. Also verify whether it is the second on the
+ # same site
+ lpsite=linkedPage.site()
+ for prevPage in self.foundIn:
+ if prevPage != linkedPage and prevPage.site() == lpsite:
+ # Still, this could be "no problem" as either may be a
+ # redirect to the other. No way to find out quickly!
+ wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
+ break
+ else:
if config.interwiki_shownew:
- wikipedia.output(u"%s: %s gives new redirect %s" % (self.originPage.aslink(), page.aslink(True), redirectTargetPage.aslink(True)))
- except wikipedia.NoPage:
- wikipedia.output(u"NOTE: %s does not exist" % page.aslink(True))
- if page == self.originPage:
- # The page we are working on is the page that does not exist.
- # No use in doing any work on it in that case.
- for page2 in self.todo:
- counter.minus(page2.site())
- self.todo = []
- self.done = [] # In some rare cases it might be we already did check some 'automatic' links
- pass
- except wikipedia.NoSuchSite:
- wikipedia.output(u"NOTE: site %s does not exist" % page.site())
- #except wikipedia.SectionError:
- # wikipedia.output(u"NOTE: section %s does not exist" % page.aslink())
- else:
- (skip, alternativePage) = self.disambigMismatch(page, counter)
- if skip:
- wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True))
- if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list"
- self.done.remove(page)
- iw = ()
- if alternativePage:
- # add the page that was entered by the user
- self.addIfNew(alternativePage, counter, None)
+ wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
- if self.originPage == page:
- self.untranslated = (len(iw) == 0)
- if globalvar.untranslatedonly:
- # Ignore the interwiki links.
- iw = ()
- elif globalvar.autonomous and page.site() in [p.site() for p in self.done if p != page and p.exists() and not p.isRedirectPage()]:
- otherpage = [p for p in self.done if p.site() == page.site() and p != page and p.exists() and not p.isRedirectPage()][0]
- wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True)))
- self.makeForcedStop(counter)
- try:
- f = codecs.open(
- wikipedia.config.datafilepath('autonomous_problems.dat'),
- 'a', 'utf-8')
- f.write("* %s {Found more than one link for %s}" % (self.originPage.aslink(True), page.site()))
- if config.interwiki_graph and config.interwiki_graph_url:
- filename = interwiki_graph.getFilename(self.originPage, extension = config.interwiki_graph_formats[0])
- f.write(" [%s%s graph]" % (config.interwiki_graph_url, filename))
- f.write("\n")
- f.close()
- except:
- #raise
- wikipedia.output(u'File autonomous_problem.dat open or corrupted! Try again with -restore.')
- sys.exit()
- iw = ()
- elif page.isEmpty() and not page.isCategory():
- wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True))
- # Ignore the interwiki links
- if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list"
- self.done.remove(page)
- iw = ()
- for linkedPage in iw:
- if globalvar.hintsareright:
- if linkedPage.site in self.hintedsites:
- wikipedia.output(u"NOTE: %s: %s extra interwiki on hinted site ignored %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
- break
- if not (self.isIgnored(linkedPage) or self.namespaceMismatch(page, linkedPage, counter) or self.wiktionaryMismatch(linkedPage)):
- if globalvar.followinterwiki or page == self.originPage:
- if self.addIfNew(linkedPage, counter, page):
- # It is new. Also verify whether it is the second on the
- # same site
- lpsite=linkedPage.site()
- for prevPage in self.foundIn.keys():
- if prevPage != linkedPage and prevPage.site() == lpsite:
- # Still, this could be "no problem" as either may be a
- # redirect to the other. No way to find out quickly!
- wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
- break
- else:
- if config.interwiki_shownew:
- wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
-
# These pages are no longer 'in progress'
- self.pending = []
+ self.pending = PageTree()
# Check whether we need hints and the user offered to give them
if self.untranslated and not self.hintsAsked:
self.reportInterwikilessPage(page)
@@ -972,92 +1055,94 @@
# Each value will be a list of pages.
new = {}
for page in self.done:
- site = page.site()
- if site == self.originPage.site() and page.exists() and not page.isRedirectPage():
- if page != self.originPage:
- self.problem("Found link to %s" % page.aslink(True) )
- self.whereReport(page)
- errorCount += 1
- elif page.exists() and not page.isRedirectPage():
- if site in new:
- new[site].append(page)
+ if page.exists() and not page.isRedirectPage():
+ site = page.site()
+ if site == self.originPage.site():
+ if page != self.originPage:
+ self.problem("Found link to %s" % page.aslink(True) )
+ self.whereReport(page)
+ errorCount += 1
else:
- new[site] = [page]
+ if site in new:
+ new[site].append(page)
+ else:
+ new[site] = [page]
# See if new{} contains any problematic values
result = {}
for site, pages in new.iteritems():
if len(pages) > 1:
errorCount += 1
self.problem("Found more than one link for %s" % site)
- # If there are any errors, we need to go through all
- # items manually.
- if errorCount > 0 or globalvar.select:
- if config.interwiki_graph:
- graphDrawer = interwiki_graph.GraphDrawer(self)
- graphDrawer.createGraph()
+ if not errorCount and not globalvar.select:
+ # no errors, so all lists have only one item
+ for site, pages in new.iteritems():
+ result[site] = pages[0]
+ return result
- # We don't need to continue with the rest if we're in autonomous
- # mode.
- if globalvar.autonomous:
- return None
+ # There are any errors.
+ if config.interwiki_graph:
+ graphDrawer = interwiki_graph.GraphDrawer(self)
+ graphDrawer.createGraph()
- # First loop over the ones that have more solutions
- for site, pages in new.iteritems():
- if len(pages) > 1:
- wikipedia.output(u"=" * 30)
- wikipedia.output(u"Links to %s" % site)
- i = 0
- for page2 in pages:
- i += 1
- wikipedia.output(u" (%d) Found link to %s in:" % (i, page2.aslink(True)))
- self.whereReport(page2, indent = 8)
- while True:
- answer = wikipedia.input(u"Which variant should be used [number, (n)one, (g)ive up] :")
- if answer:
- if answer == 'g':
- return None
- elif answer == 'n':
- # None acceptable
- break
- elif answer.isdigit():
- answer = int(answer)
- try:
- result[site] = pages[answer - 1]
- except IndexError:
- # user input is out of range
- pass
- else:
- break
- # Loop over the ones that have one solution, so are in principle
- # not a problem.
- acceptall = False
- for site, pages in new.iteritems():
- if len(pages) == 1:
- if not acceptall:
- wikipedia.output(u"=" * 30)
- page2 = pages[0]
- wikipedia.output(u"Found link to %s in:" % page2.aslink(True))
- self.whereReport(page2, indent = 4)
- while True:
- if acceptall:
- answer = 'a'
- else:
- answer = wikipedia.inputChoice(u'What should be done?', ['accept', 'reject', 'give up', 'accept all'], ['a', 'r', 'g', 'l'], 'a')
- if answer == 'l': # accept all
- acceptall = True
- answer = 'a'
- if answer == 'a': # accept this one
- result[site] = pages[0]
- break
- elif answer == 'g': # give up
+ # We don't need to continue with the rest if we're in autonomous
+ # mode.
+ if globalvar.autonomous:
+ return None
+
+ # First loop over the ones that have more solutions
+ for site, pages in new.iteritems():
+ if len(pages) > 1:
+ wikipedia.output(u"=" * 30)
+ wikipedia.output(u"Links to %s" % site)
+ i = 0
+ for page2 in pages:
+ i += 1
+ wikipedia.output(u" (%d) Found link to %s in:" % (i, page2.aslink(True)))
+ self.whereReport(page2, indent = 8)
+ while True:
+ answer = wikipedia.input(u"Which variant should be used [number, (n)one, (g)ive up] :")
+ if answer:
+ if answer == 'g':
return None
- elif answer == 'r': # reject
+ elif answer == 'n':
# None acceptable
break
- else: # errorCount <= 0, hence there are no lists longer than one.
- for site, pages in new.iteritems():
- result[site] = pages[0]
+ elif answer.isdigit():
+ answer = int(answer)
+ try:
+ result[site] = pages[answer - 1]
+ except IndexError:
+ # user input is out of range
+ pass
+ else:
+ break
+ # Loop over the ones that have one solution, so are in principle
+ # not a problem.
+ acceptall = False
+ for site, pages in new.iteritems():
+ if len(pages) == 1:
+ if not acceptall:
+ wikipedia.output(u"=" * 30)
+ page2 = pages[0]
+ wikipedia.output(u"Found link to %s in:" % page2.aslink(True))
+ self.whereReport(page2, indent = 4)
+ while True:
+ if acceptall:
+ answer = 'a'
+ else:
+ answer = wikipedia.inputChoice(u'What should be done?', ['accept', 'reject', 'give up', 'accept all'], ['a', 'r', 'g', 'l'], 'a')
+ if answer == 'l': # accept all
+ acceptall = True
+ answer = 'a'
+ if answer == 'a': # accept this one
+ result[site] = pages[0]
+ break
+ elif answer == 'g': # give up
+ return None
+ elif answer == 'r': # reject
+ # None acceptable
+ break
return result
def finish(self, bot = None):
@@ -1086,7 +1171,7 @@
wikipedia.output(u"======Post-processing %s======" % self.originPage.aslink(True))
# Assemble list of accepted interwiki links
new = self.assemble()
- if new == None: # User said give up or autonomous with problem
+ if new is None: # User said give up or autonomous with problem
wikipedia.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
return
@@ -1104,7 +1189,15 @@
lclSite = self.originPage.site()
lclSiteDone = False
frgnSiteDone = False
- for siteCode in lclSite.family.languages_by_size + [s for s in lclSite.family.langs.keys() if (not s in lclSite.family.languages_by_size and not s in lclSite.family.obsolete)]:
+
+ # XXX Do we really need to make an union here?
+ # we should have sorted(languages_by_size) = sorted(langs) ?!
+ langBySize = set(lclSite.family.languages_by_size)
+ allLangs = set(lclSite.family.langs)
+
+ langToCheck = (langBySize | allLangs).difference(lclSite.family.obsolete)
+
+ for siteCode in langToCheck:
site = wikipedia.getSite(code = siteCode)
if (not lclSiteDone and site == lclSite) or (not frgnSiteDone and site != lclSite and site in new):
if site == lclSite:
@@ -1128,7 +1221,7 @@
wikipedia.output(u"BUG>>> %s no longer exists?" % new[site].aslink(True))
continue
mods, adding, removing, modifying = compareLanguages(old, new, insite = lclSite)
- if (len(removing) > 0 and not globalvar.autonomous) or (len(modifying) > 0 and self.problemfound) or len(old.keys()) == 0 or (globalvar.needlimit and len(adding) + len(modifying) >= globalvar.needlimit +1):
+ if (len(removing) > 0 and not globalvar.autonomous) or (len(modifying) > 0 and self.problemfound) or len(old) == 0 or (globalvar.needlimit and len(adding) + len(modifying) >= globalvar.needlimit +1):
try:
if self.replaceLinks(new[site], new, bot):
updatedSites.append(site)
@@ -1186,6 +1279,8 @@
# clone original newPages dictionary, so that we can modify it to the local page's needs
new = dict(newPages)
+ interwikis = page.interwiki()
+
# remove interwiki links to ignore
for iw in re.finditer('<!-- *\[\[(.*?:.*?)\]\] *-->', pagetext):
try:
@@ -1195,7 +1290,8 @@
try:
if (new[ignorepage.site()] == ignorepage) and (ignorepage.site() != page.site()):
- if (ignorepage not in page.interwiki()):
+
+ if (ignorepage not in interwikis):
wikipedia.output(u"Ignoring link to %(to)s for %(from)s" % {'to': ignorepage.aslink(), 'from': page.aslink()})
new.pop(ignorepage.site())
else:
@@ -1207,7 +1303,7 @@
pltmp = new[page.site()]
if pltmp != page:
s = "None"
- if pltmp != None: s = pltmp.aslink(True)
+ if pltmp is not None: s = pltmp.aslink(True)
wikipedia.output(u"BUG>>> %s is not in the list of new links! Found %s." % (page.aslink(True), s))
raise SaveError
@@ -1217,7 +1313,7 @@
# Put interwiki links into a map
old={}
try:
- for page2 in page.interwiki():
+ for page2 in interwikis:
old[page2.site()] = page2
except wikipedia.NoPage:
wikipedia.output(u"BUG>>> %s no longer exists?" % page.aslink(True))
@@ -1228,104 +1324,107 @@
# When running in autonomous mode without -force switch, make sure we don't remove any items, but allow addition of the new ones
if globalvar.autonomous and not globalvar.force and len(removing) > 0:
- for rmPage in removing:
- if rmPage.site() != page.site(): # Sometimes sites have an erroneous link to itself as an interwiki
+ for rmsite in removing:
+ if rmsite != page.site(): # Sometimes sites have an erroneous link to itself as an interwiki
+ rmPage = old[rmsite]
##########
# temporary hard-coded special case to get rid of thousands of broken links to the Lombard Wikipedia,
# where useless bot-created articles were mass-deleted. See for example:
# http://meta.wikimedia.org/wiki/Proposals_for_closing_projects/Closure_of_Lo…
- if rmPage.site() == wikipedia.getSite('lmo', 'wikipedia'):
+ if rmsite == wikipedia.getSite('lmo', 'wikipedia'):
wikipedia.output('Found bad link to %s. As many lmo pages were deleted, it is assumed that it can be safely removed.' % rmPage.aslink())
else:
##########
- new[rmPage.site()] = old[rmPage.site()]
+ new[rmsite] = old[rmsite]
wikipedia.output(u"WARNING: %s is either deleted or has a mismatching disambiguation state." % rmPage.aslink(True))
# Re-Check what needs to get done
mods, adding, removing, modifying = compareLanguages(old, new, insite = page.site())
if not mods:
wikipedia.output(u'No changes needed' )
+ return False
+
+ wikipedia.output(u"Changes to be made: %s" % mods)
+ oldtext = page.get()
+ newtext = wikipedia.replaceLanguageLinks(oldtext, new, site = page.site())
+ if newtext == oldtext:
+ return False
+ if globalvar.debug:
+ wikipedia.showDiff(oldtext, newtext)
+
+ # wikipedia.output(u"NOTE: Replace %s" % page.aslink())
+ # Determine whether we need permission to submit
+ ask = False
+ if removing and removing != [page.site()]: # Allow for special case of a self-pointing interwiki link
+ self.problem('Found incorrect link to %s in %s'% (",".join([x.lang for x in removing]), page.aslink(True)), createneed = False)
+ ask = True
+ if globalvar.force:
+ ask = False
+ if globalvar.confirm:
+ ask = True
+ # If we need to ask, do so
+ if ask:
+ if globalvar.autonomous:
+ # If we cannot ask, deny permission
+ answer = 'n'
+ else:
+ answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No', 'Give up'], ['y', 'n', 'g'])
else:
- if mods:
- wikipedia.output(u"Changes to be made: %s" % mods)
- oldtext = page.get()
- newtext = wikipedia.replaceLanguageLinks(oldtext, new, site = page.site())
- if globalvar.debug:
- wikipedia.showDiff(oldtext, newtext)
- if newtext != oldtext:
- # wikipedia.output(u"NOTE: Replace %s" % page.aslink())
- # Determine whether we need permission to submit
- ask = False
- if removing and removing != [page]: # Allow for special case of a self-pointing interwiki link
- self.problem('Found incorrect link to %s in %s'% (",".join([x.site().lang for x in removing]), page.aslink(True)), createneed = False)
- ask = True
- if globalvar.force:
- ask = False
- if globalvar.confirm:
- ask = True
- # If we need to ask, do so
- if ask:
- if globalvar.autonomous:
- # If we cannot ask, deny permission
- answer = 'n'
- else:
- answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No', 'Give up'], ['y', 'n', 'g'])
+ # If we do not need to ask, allow
+ answer = 'y'
+ # If we got permission to submit, do so
+ if answer == 'y':
+ # Check whether we will have to wait for wikipedia. If so, make
+ # another get-query first.
+ if bot:
+ while wikipedia.get_throttle.waittime() + 2.0 < wikipedia.put_throttle.waittime():
+ wikipedia.output(u"NOTE: Performing a recursive query first to save time....")
+ qdone = bot.oneQuery()
+ if not qdone:
+ # Nothing more to do
+ break
+ wikipedia.output(u"NOTE: Updating live wiki...")
+ timeout=60
+ while 1:
+ try:
+ status, reason, data = page.put(newtext, comment = wikipedia.translate(page.site().lang, msg)[0] + mods)
+ except wikipedia.LockedPage:
+ wikipedia.output(u'Page %s is locked. Skipping.' % (page.title(),))
+ raise SaveError
+ except wikipedia.EditConflict:
+ wikipedia.output(u'ERROR putting page: An edit conflict occurred. Giving up.')
+ raise SaveError
+ except (wikipedia.SpamfilterError), error:
+ wikipedia.output(u'ERROR putting page: %s blacklisted by spamfilter. Giving up.' % (error.url,))
+ raise SaveError
+ except (wikipedia.PageNotSaved), error:
+ wikipedia.output(u'ERROR putting page: %s' % (error.args,))
+ raise SaveError
+ except (socket.error, IOError), error:
+ if timeout>3600:
+ raise
+ wikipedia.output(u'ERROR putting page: %s' % (error.args,))
+ wikipedia.output(u'Sleeping %i seconds before trying again.' % (timeout,))
+ timeout *= 2
+ time.sleep(timeout)
+ except wikipedia.ServerError:
+ if timeout>3600:
+ raise
+ wikipedia.output(u'ERROR putting page: ServerError.')
+ wikipedia.output(u'Sleeping %i seconds before trying again.' % (timeout,))
+ timeout *= 2
+ time.sleep(timeout)
else:
- # If we do not need to ask, allow
- answer = 'y'
- # If we got permission to submit, do so
- if answer == 'y':
- # Check whether we will have to wait for wikipedia. If so, make
- # another get-query first.
- if bot:
- while wikipedia.get_throttle.waittime() + 2.0 < wikipedia.put_throttle.waittime():
- wikipedia.output(u"NOTE: Performing a recursive query first to save time....")
- qdone = bot.oneQuery()
- if not qdone:
- # Nothing more to do
- break
- wikipedia.output(u"NOTE: Updating live wiki...")
- timeout=60
- while 1:
- try:
- status, reason, data = page.put(newtext, comment = wikipedia.translate(page.site().lang, msg)[0] + mods)
- except wikipedia.LockedPage:
- wikipedia.output(u'Page %s is locked. Skipping.' % (page.title(),))
- raise SaveError
- except wikipedia.EditConflict:
- wikipedia.output(u'ERROR putting page: An edit conflict occurred. Giving up.')
- raise SaveError
- except (wikipedia.SpamfilterError), error:
- wikipedia.output(u'ERROR putting page: %s blacklisted by spamfilter. Giving up.' % (error.url,))
- raise SaveError
- except (wikipedia.PageNotSaved), error:
- wikipedia.output(u'ERROR putting page: %s' % (error.args,))
- raise SaveError
- except (socket.error, IOError), error:
- if timeout>3600:
- raise
- wikipedia.output(u'ERROR putting page: %s' % (error.args,))
- wikipedia.output(u'Sleeping %i seconds before trying again.' % (timeout,))
- timeout *= 2
- time.sleep(timeout)
- except wikipedia.ServerError:
- if timeout>3600:
- raise
- wikipedia.output(u'ERROR putting page: ServerError.')
- wikipedia.output(u'Sleeping %i seconds before trying again.' % (timeout,))
- timeout *= 2
- time.sleep(timeout)
- else:
- break
- if str(status) == '302':
- return True
- else:
- wikipedia.output(u'%s %s' % (status, reason))
- elif answer == 'g':
- raise GiveUpOnPage
- else:
- raise LinkMustBeRemoved('Found incorrect link to %s in %s'% (",".join([x.site().lang for x in removing]), page.aslink(True)))
- return False
+ break
+ if str(status) == '302':
+ return True
+ else:
+ wikipedia.output(u'%s %s' % (status, reason))
+ return False
+ elif answer == 'g':
+ raise GiveUpOnPage
+ else:
+ raise LinkMustBeRemoved('Found incorrect link to %s in %s'% (",".join([x.lang for x in removing]), page.aslink(True)))
def reportBacklinks(self, new, updatedSites):
"""
@@ -1336,11 +1435,12 @@
"""
# use sets because searching an element is faster than in lists
- expectedPages = set(new.values())
- expectedSites = set([page.site() for page in expectedPages])
+ expectedPages = set(new.itervalues())
+ expectedSites = set(new)
try:
- for site, page in new.iteritems():
- if site not in updatedSites and not page.section():
+ for site in expectedSites - set(updatedSites):
+ page = new[site]
+ if not page.section():
try:
linkedPages = set(page.interwiki())
except wikipedia.NoPage:
@@ -1351,8 +1451,8 @@
linkedPagesDict = {}
for linkedPage in linkedPages:
linkedPagesDict[linkedPage.site()] = linkedPage
- for expectedPage in expectedPages:
- if expectedPage != page and expectedPage not in linkedPages:
+ for expectedPage in expectedPages - linkedPages:
+ if expectedPage != page:
try:
linkedPage = linkedPagesDict[expectedPage.site()]
wikipedia.output(u"WARNING: %s: %s does not link to %s but to %s" % (page.site().family.name, page.aslink(True), expectedPage.aslink(True), linkedPage.aslink(True)))
@@ -1388,9 +1488,9 @@
"""Add a single subject to the list"""
subj = Subject(page, hints = hints)
self.subjects.append(subj)
- for site in subj.openSites(allowdoubles = True):
+ for site, count in subj.openSites():
# Keep correct counters
- self.plus(site)
+ self.plus(site, count)
def setPageGenerator(self, pageGenerator, number = None, until = None):
"""Add a generator of subjects. Once the list of subjects gets
@@ -1427,7 +1527,7 @@
continue
if globalvar.skipauto:
dictName, year = page.autoFormat()
- if dictName != None:
+ if dictName is not None:
wikipedia.output(u'Skipping: %s is an auto entry %s(%s)' % (page.title(),dictName,year))
continue
if globalvar.bracketonly:
@@ -1461,12 +1561,12 @@
maxlang = None
if not self.firstSubject():
return None
- oc = self.firstSubject().openSites()
+ oc = dict(self.firstSubject().openSites())
if not oc:
# The first subject is done. This might be a recursive call made because we
# have to wait before submitting another modification to go live. Select
# any language from counts.
- oc = self.counts.keys()
+ oc = self.counts
if wikipedia.getSite() in oc:
return wikipedia.getSite()
for lang in oc:
@@ -1516,7 +1616,7 @@
"""
# First find the best language to work on
site = self.selectQuerySite()
- if site == None:
+ if site is None:
wikipedia.output(u"NOTE: Nothing left to do")
return False
# Now assemble a reasonable list of pages to get
@@ -1559,16 +1659,16 @@
"""Check whether there is still more work to do"""
return len(self) == 0 and self.pageGenerator is None
- def plus(self, site):
+ def plus(self, site, count=1):
"""This is a routine that the Subject class expects in a counter"""
try:
- self.counts[site] += 1
+ self.counts[site] += count
except KeyError:
- self.counts[site] = 1
+ self.counts[site] = count
- def minus(self, site):
+ def minus(self, site, count=1):
"""This is a routine that the Subject class expects in a counter"""
- self.counts[site] -= 1
+ self.counts[site] -= count
def run(self):
"""Start the process until finished"""
@@ -1579,37 +1679,32 @@
return len(self.subjects)
def compareLanguages(old, new, insite):
- removing = []
- adding = []
- modifying = []
- for site in old.keys():
- if site not in new:
- removing.append(old[site])
- elif old[site] != new[site]:
- modifying.append(new[site])
- for site2 in new.keys():
- if site2 not in old:
- adding.append(new[site2])
- mods = ""
+ oldiw = set(old)
+ newiw = set(new)
+
# sort by language code
- adding.sort()
- modifying.sort()
- removing.sort()
+ adding = sorted(newiw - oldiw)
+ removing = sorted(oldiw - newiw)
+ modifying = sorted(site for site in oldiw & newiw if old[site] != new[site])
+ mods = ""
+
if len(adding) + len(removing) + len(modifying) <= 3:
# Use an extended format for the string linking to all added pages.
- fmt = lambda page: page.aslink(forceInterwiki=True)
+ fmt = lambda d, site: d[site].aslink(forceInterwiki=True)
else:
# Use short format, just the language code
- fmt = lambda page: page.site().lang
+ fmt = lambda d, site: site.lang
+ _, add, rem, mod = wikipedia.translate(insite.lang, msg)
+
if adding:
- mods += " %s: %s" % (wikipedia.translate(insite.lang, msg)[1], ", ".join([fmt(x) for x in adding]))
+ mods += " %s: %s" % (add, ", ".join([fmt(new, x) for x in adding]))
if removing:
- mods += " %s: %s" % (wikipedia.translate(insite.lang, msg)[2], ", ".join([fmt(x) for x in removing]))
+ mods += " %s: %s" % (rem, ", ".join([fmt(old, x) for x in removing]))
if modifying:
- mods += " %s: %s" % (wikipedia.translate(insite.lang, msg)[3], ", ".join([fmt(x) for x in modifying]))
+ mods += " %s: %s" % (mod, ", ".join([fmt(new, x) for x in modifying]))
return mods, adding, removing, modifying
def readWarnfile(filename, bot):
@@ -1617,11 +1712,10 @@
reader = warnfile.WarnfileReader(filename)
# we won't use removeHints
(hints, removeHints) = reader.getHints()
- pages = hints.keys()
- for page in pages:
+ for page, pagelist in hints.iteritems():
# The WarnfileReader gives us a list of pagelinks, but titletranslate.py expects a list of strings, so we convert it back.
# TODO: This is a quite ugly hack, in the future we should maybe make titletranslate expect a list of pagelinks.
- hintStrings = ['%s:%s' % (hintedPage.site().language(), hintedPage.title()) for hintedPage in hints[page]]
+ hintStrings = ['%s:%s' % (hintedPage.site().language(), hintedPage.title()) for hintedPage in pagelist]
bot.add(page, hints = hintStrings)
#===========
@@ -1800,7 +1894,7 @@
except:
wikipedia.output(u'Missing main page name')
- if newPages != None:
+ if newPages is not None:
if len(namespaces) == 0:
ns = 0
elif len(namespaces) == 1:
Property changes on: branches/rewrite/scripts/interwiki.py
___________________________________________________________________
Modified: svn:mergeinfo
-
+ /trunk/pywikipedia/interwiki.py:6668-6733
Revision: 6733
Author: nicdumz
Date: 2009-04-27 12:17:21 +0000 (Mon, 27 Apr 2009)
Log Message:
-----------
On a SaxError, retry once before giving up.
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2009-04-27 09:09:36 UTC (rev 6732)
+++ trunk/pywikipedia/pagegenerators.py 2009-04-27 12:17:21 UTC (rev 6733)
@@ -798,7 +798,7 @@
traceback.print_exc()
wikipedia.output(unicode(e))
- def preload(self, page_list):
+ def preload(self, page_list, retry=False):
try:
while len(page_list) > 0:
# It might be that the pages are on different sites,
@@ -816,6 +816,9 @@
# Can happen if the pages list is empty. Don't care.
pass
except wikipedia.SaxError:
+ if not retry:
+ # Retry once.
+ self.preload(page_list, retry=True)
# Ignore this error, and get the pages the traditional way later.
pass