http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9873
Revision: 9873
Author: binbot
Date: 2012-02-07 20:40:12 +0000 (Tue, 07 Feb 2012)
Log Message:
-----------
Rollback to previous, sorry about that (capitalize will decapitalize letters other than the first)
Modified Paths:
--------------
trunk/pywikipedia/solve_disambiguation.py
Modified: trunk/pywikipedia/solve_disambiguation.py
===================================================================
--- trunk/pywikipedia/solve_disambiguation.py 2012-02-07 13:19:24 UTC (rev 9872)
+++ trunk/pywikipedia/solve_disambiguation.py 2012-02-07 20:40:12 UTC (rev 9873)
@@ -73,7 +73,7 @@
# (C) Daniel Herding, 2004
# (C) Andre Engels, 2003-2004
# (C) WikiWichtel, 2004
-# (C) Pywikipedia team, 2003-2012
+# (C) Pywikipedia team, 2003-2009
#
__version__='$Id$'
#
@@ -351,6 +351,9 @@
},
}
+def firstcap(string):
+ return string[0].upper()+string[1:]
+
def correctcap(link, text):
# If text links to a page with title link uncapitalized, uncapitalize link,
# otherwise capitalize it
@@ -361,6 +364,21 @@
else:
return linkupper
+def firstlinks(page):
+ #Returns a list of first links of every line beginning with *
+ #When a disambpage is full of unnecessary links, this may be useful
+ #to sort out the relevant links. E.g. from line
+ #*[[Jim Smith (smith)|Jim Smith]] ([[1832]]-[[1932]]) [[English]] [[smith]]
+ #it returns only Jim Smith (smith)
+ #No check for page existence, it has already been done.
+ list = []
+ reg = re.compile(r'\*.*?\[\[(.*?)(\||\]\])')
+ for line in page.get().splitlines():
+ found = reg.match(line)
+ if found:
+ list.append(found.group(1))
+ return list
+
class ReferringPageGeneratorWithIgnore:
def __init__(self, disambPage, primary=False, minimum = 0):
self.disambPage = disambPage
@@ -520,6 +538,20 @@
# note that the definition of 'letter' varies from language to language.
self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
+ def firstize(self, page, links):
+ #duma
+ #check param
+ titles = [t.capitalize() for t in firstlinks(page)]
+ pywikibot.output('\t'.join(titles))
+ print len (titles), len(links)
+ pywikibot.output('\t'.join(l.title() for l in links))
+ for l in links[:]:
+ pywikibot.output(l.title())
+ if l.title() not in titles:
+ links.remove(l)
+ print 'meghalt'
+ return links
+
def treat(self, refPage, disambPage):
"""
Parameters:
@@ -789,8 +821,8 @@
# check if we can create a link with trailing characters
# instead of a pipelink
elif len(new_page_title) <= len(link_text) \
- and link_text[:len(new_page_title)].capitalize() \
- == new_page_title.capitalize() \
+ and firstcap(link_text[:len(new_page_title)]) \
+ == firstcap(new_page_title) \
and re.sub(self.trailR, '',
link_text[len(new_page_title):]) == '' \
and not section:
@@ -871,6 +903,7 @@
primary_topic_format[self.mylang]
% disambPage.title())
links = disambPage2.linkedPages()
+ links = self.firstize(disambPage2, links)
links = [correctcap(l, disambPage2.get())
for l in links]
except pywikibot.NoPage: