Revision: 7070
Author: alexsh
Date: 2009-07-15 21:02:49 +0000 (Wed, 15 Jul 2009)
Log Message:
-----------
site().linksearch: make more efficient for detect JSON data pages
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-07-15 20:43:32 UTC (rev 7069)
+++ trunk/pywikipedia/wikipedia.py 2009-07-15 21:02:49 UTC (rev 7070)
@@ -5829,13 +5829,8 @@
if data['query']['exturlusage'] == []:
break
- if data.has_key(u'query-continue'):
- params['euoffset'] = data[u'query-continue'][u'exturlusage'][u'euoffset']
- else:
- keepGo = False
- data = data['query']['exturlusage']
- for pages in data:
+ for pages in data['query']['exturlusage']:
if not siteurl in pages['title']:
# the links themselves have similar form
if pages['title'] in cache:
@@ -5843,6 +5838,10 @@
else:
cache.append(pages['title'])
yield Page(self, pages['title'])
+ if data.has_key(u'query-continue'):
+ params['euoffset'] = data[u'query-continue'][u'exturlusage'][u'euoffset']
+ else:
+ break
else:
output(u'Querying [[Special:Linksearch]]...')
for url in urlsToRetrieve:
Revision: 7068
Author: alexsh
Date: 2009-07-15 20:23:26 +0000 (Wed, 15 Jul 2009)
Log Message:
-----------
site().linksearch: add API method (take 2 hours to debug...Orz)
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-07-15 19:25:32 UTC (rev 7067)
+++ trunk/pywikipedia/wikipedia.py 2009-07-15 20:23:26 UTC (rev 7068)
@@ -5808,44 +5808,74 @@
def linksearch(self, siteurl, limit=500):
"""Yield Pages from results of Special:Linksearch for 'siteurl'."""
- output(u'Querying [[Special:Linksearch]]...')
cache = []
R = re.compile('title ?=\"([^<>]*?)\">[^<>]*</a></li>')
urlsToRetrieve = [siteurl]
if not siteurl.startswith('*.'):
urlsToRetrieve.append('*.' + siteurl)
- for url in urlsToRetrieve:
- offset = 0
- while True:
- path = self.linksearch_address(url, limit=limit, offset=offset)
- get_throttle()
- html = self.getUrl(path)
- #restricting the HTML source :
- #when in the source, this div marks the beginning of the input
- loc = html.find('<div class="mw-spcontent">')
- if loc > -1:
- html = html[loc:]
- #when in the source, marks the end of the linklist
- loc = html.find('<div class="printfooter">')
- if loc > -1:
- html = html[:loc]
+ if config.use_api:
+ output(u'Querying API...')
+ for url in urlsToRetrieve:
+ params = {
+ 'action': 'query',
+ 'list' : 'exturlusage',
+ 'eulimit': limit,
+ 'euquery': url,
+ }
+ keepGo = True
+ while keepGo:
+ data = query.GetData(params, useAPI = True)
+ if data['query']['exturlusage'] == []:
+ break
+
+ if data.has_key(u'query-continue'):
+ params['euoffset'] = data[u'query-continue'][u'exturlusage'][u'euoffset']
+ else:
+ keepGo = False
- #our regex fetches internal page links and the link they contain
- links = R.findall(html)
- if not links:
- #no more page to be fetched for that link
- break
- for title in links:
- if not siteurl in title:
- # the links themselves have similar form
- if title in cache:
- continue
- else:
- cache.append(title)
- yield Page(self, title)
- offset += limit
+ data = data['query']['exturlusage']
+ for pages in data:
+ if not siteurl in pages['title']:
+ # the links themselves have similar form
+ if pages['title'] in cache:
+ continue
+ else:
+ cache.append(pages['title'])
+ yield Page(self, pages['title'])
+ else:
+ output(u'Querying [[Special:Linksearch]]...')
+ for url in urlsToRetrieve:
+ offset = 0
+ while True:
+ path = self.linksearch_address(url, limit=limit, offset=offset)
+ get_throttle()
+ html = self.getUrl(path)
+ #restricting the HTML source :
+ #when in the source, this div marks the beginning of the input
+ loc = html.find('<div class="mw-spcontent">')
+ if loc > -1:
+ html = html[loc:]
+ #when in the source, marks the end of the linklist
+ loc = html.find('<div class="printfooter">')
+ if loc > -1:
+ html = html[:loc]
+ #our regex fetches internal page links and the link they contain
+ links = R.findall(html)
+ if not links:
+ #no more page to be fetched for that link
+ break
+ for title in links:
+ if not siteurl in title:
+ # the links themselves have similar form
+ if title in cache:
+ continue
+ else:
+ cache.append(title)
+ yield Page(self, title)
+ offset += limit
+
def __repr__(self):
return self.family.name+":"+self.lang