Revision: 7697
Author: alexsh
Date: 2009-11-26 09:33:45 +0000 (Thu, 26 Nov 2009)
Log Message:
-----------
separate getall for huge-retrieve (only work in getAll pageCount > config.special_page_limit
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-11-26 09:29:40 UTC (rev 7696)
+++ trunk/pywikipedia/wikipedia.py 2009-11-26 09:33:45 UTC (rev 7697)
@@ -3848,7 +3848,25 @@
# TODO: why isn't this a Site method?
pages = list(pages) # if pages is an iterator, we need to make it a list
output(u'Getting %d pages from %s...' % (len(pages), site))
- _GetAll(site, pages, throttle, force).run()
+ limit = config.special_page_limit / 4 # default is 500/4, but It might have good point for server.
+
+ if len(pages) > limit:
+ # separate export pages for bulk-retrieve
+
+ for pagg in range(0, len(pages), limit):
+ if pagg == range(0, len(pages), limit)[-1]: #latest retrieve
+ k = pages[pagg:]
+ output(u'Getting pages %d - %d of %d...' % (pagg + 1, len(pages), len(pages)))
+ _GetAll(site, k, throttle, force).run()
+ pages[pagg:] = k
+ else:
+ k = pages[pagg:pagg + limit]
+ output(u'Getting pages %d - %d of %d...' % (pagg + 1, pagg + limit, len(pages)))
+ _GetAll(site, k, throttle, force).run()
+ pages[pagg:pagg + limit] = k
+ get_throttle(requestsize = len(pages) / 10) # one time to retrieve is 7.7 sec.
+ else:
+ _GetAll(site, pages, throttle, force).run()
# Library functions