Revision: 8003
Author: xqt
Date: 2010-03-13 16:14:53 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
bugfix for HTTPError 400 loading a lot of user pages at once
Modified Paths:
--------------
trunk/pywikipedia/userlib.py
trunk/pywikipedia/welcome.py
Modified: trunk/pywikipedia/userlib.py
===================================================================
--- trunk/pywikipedia/userlib.py 2010-03-13 13:55:12 UTC (rev 8002)
+++ trunk/pywikipedia/userlib.py 2010-03-13 16:14:53 UTC (rev 8003)
@@ -553,16 +553,16 @@
users = list(users) # if pages is an iterator, we need to make it a list
if len(users) > 1: wikipedia.output(u'Getting %d users data from %s...' % (len(users), site))
- if len(users) > 500:
- for urg in range(0, len(users), 500):
- if urg == range(0, len(users), 500)[-1]: #latest
+ if len(users) > 250: # max load prevents HTTPError 400
+ for urg in range(0, len(users), 250):
+ if urg == range(0, len(users), 250)[-1]: #latest
k = users[urg:]
_GetAllUI(site, k, throttle, force).run()
users[urg:] = k
else:
- k = users[urg:urg + 500]
+ k = users[urg:urg + 250]
_GetAllUI(site, k, throttle, force).run()
- users[urg:urg + 500] = k
+ users[urg:urg + 250] = k
else:
_GetAllUI(site, users, throttle, force).run()
Modified: trunk/pywikipedia/welcome.py
===================================================================
--- trunk/pywikipedia/welcome.py 2010-03-13 13:55:12 UTC (rev 8002)
+++ trunk/pywikipedia/welcome.py 2010-03-13 16:14:53 UTC (rev 8003)
@@ -673,13 +673,8 @@
# if self._checkQueue:
# for nm in self._checkQueue:
# yield userlib.User(self.site, nm)
- try:
- if config.use_api and self.site.versionnumber() >= 13:
- x = self.site.api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
+
+ if not self.site.has_api() or self.site.versionnumber() < 13:
for x in self._parseNewUserLogOld():
yield x
return
@@ -813,7 +808,7 @@
def run(self):
while True:
welcomed_count = 0
- if globalvar.quick and config.use_api:
+ if globalvar.quick and self.site.has_api():
us = [x for x in self.parseNewUserLog()]
showStatus()
try:
Revision: 8001
Author: xqt
Date: 2010-03-13 13:26:43 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
is there any reason not to use api to _GetAll() ?
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-03-13 12:27:53 UTC (rev 8000)
+++ trunk/pywikipedia/wikipedia.py 2010-03-13 13:26:43 UTC (rev 8001)
@@ -3664,11 +3664,8 @@
def run(self):
if self.pages:
- doAPI = None
- # API Implemented Check
- # doAPI = self.site.has_api()
- if doAPI:
+ if self.site.has_api():
while True:
try:
data = self.getDataApi()
@@ -3691,7 +3688,6 @@
self._norm = dict([(x['from'],x['to']) for x in data['query']['normalized']])
for vals in data['query']['pages'].values():
self.oneDoneApi(vals)
-
else:
while True:
try:
@@ -3737,6 +3733,7 @@
except PageNotFound:
return
# All of the ones that have not been found apparently do not exist
+
for pl in self.pages:
if not hasattr(pl,'_contents') and not hasattr(pl,'_getexception'):
pl._getexception = NoPage
Revision: 7999
Author: xqt
Date: 2010-03-13 12:21:37 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
use site.has_api() method
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-03-13 11:03:04 UTC (rev 7998)
+++ trunk/pywikipedia/wikipedia.py 2010-03-13 12:21:37 UTC (rev 7999)
@@ -656,13 +656,7 @@
This method returns the raw wiki text as a unicode string.
"""
- try:
- if config.use_api and self.site().versionnumber() > 11:
- x = self.site().api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
+ if not self.site().has_api() or self.site().versionnumber() < 12:
return self._getEditPageOld(get_redirect, throttle, sysop, oldid, change_edit_time)
params = {
@@ -1209,13 +1203,7 @@
* redirectsOnly - if True, only returns redirects to self.
"""
- try:
- if config.use_api and self.site().versionnumber() > 9:
- d = self.site().apipath()
- del d
- else:
- raise NotImplementedError
- except NotImplementedError:
+ if not self.site().has_api():
for s in self.getReferencesOld(follow_redirects, withTemplateInclusion, onlyTemplateInclusion, redirectsOnly):
yield s
return
@@ -1543,7 +1531,7 @@
if not force:
if not self.botMayEdit(username):
raise LockedPage(u'Not allowed to edit %s because of a restricting template' % self.aslink())
- elif config.use_api and self.namespace() in [2,3] and ( '.css' in self.title() or '.js' in self.title()):
+ elif self.site().has_api() and self.namespace() in [2,3] and ( '.css' in self.title() or '.js' in self.title()):
# API enable: if title is .css or .js in ns2,3 , it need permission `editusercssjs`
sysop = self._getActionUser(action = 'editusercssjs', restriction = self.editRestriction, sysop=True)
@@ -1584,7 +1572,7 @@
newPage = not self.exists()
# if posting to an Esperanto wiki, we must e.g. write Bordeauxx instead
# of Bordeaux
- if self.site().lang == 'eo' and not config.use_api:
+ if self.site().lang == 'eo' and not self.site().has_api():
newtext = encodeEsperantoX(newtext)
comment = encodeEsperantoX(comment)
@@ -1613,13 +1601,8 @@
Don't use this directly, use put() instead.
"""
- try:
- if config.use_api and self.site().versionnumber() >= 13:
- apitest = self.site().api_address()
- del apitest
- else:
- raise NotImplementedError #No enable api or version not support
- except NotImplementedError:
+ if not self.site().has_api() or self.site().versionnumber() < 13:
+ # api not enabled or version not supported
return self._putPageOld(text, comment, watchArticle, minorEdit,
newPage, token, newToken, sysop, captcha, botflag, maxTries)
@@ -2501,13 +2484,7 @@
"""Load history informations by API query.
Internal use for self.getVersionHistory(), don't use this function directly.
"""
- try:
- if config.use_api and self.site().versionnumber() >= 8:
- x = self.site().api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
+ if not self.site().has_api() or self.site().versionnumber() < 8:
return self._getVersionHistoryOld(reExist, getAll, skipFirst, reverseOrder, revCount)
dataQ = []
thisHistoryDone = False
@@ -2664,13 +2641,7 @@
to move and delete if not directly requested.
* fixredirects has no effect in MW < 1.13"""
- try:
- if config.use_api and self.site().versionnumber() >= 12:
- x = self.site().api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
+ if not self.site().has_api() or self.site().versionnumber() < 12:
return self._moveOld(newtitle, reason, movetalkpage, sysop,
throttle, deleteAndMove, safe, fixredirects, leaveRedirect)
# Login
@@ -2895,13 +2866,8 @@
token = self.site().getToken(self, sysop = True)
reason = reason.encode(self.site().encoding())
- try:
- d = self.site().api_address()
- del d
- except NotImplementedError:
- config.use_api = False
- if config.use_api and self.site().versionnumber() >= 12:
+ if self.site().has_api() and self.site().versionnumber() >= 12:
#API Mode
params = {
'action': 'delete',
@@ -2973,7 +2939,7 @@
self._deletedRevs = {}
- if config.use_api and self.site().versionnumber() >= 12:
+ if self.site().has_api() and self.site().versionnumber() >= 12:
params = {
'action': 'query',
'list': 'deletedrevs',
@@ -3090,7 +3056,7 @@
if throttle:
put_throttle()
- if config.use_api and self.site().versionnumber() >= 12:
+ if self.site().has_api() and self.site().versionnumber() >= 12:
params = {
'action': 'undelete',
'title': self.title(),
@@ -3178,15 +3144,10 @@
answer = 'y'
self.site()._noProtectPrompt = True
if answer == 'y':
- try:
- if config.use_api and self.site().versionnumber() >= 12:
- x = self.site().api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
- return self._oldProtect( editcreate, move, unprotect, reason, editcreate_duration,
- move_duration, cascading, prompt, throttle)
+ if not self.site().has_api() or self.site().versionnumber() < 12:
+ return self._oldProtect(editcreate, move, unprotect, reason,
+ editcreate_duration, move_duration,
+ cascading, prompt, throttle)
token = self.site().getToken(self, sysop = True)
@@ -3637,16 +3598,11 @@
return u'{| border="1"\n! date/time || username || resolution || size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
def usingPages(self):
- try:
- if config.use_api and self.site().versionnumber() >= 11:
- x = self.site().api_address()
- del x
- else:
- raise NotImplementedError
- except NotImplementedError:
+ if not self.site().has_api() or self.site().versionnumber() < 11:
for a in self._usingPagesOld():
yield a
return
+
params = {
'action': 'query',
'list': 'imageusage',
@@ -3709,14 +3665,8 @@
def run(self):
if self.pages:
doAPI = None
- #if config.use_api:
- # # API Implemented Check
- # try:
- # doAPI = True
- # d = self.site.api_address()
- # del d
- # except NotImplementedError:
- # doAPI = False
+ # API Implemented Check
+ # doAPI = self.site.has_api()
if doAPI:
while True:
@@ -4074,10 +4024,9 @@
"""
# TODO: why isn't this a Site method?
pages = list(pages) # if pages is an iterator, we need to make it a list
- output(u'Getting %d pages from %s' % (len(pages), site), newline = False)
- #if config.use_api:
- # output(u' via API...')
- #else:
+ output(u'Getting %d pages from %s' % (len(pages), site), newline=False)
+ if site.has_api():
+ output(u' via API', newline=False)
output(u'...')
limit = config.special_page_limit / 4 # default is 500/4, but It might have good point for server.
@@ -4600,6 +4549,7 @@
return 1
else:
return 0
+
def username(self, sysop = False):
return self._userName[self._userIndex(sysop = sysop)]
@@ -4682,7 +4632,6 @@
else:
self._load(sysop = sysop)
index = self._userIndex(sysop)
- ##output('%s' % self._rights[index]) #for debug use
return right in self._rights[index]
def server_time(self):
@@ -5645,7 +5594,6 @@
index = self._userIndex(sysop)
if self._userData[index] and not force:
return
-
if verbose:
output(u'Getting information for site %s' % self)
@@ -6224,7 +6172,7 @@
break
def randompage(self, redirect = False):
- if config.use_api and self.versionnumber() >= 12:
+ if self.has_api() and self.versionnumber() >= 12:
params = {
'action': 'query',
'list': 'random',
@@ -6273,10 +6221,8 @@
page = Page(self, start)
namespace = page.namespace()
start = page.titleWithoutNamespace()
- try:
- api_url = self.api_address()
- del api_url
- except NotImplementedError:
+
+ if not self.has_api():
for page in self._allpagesOld(start, namespace, includeredirects, throttle):
yield page
return
Revision: 7998
Author: xqt
Date: 2010-03-13 11:03:04 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
activate throttle class from library. This increases put_throttle wait time if dealing with the same default site.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/pywikibot/__init__.py
trunk/pywikipedia/pywikibot/throttle.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2010-03-13 10:04:49 UTC (rev 7997)
+++ trunk/pywikipedia/config.py 2010-03-13 11:03:04 UTC (rev 7998)
@@ -47,7 +47,7 @@
account_global = False
# Solve captchas in the webbrowser. Setting this to False will result in the
-# exception CaptchaError be thrown if a captcha is encountered.
+# exception CaptchaError being thrown if a captcha is encountered.
solve_captcha = True
# Some sites will require password identication to access the HTML pages at
@@ -64,7 +64,9 @@
# 2. You must use the hostname of the site, not its family/language pair
authenticate = {}
+#
# Security Connection for Wikimedia Projects
+#
SSL_connection = False
# password_file = ".passwd"
@@ -130,7 +132,7 @@
# Currently only works if interface 'terminal' is set.
transliterate = True
-# Should the system bell be rung if the bot expects user input?
+# Should the system bell ring if the bot expects user input?
ring_bell = False
# Colorization can be used to markup important text parts of the output.
@@ -155,7 +157,7 @@
# The command for the editor you want to use. If set to None, a simple Tkinter
# editor will be used.
# On Windows systems, this script tries to determine the default text editor.
-if __sys.platform=='win32':
+if __sys.platform == 'win32':
try:
import _winreg
_key1 = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 'Software\Microsoft\Windows\CurrentVersion\Explorer\FileExts\.txt\OpenWithProgids')
@@ -163,11 +165,12 @@
_key2 = _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '%s\shell\open\command' % _progID)
_cmd = _winreg.QueryValueEx(_key2, None)[0]
editor = _cmd.replace('%1', '')
- # Notepad is even worse than our Tkinter editor. Nobody has
- # deserved to use it.
+ # Notepad is even worse than our Tkinter editor.
+ # Nobody has deserved to use it.
if editor.lower().endswith('notepad.exe'):
editor = None
except:
+ # XXX what are we catching here?
#raise
editor = None
else:
@@ -267,19 +270,21 @@
# but never more than 'maxthrottle' seconds. However - if you are running
# more than one bot in parallel the times are lengthened.
minthrottle = 1
-maxthrottle = 10
+maxthrottle = 60
-# Slow down the robot such that it never makes a second change within
+# Slow down the robot such that it never makes a second page edit within
# 'put_throttle' seconds.
put_throttle = 10
+
# Sometimes you want to know when a delay is inserted. If a delay is larger
# than 'noisysleep' seconds, it is logged on the screen.
noisysleep = 3.0
# Defer bot edits during periods of database server lag. For details, see
# http://www.mediawiki.org/wiki/Maxlag_parameter
-# You can set this variable to a number of seconds, or to None to disable
-# this behavior.
+# You can set this variable to a number of seconds, or to None (or 0) to
+# disable this behavior. Higher values are more aggressive in seeking
+# access to the wiki.
# It is recommended that you do not change this parameter unless you know
# what you are doing and have a good reason for it!
maxlag = 5
@@ -465,25 +470,25 @@
# ============================
# System-level and User-level changes.
# Store current variables and their types.
-_glv={}
+_glv = {}
_glv.update(globals())
-_gl=_glv.keys()
-_tp={}
+_gl = _glv.keys()
+_tp = {}
for _key in _gl:
- if _key[0]!='_':
- _tp[_key]=type(globals()[_key])
+ if _key[0] != '_':
+ _tp[_key] = type(globals()[_key])
# Get the user files
-_thislevel=0
-_fns=[os.path.join(_base_dir, "user-config.py")]
+_thislevel = 0
+_fns = [os.path.join(_base_dir, "user-config.py")]
for _filename in _fns:
_thislevel += 1
if os.path.exists(_filename):
- _filestatus=os.stat(_filename)
- _filemode=_filestatus[0]
- _fileuid=_filestatus[4]
- if (__sys.platform=='win32' or _fileuid==os.getuid() or _fileuid==0):
- if __sys.platform=='win32' or _filemode&002==0:
+ _filestatus = os.stat(_filename)
+ _filemode = _filestatus[0]
+ _fileuid = _filestatus[4]
+ if __sys.platform == 'win32' or _fileuid in [os.getuid(), 0]:
+ if __sys.platform == 'win32' or _filemode & 002 == 0:
execfile(_filename)
else:
print "WARNING: Skipped '%s': writeable by others."%_filename
@@ -507,13 +512,13 @@
print "WARNING: Type of '%s' changed"%_key
print " Was: ",ot
print " Now: ",nt
- del nt,ot
+ del nt, ot
else:
print "WARNING: Configuration variable %r is defined but unknown. Misspelled?" %_key
# Fix up default console_encoding
if console_encoding is None:
- if __sys.platform=='win32':
+ if __sys.platform == 'win32':
console_encoding = 'cp850'
else:
console_encoding = 'iso-8859-1'
@@ -562,23 +567,22 @@
#
# When called as main program, list all configuration variables
#
-if __name__=="__main__":
+if __name__ == "__main__":
import types
- _all=1
+ _all = 1
for _arg in __sys.argv[1:]:
- if _arg=="modified":
- _all=0
+ if _arg == "modified":
+ _all = 0
else:
print "Unknown arg %s ignored"%_arg
- _k=globals().keys()
+ _k = globals().keys()
_k.sort()
for _name in _k:
- if _name[0]!='_':
+ if _name[0] != '_':
if not type(globals()[_name]) in [types.FunctionType, types.ModuleType]:
- if _all or _glv[_name]!=globals()[_name]:
- print _name,"=",repr(globals()[_name])
+ if _all or _glv[_name] != globals()[_name]:
+ print _name, "=", repr(globals()[_name])
-
# cleanup all locally-defined variables
for __var in globals().keys():
Modified: trunk/pywikipedia/pywikibot/__init__.py
===================================================================
--- trunk/pywikipedia/pywikibot/__init__.py 2010-03-13 10:04:49 UTC (rev 7997)
+++ trunk/pywikipedia/pywikibot/__init__.py 2010-03-13 11:03:04 UTC (rev 7998)
@@ -13,6 +13,7 @@
from exceptions import *
from textlib import *
+from throttle import *
import wikipedia
Modified: trunk/pywikipedia/pywikibot/throttle.py
===================================================================
--- trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 10:04:49 UTC (rev 7997)
+++ trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 11:03:04 UTC (rev 7998)
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
-Mechanics to slow down wiki page download rate.
+Mechanics to slow down wiki read and/or write rate.
"""
#
# (C) Pywikipedia bot team, 2008
@@ -34,7 +34,7 @@
"""
def __init__(self, mindelay=None, maxdelay=None, writedelay=None,
- multiplydelay=True, verbosedelay=False):
+ multiplydelay=True, verbosedelay=False, write=False):
self.lock = threading.RLock()
self.mysite = None
self.ctrlfilename = config.datafilepath('pywikibot', 'throttle.ctrl')
@@ -62,6 +62,7 @@
if self.multiplydelay:
self.checkMultiplicity()
self.setDelay()
+ self.write = write
def checkMultiplicity(self):
"""Count running processes for site and set process_multiplicity."""
@@ -225,10 +226,15 @@
Parameter requestsize is the number of Pages to be read/written;
multiply delay time by an appropriate factor.
+
+ Because this seizes the throttle lock, it will prevent any other
+ thread from writing to the same site the script started with
+ until the wait expires.
+
"""
self.lock.acquire()
try:
- wait = self.waittime(write=write)
+ wait = self.waittime(write=write or self.write)
# Calculate the multiplicity of the next delay based on how
# big the request is that is being posted now.
# We want to add "one delay" for each factor of two in the
@@ -236,13 +242,15 @@
# the delay time for the server.
self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
# Announce the delay if it exceeds a preset limit
- if wait > config.noisysleep or pywikibot.verbose:
- pywikibot.output(u"Sleeping for %(wait).1f seconds, %(now)s"
- % {'wait': wait,
- 'now' : time.strftime("%Y-%m-%d %H:%M:%S",
- time.localtime())
- } )
- time.sleep(wait)
+ if wait > 0:
+ if wait > config.noisysleep or pywikibot.verbose:
+ pywikibot.output(
+ u"Sleeping for %(wait).1f seconds, %(now)s"
+ % {'wait': wait,
+ 'now' : time.strftime("%Y-%m-%d %H:%M:%S",
+ time.localtime())
+ } )
+ time.sleep(wait)
if write:
self.last_write = time.time()
else:
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-03-13 10:04:49 UTC (rev 7997)
+++ trunk/pywikipedia/wikipedia.py 2010-03-13 11:03:04 UTC (rev 7998)
@@ -4118,176 +4118,6 @@
# Default User-agent
setUserAgent('PythonWikipediaBot/1.0')
-# Mechanics to slow down page download rate.
-class Throttle(object):
- """For internal use only - control rate of access to wiki server
-
- Calling this object blocks the calling thread until at least 'delay'
- seconds have passed since the previous call.
-
- The framework initiates two Throttle objects: get_throttle to control
- the rate of read access, and put_throttle to control the rate of write
- access.
-
- """
- def __init__(self, mindelay=config.minthrottle,
- maxdelay=config.maxthrottle,
- multiplydelay=True):
- self.lock = threading.RLock()
- self.mindelay = mindelay
- self.maxdelay = maxdelay
- self.now = 0
- self.pid = False # If self.pid remains False, we're not checking for multiple processes
- self.next_multiplicity = 1.0
- self.checkdelay = 240 # Check the file with processes again after this many seconds
- self.dropdelay = 360 # Drop processes from the list that have not made a check in this many seconds
- self.releasepid = 1200 # Free the process id
- self.lastwait = 0.0
- self.delay = 0
- self.multiplydelay = multiplydelay
- if self.multiplydelay:
- self.checkMultiplicity()
- self.setDelay(mindelay)
-
- def logfn(self):
- return config.datafilepath('pywikibot', 'throttle.ctrl')
-
- def checkMultiplicity(self):
- self.lock.acquire()
- try:
- processes = {}
- my_pid = 1
- count = 1
- try:
- f = open(self.logfn(), 'r')
- except IOError:
- if not self.pid:
- pass
- else:
- raise
- else:
- now = time.time()
- for line in f.readlines():
- try:
- line = line.split(' ')
- pid = int(line[0])
- ptime = int(line[1].split('.')[0])
- if now - ptime <= self.releasepid:
- if now - ptime <= self.dropdelay and pid != self.pid:
- count += 1
- processes[pid] = ptime
- if pid >= my_pid:
- my_pid = pid+1
- except (IndexError,ValueError):
- pass # Sometimes the file gets corrupted - ignore that line
-
- if not self.pid:
- self.pid = my_pid
- self.checktime = time.time()
- processes[self.pid] = self.checktime
- try:
- f = open(self.logfn(), 'w')
- for p in processes:
- f.write(str(p)+' '+str(processes[p])+'\n')
- except IOError:
- pass
- f.close()
- self.process_multiplicity = count
- if verbose:
- output(u"Checked for running processes. %s processes currently running, including the current process." % count)
- finally:
- self.lock.release()
-
- def setDelay(self, delay = config.minthrottle, absolute = False):
- self.lock.acquire()
- try:
- if absolute:
- self.maxdelay = delay
- self.mindelay = delay
- self.delay = delay
- # Don't count the time we already waited as part of our waiting time :-0
- self.now = time.time()
- finally:
- self.lock.release()
-
- def getDelay(self):
- thisdelay = self.delay
- if self.multiplydelay: # If self.pid, we're checking for multiple processes
- if time.time() > self.checktime + self.checkdelay:
- self.checkMultiplicity()
- if thisdelay < (self.mindelay * self.next_multiplicity):
- thisdelay = self.mindelay * self.next_multiplicity
- elif thisdelay > self.maxdelay:
- thisdelay = self.maxdelay
- thisdelay *= self.process_multiplicity
- return thisdelay
-
- def waittime(self):
- """Calculate the time in seconds we will have to wait if a query
- would be made right now"""
- # Take the previous requestsize in account calculating the desired
- # delay this time
- thisdelay = self.getDelay()
- now = time.time()
- ago = now - self.now
- if ago < thisdelay:
- delta = thisdelay - ago
- return delta
- else:
- return 0.0
-
- def drop(self):
- """Remove me from the list of running bots processes."""
- self.checktime = 0
- processes = {}
- try:
- f = open(self.logfn(), 'r')
- except IOError:
- return
- else:
- now = time.time()
- for line in f.readlines():
- try:
- line = line.split(' ')
- pid = int(line[0])
- ptime = int(line[1].split('.')[0])
- if now - ptime <= self.releasepid and pid != self.pid:
- processes[pid] = ptime
- except (IndexError,ValueError):
- pass # Sometimes the file gets corrupted - ignore that line
- try:
- f = open(self.logfn(), 'w')
- for p in processes:
- f.write(str(p)+' '+str(processes[p])+'\n')
- except IOError:
- pass
- f.close()
-
- def __call__(self, requestsize=1):
- """
- Block the calling program if the throttle time has not expired.
-
- Parameter requestsize is the number of Pages to be read/written;
- multiply delay time by an appropriate factor.
- """
- self.lock.acquire()
- try:
- waittime = self.waittime()
- # Calculate the multiplicity of the next delay based on how
- # big the request is that is being posted now.
- # We want to add "one delay" for each factor of two in the
- # size of the request. Getting 64 pages at once allows 6 times
- # the delay time for the server.
- self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
- # Announce the delay if it exceeds a preset limit
- if waittime > config.noisysleep:
- output(u"Sleeping for %.1f seconds, %s" % (waittime, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
- time.sleep(waittime)
- self.now = time.time()
- finally:
- self.lock.release()
-
-# end of category specific code
def url2link(percentname, insite, site):
"""Convert urlname of a wiki page into interwiki link format.
@@ -7378,9 +7208,11 @@
elif arg.startswith('-lang:'):
default_code = arg[6:]
elif arg.startswith('-putthrottle:'):
- put_throttle.setDelay(int(arg[13:]), absolute = True)
+ config.put_throttle = int(arg[len("-putthrottle:") : ])
+ put_throttle.setDelay()
elif arg.startswith('-pt:'):
- put_throttle.setDelay(int(arg[4:]), absolute = True)
+ config.put_throttle = int(arg[len("-pt:") : ])
+ put_throttle.setDelay()
elif arg == '-log':
setLogfileStatus(True)
elif arg.startswith('-log:'):
@@ -7768,8 +7600,8 @@
f.close()
output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )
-get_throttle = Throttle(config.minthrottle,config.maxthrottle)
-put_throttle = Throttle(config.put_throttle,config.put_throttle,multiplydelay=False)
+get_throttle = Throttle()
+put_throttle = Throttle(write=True)
def decompress_gzip(data):
# Use cStringIO if available
Revision: 7996
Author: xqt
Date: 2010-03-13 08:57:12 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
Changing throttle class for trunk version. This increases reading from server depending from actual script site (not the actual site object)
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/throttle.py
Modified: trunk/pywikipedia/pywikibot/throttle.py
===================================================================
--- trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 08:44:03 UTC (rev 7995)
+++ trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 08:57:12 UTC (rev 7996)
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
-Mechanics to slow down wiki read and/or write rate.
+Mechanics to slow down wiki page download rate.
"""
#
# (C) Pywikipedia bot team, 2008
@@ -9,16 +9,13 @@
#
__version__ = '$Id$'
-import pywikibot
-from pywikibot import config
+import wikipedia as pywikibot
+import config
-import logging
import math
import threading
import time
-logger = logging.getLogger("pywiki.wiki.throttle")
-
pid = False # global process identifier
# when the first Throttle is instantiated, it will set this
# variable to a positive integer, which will apply to all
@@ -31,15 +28,16 @@
Calling this object blocks the calling thread until at least 'delay'
seconds have passed since the previous call.
- Each Site initiates one Throttle object (site.throttle) to control the
- rate of access.
+ The framework initiates two Throttle objects: get_throttle to control
+ the rate of read access, and put_throttle to control the rate of write
+ access.
"""
- def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None,
+ def __init__(self, mindelay=None, maxdelay=None, writedelay=None,
multiplydelay=True, verbosedelay=False):
self.lock = threading.RLock()
- self.mysite = str(site)
- self.ctrlfilename = config.datafilepath('throttle.ctrl')
+ self.mysite = None
+ self.ctrlfilename = config.datafilepath('pywikibot', 'throttle.ctrl')
self.mindelay = mindelay
if self.mindelay is None:
self.mindelay = config.minthrottle
@@ -52,8 +50,8 @@
self.last_read = 0
self.last_write = 0
self.next_multiplicity = 1.0
- self.checkdelay = 300 # Check logfile again after this many seconds
- self.dropdelay = 600 # Ignore processes that have not made
+ self.checkdelay = 120 # Check logfile again after this many seconds
+ self.dropdelay = 360 # Ignore processes that have not made
# a check in this many seconds
self.releasepid = 1200 # Free the process id after this many seconds
self.lastwait = 0.0
@@ -63,15 +61,15 @@
self.multiplydelay = multiplydelay
if self.multiplydelay:
self.checkMultiplicity()
- self.setDelays()
+ self.setDelay()
def checkMultiplicity(self):
"""Count running processes for site and set process_multiplicity."""
global pid
self.lock.acquire()
- mysite = self.mysite
- pywikibot.output(u"Checking multiplicity: pid = %(pid)s" % globals(),
- level=pywikibot.DEBUG)
+ mysite = self.mysite = str(pywikibot.getSite())
+ if pywikibot.verbose:
+ pywikibot.output(u"Checking multiplicity: pid = %(pid)s" % globals())
try:
processes = []
my_pid = pid or 1 # start at 1 if global pid not yet set
@@ -124,14 +122,14 @@
pass
f.close()
self.process_multiplicity = count
- if self.verbosedelay:
+ if self.verbosedelay or pywikibot.verbose:
pywikibot.output(
u"Found %(count)s %(mysite)s processes running, including this one."
% locals())
finally:
self.lock.release()
- def setDelays(self, delay=None, writedelay=None, absolute=False):
+ def setDelay(self, delay=None, writedelay=None, absolute=False):
"""Set the nominal delays in seconds. Defaults to config values."""
self.lock.acquire()
try:
@@ -158,12 +156,11 @@
account of how much time has elapsed since the last access.
"""
- global pid
if write:
thisdelay = self.writedelay
else:
thisdelay = self.delay
- if pid and self.multiplydelay: # We're checking for multiple processes
+ if self.multiplydelay: # We're checking for multiple processes
if time.time() > self.checktime + self.checkdelay:
self.checkMultiplicity()
if thisdelay < (self.mindelay * self.next_multiplicity):
@@ -228,10 +225,6 @@
Parameter requestsize is the number of Pages to be read/written;
multiply delay time by an appropriate factor.
-
- Because this seizes the throttle lock, it will prevent any other
- thread from writing to the same site until the wait expires.
-
"""
self.lock.acquire()
try:
@@ -243,7 +236,7 @@
# the delay time for the server.
self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
# Announce the delay if it exceeds a preset limit
- if wait > config.noisysleep:
+ if wait > config.noisysleep or pywikibot.verbose:
pywikibot.output(u"Sleeping for %(wait).1f seconds, %(now)s"
% {'wait': wait,
'now' : time.strftime("%Y-%m-%d %H:%M:%S",
Revision: 7995
Author: xqt
Date: 2010-03-13 08:44:03 +0000 (Sat, 13 Mar 2010)
Log Message:
-----------
copy throttle class from rewrite branch
Added Paths:
-----------
trunk/pywikipedia/pywikibot/throttle.py
Copied: trunk/pywikipedia/pywikibot/throttle.py (from rev 7990, branches/rewrite/pywikibot/throttle.py)
===================================================================
--- trunk/pywikipedia/pywikibot/throttle.py (rev 0)
+++ trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 08:44:03 UTC (rev 7995)
@@ -0,0 +1,285 @@
+# -*- coding: utf-8 -*-
+"""
+Mechanics to slow down wiki read and/or write rate.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot import config
+
+import logging
+import math
+import threading
+import time
+
+logger = logging.getLogger("pywiki.wiki.throttle")
+
+pid = False # global process identifier
+ # when the first Throttle is instantiated, it will set this
+ # variable to a positive integer, which will apply to all
+ # throttle objects created by this process.
+
+
+class Throttle(object):
+ """Control rate of access to wiki server
+
+ Calling this object blocks the calling thread until at least 'delay'
+ seconds have passed since the previous call.
+
+ Each Site initiates one Throttle object (site.throttle) to control the
+ rate of access.
+
+ """
+ def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None,
+ multiplydelay=True, verbosedelay=False):
+ self.lock = threading.RLock()
+ self.mysite = str(site)
+ self.ctrlfilename = config.datafilepath('throttle.ctrl')
+ self.mindelay = mindelay
+ if self.mindelay is None:
+ self.mindelay = config.minthrottle
+ self.maxdelay = maxdelay
+ if self.maxdelay is None:
+ self.maxdelay = config.maxthrottle
+ self.writedelay = writedelay
+ if self.writedelay is None:
+ self.writedelay = config.put_throttle
+ self.last_read = 0
+ self.last_write = 0
+ self.next_multiplicity = 1.0
+ self.checkdelay = 300 # Check logfile again after this many seconds
+ self.dropdelay = 600 # Ignore processes that have not made
+ # a check in this many seconds
+ self.releasepid = 1200 # Free the process id after this many seconds
+ self.lastwait = 0.0
+ self.delay = 0
+ self.checktime = 0
+ self.verbosedelay = verbosedelay
+ self.multiplydelay = multiplydelay
+ if self.multiplydelay:
+ self.checkMultiplicity()
+ self.setDelays()
+
+ def checkMultiplicity(self):
+ """Count running processes for site and set process_multiplicity."""
+ global pid
+ self.lock.acquire()
+ mysite = self.mysite
+ pywikibot.output(u"Checking multiplicity: pid = %(pid)s" % globals(),
+ level=pywikibot.DEBUG)
+ try:
+ processes = []
+ my_pid = pid or 1 # start at 1 if global pid not yet set
+ count = 1
+ # open throttle.log
+ try:
+ f = open(self.ctrlfilename, 'r')
+ except IOError:
+ if not pid:
+ pass
+ else:
+ raise
+ else:
+ now = time.time()
+ for line in f.readlines():
+ # parse line; format is "pid timestamp site"
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ this_site = line[2].rstrip()
+ except (IndexError, ValueError):
+ continue # Sometimes the file gets corrupted
+ # ignore that line
+ if now - ptime > self.releasepid:
+ continue # process has expired, drop from file
+ if now - ptime <= self.dropdelay \
+ and this_site == mysite \
+ and this_pid != pid:
+ count += 1
+ if this_site != self.mysite or this_pid != pid:
+ processes.append({'pid': this_pid,
+ 'time': ptime,
+ 'site': this_site})
+ if not pid and this_pid >= my_pid:
+ my_pid = this_pid+1 # next unused process id
+
+ if not pid:
+ pid = my_pid
+ self.checktime = time.time()
+ processes.append({'pid': pid,
+ 'time': self.checktime,
+ 'site': mysite})
+ processes.sort(key=lambda p:(p['pid'], p['site']))
+ try:
+ f = open(self.ctrlfilename, 'w')
+ for p in processes:
+ f.write("%(pid)s %(time)s %(site)s\n" % p)
+ except IOError:
+ pass
+ f.close()
+ self.process_multiplicity = count
+ if self.verbosedelay:
+ pywikibot.output(
+ u"Found %(count)s %(mysite)s processes running, including this one."
+ % locals())
+ finally:
+ self.lock.release()
+
+ def setDelays(self, delay=None, writedelay=None, absolute=False):
+ """Set the nominal delays in seconds. Defaults to config values."""
+ self.lock.acquire()
+ try:
+ maxdelay = self.maxdelay
+ if delay is None:
+ delay = self.mindelay
+ if writedelay is None:
+ writedelay = config.put_throttle
+ if absolute:
+ self.maxdelay = delay
+ self.mindelay = delay
+ self.delay = delay
+ self.writedelay = min(max(self.mindelay, writedelay),
+ self.maxdelay)
+ # Start the delay count now, not at the next check
+ self.last_read = self.last_write = time.time()
+ finally:
+ self.lock.release()
+
+ def getDelay(self, write=False):
+ """Return the actual delay, accounting for multiple processes.
+
+ This value is the maximum wait between reads/writes, not taking
+ account of how much time has elapsed since the last access.
+
+ """
+ global pid
+ if write:
+ thisdelay = self.writedelay
+ else:
+ thisdelay = self.delay
+ if pid and self.multiplydelay: # We're checking for multiple processes
+ if time.time() > self.checktime + self.checkdelay:
+ self.checkMultiplicity()
+ if thisdelay < (self.mindelay * self.next_multiplicity):
+ thisdelay = self.mindelay * self.next_multiplicity
+ elif thisdelay > self.maxdelay:
+ thisdelay = self.maxdelay
+ thisdelay *= self.process_multiplicity
+ return thisdelay
+
+ def waittime(self, write=False):
+ """Return waiting time in seconds if a query would be made right now"""
+ # Take the previous requestsize in account calculating the desired
+ # delay this time
+ thisdelay = self.getDelay(write=write)
+ now = time.time()
+ if write:
+ ago = now - self.last_write
+ else:
+ ago = now - self.last_read
+ if ago < thisdelay:
+ delta = thisdelay - ago
+ return delta
+ else:
+ return 0.0
+
+ def drop(self):
+ """Remove me from the list of running bot processes."""
+ # drop all throttles with this process's pid, regardless of site
+ self.checktime = 0
+ processes = []
+ try:
+ f = open(self.ctrlfilename, 'r')
+ except IOError:
+ return
+ else:
+ now = time.time()
+ for line in f.readlines():
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ this_site = line[2].rstrip()
+ except (IndexError,ValueError):
+ continue # Sometimes the file gets corrupted
+ # ignore that line
+ if now - ptime <= self.releasepid \
+ and this_pid != pid:
+ processes.append({'pid': this_pid,
+ 'time': ptime,
+ 'site': this_site})
+ processes.sort(key=lambda p:p['pid'])
+ try:
+ f = open(self.ctrlfilename, 'w')
+ for p in processes:
+ f.write("%(pid)s %(time)s %(site)s\n" % p)
+ except IOError:
+ pass
+ f.close()
+
+ def __call__(self, requestsize=1, write=False):
+ """Block the calling program if the throttle time has not expired.
+
+ Parameter requestsize is the number of Pages to be read/written;
+ multiply delay time by an appropriate factor.
+
+ Because this seizes the throttle lock, it will prevent any other
+ thread from writing to the same site until the wait expires.
+
+ """
+ self.lock.acquire()
+ try:
+ wait = self.waittime(write=write)
+ # Calculate the multiplicity of the next delay based on how
+ # big the request is that is being posted now.
+ # We want to add "one delay" for each factor of two in the
+ # size of the request. Getting 64 pages at once allows 6 times
+ # the delay time for the server.
+ self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
+ # Announce the delay if it exceeds a preset limit
+ if wait > config.noisysleep:
+ pywikibot.output(u"Sleeping for %(wait).1f seconds, %(now)s"
+ % {'wait': wait,
+ 'now' : time.strftime("%Y-%m-%d %H:%M:%S",
+ time.localtime())
+ } )
+ time.sleep(wait)
+ if write:
+ self.last_write = time.time()
+ else:
+ self.last_read = time.time()
+ finally:
+ self.lock.release()
+
+ def lag(self, lagtime):
+ """Seize the throttle lock due to server lag.
+
+ This will prevent any thread from accessing this site.
+
+ """
+ started = time.time()
+ self.lock.acquire()
+ try:
+ # start at 1/2 the current server lag time
+ # wait at least 5 seconds but not more than 120 seconds
+ delay = min(max(5, lagtime//2), 120)
+ # account for any time we waited while acquiring the lock
+ wait = delay - (time.time() - started)
+ if wait > 0:
+ if wait > config.noisysleep:
+ pywikibot.output(
+ u"Sleeping for %(wait).1f seconds, %(now)s"
+ % {'wait': wait,
+ 'now': time.strftime("%Y-%m-%d %H:%M:%S",
+ time.localtime())
+ } )
+ time.sleep(wait)
+ finally:
+ self.lock.release()
+