Mpaa has submitted this change and it was merged.
Change subject: [FEAT] Chunked uploads
......................................................................
[FEAT] Chunked uploads
This allows chunked uploads by setting the parameter 'chunk_size'
to a value between 0 and the file size (both exclusive). It will
also only work if the version is 1.20 or newer.
The upload.py script supports this mode via the '-chunked'
parameter.
This also adds the capability to run the API request without
throttle so that it don't has to wait after each request.
See:
https://www.mediawiki.org/wiki/API:Upload#Chunked_uploading
Change-Id: I80b2bba9e63832173d5b697db1f4ea419ca1122f
---
M pywikibot/data/api.py
M pywikibot/site.py
M scripts/upload.py
3 files changed, 156 insertions(+), 36 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
diff --git a/pywikibot/data/api.py b/pywikibot/data/api.py
index 8132a27..11df17b 100644
--- a/pywikibot/data/api.py
+++ b/pywikibot/data/api.py
@@ -129,6 +129,8 @@
@param site: The Site to which the request will be submitted. If not
supplied, uses the user's configured default Site.
@param mime: If true, send in "multipart/form-data" format (default False)
+ @param mime_params: A dictionary of parameter which should only be
+ transferred via mime mode. If not None sets mime to True.
@param max_retries: (optional) Maximum number of times to retry after
errors, defaults to 25
@param retry_wait: (optional) Minimum time to wait after an error,
@@ -143,7 +145,15 @@
self.site = kwargs.pop("site")
except KeyError:
self.site = pywikibot.Site()
- self.mime = kwargs.pop("mime", False)
+ if 'mime_params' in kwargs:
+ self.mime_params = kwargs.pop('mime_params')
+ # mime may not be different from mime_params
+ if 'mime' in kwargs and kwargs.pop('mime') != self.mime:
+ raise ValueError('If mime_params is set, mime may not differ '
+ 'from it.')
+ else:
+ self.mime = kwargs.pop('mime', False)
+ self.throttle = kwargs.pop('throttle', False)
self.max_retries = kwargs.pop("max_retries",
pywikibot.config.max_retries)
self.retry_wait = kwargs.pop("retry_wait",
pywikibot.config.retry_wait)
self.params = {}
@@ -210,6 +220,23 @@
def iteritems(self):
return iter(self.params.items())
+ @property
+ def mime(self):
+ """Return whether mime parameters are defined."""
+ return self.mime_params is not None
+
+ @mime.setter
+ def mime(self, value):
+ """
+ Change whether mime parameter should be defined.
+
+ This will clear the mime parameters.
+ """
+ try:
+ self.mime_params = dict(value)
+ except TypeError:
+ self.mime_params = {} if value else None
+
def http_params(self):
"""Return the parameters formatted for inclusion in an HTTP
request.
@@ -218,7 +245,9 @@
unicode (may be |-separated list)
str in site encoding (may be |-separated list)
"""
-
+ if self.mime_params and set(self.params.keys()) &
set(self.mime_params.keys()):
+ raise ValueError('The mime_params and params may not share the '
+ 'same keys.')
for key in self.params:
if isinstance(self.params[key], bytes):
self.params[key] = self.params[key].decode(self.site.encoding())
@@ -296,6 +325,23 @@
message = None
return message == ERR_MSG
+ @staticmethod
+ def _generate_MIME_part(key, content, keytype, headers):
+ if not keytype:
+ try:
+ content.encode("ascii")
+ keytype = ("text", "plain")
+ except UnicodeError:
+ keytype = ("application", "octet-stream")
+ submsg = MIMENonMultipart(*keytype)
+ content_headers = {'name': key}
+ if headers:
+ content_headers.update(headers)
+ submsg.add_header("Content-disposition", "form-data",
+ **content_headers)
+ submsg.set_payload(content)
+ return submsg
+
def submit(self):
"""Submit a query and parse the response.
@@ -308,7 +354,10 @@
simulate = self._simulate(action)
if simulate:
return simulate
- self.site.throttle(write=self.write)
+ if self.throttle:
+ self.site.throttle(write=self.write)
+ else:
+ pywikibot.log("Action '{0}' is submitted not
throttled.".format(action))
uri = self.site.scriptpath() + "/api.php"
ssl = False
if self.site.family.name in config.available_ssl_project:
@@ -328,22 +377,15 @@
filetype = mimetypes.guess_type(local_filename)[0] \
or 'application/octet-stream'
file_content = file(local_filename, "rb").read()
- submsg = MIMENonMultipart(*filetype.split("/"))
- submsg.add_header("Content-disposition",
- "form-data", name=key,
- filename=local_filename)
- submsg.set_payload(file_content)
+ submsg = Request._generate_MIME_part(
+ key, file_content, filetype.split('/'),
+ {'filename': local_filename})
else:
- try:
- self.params[key].encode("ascii")
- keytype = ("text", "plain")
- except UnicodeError:
- keytype = ("application",
"octet-stream")
- submsg = MIMENonMultipart(*keytype)
- submsg.add_header("Content-disposition",
"form-data",
- name=key)
- submsg.set_payload(self.params[key])
+ submsg = Request._generate_MIME_part(
+ key, self.params[key], None, None)
container.attach(submsg)
+ for key, value in self.mime_params.items():
+ container.attach(Request._generate_MIME_part(key, *value))
# strip the headers to get the HTTP message body
body = container.as_string()
marker = "\n\n" # separates headers from body
diff --git a/pywikibot/site.py b/pywikibot/site.py
index fb8202d..21e0989 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -3858,7 +3858,8 @@
@deprecate_arg('imagepage', 'filepage')
def upload(self, filepage, source_filename=None, source_url=None,
- comment=None, text=None, watch=False, ignore_warnings=False):
+ comment=None, text=None, watch=False, ignore_warnings=False,
+ chunk_size=0):
"""Upload a file to the wiki.
Either source_filename or source_url, but not both, must be provided.
@@ -3875,7 +3876,11 @@
@param watch: If true, add filepage to the bot user's watchlist
@param ignore_warnings: if true, ignore API warnings and force
upload (for example, to overwrite an existing file); default False
-
+ @param chunk_size: The chunk size in bytesfor chunked uploading (see
+
U{https://www.mediawiki.org/wiki/API:Upload#Chunked_uploading}). It
+ will only upload in chunks, if the version number is 1.20 or higher
+ and the chunk size is positive but lower than the file size.
+ @type chunk_size: int
"""
upload_warnings = {
# map API warning codes to user error messages
@@ -3909,18 +3914,51 @@
if not text:
text = comment
token = self.token(filepage, "edit")
+ result = None
if source_filename:
# upload local file
# make sure file actually exists
if not os.path.isfile(source_filename):
raise ValueError("File '%s' does not exist."
% source_filename)
- # TODO: if file size exceeds some threshold (to be determined),
- # upload by chunks (--> os.path.getsize(source_filename))
+ additional_parameters = {}
+ throttle = True
+ filesize = os.path.getsize(source_filename)
+ if (chunk_size > 0 and chunk_size < filesize and
+ LV(self.version()) >= LV('1.20')):
+ offset = 0
+ file_key = None
+ with open(source_filename, 'rb') as f:
+ while True:
+ f.seek(offset)
+ chunk = f.read(chunk_size)
+ req = api.Request(site=self, action='upload',
token=token,
+ stash='1', offset=offset,
filesize=filesize,
+ filename=filepage.title(withNamespace=False),
+ mime_params={}, throttle=throttle)
+ req.mime_params['chunk'] = (chunk, None,
{'filename': req.params['filename']})
+ if file_key:
+ req['filekey'] = file_key
+ # TODO: Proper error and warning handling
+ data = req.submit()['upload']
+ if 'warnings' in data:
+ result = data
+ break
+ file_key = data['filekey']
+ throttle = False
+ new_offset = int(data['offset'])
+ if offset + len(chunk) != new_offset:
+ pywikibot.warning('Unexpected offset.')
+ offset = new_offset
+ if data['result'] != 'Continue': # finished
+ additional_parameters['filekey'] = file_key
+ break
+ else:
+ additional_parameters = {'file': source_filename, 'mime':
True}
req = api.Request(site=self, action="upload", token=token,
filename=filepage.title(withNamespace=False),
- file=source_filename, comment=comment,
- text=text, mime=True)
+ comment=comment, text=text, throttle=throttle,
+ **additional_parameters)
else:
# upload by URL
if "upload_by_url" not in self.userinfo["rights"]:
@@ -3930,16 +3968,17 @@
req = api.Request(site=self, action="upload", token=token,
filename=filepage.title(withNamespace=False),
url=source_url, comment=comment, text=text)
- if watch:
- req["watch"] = ""
- if ignore_warnings:
- req["ignorewarnings"] = ""
- try:
- result = req.submit()
- except api.APIError:
- # TODO: catch and process foreseeable errors
- raise
- result = result["upload"]
+ if not result:
+ if watch:
+ req["watch"] = ""
+ if ignore_warnings:
+ req["ignorewarnings"] = ""
+ try:
+ result = req.submit()
+ except api.APIError:
+ # TODO: catch and process foreseeable errors
+ raise
+ result = result["upload"]
pywikibot.debug(result, _logger)
if "warnings" in result:
warning = list(result["warnings"].keys())[0]
diff --git a/scripts/upload.py b/scripts/upload.py
index 8a9a44d..39f9df5 100755
--- a/scripts/upload.py
+++ b/scripts/upload.py
@@ -11,6 +11,15 @@
is given
-abortonwarn: Abort upload on the specified warning type. If no warning type
is specified abort on all warnings.
+ -chunked: Upload the file in chunks (more overhead, but restartable). If
+ no value is specified the chunk size is 1 MiB. The value must
+ be a number which can be preceded by a suffix. The units are:
+ No suffix: Bytes
+ 'k': Kilobytes (1000 B)
+ 'M': Megabytes (1000000 B)
+ 'Ki': Kibibytes (1024 B)
+ 'Mi': Mebibytes (1024x1024 B)
+ The suffixes are case insenstive.
If any other arguments are given, the first is the URL or filename to upload,
and the rest is a proposed description to go with the upload. If none of these
@@ -34,6 +43,8 @@
import urllib
import urlparse
import tempfile
+import re
+import math
import pywikibot
import pywikibot.data.api
from pywikibot import config
@@ -43,7 +54,7 @@
def __init__(self, url, urlEncoding=None, description=u'',
useFilename=None, keepFilename=False,
verifyDescription=True, ignoreWarning=False,
- targetSite=None, uploadByUrl=False, aborts=[]):
+ targetSite=None, uploadByUrl=False, aborts=[], chunk_size=0):
"""
@param ignoreWarning: Set this to True if you want to upload even if
another file would be overwritten or another mistake would be
@@ -58,6 +69,7 @@
self.verifyDescription = verifyDescription
self.ignoreWarning = ignoreWarning
self.aborts = aborts
+ self.chunk_size = chunk_size
if config.upload_to_commons:
self.targetSite = targetSite or pywikibot.Site('commons',
'commons')
@@ -224,7 +236,8 @@
else:
temp = self.url
site.upload(imagepage, source_filename=temp,
- ignore_warnings=self.ignoreWarning)
+ ignore_warnings=self.ignoreWarning,
+ chunk_size=self.chunk_size)
except pywikibot.data.api.UploadWarning as warn:
pywikibot.output(u"We got a warning message:
{0}".format(warn.message))
@@ -266,6 +279,8 @@
useFilename = None
verifyDescription = True
aborts = set()
+ chunk_size = 0
+ chunk_size_regex = re.compile(r'^-chunked(?::(\d+(?:\.\d+)?)[
\t]*(k|ki|m|mi)?b?)?$', re.I)
# process all global bot args
# returns a list of non-global args, i.e. args for upload.py
@@ -282,6 +297,30 @@
aborts.add(arg[len('-abortonwarn:'):])
else:
aborts = True
+ elif arg.startswith('-chunked'):
+ match = chunk_size_regex.match(arg)
+ if match:
+ if match.group(1): # number was in there
+ base = float(match.group(1))
+ if match.group(2): # suffix too
+ suffix = match.group(2).lower()
+ if suffix == "k":
+ suffix = 1000
+ elif suffix == "m":
+ suffix = 1000000
+ elif suffix == "ki":
+ suffix = 1 << 10
+ elif suffix == "mi":
+ suffix = 1 << 20
+ else:
+ pass # huh?
+ else:
+ suffix = 1
+ chunk_size = math.trunc(base * suffix)
+ else:
+ chunk_size = 1 << 20 # default to 1 MiB
+ else:
+ pywikibot.error('Chunk size parameter is not valid.')
elif url == u'':
url = arg
else:
@@ -290,7 +329,7 @@
bot = UploadRobot(url, description=description, useFilename=useFilename,
keepFilename=keepFilename,
verifyDescription=verifyDescription,
- aborts=aborts)
+ aborts=aborts, chunk_size=chunk_size)
bot.run()
if __name__ == "__main__":
--
To view, visit
https://gerrit.wikimedia.org/r/156030
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I80b2bba9e63832173d5b697db1f4ea419ca1122f
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>