jenkins-bot has submitted this change and it was merged.
Change subject: Normalise data_ingestion script
......................................................................
Normalise data_ingestion script
Photo subclasses FilePage
DataIngestionBot subclasses Bot
Commented out parts of data_ingestion now integrated into
the script.
Bug: T70611
Bug: T75624
Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
---
M scripts/data_ingestion.py
M setup.py
M tests/data_ingestion_tests.py
M tests/script_tests.py
M tox.ini
5 files changed, 153 insertions(+), 182 deletions(-)
Approvals:
XZise: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py
index 72e22f5..1e44ece 100755
--- a/scripts/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -1,6 +1,10 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
-"""A generic bot to do data ingestion (batch uploading) to
Commons."""
+"""
+A generic bot to do data ingestion (batch uploading).
+
+usage: data_ingestion.py -csvdir:local_dir/ -page:config_page
+"""
#
# (C) Pywikibot team, 2013
#
@@ -9,16 +13,25 @@
__version__ = '$Id$'
#
-import posixpath
-import hashlib
import base64
-import sys
+import codecs
+import hashlib
import io
+import os
+import sys
+
+import posixpath
+
+if sys.version_info[0] > 2:
+ import csv
+else:
+ import unicodecsv as csv
import pywikibot
-# TODO: nosetests3 fails on 'import <other_script>', which is used by many
-# of our scripts, but only data_ingestion is directly imported (not via pwb).
-#
https://github.com/nose-devs/nose/issues/839
+
+from pywikibot import pagegenerators
+from pywikibot.tools import deprecated, deprecated_args
+
from scripts import upload
if sys.version_info[0] > 2:
@@ -29,20 +42,23 @@
from urllib import urlopen
-class Photo(object):
+class Photo(pywikibot.FilePage):
- """
- Represents a Photo (or other file), with metadata, to upload to Commons.
+ """Represents a Photo (or other file), with metadata, to be
uploaded."""
- The constructor takes two parameters: URL (string) and metadata (dict with
- str:str key:value pairs) that can be referred to from the title & template
- generation.
+ def __init__(self, URL, metadata, site=None):
+ """
+ Constructor.
+ @param URL: URL of photo
+ @type URL: str
+ @param metadata: metadata about the photo that can be referred to
+ from the title & template
+ @type metadata: dict
+ @param site: target site
+ @type site: APISite
- """
-
- def __init__(self, URL, metadata):
- """Constructor."""
+ """
self.URL = URL
self.metadata = metadata
self.metadata["_url"] = URL
@@ -52,6 +68,13 @@
if ext == filename:
self.metadata["_ext"] = ext = None
self.contents = None
+
+ if not site:
+ site = pywikibot.Site(u'commons', u'commons')
+
+ # default title
+ super(Photo, self).__init__(site,
+ self.getTitle('%(_filename)s.%(_ext)s'))
def downloadPhoto(self):
"""
@@ -64,8 +87,8 @@
self.contents = io.BytesIO(imageFile)
return self.contents
- def findDuplicateImages(self,
- site=pywikibot.Site(u'commons',
u'commons')):
+ @deprecated_args(site=None)
+ def findDuplicateImages(self):
"""
Find duplicates of the photo.
@@ -76,17 +99,23 @@
"""
hashObject = hashlib.sha1()
hashObject.update(self.downloadPhoto().getvalue())
- return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
+ return list(
+ page.title(withNamespace=False) for page in
+ self.site.allimages(sha1=base64.b16encode(hashObject.digest())))
def getTitle(self, fmt):
"""
Populate format string with %(name)s entries using metadata.
+
+ Note: this does not clean the title, so it may be unusable as
+ a MediaWiki page title, and cause an API exception when used.
@param fmt: format string
@type fmt: unicode
@return: formatted string
@rtype: unicode
"""
+ # FIXME: normalise the title so it is usable as a MediaWiki title.
return fmt % self.metadata
def getDescription(self, template, extraparams={}):
@@ -105,31 +134,35 @@
return description
def _safeTemplateValue(self, value):
+ """Replace pipe (|) with {{!}}."""
return value.replace("|", "{{!}}")
-def CSVReader(fileobj, urlcolumn, *args, **kwargs):
+def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs):
"""CSV reader."""
- import csv
reader = csv.DictReader(fileobj, *args, **kwargs)
-
for line in reader:
- yield Photo(line[urlcolumn], line)
+ yield Photo(line[urlcolumn], line, site=site)
-class DataIngestionBot:
+class DataIngestionBot(pywikibot.Bot):
"""Data ingestion bot."""
def __init__(self, reader, titlefmt, pagefmt,
site=pywikibot.Site(u'commons', u'commons')):
+ """Constructor."""
+ super(DataIngestionBot, self).__init__(generator=reader)
self.reader = reader
self.titlefmt = titlefmt
self.pagefmt = pagefmt
- self.site = site
- def _doUpload(self, photo):
- duplicates = photo.findDuplicateImages(self.site)
+ if site:
+ self.site = site
+
+ def treat(self, photo):
+ """Process each page."""
+ duplicates = photo.findDuplicateImages()
if duplicates:
pywikibot.output(u"Skipping duplicate of %r" % duplicates)
return duplicates[0]
@@ -149,178 +182,98 @@
return title
+ @deprecated("treat()")
def doSingle(self):
- return self._doUpload(next(self.reader))
+ """Process one page."""
+ return self.treat(next(self.reader))
- def run(self):
- for photo in self.reader:
- self._doUpload(photo)
-
-if __name__ == "__main__":
- reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url')
- bot = DataIngestionBot(
- reader,
- "%(name)s - %(set)s.%(_ext)s",
":user:valhallasw/test_template",
- pywikibot.Site('test', 'test'))
- bot.run()
-
-'''
-class DataIngestionBot:
- def __init__(self, configurationPage):
+ @classmethod
+ def parseConfigurationPage(cls, configurationPage):
"""
+ Parse a Page which contains the configuration.
+ @param configurationPage: page with configuration
+ @type configurationPage: L{pywikibot.Page}
"""
- self.site = configurationPage.site()
- self.configuration = self.parseConfigurationPage(configurationPage)
-
- def parseConfigurationPage(self, configurationPage):
- """
- Expects a pywikibot.page object "configurationPage" which contains the
configuration
- """
- configuration = {}
+ configuration = {}
# Set a bunch of defaults
- configuration['csvDialect']=u'excel'
- configuration['csvDelimiter']=';'
- configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding
hell
+ configuration['csvDialect'] = u'excel'
+ configuration['csvDelimiter'] = ';'
+ configuration['csvEncoding'] = u'Windows-1252' # FIXME: Encoding
hell
templates = configurationPage.templatesWithParams()
for (template, params) in templates:
- if template == u'Data ingestion':
+ if template.title(withNamespace=False) == u'Data ingestion':
for param in params:
(field, sep, value) = param.partition(u'=')
# Remove leading or trailing spaces
field = field.strip()
value = value.strip()
+ if not value:
+ value = None
configuration[field] = value
- print(configuration)
+
return configuration
- def downloadPhoto(self, photoUrl=''):
- """
- Download the photo and store it in a io.BytesIO object.
+def main(*args):
+ """
+ Process command line arguments and invoke bot.
- TODO: Add exception handling
- """
- imageFile = urlopen(photoUrl).read()
- return io.BytesIO(imageFile)
+ If args is an empty list, sys.argv is used.
- def findDuplicateImages(self, photo=None, site=pywikibot.Site(u'commons',
u'commons')):
- """
- Takes the photo, calculates the SHA1 hash and asks the MediaWiki api for a list
of duplicates.
-
- TODO: Add exception handling, fix site thing
- """
- hashObject = hashlib.sha1()
- hashObject.update(photo.getvalue())
- return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
-
- def getTitle(self, metadata):
- """
- Build a title.
- Have titleFormat to indicate how the title would look.
- We need to be able to strip off stuff if it's too long.
configuration.get('maxTitleLength')
- """
-
- #FIXME: Make this configurable.
- title = self.configuration.get('titleFormat') % metadata
-
- description = metadata.get(u'dc:title')
- identifier = metadata.get(u'dc:identifier')
-
- if len(description) > 120:
- description = description[0 : 120]
-
- title = u'%s - %s.jpg' % (description, identifier)
-
- return flickrripper.cleanUpTitle(title)
-
- def cleanDate(self, field):
- """
- A function to do date clean up.
- """
- # Empty, make it really empty
- if field == u'-':
- return u''
- # TODO: Circa
- # TODO: Period
-
- return field
-
- def cleanEmptyField(self, field):
- return field
-
- def procesFile(self, metadata):
- # FIXME: Do some metadata enrichment
- #metadata = getEuropeanaMetadata(metadata)
-
- fileLocation = metadata.get(self.configuration.get('sourceFileField'))
-
- photo = self.downloadPhoto(fileLocation)
- duplicates = self.findDuplicateImages(photo)
-
- # We don't want to upload dupes
- if duplicates:
- pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
- # The file is at Commons so return True
- return True
-
- # FIXME: Do some checking to see if the title already exists
-
- title = self.getTitle(metadata)
- description = self.getDescription(metadata)
-
-
- pywikibot.output(u'Preparing upload for %s.' % title)
- pywikibot.output(description)
-
- bot = upload.UploadRobot(url=fileLocation, description=description,
useFilename=title, keepFilename=True, verifyDescription=False, targetSite = self.site)
- bot.run()
-
- def processCSV(self):
- database = {}
-
- reader = csv.DictReader(open(self.configuration.get('csvFile'),
"rb"), dialect=self.configuration.get('csvDialect'),
delimiter=self.configuration.csvDelimiter)
- # FIXME : Encoding problems
https://docs.python.org/2/library/csv.html#csv-examples
- for row in reader:
- self.metadataCSV(row)
- self.processFile(metadata)
-
- def run(self):
- """
- Do crap
- """
- if not self.configuration.get('sourceFormat'):
- pywikibot.output(u'The field "sourceFormat" is not set')
- return False
-
- if self.configuration.get('sourceFormat') == u'csv':
- self.processCSV()
- else:
- pywikibot.output(u'%s is not a supported source format')
-
-def main():
- generator = None;
-
+ @param args: command line arguments
+ @type args: list of unicode
+ """
# Process global args and prepare generator args parser
- local_args = pywikibot.handleArgs()
+ local_args = pywikibot.handle_args(args)
genFactory = pagegenerators.GeneratorFactory()
+ csv_dir = None
for arg in local_args:
- genFactory.handleArg(arg)
+ if arg.startswith('-csvdir:'):
+ csv_dir = arg[8:]
+ else:
+ genFactory.handleArg(arg)
- generator = genFactory.getCombinedGenerator()
- if not generator:
- return False
+ config_generator = genFactory.getCombinedGenerator()
- for page in generator:
- bot = DataIngestionBot(page)
- bot.run()
+ if not config_generator or not csv_dir:
+ pywikibot.showHelp()
+ return
+
+ for config_page in config_generator:
+ try:
+ config_page.get()
+ except pywikibot.NoPage:
+ pywikibot.error('%s does not exist' % config_page)
+ continue
+
+ configuration = DataIngestionBot.parseConfigurationPage(config_page)
+
+ filename = os.path.join(csv_dir, configuration['csvFile'])
+ try:
+
+ f = codecs.open(filename, 'r', configuration['csvEncoding'])
+ except (IOError, OSError) as e:
+ pywikibot.error('%s could not be opened: %s' % (filename, e))
+ continue
+
+ try:
+ files = CSVReader(f, urlcolumn='url',
+ site=config_page.site,
+ dialect=configuration['csvDialect'],
+ delimiter=str(configuration['csvDelimiter']))
+
+ bot = DataIngestionBot(files,
+ configuration['titleFormat'],
+ configuration['formattingTemplate'],
+ site=None)
+
+ bot.run()
+ finally:
+ f.close()
if __name__ == "__main__":
- try:
- main()
- finally:
- print("All done!")
-'''
+ main()
diff --git a/setup.py b/setup.py
index 408c054..649d0b8 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,8 @@
test_deps.extend(extra_deps['rcstream'])
if sys.version_info[0] == 2:
- extra_deps['wikistats-csv'] = ['unicodecsv']
+ # csv is used by wikistats and script data_ingestion
+ extra_deps['csv'] = ['unicodecsv']
script_deps = {
'script_wui.py': ['irc', 'lunatic-python',
'crontab'],
diff --git a/tests/data_ingestion_tests.py b/tests/data_ingestion_tests.py
index b62a999..156343e 100644
--- a/tests/data_ingestion_tests.py
+++ b/tests/data_ingestion_tests.py
@@ -7,7 +7,7 @@
import os
from tests import _data_dir
from tests import _images_dir
-from tests.aspects import unittest, TestCase
+from tests.aspects import unittest, TestCase, ScriptMainTestCase
from scripts import data_ingestion
@@ -33,8 +33,8 @@
'author': 'KDE artists |
Silstor',
'license': 'LGPL',
'set': 'Crystal SVG icon
set',
- 'name': 'Sound icon'}
- )
+ 'name': 'Sound icon'},
+ site=self.get_site('commons'))
def test_downloadPhoto(self):
"""Test download from
http://upload.wikimedia.org/."""
@@ -66,12 +66,14 @@
"""Test CSVReader class."""
- net = False
+ family = 'commons'
+ code = 'commons'
def setUp(self):
super(TestCSVReader, self).setUp()
with open(os.path.join(_data_dir, 'csv_ingestion.csv')) as fileobj:
- self.iterator = data_ingestion.CSVReader(fileobj, 'url')
+ self.iterator = data_ingestion.CSVReader(fileobj, 'url',
+ site=self.get_site())
self.obj = next(self.iterator)
def test_PhotoURL(self):
@@ -93,5 +95,19 @@
}}""") # noqa
+class TestDataIngestionBot(ScriptMainTestCase):
+
+ """Test TestDataIngestionBot class."""
+
+ family = 'commons'
+ code = 'commons'
+
+ def test_existing_file(self):
+ """Test uploading a file that already exists."""
+ data_ingestion.main(
+ '-family:test', '-lang:test', '-csvdir:tests/data',
+ '-page:User:John_Vandenberg/data_ingestion_test_template')
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 9a9c88f..ce04236 100644
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -28,6 +28,7 @@
'script_wui': ['crontab', 'lua'],
# Note: package 'lunatic-python' provides module 'lua'
+ 'data_ingestion': ['unicodecsv'],
'flickrripper': ['flickrapi'],
'match_images': ['PIL.ImageTk'],
'states_redirect': ['pycountry'],
@@ -302,7 +303,6 @@
test_name = 'test_' + script_name + '_help'
dct[test_name] = test_execution(script_name, ['-help'])
if script_name in ['version',
- 'data_ingestion', # bug 68611
'script_wui', # Failing on travis-ci
] + failed_dep_script_list:
dct[test_name] = unittest.expectedFailure(dct[test_name])
@@ -325,7 +325,6 @@
no_args_expected_results)
if script_name in ['catall', # stdout user interaction
'checkimages', # bug 68613
- 'data_ingestion', # bug 68611
'flickrripper', # Requires a flickr api key
'lonelypages', # uses exit code 1
'script_wui', # Error on any user except
DrTrigonBot
diff --git a/tox.ini b/tox.ini
index ae60d99..ee980f4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -68,6 +68,7 @@
scripts/clean_sandbox.py \
scripts/commonscat.py \
scripts/coordinate_import.py \
+ scripts/data_ingestion.py \
scripts/delete.py \
scripts/flickrripper.py \
scripts/harvest_template.py \
@@ -115,6 +116,7 @@
deps =
nose
doctest-ignore-unicode
+ unicodecsv
[testenv:nose34]
basepython = python3
--
To view, visit
https://gerrit.wikimedia.org/r/185666
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
Gerrit-PatchSet: 15
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Hashar <hashar(a)free.fr>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Multichill <maarten(a)mdammers.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>