[Gerrit] Normalise data_ingestion script - change (pywikibot/core) - Pywikibot-commits

20 Jan 2015

jenkins-bot has submitted this change and it was merged.

Change subject: Normalise data_ingestion script
......................................................................


Normalise data_ingestion script

Photo subclasses FilePage
DataIngestionBot subclasses Bot

Commented out parts of data_ingestion now integrated into
the script.

Bug: T70611
Bug: T75624
Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
---
M scripts/data_ingestion.py
M setup.py
M tests/data_ingestion_tests.py
M tests/script_tests.py
M tox.ini
5 files changed, 153 insertions(+), 182 deletions(-)

Approvals:
  XZise: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py
index 72e22f5..1e44ece 100755
--- a/scripts/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -1,6 +1,10 @@
 #!/usr/bin/python
 # -*- coding: utf-8  -*-
-"""A generic bot to do data ingestion (batch uploading) to
Commons."""
+"""
+A generic bot to do data ingestion (batch uploading).
+
+usage: data_ingestion.py -csvdir:local_dir/ -page:config_page
+"""
 #
 # (C) Pywikibot team, 2013
 #
@@ -9,16 +13,25 @@
 __version__ = '$Id$'
 #
 
-import posixpath
-import hashlib
 import base64
-import sys
+import codecs
+import hashlib
 import io
+import os
+import sys
+
+import posixpath
+
+if sys.version_info[0] > 2:
+    import csv
+else:
+    import unicodecsv as csv
 
 import pywikibot
-# TODO: nosetests3 fails on 'import <other_script>', which is used by many
-# of our scripts, but only data_ingestion is directly imported (not via pwb).
-# https://github.com/nose-devs/nose/issues/839
+
+from pywikibot import pagegenerators
+from pywikibot.tools import deprecated, deprecated_args
+
 from scripts import upload
 
 if sys.version_info[0] > 2:
@@ -29,20 +42,23 @@
     from urllib import urlopen
 
 
-class Photo(object):
+class Photo(pywikibot.FilePage):
 
-    """
-    Represents a Photo (or other file), with metadata, to upload to Commons.
+    """Represents a Photo (or other file), with metadata, to be
uploaded."""
 
-    The constructor takes two parameters: URL (string) and metadata (dict with
-    str:str key:value pairs) that can be referred to from the title & template
-    generation.
+    def __init__(self, URL, metadata, site=None):
+        """
+        Constructor.
 
+        @param URL: URL of photo
+        @type URL: str
+        @param metadata: metadata about the photo that can be referred to
+            from the title & template
+        @type metadata: dict
+        @param site: target site
+        @type site: APISite
 
-    """
-
-    def __init__(self, URL, metadata):
-        """Constructor."""
+        """
         self.URL = URL
         self.metadata = metadata
         self.metadata["_url"] = URL
@@ -52,6 +68,13 @@
         if ext == filename:
             self.metadata["_ext"] = ext = None
         self.contents = None
+
+        if not site:
+            site = pywikibot.Site(u'commons', u'commons')
+
+        # default title
+        super(Photo, self).__init__(site,
+                                    self.getTitle('%(_filename)s.%(_ext)s'))
 
     def downloadPhoto(self):
         """
@@ -64,8 +87,8 @@
             self.contents = io.BytesIO(imageFile)
         return self.contents
 
-    def findDuplicateImages(self,
-                            site=pywikibot.Site(u'commons',
u'commons')):
+    @deprecated_args(site=None)
+    def findDuplicateImages(self):
         """
         Find duplicates of the photo.
 
@@ -76,17 +99,23 @@
         """
         hashObject = hashlib.sha1()
         hashObject.update(self.downloadPhoto().getvalue())
-        return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
+        return list(
+            page.title(withNamespace=False) for page in
+            self.site.allimages(sha1=base64.b16encode(hashObject.digest())))
 
     def getTitle(self, fmt):
         """
         Populate format string with %(name)s entries using metadata.
+
+        Note: this does not clean the title, so it may be unusable as
+        a MediaWiki page title, and cause an API exception when used.
 
         @param fmt: format string
         @type fmt: unicode
         @return: formatted string
         @rtype: unicode
         """
+        # FIXME: normalise the title so it is usable as a MediaWiki title.
         return fmt % self.metadata
 
     def getDescription(self, template, extraparams={}):
@@ -105,31 +134,35 @@
         return description
 
     def _safeTemplateValue(self, value):
+        """Replace pipe (|) with {{!}}."""
         return value.replace("|", "{{!}}")
 
 
-def CSVReader(fileobj, urlcolumn, *args, **kwargs):
+def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs):
     """CSV reader."""
-    import csv
     reader = csv.DictReader(fileobj, *args, **kwargs)
-
     for line in reader:
-        yield Photo(line[urlcolumn], line)
+        yield Photo(line[urlcolumn], line, site=site)
 
 
-class DataIngestionBot:
+class DataIngestionBot(pywikibot.Bot):
 
     """Data ingestion bot."""
 
     def __init__(self, reader, titlefmt, pagefmt,
                  site=pywikibot.Site(u'commons', u'commons')):
+        """Constructor."""
+        super(DataIngestionBot, self).__init__(generator=reader)
         self.reader = reader
         self.titlefmt = titlefmt
         self.pagefmt = pagefmt
-        self.site = site
 
-    def _doUpload(self, photo):
-        duplicates = photo.findDuplicateImages(self.site)
+        if site:
+            self.site = site
+
+    def treat(self, photo):
+        """Process each page."""
+        duplicates = photo.findDuplicateImages()
         if duplicates:
             pywikibot.output(u"Skipping duplicate of %r" % duplicates)
             return duplicates[0]
@@ -149,178 +182,98 @@
 
         return title
 
+    @deprecated("treat()")
     def doSingle(self):
-        return self._doUpload(next(self.reader))
+        """Process one page."""
+        return self.treat(next(self.reader))
 
-    def run(self):
-        for photo in self.reader:
-            self._doUpload(photo)
-
-if __name__ == "__main__":
-    reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url')
-    bot = DataIngestionBot(
-        reader,
-        "%(name)s - %(set)s.%(_ext)s",
":user:valhallasw/test_template",
-        pywikibot.Site('test', 'test'))
-    bot.run()
-
-'''
-class DataIngestionBot:
-    def __init__(self, configurationPage):
+    @classmethod
+    def parseConfigurationPage(cls, configurationPage):
         """
+        Parse a Page which contains the configuration.
 
+        @param configurationPage: page with configuration
+        @type configurationPage: L{pywikibot.Page}
         """
-        self.site = configurationPage.site()
-        self.configuration = self.parseConfigurationPage(configurationPage)
-
-    def parseConfigurationPage(self, configurationPage):
-        """
-        Expects a pywikibot.page object "configurationPage" which contains the
configuration
-        """
-        configuration  = {}
+        configuration = {}
         # Set a bunch of defaults
-        configuration['csvDialect']=u'excel'
-        configuration['csvDelimiter']=';'
-        configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding
hell
+        configuration['csvDialect'] = u'excel'
+        configuration['csvDelimiter'] = ';'
+        configuration['csvEncoding'] = u'Windows-1252'  # FIXME: Encoding
hell
 
         templates = configurationPage.templatesWithParams()
         for (template, params) in templates:
-            if template == u'Data ingestion':
+            if template.title(withNamespace=False) == u'Data ingestion':
                 for param in params:
                     (field, sep, value) = param.partition(u'=')
 
                     # Remove leading or trailing spaces
                     field = field.strip()
                     value = value.strip()
+                    if not value:
+                        value = None
                     configuration[field] = value
-        print(configuration)
+
         return configuration
 
 
-    def downloadPhoto(self, photoUrl=''):
-        """
-        Download the photo and store it in a io.BytesIO object.
+def main(*args):
+    """
+    Process command line arguments and invoke bot.
 
-        TODO: Add exception handling
-        """
-        imageFile = urlopen(photoUrl).read()
-        return io.BytesIO(imageFile)
+    If args is an empty list, sys.argv is used.
 
-    def findDuplicateImages(self, photo=None, site=pywikibot.Site(u'commons',
u'commons')):
-        """
-        Takes the photo, calculates the SHA1 hash and asks the MediaWiki api for a list
of duplicates.
-
-        TODO: Add exception handling, fix site thing
-        """
-        hashObject = hashlib.sha1()
-        hashObject.update(photo.getvalue())
-        return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
-
-    def getTitle(self, metadata):
-        """
-        Build a title.
-        Have titleFormat to indicate how the title would look.
-        We need to be able to strip off stuff if it's too long.
configuration.get('maxTitleLength')
-        """
-
-        #FIXME: Make this configurable.
-        title = self.configuration.get('titleFormat') % metadata
-
-        description = metadata.get(u'dc:title')
-        identifier = metadata.get(u'dc:identifier')
-
-        if len(description) > 120:
-            description = description[0 : 120]
-
-        title = u'%s - %s.jpg' % (description, identifier)
-
-        return flickrripper.cleanUpTitle(title)
-
-    def cleanDate(self, field):
-        """
-        A function to do date clean up.
-        """
-        # Empty, make it really empty
-        if field == u'-':
-            return u''
-        # TODO: Circa
-        # TODO: Period
-
-        return field
-
-    def cleanEmptyField(self, field):
-        return field
-
-    def procesFile(self, metadata):
-        # FIXME: Do some metadata enrichment
-        #metadata = getEuropeanaMetadata(metadata)
-
-        fileLocation = metadata.get(self.configuration.get('sourceFileField'))
-
-        photo = self.downloadPhoto(fileLocation)
-        duplicates = self.findDuplicateImages(photo)
-
-        # We don't want to upload dupes
-        if duplicates:
-            pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
-            # The file is at Commons so return True
-            return True
-
-        # FIXME: Do some checking to see if the title already exists
-
-        title = self.getTitle(metadata)
-        description = self.getDescription(metadata)
-
-
-        pywikibot.output(u'Preparing upload for %s.' % title)
-        pywikibot.output(description)
-
-        bot = upload.UploadRobot(url=fileLocation, description=description,
useFilename=title, keepFilename=True, verifyDescription=False, targetSite = self.site)
-        bot.run()
-
-    def processCSV(self):
-        database = {}
-
-        reader = csv.DictReader(open(self.configuration.get('csvFile'),
"rb"), dialect=self.configuration.get('csvDialect'),
delimiter=self.configuration.csvDelimiter)
-        # FIXME : Encoding problems
https://docs.python.org/2/library/csv.html#csv-examples
-        for row in reader:
-            self.metadataCSV(row)
-            self.processFile(metadata)
-
-    def run(self):
-        """
-        Do crap
-        """
-        if not self.configuration.get('sourceFormat'):
-            pywikibot.output(u'The field "sourceFormat" is not set')
-            return False
-
-        if self.configuration.get('sourceFormat') == u'csv':
-            self.processCSV()
-        else:
-            pywikibot.output(u'%s is not a supported source format')
-
-def main():
-    generator = None;
-
+    @param args: command line arguments
+    @type args: list of unicode
+    """
     # Process global args and prepare generator args parser
-    local_args = pywikibot.handleArgs()
+    local_args = pywikibot.handle_args(args)
     genFactory = pagegenerators.GeneratorFactory()
+    csv_dir = None
 
     for arg in local_args:
-        genFactory.handleArg(arg)
+        if arg.startswith('-csvdir:'):
+            csv_dir = arg[8:]
+        else:
+            genFactory.handleArg(arg)
 
-    generator = genFactory.getCombinedGenerator()
-    if not generator:
-        return False
+    config_generator = genFactory.getCombinedGenerator()
 
-    for page in generator:
-        bot  = DataIngestionBot(page)
-        bot.run()
+    if not config_generator or not csv_dir:
+        pywikibot.showHelp()
+        return
+
+    for config_page in config_generator:
+        try:
+            config_page.get()
+        except pywikibot.NoPage:
+            pywikibot.error('%s does not exist' % config_page)
+            continue
+
+        configuration = DataIngestionBot.parseConfigurationPage(config_page)
+
+        filename = os.path.join(csv_dir, configuration['csvFile'])
+        try:
+
+            f = codecs.open(filename, 'r', configuration['csvEncoding'])
+        except (IOError, OSError) as e:
+            pywikibot.error('%s could not be opened: %s' % (filename, e))
+            continue
+
+        try:
+            files = CSVReader(f, urlcolumn='url',
+                              site=config_page.site,
+                              dialect=configuration['csvDialect'],
+                              delimiter=str(configuration['csvDelimiter']))
+
+            bot = DataIngestionBot(files,
+                                   configuration['titleFormat'],
+                                   configuration['formattingTemplate'],
+                                   site=None)
+
+            bot.run()
+        finally:
+            f.close()
 
 if __name__ == "__main__":
-    try:
-        main()
-    finally:
-        print("All done!")
-'''
+    main()
diff --git a/setup.py b/setup.py
index 408c054..649d0b8 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,8 @@
 test_deps.extend(extra_deps['rcstream'])
 
 if sys.version_info[0] == 2:
-    extra_deps['wikistats-csv'] = ['unicodecsv']
+    # csv is used by wikistats and script data_ingestion
+    extra_deps['csv'] = ['unicodecsv']
 
 script_deps = {
     'script_wui.py': ['irc', 'lunatic-python',
'crontab'],
diff --git a/tests/data_ingestion_tests.py b/tests/data_ingestion_tests.py
index b62a999..156343e 100644
--- a/tests/data_ingestion_tests.py
+++ b/tests/data_ingestion_tests.py
@@ -7,7 +7,7 @@
 import os
 from tests import _data_dir
 from tests import _images_dir
-from tests.aspects import unittest, TestCase
+from tests.aspects import unittest, TestCase, ScriptMainTestCase
 from scripts import data_ingestion
 
 
@@ -33,8 +33,8 @@
                                                   'author': 'KDE artists |
Silstor',
                                                   'license': 'LGPL',
                                                   'set': 'Crystal SVG icon
set',
-                                                  'name': 'Sound icon'}
-                                        )
+                                                  'name': 'Sound icon'},
+                                        site=self.get_site('commons'))
 
     def test_downloadPhoto(self):
         """Test download from
http://upload.wikimedia.org/."""
@@ -66,12 +66,14 @@
 
     """Test CSVReader class."""
 
-    net = False
+    family = 'commons'
+    code = 'commons'
 
     def setUp(self):
         super(TestCSVReader, self).setUp()
         with open(os.path.join(_data_dir, 'csv_ingestion.csv')) as fileobj:
-            self.iterator = data_ingestion.CSVReader(fileobj, 'url')
+            self.iterator = data_ingestion.CSVReader(fileobj, 'url',
+                                                     site=self.get_site())
             self.obj = next(self.iterator)
 
     def test_PhotoURL(self):
@@ -93,5 +95,19 @@
 }}""")  # noqa
 
 
+class TestDataIngestionBot(ScriptMainTestCase):
+
+    """Test TestDataIngestionBot class."""
+
+    family = 'commons'
+    code = 'commons'
+
+    def test_existing_file(self):
+        """Test uploading a file that already exists."""
+        data_ingestion.main(
+            '-family:test', '-lang:test', '-csvdir:tests/data',
+            '-page:User:John_Vandenberg/data_ingestion_test_template')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 9a9c88f..ce04236 100644
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -28,6 +28,7 @@
     'script_wui': ['crontab', 'lua'],
     # Note: package 'lunatic-python' provides module 'lua'
 
+    'data_ingestion': ['unicodecsv'],
     'flickrripper': ['flickrapi'],
     'match_images': ['PIL.ImageTk'],
     'states_redirect': ['pycountry'],
@@ -302,7 +303,6 @@
                 test_name = 'test_' + script_name + '_help'
             dct[test_name] = test_execution(script_name, ['-help'])
             if script_name in ['version',
-                               'data_ingestion',  # bug 68611
                                'script_wui',      # Failing on travis-ci
                                ] + failed_dep_script_list:
                 dct[test_name] = unittest.expectedFailure(dct[test_name])
@@ -325,7 +325,6 @@
                                             no_args_expected_results)
             if script_name in ['catall',          # stdout user interaction
                                'checkimages',     # bug 68613
-                               'data_ingestion',  # bug 68611
                                'flickrripper',    # Requires a flickr api key
                                'lonelypages',     # uses exit code 1
                                'script_wui',      # Error on any user except
DrTrigonBot
diff --git a/tox.ini b/tox.ini
index ae60d99..ee980f4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -68,6 +68,7 @@
     scripts/clean_sandbox.py \
     scripts/commonscat.py \
     scripts/coordinate_import.py \
+    scripts/data_ingestion.py \
     scripts/delete.py \
     scripts/flickrripper.py \
     scripts/harvest_template.py \
@@ -115,6 +116,7 @@
 deps =
     nose
     doctest-ignore-unicode
+    unicodecsv
 
 [testenv:nose34]
 basepython = python3

-- 
To view, visit https://gerrit.wikimedia.org/r/185666
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b
Gerrit-PatchSet: 15
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg &lt;jayvdb(a)gmail.com&gt;
Gerrit-Reviewer: Hashar &lt;hashar(a)free.fr&gt;
Gerrit-Reviewer: John Vandenberg &lt;jayvdb(a)gmail.com&gt;
Gerrit-Reviewer: Multichill &lt;maarten(a)mdammers.nl&gt;
Gerrit-Reviewer: XZise &lt;CommodoreFabianus(a)gmx.de&gt;
Gerrit-Reviewer: jenkins-bot <>