SVN: [10840] trunk/pywikipedia/data_ingestion.py - Pywikipedia-svn

29 Dec 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10840

Revision: 10840
Author:   multichill
Date:     2012-12-29 14:21:28 +0000 (Sat, 29 Dec 2012)
Log Message:
-----------
Add JSON support. This is used for the RCE batch upload

Modified Paths:
--------------
    trunk/pywikipedia/data_ingestion.py

Modified: trunk/pywikipedia/data_ingestion.py
===================================================================

--- trunk/pywikipedia/data_ingestion.py	2012-12-29 13:16:16 UTC (rev 10839)
+++ trunk/pywikipedia/data_ingestion.py	2012-12-29 14:21:28 UTC (rev 10840)
@@ -4,11 +4,11 @@
 A generic bot to do data ingestion (batch uploading) to Commons
 
 '''
-import pywikibot
+import pywikibot, upload
 import posixpath, urlparse
 import urllib
 import hashlib, base64
-import StringIO
+import StringIO, json
 
 class Photo(object):
     '''
@@ -64,7 +64,7 @@
         params = {}
         params.update(self.metadata)
         params.update(extraparams)
-        description = u'{{%s\n' % template
+        description = u'{{subst:%s|subst=subst:\n' % template
         for key in sorted(params.keys()):
             value = params[key]
             if not key.startswith("_"):
@@ -83,6 +83,80 @@
     for line in reader:
         yield Photo(line[urlcolumn], line)
 
+
+def JSONReader(baseurl, start=0, end=100, JSONBase=None, metadataFunction=None,
fileurl=u'fileurl'):
+    '''
+    Loops over a bunch of json objects.
+    For each json page you can rebase it to not get all the crap
+    You can apply a custom metadata function to do some modification on the metadata and
checking
+    By default the field 'fileurl' is expected in the metadata to contain the
file. You can change this.
+
+    Will a Photo object with metadata
+    '''
+    if baseurl:
+        for i in range(start , end): 
+            # How to do recursion?
+            JSONPage = urllib.urlopen(baseurl % (i,))
+            JSONData = json.load(JSONPage)
+            JSONPage.close()
+
+            # Rebase based on jsonBase
+            if JSONBase:
+                JSONData = JSONRebase(JSONData, JSONBase)
+
+            if JSONData:
+                # If rebasing worked, get the metadata
+                metadata = dict()
+                fieldlist = [u'']
+                metadata = JSONTree(metadata, [], JSONData)
+
+                # If a metadataFunction is set, apply it
+                if metadataFunction:
+                    metadata = metadataFunction(metadata)
+
+                # If the metadataFunction didn't return none (something was wrong).
Yield the photo
+                if metadata:
+                    yield Photo(metadata.get(fileurl), metadata)
+
+def JSONRebase(JSONData, JSONBase):
+    '''
+    Moves the base of the JSON object to the part you're intrested in.
+    JSONBase is a list to crawl the tree. If one of the steps is not found, return None
+    '''
+    for step in JSONBase:
+        if JSONData:
+            if type(JSONBase) == dict:
+                JSONData = JSONData.get(step)
+            elif type(JSONBase) == list:
+                # FIXME: Needs error, length etc checking
+                JSONData = JSONData[step]
+
+    return JSONData
+
+
+def JSONTree(metadata, fieldlist, record):
+    '''
+    metadata: Dict with end result
+    key: The key we encountered
+    record: Record to work on
+    '''
+    if type(record) == list:
+        for r in record:
+            metadata = JSONTree(metadata, fieldlist, r)
+    elif type(record) == dict:
+        for k,v in record.items():
+            metadata = JSONTree(metadata, fieldlist + [k], v)
+    elif type(record) == unicode:
+        key = u'_'.join(fieldlist)
+        if not key in metadata:
+            metadata[key] = record
+        else:
+            newkey = key + u'_2'
+            if not newkey in metadata:
+                metadata[newkey] = record
+        
+    return metadata
+
 class DataIngestionBot:
     def __init__(self, reader, titlefmt, pagefmt,
site=pywikibot.getSite(u'commons', u'commons')):
         self.reader = reader
@@ -93,7 +167,6 @@
     def _doUpload(self, photo):
         duplicates = photo.findDuplicateImages(self.site)
         if duplicates:
-            pywikibot.output(u"Skipping duplicate of %r" % (duplicates, ))
             return duplicates[0]
 
         title = photo.getTitle(self.titlefmt)
@@ -104,6 +177,7 @@
                                  useFilename = title,
                                  keepFilename = True,
                                  verifyDescription = False,
+                                 ignoreWarning=True,
                                  targetSite = self.site)
         bot._contents = photo.downloadPhoto().getvalue()
         bot._retrieved = True