jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/360575 )
Change subject: proofreadpage.py: add support for OCR feature
......................................................................
proofreadpage.py: add support for OCR feature
ProofreadPage:
- allow to perform OCR via
https://tools.wmflabs.org/phetools tool
wikisourcetext.py:
- make use of the new feature to allow OCR.
Bug: T159655
Change-Id: Id9fdb63da32e205bb898f184402c95fedcc176f4
---
M pywikibot/proofreadpage.py
M scripts/wikisourcetext.py
M tests/proofreadpage_tests.py
3 files changed, 345 insertions(+), 28 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 60546c4..13dbce3 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
-Objects representing objects used with ProofreadPage Extension.
+Objects used with ProofreadPage Extension.
The extension is supported by MW 1.21+.
@@ -10,9 +10,15 @@
* FullHeader
* IndexPage(Page)
+
+OCR support of page scans via:
+-
https://tools.wmflabs.org/phetools/hocr_cgi.py
+-
https://tools.wmflabs.org/phetools/ocr.php
+inspired by
https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+
"""
#
-# (C) Pywikibot team, 2015-2016
+# (C) Pywikibot team, 2015-2017
#
# Distributed under the terms of the MIT license.
#
@@ -30,7 +36,10 @@
import pywikibot
+from pywikibot.comms import http
from pywikibot.data.api import Request
+
+_logger = 'proofreadpage'
class FullHeader(object):
@@ -97,6 +106,15 @@
p_open = re.compile(r'<noinclude>')
p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
+ # phe-tools ocr utility
+ HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
+ 'cmd=hocr&book={book}&lang={lang}&user={user}')
+
+ OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
+ 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
+
+ MULTI_PAGE_EXT = ['djvu', 'pdf']
+
def __init__(self, source, title=''):
"""Instantiate a ProofreadPage object.
@@ -116,12 +134,55 @@
% (self.site.proofread_levels.keys(),
self.PROOFREAD_LEVELS))
+ self._base, self._base_ext, self._num = self._parse_title()
+ self._multi_page = self._base_ext in self.MULTI_PAGE_EXT
+
@property
def _fmt(self):
if self._full_header._has_div:
return self._FMT % '</div>'
else:
return self._FMT % ''
+
+ def _parse_title(self):
+ """Get ProofreadPage base title, base extension and page number.
+
+ Base title is the part of title before the last '/', if any,
+ or the whole title if no '/' is present.
+
+ Extension is the extension of the base title.
+
+ Page number is the part of title after the last '/', if any,
+ or None if no '/' is present.
+
+ E.g. for title 'Page:Popular Science Monthly Volume 1.djvu/12':
+ - base = 'Popular Science Monthly Volume 1.djvu'
+ - extenstion = 'djvu'
+ - number = 12
+
+ E.g. for title 'Page:Original Waltzing Matilda manuscript.jpg':
+ - base = 'Original Waltzing Matilda manuscript.jpg'
+ - extenstion = 'jpg'
+ - number = None
+
+ @return: (base, ext, num).
+ @rtype: tuple
+ """
+ left, sep, right = self.title(withNamespace=False).rpartition('/')
+ if sep:
+ base = left
+ num = int(right)
+ else:
+ base = right
+ num = None
+
+ left, sep, right = base.rpartition('.')
+ if sep:
+ ext = right
+ else:
+ ext = ''
+
+ return (base, ext, num)
@property
def index(self):
@@ -139,32 +200,33 @@
"""
if not hasattr(self, '_index'):
index_ns = self.site.proofread_index_ns
- what_links_here = [IndexPage(page) for
- page in self.getReferences(namespaces=index_ns)]
+ what_links_here = [IndexPage(page) for page in
+ set(self.getReferences(namespaces=index_ns))]
if not what_links_here:
self._index = (None, [])
elif len(what_links_here) == 1:
- self._index = (what_links_here[0], [])
+ self._index = (what_links_here.pop(), [])
else:
self._index = (None, what_links_here)
- # Try to infer names form page titles.
- base, sep, num = self.title(withNamespace=False).rpartition('/')
- if sep == '/':
+ # Try to infer names from page titles.
+ if self._num is not None:
for page in what_links_here:
- if page.title(withNamespace=False) == base:
+ if page.title(withNamespace=False) == self._base:
what_links_here.remove(page)
self._index = (page, what_links_here)
break
page, others = self._index
if others:
- pywikibot.warning('Page %s is linked to several Index pages: %s.'
- % (self, others))
+ pywikibot.warning('%s linked to several Index pages.' % self)
+ pywikibot.output('{0}{1!s}'.format(' ' * 9, [page] +
others))
+
if page:
- pywikibot.warning(' %s selected as Index.' % page)
- pywikibot.warning(' %s remaining.' % others)
- elif not page:
+ pywikibot.output('{0}Selected Index: {1}'.format(' ' * 9,
page))
+ pywikibot.output('{0}remaining: {1!s}'.format(' ' * 9,
others))
+
+ if not page:
pywikibot.warning('Page %s is not linked to any Index page.'
% self)
@@ -435,6 +497,143 @@
"""
return '/* {0.status} */ '.format(self)
+ @property
+ def url_image(self):
+ """Get the file url of the scan of ProofreadPage.
+
+ @return: file url of the scan ProofreadPage or None.
+ @rtype: str/unicode
+
+ @raises:
+ - Exception in case of http errors.
+ """
+ # wrong link fail with various possible Exceptions.
+ if not hasattr(self, '_url_image'):
+
+ if self.exists():
+ url = self.full_url()
+ else:
+ path = 'w/index.php?title={0}&action=edit&redlink=1'
+ url = self.site.base_url(path.format(self.title(asUrl=True)))
+
+ try:
+ response = http.fetch(url, charset='utf-8')
+ except Exception:
+ pywikibot.error('Error fetching HTML for %s.' % self)
+ raise
+
+ soup = BeautifulSoup(response.content, 'lxml')
+
+ try:
+ # None if nothing is found by .find()
+ self._url_image = soup.find(class_='prp-page-image')
+ self._url_image = self._url_image.find('img')
+ # if None raises TypeError.
+ self._url_image = self._url_image['src']
+ except TypeError:
+ raise ValueError('No prp-page-image src found for %s.' % self)
+ else:
+ self._url_image = 'https:' + self._url_image
+
+ return self._url_image
+
+ def _ocr_callback(self, cmd_uri, parser_func=None):
+ """OCR callback function.
+
+ @return: tuple (error, text [error description in case of error]).
+ """
+ def id(x):
+ return x
+
+ if not cmd_uri:
+ raise ValueError('Parameter cmd_uri is mandatory.')
+
+ if parser_func is None:
+ parser_func = id
+
+ if not callable(parser_func):
+ raise TypeError('Keyword parser_func must be callable.')
+
+ # wrong link fail with Exceptions
+ try:
+ response = http.fetch(cmd_uri, charset='utf-8')
+ except Exception as e:
+ pywikibot.error('Querying %s: %s' % (cmd_uri, e))
+ return (True, e)
+
+ data = json.loads(response.content)
+
+ assert 'error' in data, 'Error from phe-tools: %s' % data
+ assert data['error'] in [0, 1], 'Error from phe-tools: %s' %
data
+
+ error = bool(data['error'])
+ if error:
+ pywikibot.error('Querying %s: %s' % (cmd_uri, data['text']))
+ return (error, data['text'])
+ else:
+ return (error, parser_func(data['text']))
+
+ def _do_hocr(self):
+ """Do hocr using
//tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr."""
+ def parse_hocr_text(txt):
+ """Parse hocr text."""
+ soup = BeautifulSoup(txt, 'lxml')
+
+ res = []
+ for ocr_page in soup.find_all(class_='ocr_page'):
+ for area in soup.find_all(class_='ocr_carea'):
+ for par in area.find_all(class_='ocr_par'):
+ for line in par.find_all(class_='ocr_line'):
+ res.append(line.get_text())
+ res.append('\n')
+ return ''.join(res)
+
+ params = {'book': self.title(asUrl=True, withNamespace=False),
+ 'lang': self.site.lang,
+ 'user': self.site.user(),
+ }
+
+ cmd_uri = self.HOCR_CMD.format(**params)
+
+ return self._ocr_callback(cmd_uri, parser_func=parse_hocr_text)
+
+ def _do_ocr(self):
+ """Do ocr using
//tools.wmflabs.org/phetools/ocr.pmp?cmd=ocr."""
+ try:
+ url_image = self.url_image
+ except ValueError:
+ error_text = 'No prp-page-image src found for %s.' % self
+ pywikibot.error(error_text)
+ return (True, error_text)
+
+ params = {'url_image': url_image,
+ 'lang': self.site.lang,
+ 'user': self.site.user(),
+ }
+
+ cmd_uri = self.OCR_CMD.format(**params)
+
+ return self._ocr_callback(cmd_uri)
+
+ def ocr(self):
+ """Do OCR of Proofreadpage scan.
+
+ The text returned by this function shalle be assign to self.body,
+ otherwise the ProofreadPage format will not be maintained.
+
+ It is the user's responsibility to reset quality level accordingly.
+ """
+ if self._multi_page:
+ error, text = self._do_hocr()
+ if not error:
+ return text
+
+ error, text = self._do_ocr()
+ if not error:
+ return text
+ else:
+ raise ValueError('Not possible to perform HOCR/OCR on %s.' % self)
+
class PurgeRequest(Request):
@@ -604,11 +803,9 @@
if page in self._labels_from_page:
break
- # Divide page title in base title and page number.
- base_title, sep, page_number = title.rpartition('/')
# Sanity check if WS site use page convention name/number.
- if sep == '/':
- assert page_cnt == int(page_number), (
+ if page._num is not None:
+ assert page_cnt == int(page._num), (
'Page number %s not recognised as page %s.'
% (page_cnt, title))
diff --git a/scripts/wikisourcetext.py b/scripts/wikisourcetext.py
index 6ee8ae9..6ae1f92 100644
--- a/scripts/wikisourcetext.py
+++ b/scripts/wikisourcetext.py
@@ -4,12 +4,20 @@
This bot applies to wikisource sites to upload text.
-Text is uploaded to not-(yet)-existing pages in Page ns, for a specified Index.
+Text is uploaded to pages in Page ns, for a specified Index.
Text to be stored, if the page is not-existing, is preloaded from the file used
to create the Index page, making the upload feature independent from the format
of the file, as long as it is supported by the MW ProofreadPage extension.
+As alternative, if '-ocr' option is selected,
https://tools.wmflabs.org/phetools
+OCR tool will be used to get text.
+In this case, also already existing pages with quality value 'Not Proofread'
+can be treated. '-force' will override existing page in this case.
+
The following parameters are supported:
+
+# TODO: update params + handle quality level
+
-index:... name of the index page
@@ -22,11 +30,20 @@
A -> just page A
-B -> pages 1 until B
+ -showdiff: show difference between curent text and new text when
+ saving the page
+
+ -ocr: use
https://tools.wmflabs.org/phetools OCR tool to get text;
+ default is False, i.e. only not-(yet)-existing pages
+ in Page ns will be treated and text will be fetched via preload.
+
+ -force: overwrite existing pages;
+ default is False; valid only if '-ocr' is selected.
+
-summary: custom edit summary.
Use quotes if edit summary contains spaces.
-always don't bother asking to confirm any of the changes.
-
"""
#
# (C) Pywikibot team, 2016-2017
@@ -44,8 +61,7 @@
from pywikibot import i18n
from pywikibot.bot import SingleSiteBot
-from pywikibot.proofreadpage import IndexPage
-from pywikibot.tools import issue_deprecation_warning
+from pywikibot.proofreadpage import IndexPage, ProofreadPage
class UploadTextBot(SingleSiteBot):
@@ -67,6 +83,9 @@
@type generator: generator
"""
self.availableOptions.update({
+ 'showdiff': False,
+ 'force': False,
+ 'ocr': False,
'summary': 'Bot: uploading text'
})
super(UploadTextBot, self).__init__(**kwargs)
@@ -79,15 +98,31 @@
self.site, 'djvutext-creating')
def treat(self, page):
- """Process one page."""
- old_text = ''
- new_text = page.text
+ """Process one ProofreadPage page.
+
+ @param page: page to be treated.
+ @type page: ProofreadPage
+ @raises: pywikibot.Error
+ """
+ if not isinstance(page, ProofreadPage):
+ raise pywikibot.Error('Page %s must be a ProofreadPage object.'
+ % page)
summary = self.getOption('summary')
+
if page.exists():
+ old_text = page.text
+ else:
+ old_text = ''
+
+ if self.getOption('ocr'):
+ page.body = page.ocr()
+
+ if (page.exists() and
+ not (self.getOption('ocr') and
self.getOption('force'))):
pywikibot.output('Page %s already exists, not adding!' % page)
else:
- self.userPut(page, old_text, new_text, summary=summary,
+ self.userPut(page, old_text, page.text, summary=summary,
show_diff=self.getOption('showdiff'))
@@ -113,11 +148,13 @@
elif arg == '-pages':
pages = value
elif arg == '-showdiff':
- issue_deprecation_warning('The usage of -showdiff option', None, 0)
+ options['showdiff'] = True
elif arg == '-summary':
options['summary'] = value
+ elif arg == '-ocr':
+ options['ocr'] = True
elif arg == '-force':
- issue_deprecation_warning('The usage of -force option', None, 0)
+ options['force'] = True
elif arg == '-always':
options['always'] = True
else:
@@ -126,6 +163,11 @@
# index is mandatory.
if not index:
pywikibot.bot.suggest_help(missing_parameters=['-index'])
+ return False
+
+ # '-force' can be used with '-ocr' only.
+ if 'force' in options and 'ocr' not in options:
+ pywikibot.error("'-force' can be used with '-ocr' option
only.")
return False
site = pywikibot.Site()
@@ -151,6 +193,7 @@
end = int(end) if end else index.num_pages
pages[interval] = (start, end)
+ # gen yields ProofreadPage objects.
gen_list = []
for start, end in sorted(pages):
gen = index.page_gen(start=start, end=end,
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 49b6e75..0e17ae1 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -73,6 +73,62 @@
self._test_page_text()
+class TestProofreadPageParseTitle(TestCase):
+
+ """Test ProofreadPage._parse_title() function."""
+
+ cached = True
+
+ # Use sites to run parametrized tests.
+ sites = {
+ '1': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test.djvu/12',
+ 'tuple': ('Test.djvu', 'djvu', 12),
+ },
+ '2': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test djvu/12',
+ 'tuple': ('Test djvu', '', 12),
+ },
+ '3': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test.jpg/12',
+ 'tuple': ('Test.jpg', 'jpg', 12),
+ },
+ '4': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test jpg/12',
+ 'tuple': ('Test jpg', '', 12),
+ },
+ '5': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test.jpg',
+ 'tuple': ('Test.jpg', 'jpg', None),
+ },
+ '6': {
+ 'family': 'wikisource', 'code': 'en',
+ 'title': 'Page:Test jpg',
+ 'tuple': ('Test jpg', '', None),
+ },
+ }
+
+ @classmethod
+ def setUpClass(cls):
+ """Prepare get_page dataset for tests."""
+ super(TestProofreadPageParseTitle, cls).setUpClass()
+
+ def test_parse_title(self, key):
+ """Test ProofreadPage_parse_title() function."""
+ data = self.sites[key]
+ title = data['title']
+ base, base_ext, num = data['tuple']
+ page = ProofreadPage(self.site, title)
+ self.assertEqual(page._base, base)
+ self.assertEqual(page._base_ext, base_ext)
+ self.assertEqual(page._num, num)
+
+
class TestProofreadPageValidSite(TestCase):
"""Test ProofreadPage class."""
@@ -89,6 +145,15 @@
'user': 'T. Mazzei',
'header': u"{{rh|2|''THE POPULAR SCIENCE
MONTHLY.''}}",
'footer': u'\n{{smallrefs}}',
+ 'url_image':
('https://upload.wikimedia.org/wikipedia/commons/thumb/a/ac/'
+ 'Popular_Science_Monthly_Volume_1.djvu/'
+
'page12-1024px-Popular_Science_Monthly_Volume_1.djvu.jpg'),
+ }
+
+ valid_redlink = {
+ 'title': 'Page:Pywikibot test page 3.jpg',
+ 'url_image': ('https://upload.wikimedia.org/wikisource/en/3/37/'
+ 'Pywikibot_test_page_3.jpg'),
}
existing_invalid = {
@@ -218,6 +283,18 @@
page_text = page._page_to_json()
self.assertEqual(json.loads(page_text), json.loads(loaded_text))
+ def test_url_image(self):
+ """Test fetching of url image of the scan of
ProofreadPage."""
+ page = ProofreadPage(self.site, self.valid['title'])
+ self.assertEqual(page.url_image, self.valid['url_image'])
+
+ page = ProofreadPage(self.site, self.valid_redlink['title'])
+ self.assertEqual(page.url_image, self.valid_redlink['url_image'])
+
+ page = ProofreadPage(self.site, self.existing_unlinked['title'])
+ # test Exception in property.
+ self.assertRaises(ValueError, getattr, page, 'url_image')
+
class TestPageQuality(TestCase):
--
To view, visit
https://gerrit.wikimedia.org/r/360575
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Id9fdb63da32e205bb898f184402c95fedcc176f4
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>