jenkins-bot has submitted this change and it was merged.
Change subject: [FEAT] chars: Generic module for char classes
......................................................................
[FEAT] chars: Generic module for char classes
Add the pywikibot.tools.chars module which handles currently only invisible
characters. This is now used by replace (instead of a script specific
implementation) and the PatchManager class uses this module too to replace
invisible characters with placeholders.
Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
---
M pywikibot/diff.py
A pywikibot/tools/chars.py
M scripts/replace.py
A tests/tools_chars_tests.py
4 files changed, 162 insertions(+), 12 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/diff.py b/pywikibot/diff.py
index 5acc741..08f939e 100644
--- a/pywikibot/diff.py
+++ b/pywikibot/diff.py
@@ -26,6 +26,8 @@
BeautifulSoup = False
import pywikibot
+from pywikibot.tools import chars
+
from pywikibot.backports import format_range_unified # introduced in 2.7.2
from pywikibot.tools import deprecated_args
@@ -221,7 +223,8 @@
"""
@deprecated_args(n='context')
- def __init__(self, text_a, text_b, context=0, by_letter=False):
+ def __init__(self, text_a, text_b, context=0, by_letter=False,
+ replace_invisible=False):
"""Constructor.
@param text_a: base text
@@ -233,6 +236,9 @@
@param by_letter: if text_a and text_b are single lines, comparison can be done
letter by letter.
@type by_letter: bool
+ @param replace_invisible: Replace invisible characters like U+200e with
+ the charnumber in brackets (e.g. <200e>).
+ @type replace_invisible: bool
"""
if '\n' in text_a or '\n' in text_b:
self.a = text_a.splitlines(1)
@@ -265,6 +271,7 @@
self.blocks = self.get_blocks()
self.context = context
self._super_hunks = self._generate_super_hunks()
+ self._replace_invisible = replace_invisible
def get_blocks(self):
"""Return list with blocks of indexes which compose a and, where
applicable, b.
@@ -352,7 +359,10 @@
output += extend_context(previous_hunk.a_rng[1], hunk.a_rng[0])
previous_hunk = hunk
output += hunk.diff_text
- return output + extend_context(hunks[-1].a_rng[1], context_range[0][1])
+ output += extend_context(hunks[-1].a_rng[1], context_range[0][1])
+ if self._replace_invisible:
+ output = chars.replace_invisible(output)
+ return output
def review_hunks(self):
"""Review hunks."""
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
new file mode 100644
index 0000000..9a29e24
--- /dev/null
+++ b/pywikibot/tools/chars.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+"""Character based helper functions(not
wiki-dependent)."""
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+
+from pywikibot.tools import LazyRegex
+
+
+if sys.version_info[0] > 2:
+ unicode = str
+
+
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
+# differ between Python versions which can be avoided by this static list.
+_category_cf = frozenset([
+ '\U000000ad', '\U00000600', '\U00000601',
'\U00000602', '\U00000603',
+ '\U00000604', '\U0000061c', '\U000006dd',
'\U0000070f', '\U0000180e',
+ '\U0000200b', '\U0000200c', '\U0000200d',
'\U0000200e', '\U0000200f',
+ '\U0000202a', '\U0000202b', '\U0000202c',
'\U0000202d', '\U0000202e',
+ '\U00002060', '\U00002061', '\U00002062',
'\U00002063', '\U00002064',
+ '\U00002066', '\U00002067', '\U00002068',
'\U00002069', '\U0000206a',
+ '\U0000206b', '\U0000206c', '\U0000206d',
'\U0000206e', '\U0000206f',
+ '\U0000feff', '\U0000fff9', '\U0000fffa',
'\U0000fffb', '\U000110bd',
+ '\U0001d173', '\U0001d174', '\U0001d175',
'\U0001d176', '\U0001d177',
+ '\U0001d178', '\U0001d179', '\U0001d17a',
'\U000e0001', '\U000e0020',
+ '\U000e0021', '\U000e0022', '\U000e0023',
'\U000e0024', '\U000e0025',
+ '\U000e0026', '\U000e0027', '\U000e0028',
'\U000e0029', '\U000e002a',
+ '\U000e002b', '\U000e002c', '\U000e002d',
'\U000e002e', '\U000e002f',
+ '\U000e0030', '\U000e0031', '\U000e0032',
'\U000e0033', '\U000e0034',
+ '\U000e0035', '\U000e0036', '\U000e0037',
'\U000e0038', '\U000e0039',
+ '\U000e003a', '\U000e003b', '\U000e003c',
'\U000e003d', '\U000e003e',
+ '\U000e003f', '\U000e0040', '\U000e0041',
'\U000e0042', '\U000e0043',
+ '\U000e0044', '\U000e0045', '\U000e0046',
'\U000e0047', '\U000e0048',
+ '\U000e0049', '\U000e004a', '\U000e004b',
'\U000e004c', '\U000e004d',
+ '\U000e004e', '\U000e004f', '\U000e0050',
'\U000e0051', '\U000e0052',
+ '\U000e0053', '\U000e0054', '\U000e0055',
'\U000e0056', '\U000e0057',
+ '\U000e0058', '\U000e0059', '\U000e005a',
'\U000e005b', '\U000e005c',
+ '\U000e005d', '\U000e005e', '\U000e005f',
'\U000e0060', '\U000e0061',
+ '\U000e0062', '\U000e0063', '\U000e0064',
'\U000e0065', '\U000e0066',
+ '\U000e0067', '\U000e0068', '\U000e0069',
'\U000e006a', '\U000e006b',
+ '\U000e006c', '\U000e006d', '\U000e006e',
'\U000e006f', '\U000e0070',
+ '\U000e0071', '\U000e0072', '\U000e0073',
'\U000e0074', '\U000e0075',
+ '\U000e0076', '\U000e0077', '\U000e0078',
'\U000e0079', '\U000e007a',
+ '\U000e007b', '\U000e007c', '\U000e007d',
'\U000e007e', '\U000e007f',
+])
+# This is a set of all invisible characters
+# At the moment we've only added the characters from the Cf category
+_invisible_chars = frozenset(_category_cf)
+
+# TODO: Is that complex and a lazy regex justified?
+invisible_regex = LazyRegex()
+invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']'
+invisible_regex.flags = 0
+
+
+def contains_invisible(text):
+ """Return True if the text contain any of the invisible
characters."""
+ return any(char in _invisible_chars for char in text)
+
+
+def replace_invisible(text):
+ """Replace invisible characters by
'<codepoint>'."""
+ def replace(match):
+ match = match.group()
+ if sys.maxunicode < 0x10ffff and len(match) == 2:
+ mask = (1 << 10) - 1
+ assert(ord(match[0]) & ~mask == 0xd800)
+ assert(ord(match[1]) & ~mask == 0xdc00)
+ codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) &
mask)
+ else:
+ codepoint = ord(match)
+ return '<{0:x}>'.format(codepoint)
+ return invisible_regex.sub(replace, text)
diff --git a/scripts/replace.py b/scripts/replace.py
index 7cb7764..7c19f3c 100755
--- a/scripts/replace.py
+++ b/scripts/replace.py
@@ -137,7 +137,6 @@
import re
import time
import sys
-import unicodedata
import pywikibot
from pywikibot import i18n, textlib, pagegenerators, Bot
@@ -145,6 +144,8 @@
# Imports predefined replacements tasks from fixes.py
from pywikibot import fixes
+
+from pywikibot.tools import chars
if sys.version_info[0] > 2:
basestring = (str, )
@@ -667,11 +668,6 @@
return pattern
-def contains_format_characters(text):
- """Return True when there are format characters (e.g. U+200E) in
text."""
- return any(unicodedata.category(char) == 'Cf' for char in text)
-
-
def main(*args):
"""
Process command line arguments and invoke bot.
@@ -881,12 +877,14 @@
set_summary)
for replacement in fix['replacements']:
summary = None if len(replacement) < 3 else replacement[2]
- if contains_format_characters(replacement[0]):
+ if chars.contains_invisible(replacement[0]):
pywikibot.warning('The old string "{0}" contains formatting
'
- 'characters like
U+200E'.format(replacement[0]))
- if contains_format_characters(replacement[1]):
+ 'characters like U+200E'.format(
+ chars.replace_invisible(replacement[0])))
+ if chars.contains_invisible(replacement[1]):
pywikibot.warning('The new string "{0}" contains formatting
'
- 'characters like
U+200E'.format(replacement[1]))
+ 'characters like U+200E'.format(
+ chars.replace_invisible(replacement[1])))
replacements.append(ReplacementListEntry(
old=replacement[0],
new=replacement[1],
diff --git a/tests/tools_chars_tests.py b/tests/tools_chars_tests.py
new file mode 100644
index 0000000..e7c45d1
--- /dev/null
+++ b/tests/tools_chars_tests.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+"""Test tools.chars package."""
+# -*- coding: utf-8 -*-
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+import unicodedata
+
+from pywikibot.tools import chars
+
+from tests.aspects import unittest, TestCase
+
+
+class CharsTestCase(TestCase):
+
+ """General test case testing the module."""
+
+ net = False
+
+ def test_replace(self):
+ """Test replace_invisible."""
+ self.assertEqual(chars.replace_invisible('Hello world!'), 'Hello
world!')
+ self.assertEqual(chars.replace_invisible('\u200eRTL\u200f'),
'<200e>RTL<200f>')
+
+ def test_contains(self):
+ """Test contains_invisible."""
+ self.assertFalse(chars.contains_invisible('Hello world!'))
+ self.assertTrue(chars.contains_invisible('\u200eRTL\u200f'))
+
+ def test_category_cf(self):
+ """Test that all characters in _category_cf are actually in
Cf."""
+ invalid = {}
+ for char in chars._category_cf:
+ cat = unicodedata.category(char)
+ if cat != 'Cf':
+ invalid[char] = cat
+ if sys.version_info[0] == 2:
+ # These weren't defined in Unicode 5.2 (which is what Py2 is using)
+ self.assertEqual(invalid.pop('\u0604'), 'Cn')
+ self.assertEqual(invalid.pop('\u061c'), 'Cn')
+ self.assertEqual(invalid.pop('\u2066'), 'Cn')
+ self.assertEqual(invalid.pop('\u2067'), 'Cn')
+ self.assertEqual(invalid.pop('\u2068'), 'Cn')
+ self.assertEqual(invalid.pop('\u2069'), 'Cn')
+ # This category has changed between Unicode 6 and 7 to Cf
+ self.assertEqual(invalid.pop('\u180e'), 'Zs')
+ self.assertCountEqual(invalid.items(), [])
+
+
+if __name__ == '__main__':
+ try:
+ unittest.main()
+ except SystemExit:
+ pass
--
To view, visit
https://gerrit.wikimedia.org/r/213332
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>