[Gerrit] [FEAT] chars: Generic module for char classes - change (pywikibot/core) - Pywikibot-commits

25 May 2015

jenkins-bot has submitted this change and it was merged.

Change subject: [FEAT] chars: Generic module for char classes
......................................................................


[FEAT] chars: Generic module for char classes

Add the pywikibot.tools.chars module which handles currently only invisible
characters. This is now used by replace (instead of a script specific
implementation) and the PatchManager class uses this module too to replace
invisible characters with placeholders.

Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
---
M pywikibot/diff.py
A pywikibot/tools/chars.py
M scripts/replace.py
A tests/tools_chars_tests.py
4 files changed, 162 insertions(+), 12 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/diff.py b/pywikibot/diff.py
index 5acc741..08f939e 100644
--- a/pywikibot/diff.py
+++ b/pywikibot/diff.py
@@ -26,6 +26,8 @@
     BeautifulSoup = False
 
 import pywikibot
+from pywikibot.tools import chars
+
 from pywikibot.backports import format_range_unified  # introduced in 2.7.2
 from pywikibot.tools import deprecated_args
 
@@ -221,7 +223,8 @@
     """
 
     @deprecated_args(n='context')
-    def __init__(self, text_a, text_b, context=0, by_letter=False):
+    def __init__(self, text_a, text_b, context=0, by_letter=False,
+                 replace_invisible=False):
         """Constructor.
 
         @param text_a: base text
@@ -233,6 +236,9 @@
         @param by_letter: if text_a and text_b are single lines, comparison can be done
             letter by letter.
         @type by_letter: bool
+        @param replace_invisible: Replace invisible characters like U+200e with
+            the charnumber in brackets (e.g. <200e>).
+        @type replace_invisible: bool
         """
         if '\n' in text_a or '\n' in text_b:
             self.a = text_a.splitlines(1)
@@ -265,6 +271,7 @@
         self.blocks = self.get_blocks()
         self.context = context
         self._super_hunks = self._generate_super_hunks()
+        self._replace_invisible = replace_invisible
 
     def get_blocks(self):
         """Return list with blocks of indexes which compose a and, where
applicable, b.
@@ -352,7 +359,10 @@
                 output += extend_context(previous_hunk.a_rng[1], hunk.a_rng[0])
             previous_hunk = hunk
             output += hunk.diff_text
-        return output + extend_context(hunks[-1].a_rng[1], context_range[0][1])
+        output += extend_context(hunks[-1].a_rng[1], context_range[0][1])
+        if self._replace_invisible:
+            output = chars.replace_invisible(output)
+        return output
 
     def review_hunks(self):
         """Review hunks."""
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
new file mode 100644
index 0000000..9a29e24
--- /dev/null
+++ b/pywikibot/tools/chars.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+"""Character based helper functions(not
wiki-dependent)."""
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+
+from pywikibot.tools import LazyRegex
+
+
+if sys.version_info[0] > 2:
+    unicode = str
+
+
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
+# differ between Python versions which can be avoided by this static list.
+_category_cf = frozenset([
+    '\U000000ad', '\U00000600', '\U00000601',
'\U00000602', '\U00000603',
+    '\U00000604', '\U0000061c', '\U000006dd',
'\U0000070f', '\U0000180e',
+    '\U0000200b', '\U0000200c', '\U0000200d',
'\U0000200e', '\U0000200f',
+    '\U0000202a', '\U0000202b', '\U0000202c',
'\U0000202d', '\U0000202e',
+    '\U00002060', '\U00002061', '\U00002062',
'\U00002063', '\U00002064',
+    '\U00002066', '\U00002067', '\U00002068',
'\U00002069', '\U0000206a',
+    '\U0000206b', '\U0000206c', '\U0000206d',
'\U0000206e', '\U0000206f',
+    '\U0000feff', '\U0000fff9', '\U0000fffa',
'\U0000fffb', '\U000110bd',
+    '\U0001d173', '\U0001d174', '\U0001d175',
'\U0001d176', '\U0001d177',
+    '\U0001d178', '\U0001d179', '\U0001d17a',
'\U000e0001', '\U000e0020',
+    '\U000e0021', '\U000e0022', '\U000e0023',
'\U000e0024', '\U000e0025',
+    '\U000e0026', '\U000e0027', '\U000e0028',
'\U000e0029', '\U000e002a',
+    '\U000e002b', '\U000e002c', '\U000e002d',
'\U000e002e', '\U000e002f',
+    '\U000e0030', '\U000e0031', '\U000e0032',
'\U000e0033', '\U000e0034',
+    '\U000e0035', '\U000e0036', '\U000e0037',
'\U000e0038', '\U000e0039',
+    '\U000e003a', '\U000e003b', '\U000e003c',
'\U000e003d', '\U000e003e',
+    '\U000e003f', '\U000e0040', '\U000e0041',
'\U000e0042', '\U000e0043',
+    '\U000e0044', '\U000e0045', '\U000e0046',
'\U000e0047', '\U000e0048',
+    '\U000e0049', '\U000e004a', '\U000e004b',
'\U000e004c', '\U000e004d',
+    '\U000e004e', '\U000e004f', '\U000e0050',
'\U000e0051', '\U000e0052',
+    '\U000e0053', '\U000e0054', '\U000e0055',
'\U000e0056', '\U000e0057',
+    '\U000e0058', '\U000e0059', '\U000e005a',
'\U000e005b', '\U000e005c',
+    '\U000e005d', '\U000e005e', '\U000e005f',
'\U000e0060', '\U000e0061',
+    '\U000e0062', '\U000e0063', '\U000e0064',
'\U000e0065', '\U000e0066',
+    '\U000e0067', '\U000e0068', '\U000e0069',
'\U000e006a', '\U000e006b',
+    '\U000e006c', '\U000e006d', '\U000e006e',
'\U000e006f', '\U000e0070',
+    '\U000e0071', '\U000e0072', '\U000e0073',
'\U000e0074', '\U000e0075',
+    '\U000e0076', '\U000e0077', '\U000e0078',
'\U000e0079', '\U000e007a',
+    '\U000e007b', '\U000e007c', '\U000e007d',
'\U000e007e', '\U000e007f',
+])
+# This is a set of all invisible characters
+# At the moment we've only added the characters from the Cf category
+_invisible_chars = frozenset(_category_cf)
+
+# TODO: Is that complex and a lazy regex justified?
+invisible_regex = LazyRegex()
+invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']'
+invisible_regex.flags = 0
+
+
+def contains_invisible(text):
+    """Return True if the text contain any of the invisible
characters."""
+    return any(char in _invisible_chars for char in text)
+
+
+def replace_invisible(text):
+    """Replace invisible characters by
'<codepoint>'."""
+    def replace(match):
+        match = match.group()
+        if sys.maxunicode < 0x10ffff and len(match) == 2:
+            mask = (1 << 10) - 1
+            assert(ord(match[0]) & ~mask == 0xd800)
+            assert(ord(match[1]) & ~mask == 0xdc00)
+            codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) &
mask)
+        else:
+            codepoint = ord(match)
+        return '<{0:x}>'.format(codepoint)
+    return invisible_regex.sub(replace, text)
diff --git a/scripts/replace.py b/scripts/replace.py
index 7cb7764..7c19f3c 100755
--- a/scripts/replace.py
+++ b/scripts/replace.py
@@ -137,7 +137,6 @@
 import re
 import time
 import sys
-import unicodedata
 
 import pywikibot
 from pywikibot import i18n, textlib, pagegenerators, Bot
@@ -145,6 +144,8 @@
 
 # Imports predefined replacements tasks from fixes.py
 from pywikibot import fixes
+
+from pywikibot.tools import chars
 
 if sys.version_info[0] > 2:
     basestring = (str, )
@@ -667,11 +668,6 @@
     return pattern
 
 
-def contains_format_characters(text):
-    """Return True when there are format characters (e.g. U+200E) in
text."""
-    return any(unicodedata.category(char) == 'Cf' for char in text)
-
-
 def main(*args):
     """
     Process command line arguments and invoke bot.
@@ -881,12 +877,14 @@
                                           set_summary)
         for replacement in fix['replacements']:
             summary = None if len(replacement) < 3 else replacement[2]
-            if contains_format_characters(replacement[0]):
+            if chars.contains_invisible(replacement[0]):
                 pywikibot.warning('The old string "{0}" contains formatting
'
-                                  'characters like
U+200E'.format(replacement[0]))
-            if contains_format_characters(replacement[1]):
+                                  'characters like U+200E'.format(
+                    chars.replace_invisible(replacement[0])))
+            if chars.contains_invisible(replacement[1]):
                 pywikibot.warning('The new string "{0}" contains formatting
'
-                                  'characters like
U+200E'.format(replacement[1]))
+                                  'characters like U+200E'.format(
+                    chars.replace_invisible(replacement[1])))
             replacements.append(ReplacementListEntry(
                 old=replacement[0],
                 new=replacement[1],
diff --git a/tests/tools_chars_tests.py b/tests/tools_chars_tests.py
new file mode 100644
index 0000000..e7c45d1
--- /dev/null
+++ b/tests/tools_chars_tests.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+"""Test tools.chars package."""
+# -*- coding: utf-8  -*-
+#
+# (C) Pywikibot team, 2015
+#
+# Distributed under the terms of the MIT license.
+from __future__ import unicode_literals
+
+__version__ = '$Id$'
+
+import sys
+import unicodedata
+
+from pywikibot.tools import chars
+
+from tests.aspects import unittest, TestCase
+
+
+class CharsTestCase(TestCase):
+
+    """General test case testing the module."""
+
+    net = False
+
+    def test_replace(self):
+        """Test replace_invisible."""
+        self.assertEqual(chars.replace_invisible('Hello world!'), 'Hello
world!')
+        self.assertEqual(chars.replace_invisible('\u200eRTL\u200f'),
'<200e>RTL<200f>')
+
+    def test_contains(self):
+        """Test contains_invisible."""
+        self.assertFalse(chars.contains_invisible('Hello world!'))
+        self.assertTrue(chars.contains_invisible('\u200eRTL\u200f'))
+
+    def test_category_cf(self):
+        """Test that all characters in _category_cf are actually in
Cf."""
+        invalid = {}
+        for char in chars._category_cf:
+            cat = unicodedata.category(char)
+            if cat != 'Cf':
+                invalid[char] = cat
+        if sys.version_info[0] == 2:
+            # These weren't defined in Unicode 5.2 (which is what Py2 is using)
+            self.assertEqual(invalid.pop('\u0604'), 'Cn')
+            self.assertEqual(invalid.pop('\u061c'), 'Cn')
+            self.assertEqual(invalid.pop('\u2066'), 'Cn')
+            self.assertEqual(invalid.pop('\u2067'), 'Cn')
+            self.assertEqual(invalid.pop('\u2068'), 'Cn')
+            self.assertEqual(invalid.pop('\u2069'), 'Cn')
+            # This category has changed between Unicode 6 and 7 to Cf
+            self.assertEqual(invalid.pop('\u180e'), 'Zs')
+        self.assertCountEqual(invalid.items(), [])
+
+
+if __name__ == '__main__':
+    try:
+        unittest.main()
+    except SystemExit:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/213332
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise &lt;CommodoreFabianus(a)gmx.de&gt;
Gerrit-Reviewer: John Vandenberg &lt;jayvdb(a)gmail.com&gt;
Gerrit-Reviewer: Ladsgroup &lt;ladsgroup(a)gmail.com&gt;
Gerrit-Reviewer: Merlijn van Deen &lt;valhallasw(a)arctus.nl&gt;
Gerrit-Reviewer: XZise &lt;CommodoreFabianus(a)gmx.de&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: jenkins-bot <>