SVN: [9911] trunk/pywikipedia/pywikibot/textlib.py - Pywikipedia-svn

19 Feb 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9911

Revision: 9911
Author:   drtrigon
Date:     2012-02-19 14:24:41 +0000 (Sun, 19 Feb 2012)
Log Message:
-----------
Adding capabilities of DrTrigonBot 'textlib' script; 'removeHTMLParts'
(this is a follow-up or bug fix for r9902 also)

Modified Paths:
--------------
    trunk/pywikipedia/pywikibot/textlib.py

Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================

--- trunk/pywikipedia/pywikibot/textlib.py	2012-02-19 12:49:35 UTC (rev 9910)
+++ trunk/pywikipedia/pywikibot/textlib.py	2012-02-19 14:24:41 UTC (rev 9911)
@@ -16,6 +16,7 @@
 
 import wikipedia as pywikibot
 import re
+from HTMLParser import HTMLParser
 
 def unescape(s):
     """Replace escaped HTML-special characters by their
originals"""
@@ -219,6 +220,40 @@
     return toRemoveR.sub('', text)
 
 
+def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small',
'sup']):
+    """
+    Return text without portions where HTML markup is disabled
+
+    Parts that can/will be removed are --
+    * HTML and all wiki tags
+
+    The exact set of parts which should NOT be removed can be passed as the
+    'keeptags' parameter, which defaults to ['tt', 'nowiki',
'small', 'sup'].
+    """
+    # try to merge with 'removeDisabledParts()' above into one generic function
+
+    # thanks to
http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
+    parser = _GetDataHTML()
+    parser.keeptags = keeptags
+    parser.feed(text)
+    parser.close()
+    return parser.textdata
+
+# thanks to http://docs.python.org/library/htmlparser.html
+class _GetDataHTML(HTMLParser):
+    textdata = u''
+    keeptags = []
+
+    def handle_data(self, data):
+        self.textdata += data
+
+    def handle_starttag(self, tag, attrs):
+        if tag in self.keeptags: self.textdata += u"<%s>" % tag
+
+    def handle_endtag(self, tag):
+        if tag in self.keeptags: self.textdata += u"</%s>" % tag
+
+
 def isDisabled(text, index, tags = ['*']):
     """
     Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.