http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9911
Revision: 9911
Author: drtrigon
Date: 2012-02-19 14:24:41 +0000 (Sun, 19 Feb 2012)
Log Message:
-----------
Adding capabilities of DrTrigonBot 'textlib' script; 'removeHTMLParts'
(this is a follow-up or bug fix for r9902 also)
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2012-02-19 12:49:35 UTC (rev 9910)
+++ trunk/pywikipedia/pywikibot/textlib.py 2012-02-19 14:24:41 UTC (rev 9911)
@@ -16,6 +16,7 @@
import wikipedia as pywikibot
import re
+from HTMLParser import HTMLParser
def unescape(s):
"""Replace escaped HTML-special characters by their
originals"""
@@ -219,6 +220,40 @@
return toRemoveR.sub('', text)
+def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small',
'sup']):
+ """
+ Return text without portions where HTML markup is disabled
+
+ Parts that can/will be removed are --
+ * HTML and all wiki tags
+
+ The exact set of parts which should NOT be removed can be passed as the
+ 'keeptags' parameter, which defaults to ['tt', 'nowiki',
'small', 'sup'].
+ """
+ # try to merge with 'removeDisabledParts()' above into one generic function
+
+ # thanks to
http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
+ parser = _GetDataHTML()
+ parser.keeptags = keeptags
+ parser.feed(text)
+ parser.close()
+ return parser.textdata
+
+# thanks to
http://docs.python.org/library/htmlparser.html
+class _GetDataHTML(HTMLParser):
+ textdata = u''
+ keeptags = []
+
+ def handle_data(self, data):
+ self.textdata += data
+
+ def handle_starttag(self, tag, attrs):
+ if tag in self.keeptags: self.textdata += u"<%s>" % tag
+
+ def handle_endtag(self, tag):
+ if tag in self.keeptags: self.textdata += u"</%s>" % tag
+
+
def isDisabled(text, index, tags = ['*']):
"""
Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.