法1:
def xmlEntityDecode(capture1 = None, capture2 = None, capture3 = None):
# Define name map dict
nameDict = {'quot': 34,
'amp': 38,
'apos': 39,
'lt': 60,
'gt': 62}
# Switch decode char to primary char
if capture1 is not None:
charCode = int(capture1, 10)
elif capture2 is not None:
charCode = int(capture2, 16)
else:
charCode = nameDict[capture3]
return unichr(charCode)
import re
subject = 'Σ'
match = re.search("&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));", subject)
if match:
capture1 = match.group(1)
capture2 = match.group(2)
capture3 = match.group(3)
print xmlEntityDecode(capture1=capture1, capture2 = capture2, capture3 = capture3)
法2:
import re
##
# Removes HTML markup from a text string.
#
# @param text The HTML source.
# @return The plain text. If the HTML source contains non-ASCII
# entities or character references, this is a Unicode string.
def strip_html(text):
def fixup(m):
text = m.group(0)
if text[:1] == "<":
return "" # ignore tags
if text[:2] == "&#":
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
elif text[:1] == "&":
import htmlentitydefs
entity = htmlentitydefs.entitydefs.get(text[1:-1])
if entity:
if entity[:2] == "&#":
try:
return unichr(int(entity[2:-1]))
except ValueError:
pass
else:
return unicode(entity, "iso-8859-1")
return text # leave as is
return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)
附: