Python 正则表达式decode xml entities

最新推荐文章于 2023-04-27 18:45:48 发布

guaguastd

最新推荐文章于 2023-04-27 18:45:48 发布

阅读量605

点赞数

分类专栏： # PYTHON 文章标签：正则表达式

PYTHON 专栏收录该内容

307 篇文章 5 订阅

订阅专栏

法1：

def xmlEntityDecode(capture1 = None, capture2 = None, capture3 = None):

    # Define name map dict
    nameDict = {'quot': 34, 
                'amp': 38, 
                'apos': 39, 
                'lt': 60, 
                'gt': 62}
    
    # Switch decode char to primary char
    if capture1 is not None:
        charCode = int(capture1, 10)
    elif capture2 is not None:
        charCode = int(capture2, 16)
    else:
        charCode = nameDict[capture3]

    return unichr(charCode)

import re
subject = 'Σ'

match = re.search("&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));", subject)
if match:
	capture1 = match.group(1)
	capture2 = match.group(2)
	capture3 = match.group(3)

print xmlEntityDecode(capture1=capture1, capture2 = capture2, capture3 = capture3)

法2：

import re

##
# Removes HTML markup from a text string.
#
# @param text The HTML source.
# @return The plain text.  If the HTML source contains non-ASCII
#     entities or character references, this is a Unicode string.

def strip_html(text):
    def fixup(m):
        text = m.group(0)
        if text[:1] == "<":
            return "" # ignore tags
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        elif text[:1] == "&":
            import htmlentitydefs
            entity = htmlentitydefs.entitydefs.get(text[1:-1])
            if entity:
                if entity[:2] == "&#":
                    try:
                        return unichr(int(entity[2:-1]))
                    except ValueError:
                        pass
                else:
                    return unicode(entity, "iso-8859-1")
        return text # leave as is
    return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)

附:

html entities