Python 正则表达式decode xml entities

法1:

def xmlEntityDecode(capture1 = None, capture2 = None, capture3 = None):

    # Define name map dict
    nameDict = {'quot': 34, 
                'amp': 38, 
                'apos': 39, 
                'lt': 60, 
                'gt': 62}
    
    # Switch decode char to primary char
    if capture1 is not None:
        charCode = int(capture1, 10)
    elif capture2 is not None:
        charCode = int(capture2, 16)
    else:
        charCode = nameDict[capture3]

    return unichr(charCode)

import re
subject = 'Σ'

match = re.search("&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));", subject)
if match:
	capture1 = match.group(1)
	capture2 = match.group(2)
	capture3 = match.group(3)

print xmlEntityDecode(capture1=capture1, capture2 = capture2, capture3 = capture3)

法2:

import re

##
# Removes HTML markup from a text string.
#
# @param text The HTML source.
# @return The plain text.  If the HTML source contains non-ASCII
#     entities or character references, this is a Unicode string.

def strip_html(text):
    def fixup(m):
        text = m.group(0)
        if text[:1] == "<":
            return "" # ignore tags
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        elif text[:1] == "&":
            import htmlentitydefs
            entity = htmlentitydefs.entitydefs.get(text[1:-1])
            if entity:
                if entity[:2] == "&#":
                    try:
                        return unichr(int(entity[2:-1]))
                    except ValueError:
                        pass
                else:
                    return unicode(entity, "iso-8859-1")
        return text # leave as is
    return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)

附:

html entities

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值