python文本清洗

 繁体转简体

全角符号转半角符号

拼音去声调转文字

过滤html编码、网址、其他编码等

import re
import unicodedata
from src.langconv import Converter

def zh_to(text, flag=''):
    if flag == 'zh2cn':
        rule = 'zh-hans'
    elif flag == 'zh2tw':
        rule = 'zh-hant'
    else:
        return text
    return Converter(rule).convert(text)

def is_number(uchar):
    """判断一个unicode是否是半角数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False

def is_Qnumber(uchar):
    """判断一个unicode是否是全角数字"""
    if uchar >= u'\uff10' and uchar <= u'\uff19':
        return True
    else:
        return False

def is_alphabet(uchar):
    """判断一个unicode是否是半角英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False

def is_Qalphabet(uchar):
    """判断一个unicode是否是全角英文字母"""
    if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41' and uchar <= u'\uff5a'):
        return True
    else:
        return False

def Q2B(char):
    """将单个全角字符转换为半角字符"""
    code = ord(char)
    if code == 0x3000:  # 全角空格
        return chr(0x0020)
    elif 0xFF01 <= code <= 0xFF5E:  # 全角字符(除了空格)
        return chr(code - 0xFEE0)
    else:  # 保持字符不变
        return char

def fullwidth_to_halfwidth(text):
    """将字符串中的所有全角字符转换为半角字符"""
    return "".join(Q2B(char) for char in text)
def stringpartQ2B(text):
    """把字符串中数字和字母全角转半角"""
    return "".join([Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar for uchar in text])
def remove_diacritics(input_str):
    """移除字符串中的变音符号"""
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])


def replaceCharEntity(html_str):
    html_str=html_str.replace('25o;n','250;n')
    """将字符实体还原为原字符,并删除十六进制编码字符"""
    # 定义要还原的字符实体
    CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                     'lt': '<', '60': '<',
                     'gt': '>', '62': '>',
                     'amp': '&', '38': '&',
                     'quot': '"', '34': '"', }

    # 替换十进制字符实体(例如:&#232;)
    html_str = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html_str)

    # 替换命名字符实体(例如:&amp;)
    re_char_entity = re.compile(r'&(?P<name>\w+);')
    sz = re_char_entity.search(html_str)
    while sz:
        entity = sz.group()
        key = sz.group('name')
        try:
            html_str = re_char_entity.sub(CHAR_ENTITIES[key], html_str, 1)
            sz = re_char_entity.search(html_str)
        except KeyError:
            html_str = re_char_entity.sub('', html_str, 1)
            sz = re_char_entity.search(html_str)

    # 删除十六进制字符实体(例如:&#x767E;)
    html_str = re.sub(r'&#x[\da-fA-F]+;', '', html_str)

    # 移除变音符号
    html_str = remove_diacritics(html_str)
    html_str = zh_to(html_str, flag='zh2cn')
    return html_str
def process_html_encoding(line):

    # 将当前行中的全角字符转换为半角字符
    line = fullwidth_to_halfwidth(line)
    line = stringpartQ2B(line)
    line=line.replace('</p>','')

    line = replaceCharEntity(line)
    line = " ".join(line.split())
    """ 过滤表情 """
    try:
        co = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u2B55]+')
    except re.error:
        co = re.compile(
            u'('u'\ud83c[\udf00-\udfff]|'u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'u'[\u2600-\u2B55])+')
    line = co.sub('', line)
    """ 过滤HTML """
    htmltags = ['div', 'ul', 'li', 'ol', 'p', 'span', 'form', 'br',
                'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                'hr', 'input',
                'title', 'table', 'tbody', 'a',
                'i', 'strong', 'b', 'big', 'small', 'u', 's', 'strike',
                'img', 'center', 'dl', 'dt', 'font', 'em',
                'code', 'pre', 'link', 'meta', 'iframe', 'ins']
    for tag in htmltags:
        line = re.sub(f'<{tag}[^<>]*[/]?>', '', line)
        line = re.sub(f'</{tag}>', '', line)

    text = line
    """ 过滤不可见字符 """
    for i in range(0, 10):
        text = text.replace(chr(i), '')
    text = text.replace(chr(127), '')
    for i in range(11, 33):
        # print(i)
        text = text.replace(chr(i), '')
    text = text.replace(chr(127), '')
    """ 过滤非HTML嵌套的URL网址 """
    regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    url = re.findall(regex, text)
    newlist = []
    for i in range(len(url)):
        newlist.append(filter_cn(url[i]))
    for j in range(len(newlist)):
        text = text.replace(newlist[j], '')

    """ 过滤部分HTML标签 """
    dr = re.compile(r'<[^>]+>', re.S)
    text = dr.sub('', text)

    """ 过滤 HTML 标签 """
    # 兼容换行
    text = text.replace('\r\n', '\n')
    text = text.replace('\r', '\n')

    # 规则
    re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I)  # 匹配CDATA
    re_script = re.compile('<\s*script[^>]*>[\S\s]*?<\s*/\s*script\s*>', re.I)  # script
    re_style = re.compile('<\s*style[^>]*>[\S\s]*?<\s*/\s*style\s*>', re.I)  # style
    re_br = re.compile('<br\\s*?\/??>', re.I)  # br标签换行
    re_p = re.compile('<\/p>', re.I)  # p标签换行
    re_h = re.compile('<[\!|/]?\w+[^>]*>', re.I)  # HTML标签
    re_comment = re.compile('<!--[^>]*-->')  # HTML注释
    re_hendstr = re.compile('^\s*|\s*$')  # 头尾空白字符
    re_lineblank = re.compile('[\t\f\v ]*')  # 空白字符
    # re_linenum = re.compile('\n+')  # 连续换行保留1个
    re_blanks = re.compile(' +')  # 连续多个空格

    # 处理
    text = re_cdata.sub('', text)  # 去CDATA
    text = re_script.sub('', text)  # 去script
    text = re_style.sub('', text)  # 去style
    # text = re_br.sub('\n', text)  # br标签换行
    # text = re_p.sub('\n', text)  # p标签换行
    text = re_h.sub('', text)  # 去HTML标签
    text = re_comment.sub('', text)  # 去HTML注释
    # text = re_lineblank.sub('', text)  # 去空白字符,鉴于会对英文字符间隔造成影响,此处暂不过滤空格
    # text = re_linenum.sub('\n', text)  # 连续换行保留1个
    # text = re_hendstr.sub('', text)  # 去头尾空白字符
    # text = re_blanks.sub(' ', text)  # 连续空格保留1个
    line = text
    pinyin={
        "chaoyue": "超越",
        "cao": "操",
        "jiao": "交",
        "jia": "交",
        "se": "色",
        "men": "门",
        "1u": "露",
        "1uan": "乱",
        "yao": "药",
        "dong": "洞",
        "jing": "精",
        "xing": "性",
        "2uan": "乱",
        "mi": "迷",
        "hun": "混",
        "mao": "毛",
        "chuang": "床",
        "chao": "潮",
        "huo": "惑",
        "nong": "弄",
        "yu": "玉",
        "xiao": "小",
        "ji": "激",
        "yin": "阴",
        "dang": "荡",
        "niao": "尿",
        "bo": "波",
        "she": "射",
        "lu": "露",
        "mo": "摸",
        "fu": "府",
        "luo": "裸",
        "dian": "点",
        "cha": "插",
        "rou": "肉",
    }
    for key, value in pinyin.items():
        line = line.replace(key, value)

    return line

text = """ 来那远远chāoyuè他们在他的意念cào纵之下异的丝线飘dàng出来。这些程的you睦,叹息一声那战场zhongyāng,的建筑之中ziyou生存。两边的GoldenYellowImmovableMountain在障碍流动
"""

line = process_html_encoding(text)
print(line)
line = zh_to(processed_text,flag='zh2cn')
print(line)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值