繁体转简体
全角符号转半角符号
拼音去声调转文字
过滤html编码、网址、其他编码等
import re
import unicodedata
from src.langconv import Converter
def zh_to(text, flag=''):
if flag == 'zh2cn':
rule = 'zh-hans'
elif flag == 'zh2tw':
rule = 'zh-hant'
else:
return text
return Converter(rule).convert(text)
def is_number(uchar):
"""判断一个unicode是否是半角数字"""
if uchar >= u'\u0030' and uchar <= u'\u0039':
return True
else:
return False
def is_Qnumber(uchar):
"""判断一个unicode是否是全角数字"""
if uchar >= u'\uff10' and uchar <= u'\uff19':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是半角英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def is_Qalphabet(uchar):
"""判断一个unicode是否是全角英文字母"""
if (uchar >= u'\uff21' and uchar <= u'\uff3a') or (uchar >= u'\uff41' and uchar <= u'\uff5a'):
return True
else:
return False
def Q2B(char):
"""将单个全角字符转换为半角字符"""
code = ord(char)
if code == 0x3000: # 全角空格
return chr(0x0020)
elif 0xFF01 <= code <= 0xFF5E: # 全角字符(除了空格)
return chr(code - 0xFEE0)
else: # 保持字符不变
return char
def fullwidth_to_halfwidth(text):
"""将字符串中的所有全角字符转换为半角字符"""
return "".join(Q2B(char) for char in text)
def stringpartQ2B(text):
"""把字符串中数字和字母全角转半角"""
return "".join([Q2B(uchar) if is_Qnumber(uchar) or is_Qalphabet(uchar) else uchar for uchar in text])
def remove_diacritics(input_str):
"""移除字符串中的变音符号"""
nfkd_form = unicodedata.normalize('NFKD', input_str)
return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])
def replaceCharEntity(html_str):
html_str=html_str.replace('25o;n','250;n')
"""将字符实体还原为原字符,并删除十六进制编码字符"""
# 定义要还原的字符实体
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
# 替换十进制字符实体(例如:è)
html_str = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html_str)
# 替换命名字符实体(例如:&)
re_char_entity = re.compile(r'&(?P<name>\w+);')
sz = re_char_entity.search(html_str)
while sz:
entity = sz.group()
key = sz.group('name')
try:
html_str = re_char_entity.sub(CHAR_ENTITIES[key], html_str, 1)
sz = re_char_entity.search(html_str)
except KeyError:
html_str = re_char_entity.sub('', html_str, 1)
sz = re_char_entity.search(html_str)
# 删除十六进制字符实体(例如:百)
html_str = re.sub(r'&#x[\da-fA-F]+;', '', html_str)
# 移除变音符号
html_str = remove_diacritics(html_str)
html_str = zh_to(html_str, flag='zh2cn')
return html_str
def process_html_encoding(line):
# 将当前行中的全角字符转换为半角字符
line = fullwidth_to_halfwidth(line)
line = stringpartQ2B(line)
line=line.replace('</p>','')
line = replaceCharEntity(line)
line = " ".join(line.split())
""" 过滤表情 """
try:
co = re.compile(u'['u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF'u'\u2600-\u2B55]+')
except re.error:
co = re.compile(
u'('u'\ud83c[\udf00-\udfff]|'u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'u'[\u2600-\u2B55])+')
line = co.sub('', line)
""" 过滤HTML """
htmltags = ['div', 'ul', 'li', 'ol', 'p', 'span', 'form', 'br',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'input',
'title', 'table', 'tbody', 'a',
'i', 'strong', 'b', 'big', 'small', 'u', 's', 'strike',
'img', 'center', 'dl', 'dt', 'font', 'em',
'code', 'pre', 'link', 'meta', 'iframe', 'ins']
for tag in htmltags:
line = re.sub(f'<{tag}[^<>]*[/]?>', '', line)
line = re.sub(f'</{tag}>', '', line)
text = line
""" 过滤不可见字符 """
for i in range(0, 10):
text = text.replace(chr(i), '')
text = text.replace(chr(127), '')
for i in range(11, 33):
# print(i)
text = text.replace(chr(i), '')
text = text.replace(chr(127), '')
""" 过滤非HTML嵌套的URL网址 """
regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
url = re.findall(regex, text)
newlist = []
for i in range(len(url)):
newlist.append(filter_cn(url[i]))
for j in range(len(newlist)):
text = text.replace(newlist[j], '')
""" 过滤部分HTML标签 """
dr = re.compile(r'<[^>]+>', re.S)
text = dr.sub('', text)
""" 过滤 HTML 标签 """
# 兼容换行
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
# 规则
re_cdata = re.compile('//<!\[CDATA\[[^>]*//\]\]>', re.I) # 匹配CDATA
re_script = re.compile('<\s*script[^>]*>[\S\s]*?<\s*/\s*script\s*>', re.I) # script
re_style = re.compile('<\s*style[^>]*>[\S\s]*?<\s*/\s*style\s*>', re.I) # style
re_br = re.compile('<br\\s*?\/??>', re.I) # br标签换行
re_p = re.compile('<\/p>', re.I) # p标签换行
re_h = re.compile('<[\!|/]?\w+[^>]*>', re.I) # HTML标签
re_comment = re.compile('<!--[^>]*-->') # HTML注释
re_hendstr = re.compile('^\s*|\s*$') # 头尾空白字符
re_lineblank = re.compile('[\t\f\v ]*') # 空白字符
# re_linenum = re.compile('\n+') # 连续换行保留1个
re_blanks = re.compile(' +') # 连续多个空格
# 处理
text = re_cdata.sub('', text) # 去CDATA
text = re_script.sub('', text) # 去script
text = re_style.sub('', text) # 去style
# text = re_br.sub('\n', text) # br标签换行
# text = re_p.sub('\n', text) # p标签换行
text = re_h.sub('', text) # 去HTML标签
text = re_comment.sub('', text) # 去HTML注释
# text = re_lineblank.sub('', text) # 去空白字符,鉴于会对英文字符间隔造成影响,此处暂不过滤空格
# text = re_linenum.sub('\n', text) # 连续换行保留1个
# text = re_hendstr.sub('', text) # 去头尾空白字符
# text = re_blanks.sub(' ', text) # 连续空格保留1个
line = text
pinyin={
"chaoyue": "超越",
"cao": "操",
"jiao": "交",
"jia": "交",
"se": "色",
"men": "门",
"1u": "露",
"1uan": "乱",
"yao": "药",
"dong": "洞",
"jing": "精",
"xing": "性",
"2uan": "乱",
"mi": "迷",
"hun": "混",
"mao": "毛",
"chuang": "床",
"chao": "潮",
"huo": "惑",
"nong": "弄",
"yu": "玉",
"xiao": "小",
"ji": "激",
"yin": "阴",
"dang": "荡",
"niao": "尿",
"bo": "波",
"she": "射",
"lu": "露",
"mo": "摸",
"fu": "府",
"luo": "裸",
"dian": "点",
"cha": "插",
"rou": "肉",
}
for key, value in pinyin.items():
line = line.replace(key, value)
return line
text = """ 来那远远chāoyuè他们在他的意念cào纵之下异的丝线飘dàng出来。这些程的you睦,叹息一声那战场zhongyāng,的建筑之中ziyou生存。两边的GoldenYellowImmovableMountain在障碍流动
"""
line = process_html_encoding(text)
print(line)
line = zh_to(processed_text,flag='zh2cn')
print(line)