"""
Description:正则清洗HTML数据
Author:
Prompt: code in python3 env
"""
"""
re.I 使匹配对大小写不敏感
re.L 做本地化识别(locale-aware)匹配
re.M 多行匹配,影响^(开头)和$(结尾)
re.S 匹配包含换行在内的所有字符
re.U 根据Unicode字符集解析字符,这个标志影响 \w, \W, \b, \B
re.X 该标志通过给予你更灵活的格式以便你将正则表达式写得更加
"""
import re
# 处理HTML标签文本
# @param htmlstr html字符串
def filter_tags(htmlstr):
# 过滤doc_type
htmlstr = ' '.join(htmlstr.split())
re_doctype = re.compile(r'.*?>', re.S)
res = re_doctype.sub('', htmlstr)
# 过滤CDATA
re_cdata = re.compile( r'//] //\] >', re.I)
res = re_cdata.sub('', res)
# Script
re_script = re.compile(']*>[^', re.I)
res = re_script.sub('', res)
# 注释
re_script = re.compile('', 0)
res = re_script.sub('', res)
# 换行符
re_br = re.compile('
')
res = re_br.sub('\n', res)
# HTML 标签
re_lable = re.compile('?\w[^>]*>')
res = re_lable.sub('', res)
# 转义字符
re_esc = re.compile('&.*?;')
res = re_esc.sub('', res)
# 空格处理
re_blank = re.compile('\s+') # \s包含 \t \n \r \f \v
res = re_blank.sub(' ', res)
# 超链接处理
re_http = re.compile(r'(http://.+.html)')
res = re_http.sub(' ', res)
d = lambda pattern, flags=0: re.compile(pattern, flags)
for re_type in re_mate:
re_type = d(*re_type)
res = re_type.sub(' ', res)
return res
def read_file(read_path):
str_doc = ''
with open(read_path, 'r', encoding='utf-8') as f:
str_doc = f.read()
return str_doc
if __name__ == '__main__':
str_doc = read_file(r'../data/html/re.html')
res = filter_tags(str_doc)
# print(res)
with open(r'../data/html/test.html', 'w', encoding='utf-8') as f:
f.write(res)
print('No Exception') # 我是通过另一个编辑器进行打开预览的
这是我的笔记