from html import parser
from lxml.html.clean import Cleaner
# 保留标签的src、href属性
safe_attrs =['src', 'href']
# 保留的标签
allow_tags = ["div", "p", "img", "video", "h1","h2","h3","h4","h5", "h6", "br", "a", "blockquote"]
cleaner = Cleaner(safe_attrs=safe_attrs, allow_tags=allow_tags, remove_unknown_tags=False)
clean_content = cleaner.clean_html(content)
# 处理后的标签内的url会被编码,进行解码
clean_content = parser.unescape(clean_content)
处理html文本,保留指定标签、属性
最新推荐文章于 2024-08-03 17:40:19 发布