1,使用xpath清理不必要的标签元素,以及无内容标签
from lxml import etree
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
'''
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目标xpath
:return: string type html_content
'''
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的项目 除非极端情况 一般这些都是要清除的
remove_by_xpath.update({
'_remove_2': '//iframe',
'_remove_4': '//button',
'_remove_5': '//form',
'_remove_6': '//input',
'_remove_7': '//select',
'_remove_8': '//option',
'_remove_9': '//textarea',
'_remove_10': '//figure',
'_remove_11': '//figcaption',
'_remove_12': '//frame',
'_remove_13': '//video',
'_remove_14': '//script',
'_remove_15': '//style'
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常规删除操作,不需要的标签删除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean article content : {bad_string}")
bad.getparent().remove(bad)
skip_tip = "name()='img' or name()='tr' or " \
"name()='th' or name()='tbody' or " \
"name()='thead' or name()='table'"
# 判断所有p标签,是否有内容存在,没有的直接删除
for p in selector.xpath(f"//*[not({skip_tip})]"):
# 跳过逻辑
if p.xpath(f".//*[{skip_tip}]") or \
bool(re.sub('\s', '', p.xpath('string(.)'))):
continue
bad_p = etree.tostring(p, encoding='utf-8',
pretty_print=True).decode()
logger.debug(f"clean p tag : {bad_p}")
p.getparent().remove(p)
return etree.tostring(selector, encoding='utf-8',
pretty_print=True).decode()
2,使用pyquery清理标签属性,并返回处理后源码和纯净文本
#!/usr/bin/env python
# -*-coding:utf-8-*-
from pyquery import PyQue