传入参数为res.content
import requests
res = requests.get(url)
res_content = res.content
// 获取网页所有文本
from lxml import etree
from lxml import html as html_parse
from lxml.html.clean import Cleaner
import re
def parse_html(res_content):
cleaner = Cleaner()
cleaner.javascript = True # 运行网页上的javascript
cleaner.style = True # 清理style元素下的脏数据
tree = html_parse.fromstring(res_content)
html_ = html_parse.tostring(cleaner.clean_html(tree)).decode()
ele = etree.HTML(html_)
text = ele.xpath('/html/body//text()') # xpath解析
# 替换异常符号,将xpath解析后的列表合并为一个字符串
text = re.sub('\r|\n|\t|<.*?>', '', ' '.join([i1.strip() for i1 in text if i1.strip()]))
return text