from lxml import etree
from html import unescape
"""
lxml 可以接收 bytes 与 str 类型的数据
"""
temp_file = open("find_demo.html", "r", encoding="utf-8").read()
# print(temp_file)
# 首先需要获得一个 Element Html对象
# 通过.HTML() 创建对象会自动补全里面的标签 需要注意
html = etree.HTML(temp_file)
print(html)
html2 = etree.tostring(html).decode()
# print("未处理html实体编码的str:================================\n{}".format(html2))
# 解码html中文的深度编码
print("已处理html实体编码的str:================================\n{}".format(unescape(html2)))
unescape(etree.tostring(html).decode())
# 获取其中点赞的数据
ret1 = html.xpath("//div/a[@class='good']/span/text()")
# ret1 = html.xpath("//div/a[contains(@class,'good')]/span/text()")
# (//a[@class='recmd-left multi'] | //a[@class='recmd-left video'])/img/@src
# (
print(ret1)
# 获取其中的href属性
ret2 = html.xpath("//div/a[@class='good']/@href")
print(ret2)
print("=" * 100)
# 分组 一个标签下获取多个属性时 封装 list(object)
ret3 = html.xpath("//div[@class='fen']/a")
print(ret3)
# 遍历每条element
result = []
for i in ret3:
obj = {}
# 每条信息
# print(unescape(etree.tostring(i).decode("utf-8")))
# 再对当前element 获取属性封装成json数据
# obj["href"] = i.xpath("../a/@href")[0]
# obj["title"] = i.xpath("../a/@title")[0]
# obj["content"] = i.xpath("../a/text()")[0]
t1 = i.xpath("@href")
t2 = i.xpath("@title")
t3 = i.xpath("text()")
obj["href"] = t1[0] if len(t1) > 0 else None
obj["title"] = t2[0] if len(t2) > 0 else None
obj["content"] = t3[0] if len(t3) > 0 else None
# print(obj)
result.append(obj)
print(result)
Python-lxml-xpath库的用法
最新推荐文章于 2022-08-17 10:05:42 发布