Python-lxml-xpath库的用法

最新推荐文章于 2022-08-17 10:05:42 发布

WRY_

最新推荐文章于 2022-08-17 10:05:42 发布

阅读量301

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_36871430/article/details/109511440

版权

python 专栏收录该内容

40 篇文章 1 订阅

订阅专栏

from lxml import etree
from html import unescape

"""
lxml 可以接收 bytes 与 str 类型的数据
"""

temp_file = open("find_demo.html", "r", encoding="utf-8").read()

# print(temp_file)

# 首先需要获得一个 Element Html对象
# 通过.HTML() 创建对象会自动补全里面的标签 需要注意
html = etree.HTML(temp_file)
print(html)
html2 = etree.tostring(html).decode()
# print("未处理html实体编码的str:================================\n{}".format(html2))
# 解码html中文的深度编码
print("已处理html实体编码的str:================================\n{}".format(unescape(html2)))

unescape(etree.tostring(html).decode())

# 获取其中点赞的数据
ret1 = html.xpath("//div/a[@class='good']/span/text()")
# ret1 = html.xpath("//div/a[contains(@class,'good')]/span/text()")
# (//a[@class='recmd-left multi'] | //a[@class='recmd-left video'])/img/@src
# (
print(ret1)

# 获取其中的href属性
ret2 = html.xpath("//div/a[@class='good']/@href")
print(ret2)

print("=" * 100)

# 分组 一个标签下获取多个属性时 封装 list(object)
ret3 = html.xpath("//div[@class='fen']/a")
print(ret3)
# 遍历每条element
result = []
for i in ret3:
    obj = {}
    # 每条信息
    # print(unescape(etree.tostring(i).decode("utf-8")))
    # 再对当前element 获取属性封装成json数据
    # obj["href"] = i.xpath("../a/@href")[0]
    # obj["title"] = i.xpath("../a/@title")[0]
    # obj["content"] = i.xpath("../a/text()")[0]

    t1 = i.xpath("@href")
    t2 = i.xpath("@title")
    t3 = i.xpath("text()")
    obj["href"] = t1[0] if len(t1) > 0 else None
    obj["title"] = t2[0] if len(t2) > 0 else None
    obj["content"] = t3[0] if len(t3) > 0 else None
    # print(obj)
    result.append(obj)

print(result)