python爬虫入门xpath

本文介绍了在Python爬虫中使用XPath时,如何通过索引选取特定位置的节点,例如第二个或最后一个匹配的节点。这种方法对于处理网页解析和数据提取非常有用。
摘要由CSDN通过智能技术生成
import requests
import csv
import os
from lxml import etree


def getHtml(name, page):
    url = "https://search.jd.com/Search?"
    parmas = {"keyword": name, "enc": "utf-8", "page": page}
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
    try:
        res = requests.get(url=url, params=parmas, headers=headers)

        res.encoding = res.apparent_encoding

        return res.text
    except Exception as e:
        print(e)


def getdiv(html):
    xpl = etree.HTML(html)  # 解析网页,构造xpath对象
    divs = xpl.xpath("//div[@class='gl-i-wrap']")
    return divs


def getTitle(divs):
    for i in divs:
        print("-------------------------------------------------------------")
        print(i.xpath('div[@class="p-name p-name-type-2"]//em/text()'))
        data = i.xpath('div[@class="p-name p-name-type-2"]//em/text()')
        for data1 in data:
            index = data1.find(" ")
            dict1 = {"商品名称": data1[0:index], "描述": data1[index + 1:]}
        price = i.xpath('div[@class="p-price"]//i/text()')
        shop = i.xpath('div[@class="p-shop"]//a/text()')
        for data2, data3 in zip(price, shop):
            dict2 = {"价格": data2, "店铺": data3}
        print("评论数:", i.xpath('div[@class="p-commit"]//a'), "条评价")
        # for item in i.xpath('div[@class="p-commit"]/strong'):
        #     print(etree.tostring(item).decode("utf-8"))
        dic = {}
        dic.update(dict1)
        dic.update(dict2)
        imgs = i.xpath('div[@class="p-img"]//img[@source-data-lazy-img]')  # @[0].get("source-data-lazy-img")
        for index in range(0, len(imgs)):
            img = imgs[index].get("source-data-lazy-img")
            dict3 = {"图片链接": img}
            # print(img)
        dic.update(dict3)
        with open("jd.csv", "a", encoding="utf8", newline="")as f:
            title = ["商品名称", "描述", "价格", "店铺", "图片链接"]
            dictwri = csv.DictWriter(f, title)
            if os.path.getsize('jd.csv') == 0:
                dictwri.writeheader()
            dictwri.writerow(dic)


if __name__ == "__main__":
    for i in range(1, 100):
        html = getHtml("宠物", str(i))
        divs = getdiv(html)
        getTitle(divs)

 

运算符

描述

实例

返回值

or

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值