爬虫[当当（6000条数据）XPATH]

凉城姑娘

已于 2024-08-02 00:12:14 修改

阅读量1.4k

点赞数 23

文章标签：爬虫 python

于 2024-08-02 00:08:26 首次发布

本文链接：https://blog.csdn.net/2401_84513239/article/details/140860095

版权

import requests
from lxml import etree
import csv

# 创建一个空列表
book_list = []

# 对页数进行循环
for i in range(1, 100, 1):
    url = f" http://category.dangdang.com/pg{i}-cp01.05.06.00.00.00-srsort_sale_amt_desc.html"

# 设置请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
    }
    response = requests.get(url=url, headers=headers)
    # print(response)
    # < Response[200] >

    book = etree.HTML(response.text)

# 所有书籍信息的节点
    books = book.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li')
    # print(len(books))
    for j in books:
        a = {}
        book_name = j.xpath("./p[1]/a/text()")
        a["书名"] = book_name

        price = j.xpath("./p[3]/span[1]/text()")
        a["价格"] = price

        author = j.xpath("./p[5]/span[1]/a/text()")
        if len(author) == 0:
            author = ["空"]
        else:
            author = author
        a["作者"] = author

        press = j.xpath("./p[5]/span[3]/a/text()")
        if len(press) == 0:
            press = ["NULL"]
        else:
            press = press
        a["出版社"] = press
        comment = j.xpath("./p[2]/a/text()")
        a["评论"] = comment
        information = j.xpath("./p[2]/text()")
        if len(information) == 0:
            information = ["NULL"]
        else:
            information = information
        a["介绍"] = information
        book_list.append(a)
    print(f"第{i}页爬取完成！")

# 保存为CSV模式
csvfile = open("当当文集.csv", "w", encoding="utf-8", newline="")
file = csv.DictWriter(csvfile, fieldnames=["书名", "价格", "作者", "出版社", "评论", "介绍"])
file.writeheader()
file.writerows(book_list)