import requests
from lxml import etree
import csv
# 创建一个空列表
book_list = []
# 对页数进行循环
for i in range(1, 100, 1):
url = f" http://category.dangdang.com/pg{i}-cp01.05.06.00.00.00-srsort_sale_amt_desc.html"
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
response = requests.get(url=url, headers=headers)
# print(response)
# < Response[200] >
book = etree.HTML(response.text)
# 所有书籍信息的节点
books = book.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li')
# print(len(books))
for j in books:
a = {}
book_name = j.xpath("./p[1]/a/text()")
a["书名"] = book_name
price = j.xpath("./p[3]/span[1]/text()")
a["价格"] = price
author = j.xpath("./p[5]/span[1]/a/text()")
if len(author) == 0:
author = ["空"]
else:
author = author
a["作者"] = author
press = j.xpath("./p[5]/span[3]/a/text()")
if len(press) == 0:
press = ["NULL"]
else:
press = press
a["出版社"] = press
comment = j.xpath("./p[2]/a/text()")
a["评论"] = comment
information = j.xpath("./p[2]/text()")
if len(information) == 0:
information = ["NULL"]
else:
information = information
a["介绍"] = information
book_list.append(a)
print(f"第{i}页爬取完成!")
# 保存为CSV模式
csvfile = open("当当文集.csv", "w", encoding="utf-8", newline="")
file = csv.DictWriter(csvfile, fieldnames=["书名", "价格", "作者", "出版社", "评论", "介绍"])
file.writeheader()
file.writerows(book_list)
