import requests from lxml import etree import csv # 创建一个空列表 book_list = [] # 对页数进行循环 for i in range(1, 100, 1): url = f" http://category.dangdang.com/pg{i}-cp01.05.06.00.00.00-srsort_sale_amt_desc.html" # 设置请求头 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" } response = requests.get(url=url, headers=headers) # print(response) # < Response[200] > book = etree.HTML(response.text) # 所有书籍信息的节点 books = book.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li') # print(len(books)) for j in books: a = {} book_name = j.xpath("./p[1]/a/text()") a["书名"] = book_name price = j.xpath("./p[3]/span[1]/text()") a["价格"] = price author = j.xpath("./p[5]/span[1]/a/text()") if len(author) == 0: author = ["空"] else: author = author a["作者"] = author press = j.xpath("./p[5]/span[3]/a/text()") if len(press) == 0: press = ["NULL"] else: press = press a["出版社"] = press comment = j.xpath("./p[2]/a/text()") a["评论"] = comment information = j.xpath("./p[2]/text()") if len(information) == 0: information = ["NULL"] else: information = information a["介绍"] = information book_list.append(a) print(f"第{i}页爬取完成!") # 保存为CSV模式 csvfile = open("当当文集.csv", "w", encoding="utf-8", newline="") file = csv.DictWriter(csvfile, fieldnames=["书名", "价格", "作者", "出版社", "评论", "介绍"]) file.writeheader() file.writerows(book_list)
爬虫[当当(6000条数据)XPATH]
于 2024-08-02 00:08:26 首次发布