爬取豆瓣书籍的封面地址、书名、作者、简介、书籍类型、字数、原价、折扣价等信息,将信息保存到csv文件中。
# -*- coding: UTF-8 -*-
import time
import re
from selenium import webdriver
from bs4 import BeautifulSoup
class DoubanSpider(object):
def __init__(self):
self.base_url = "https://read.douban.com/category/?kind=1"
def get_driver(self):
driver = webdriver.Chrome()
driver.get(self.base_url)
return driver
def parse_html(self, driver):
time.sleep(3)
html_code = driver.page_source
time.sleep(2)
# 下一页
next_page = driver.find_element_by_class_name("page-next")
driver.implicitly_wait(5)
# 点击下一页
next_page.click()
time.sleep(3)
return html_code
def catch_book_info(self, html_content):
soup = BeautifulSoup(html_content, "lxml")
# select ->css选择器
li_tag_elements = soup.select("div.section-works>ul>li")
# print(len(li_tag_elements))
book_info_list = []
for li_tag_element in li_tag_elements:
book_info = []
# 封面地址
temp_img_url = li_tag_element.select("a.pic>img")
# print(temp_img_url[0])
if len(temp_img_url) > 0:
img_url = temp_img_url[0].get("src")
# print(img_url)
book_info.append(img_url)
# 书名
book_name = li_tag_element.select("div.info>h4>a")[0].get("title")
# print(book_name)
book_info.append(book_name)
# 作者
author = li_tag_element.select("div.author>a>span>span")[0].get_text()
author = re.sub(r",", " ", author)
# print(author)
book_info.append(author)
# 简介
temp_introduce = li_tag_element.select("div.intro>span>span")[0].get_text()
# 去除换行符
introduce_info = re.sub(r"\s", "", temp_introduce)
# 将英文逗号变为中文
introduce = re.sub(r",", ",", introduce_info)
book_info.append(introduce)
# 书籍类型
extra_info_list = li_tag_element.select("div.extra-info")[0].find_all("a")
type_list = []
if len(extra_info_list) > 0:
for temp in extra_info_list:
extra_info = temp.get_text()
type_list.append(extra_info)
type_info = "-".join(type_list)
book_info.append(type_info)
# print(type_info)
# 字数
temp_word_num = li_tag_element.select("div.extra-info>div.sticky-info>span")
# print(temp_word_num[2].get_text())
word_num = temp_word_num[2].get_text()
word_num = re.sub(r",", "", word_num)
book_info.append(word_num)
# 价格
sale_list = li_tag_element.select("div.actions>div.actions-left>*")
# print(sale_list)
temp_now_sale = sale_list[0].select("span.sale>span.price-tag>span")
before = sale_list[0].select("s")
if len(before) > 0:
before_sale = before[0].get_text()
else:
before_sale_temp = sale_list[0].select("span.sale>span.price-tag")[0].get_text()
before_sale = re.sub(r"¥", "", before_sale_temp)
if len(temp_now_sale) > 1:
now_sale = temp_now_sale[1].get_text()
else:
now_sale = 0
# print("原价:", before_sale)
# print("折扣价:", now_sale)
book_info.append(str(before_sale))
book_info.append(str(now_sale))
book_info_list.append(book_info)
return book_info_list
def save_book_infos_by_csv(self, datas):
with open("./豆瓣书籍信息.csv", "a", encoding="utf-8") as writer:
for book_info in datas:
res = ",".join(book_info) + "\n"
writer.write(res)
def run(self):
with open("./豆瓣书籍信息.csv", "w", encoding="utf-8") as writer:
writer.write("封面地址,书名,作者,简介,书籍类型,字数,原价,折扣价\n")
douban_driver = self.get_driver()
num = input("请输入要爬取豆瓣书籍的页数:")
index = 1
for page in range(int(num)):
html_content = self.parse_html(douban_driver)
# print(html_content)
book_data = self.catch_book_info(html_content)
self.save_book_infos_by_csv(book_data)
print("正在爬取豆瓣网页的第%d页的数据...." % index)
index += 1
print("全部豆瓣书籍网页数据保存成功!!!!!")
douban_driver.quit()
def main():
douban = DoubanSpider()
douban.run()
if __name__ == '__main__':
main()
运行结果:
不足之处请指出。