Selenium+BeautifulSoup爬取豆瓣阅读书籍

最新推荐文章于 2023-04-07 21:37:10 发布

CJ.Williams

最新推荐文章于 2023-04-07 21:37:10 发布

阅读量384

点赞数 3

分类专栏： Python 文章标签： selenium 爬虫 python

本文链接：https://blog.csdn.net/weixin_45179605/article/details/120512074

版权

Python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

爬取豆瓣书籍的封面地址、书名、作者、简介、书籍类型、字数、原价、折扣价等信息，将信息保存到csv文件中。

# -*- coding: UTF-8 -*-

import time
import re
from selenium import webdriver
from bs4 import BeautifulSoup


class DoubanSpider(object):
    def __init__(self):
        self.base_url = "https://read.douban.com/category/?kind=1"

    def get_driver(self):
        driver = webdriver.Chrome()
        driver.get(self.base_url)
        return driver

    def parse_html(self, driver):
        time.sleep(3)
        html_code = driver.page_source
        time.sleep(2)
        # 下一页
        next_page = driver.find_element_by_class_name("page-next")
        driver.implicitly_wait(5)
        # 点击下一页
        next_page.click()
        time.sleep(3)
        return html_code

    def catch_book_info(self, html_content):
        soup = BeautifulSoup(html_content, "lxml")
        # select ->css选择器
        li_tag_elements = soup.select("div.section-works>ul>li")
        # print(len(li_tag_elements))
        book_info_list = []
        for li_tag_element in li_tag_elements:
            book_info = []
            # 封面地址
            temp_img_url = li_tag_element.select("a.pic>img")
            # print(temp_img_url[0])
            if len(temp_img_url) > 0:
                img_url = temp_img_url[0].get("src")
                # print(img_url)
                book_info.append(img_url)
            # 书名
            book_name = li_tag_element.select("div.info>h4>a")[0].get("title")
            # print(book_name)
            book_info.append(book_name)
            # 作者
            author = li_tag_element.select("div.author>a>span>span")[0].get_text()
            author = re.sub(r",", " ", author)
            # print(author)
            book_info.append(author)
            # 简介
            temp_introduce = li_tag_element.select("div.intro>span>span")[0].get_text()
            # 去除换行符
            introduce_info = re.sub(r"\s", "", temp_introduce)
            # 将英文逗号变为中文
            introduce = re.sub(r",", "，", introduce_info)
            book_info.append(introduce)
            # 书籍类型
            extra_info_list = li_tag_element.select("div.extra-info")[0].find_all("a")
            type_list = []
            if len(extra_info_list) > 0:
                for temp in extra_info_list:
                    extra_info = temp.get_text()
                    type_list.append(extra_info)
            type_info = "-".join(type_list)
            book_info.append(type_info)
            # print(type_info)
            # 字数
            temp_word_num = li_tag_element.select("div.extra-info>div.sticky-info>span")
            # print(temp_word_num[2].get_text())
            word_num = temp_word_num[2].get_text()
            word_num = re.sub(r",", "", word_num)
            book_info.append(word_num)
            # 价格
            sale_list = li_tag_element.select("div.actions>div.actions-left>*")
            # print(sale_list)
            temp_now_sale = sale_list[0].select("span.sale>span.price-tag>span")
            before = sale_list[0].select("s")
            if len(before) > 0:
                before_sale = before[0].get_text()
            else:
                before_sale_temp = sale_list[0].select("span.sale>span.price-tag")[0].get_text()
                before_sale = re.sub(r"￥", "", before_sale_temp)
            if len(temp_now_sale) > 1:
                now_sale = temp_now_sale[1].get_text()
            else:
                now_sale = 0
            # print("原价：", before_sale)
            # print("折扣价：", now_sale)
            book_info.append(str(before_sale))
            book_info.append(str(now_sale))
            book_info_list.append(book_info)
        return book_info_list

    def save_book_infos_by_csv(self, datas):
        with open("./豆瓣书籍信息.csv", "a", encoding="utf-8") as writer:
            for book_info in datas:
                res = ",".join(book_info) + "\n"
                writer.write(res)

    def run(self):
        with open("./豆瓣书籍信息.csv", "w", encoding="utf-8") as writer:
            writer.write("封面地址,书名,作者,简介,书籍类型,字数,原价,折扣价\n")
        douban_driver = self.get_driver()
        num = input("请输入要爬取豆瓣书籍的页数：")
        index = 1
        for page in range(int(num)):
            html_content = self.parse_html(douban_driver)
            # print(html_content)
            book_data = self.catch_book_info(html_content)
            self.save_book_infos_by_csv(book_data)
            print("正在爬取豆瓣网页的第%d页的数据...." % index)
            index += 1
        print("全部豆瓣书籍网页数据保存成功!!!!!")
        douban_driver.quit()


def main():
    douban = DoubanSpider()
    douban.run()


if __name__ == '__main__':
    main()

运行结果：