python爬虫案例——爬取豆瓣图书信息并保存

python爬虫案例——爬取豆瓣图书信息并保存

  • 所需基础
    • requests库的使用
    • BeautifulSoup库的使用
    • re库的使用和简单的正则表达式
    • tqdm(进度条)库的使用
    • pandas库创建DataFrame和保存Csv操作

直接上代码,注释写的比较详细

from bs4 import BeautifulSoup
import requests
import re
#import threading
#import want2url
import pandas as pd
from tqdm import tqdm

url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T"\

class douban_crawler():
    send_headers = {
        "Host": "book.douban.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Connection": "close"
    }

    def __init__(self, url, pages):
        """
        :param url: 爬虫的最初界面,决定了要爬的书籍的类别信息
        :param pages: 要爬取的页数,豆瓣每页20本书的信息,决定了要爬取的数据量
        """
        self.url = url
        self.pages = [20*i for i in range(pages)]
        self.book_class = ""
        self.book_names = []
        self.book_nations = []
        self.book_writers = []
        self.book_scores = []
        self.book_comments = []
        self.book_sites = []
        self.book_pages = []

    def generate_urls(self):
        idx_urls = []
        #正则表达式
        page_key = re.compile(r"(\S*\?)")
        #利用正则表达式匹配出url的必须部分,后面和控制页数的变量进行拼接成索要检索的所有url列表
        #注意利用正则匹配到的返回结果为一个列表,一般需要取出列表中的值进行下面的操作
        page_main = page_key.findall(self.url)[0]
        #“合成”所有url列表,因为豆瓣的规则是每20本书放在一页中,并且用url中的start关键字控制显示的页数
        for i in self.pages:
            g_url = page_main+"start="+str(i)+"&type=T"
            idx_urls.append(g_url)
        return idx_urls

    def open_url(self, url=None):
        #如果不给需要打开的url则自动打开最初始界面(对象初始化给的界面)
        if url == None:
            #对网站发起get请求
            resp = requests.get(self.url,headers=self.send_headers)
            #获取返回信息的文本部分
            resp_text = resp.text
            #利用BS库对文本部分进行html解析,并返回解析后的界面
            soup = BeautifulSoup(resp_text, "html.parser")
            return soup
        else:
            resp = requests.get(url, headers=self.send_headers)
            resp_text = resp.text
            soup = BeautifulSoup(resp_text, "html.parser")
            return soup

    def get_main_info(self, url):
        """
        获取url列表页面能获取主要信息,不打开各个书的独立页面,
        主要信息包括:书的所属类别,作者国家,书名,每本书的索引url,书的作者,书的评分,书的简介,书的页数
        :return: 各个主要信息的存储列表

        """
        #分别为,书类别,国家,作者和简介的正则表示式
        book_class_key = re.compile(": (\D*)")
        book_nation_key = re.compile("\[(\D*?)\]")
        book_writer_key1 = re.compile("^(\D*?)/")
        book_writer_key2 = re.compile("](\D*)$")
        book_comment_key = re.compile(r"<p>(\S*)</p>")
        #创建存储主要信息的列表:因为书名是固定的,一个大页面是一个类别,所以只需要返回一次,不需要列表存储
        book_names = []
        book_pages = []
        book_nations = []
        book_writers = []
        book_comments = []
        book_scores = []
        #对url列表进行遍历并操作
        #urls = self.generate_urls()
        #为了防止耦合,最好一个函数只操作一个页面,在主函数进行对这个函数的遍历操作
        resp = requests.get(url, headers=self.send_headers) #和上面一样的操作,向url发送get请求

        resp_text = resp.text #获取返回的文本信息

        soup = BeautifulSoup(resp_text, "html.parser") #利用BS库对html格式的文本信息进行解析
        # 获取图书类别
        book_class = soup.find("h1").get_text(strip=True)
        book_class = book_class_key.findall(book_class)
        # 获取书名
        for a in soup.find_all("a"):
            try:
                # 获取书名
                res = a.get("title")
                # 获取对应的内层网站
                res_url = a.get("href")
                # 获取每本书对应的独立页面url
                if res != None:
                    book_names.append(res)
                    book_pages.append(res_url)
            except:
                pass

        """
        获取书的作者和作者国籍,因为非中国籍的形式为[国家]作者,而中国籍作者在作者名前没有[]
        所以我们用两个正则表达式分别检索,但是少数作者即使不为中国籍,也没有加[],此类我把这类数据当作脏数据
        为了尽可能的修正这种数据带来的影响,设置判定条件为,没有[]且作者名小于五个字,为中国作者
        """
        for nation in soup.find_all("div", attrs={"class": "pub"}):
            nn = nation.get_text().strip()
            # print(nn)
            book_writer = book_writer_key1.findall(nn)[0]

            if ']' in book_writer:
                book_writers.append(book_writer_key2.findall(book_writer)[0].strip())
            else:
                book_writers.append(book_writer)

            try:
                bn = book_nation_key.findall(nn)
                if bn == [] and len(book_writer) < 5:  #中国籍作者的判定条件
                    book_nations.append("中")
                elif bn != []:
                    # print(bn)
                    book_nations.append(bn[0])
                else:
                    book_nations.append("日")
            except:
                book_nations.append("中")

        #获取书籍简介
        for comment in soup.find_all("div", attrs={"class": "info"}):
            if comment.find_all("p") == []:
                book_comments.append("无简介")
            else:
                book_comments.append(comment.find_all("p")[0].get_text())

        #获取书籍评分
        for score in soup.find_all("span", attrs={"class": "rating_nums"}):
            book_scores.append(score.get_text())

        return book_names, book_pages, book_class*20, book_writers, book_nations, book_comments, book_scores

    def get_page_numbers(self, urls):
        """
        从每个图书的独立页面中获取数据,目前只获取了页数数据
        :param urls: 从get_main_info中生成的图书独立页面url列表
        :return: 对应图书的页数数据
        """
        book_pagesnumber = []
        print("****开始获取页数信息****")
        for url in tqdm(urls):
            rrr = requests.get(url, headers=self.send_headers)
            rtext = rrr.text
            in_soup = BeautifulSoup(rtext, 'html.parser')
            # print(in_soup.text)
            page_num = re.compile(r"页数: (\d*)").findall(in_soup.text)
            #有可能有的书缺失页数信息,遇上此类情况全部将页数设置为0
            if page_num == []:
                book_pagesnumber.append(0)
            else:
                book_pagesnumber.extend(page_num)

        return book_pagesnumber

    def begin_crawl(self):
        """
        类的“主函数”只需要执行这个函数就可以完成爬虫功能
        :return: 所有的信息列表
        """
        sum_book_names = []
        sum_book_urls = []
        sum_book_class = []
        sum_book_writers = []
        sum_book_nations = []
        sum_book_comments = []
        sum_book_scores = []
        sum_book_pages = []
        urls = self.generate_urls() #生成要爬取的所有页面的url地址
        print("****开始爬取****")
        for url in tqdm(urls):
            book_names, book_urls, book_class, book_writers, book_nations, book_comments, book_scores = self.get_main_info(url)
            book_pages = self.get_page_numbers(book_urls)

            sum_book_names.extend(book_names)
            sum_book_urls.extend(book_urls)
            sum_book_class.extend(book_class)
            sum_book_writers.extend(book_writers)
            sum_book_nations.extend(book_nations)
            sum_book_comments.extend(book_comments)
            sum_book_scores.extend(book_scores)
            sum_book_pages.extend(book_pages)

        return sum_book_names, sum_book_urls, sum_book_class, sum_book_writers, sum_book_nations, sum_book_comments, sum_book_scores, sum_book_pages

    def write2csv(self):
        """
        将爬取结果写入csv文件中
        :return: 无返回值
        """
        name, url, book_class, writer, nation, comment, score, pages = self.begin_crawl()
        info_df = pd.DataFrame(columns=["name", "url", "class", "writer", "nation", "comment", "score", "pages"])
        info_df["name"] = name
        info_df["url"] = url
        info_df["class"] = book_class
        info_df["writer"] = writer
        info_df["nation"] = nation
        info_df["comment"] = comment
        info_df["score"] = score
        info_df["pages"] = pages

        info_df.to_csv(f"{book_class[0]}.csv", header=None, encoding="utf_8_sig")


if __name__ == '__main__':
    db_crawler = douban_crawler(url, 5)
    db_crawler.write2csv()



推荐资料:
request库的用法
BeautifulSoup库的用法
tqdm库相关
pandas入门操作

Python爬虫基础案例通常从简单的网页抓取开始,比如爬取猫眼电影的数据。这里以`requests`库获取网页内容,然后用`BeautifulSoup`解析HTML,提取所需信息通过`pandas`将数据保存到Excel文件为例: 首先,你需要安装必要的库: ```bash pip install requests beautifulsoup4 pandas openpyxl ``` 下面是一个基本的Python爬虫脚本示例,假设我们要爬取电影名称、评分和主演信息: ```python import requests from bs4 import BeautifulSoup import pandas as pd # 爬取URL url = "https://maoyan.com/board/4" response = requests.get(url) # 解析HTML soup = BeautifulSoup(response.text, 'lxml') # 查找需要的数据元素,这里假设电影列表在class为'movie-list' movies_data = soup.find_all(class_='movie-list') # 创建空列表存储数据 data_list = [] # 遍历每部电影 for movie in movies_data: title = movie.find('a', class_='title').text rating = movie.find('i', class_='rating_num')['title'] stars = ', '.join(movie.find_all('span', class_='name')) # 将数据添加到列表 data_list.append([title, rating, stars]) # 将数据转换为DataFrame df_movies = pd.DataFrame(data_list, columns=['电影名称', '评分', '主演']) # 保存到Excel文件 filename = '猫眼电影数据.xlsx' df_movies.to_excel(filename, index=False) ``` 这个例子中,爬虫会定期访问指定的猫眼电影页面,提取每个电影的基本信息,然后将其保存到一个名为`猫眼电影数据.xlsx`的Excel文件中。
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值