小说资源爬取

最新推荐文章于 2024-07-25 17:17:18 发布
九七不会用python
最新推荐文章于 2024-07-25 17:17:18 发布
阅读量243
点赞数 5
文章标签：爬虫 python
本文链接：https://blog.csdn.net/qq_45805368/article/details/138132138
版权
以某小说网站为例，代码如下：
# coding: utf-8
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import random
import urllib.request
#获取当前文件夹路径
current_path = os.getcwd()
#查重
def check_duplited():
    #查找路径下的所有文件夹名称
    book_path = os.path.join(current_path, '小说爬虫')
    # 创建一个空列表，用于存放所有文件夹的路径
    book_path_list = []
    # 遍历所有文件夹
    for root, dirs, files in os.walk(book_path):
        # dirs是一个包含当前路径下所有文件夹名称的列表
        book_path_list.extend(dirs)
    return book_path_list


#奇书网爬虫
def get_qishu_book():
    url = 'https://www.qishu.de/'

    #获取网页内容
    result = requests.get(url)
    result.encoding = 'gbk'
    result = result.text
    soup = BeautifulSoup(result, 'html.parser')

    #确定下载路径
    download_path = os.path.join(current_path, '小说爬虫\\奇书网')
    #判断路径是否存在
    if not os.path.exists(download_path):
        os.makedirs(download_path)

    #获取所有小说名称
    # 如果文件不存在则创建文件
    if os.path.isfile(f'{current_path}\\小说爬虫\\download_history.txt'):
        with open(f'{current_path}\\小说爬虫\\download_history.txt', 'r', encoding='utf-8') as f:
            lines = f.readlines()
            exsit_book_list = [line.strip() for line in lines]
            f.close()
    else:
        with open(f'{current_path}\\小说爬虫\\download_history.txt', 'w') as f:
            f.close()

    # 找到分类的链接跳转
    class_div = soup.find('div', class_="wrap header")
    class_tag =  class_div.find_all('a', href=True, attrs={"target":"_blank"})
    for class_information in class_tag:
        class_name = class_information.get_text()
        class_path = os.path.join(download_path,class_name)
        if not os.path.exists(class_path):
            os.makedirs(class_path)
        #如果文件不存在则创建文件
        if os.path.isfile(f'{class_path}\page.txt'):
            with open(f'{class_path}\page.txt', 'r') as f:
                page = int(f.readlines()[0])
        else:
            with open(f'{class_path}\page.txt', 'w') as f:
                page = int(1)

        class_link = f"https://www.qishu.de{class_information['href']}"
        # 获取分类下的所有小说
        response = requests.get(class_link)
        response.encoding = 'gbk'
        response = response.text
        class_soup = BeautifulSoup(response, 'html.parser')
        #获取该类别总页数
        page_div = class_soup.find('div', class_="tspage")
        page_tag = page_div.find_all('a', text='尾页')
        #使用正则获取字符串的所有数字
        page_total = int(re.findall(r"\d+", page_tag[0]['href'])[0])
        print(f'开始爬取{class_name}类小说，总页数：{page_total}...')
        while page<=page_total:
            print(f'开始爬取页码{page}....')
            # 获取该页的所有小说
            if page == 1:
                class_link = f"https://www.qishu.de{class_information['href']}"
            else:
                class_link = f"https://www.qishu.de{class_information['href']}index_{page}.html"
            page+=1
            #修改txt的内容
            with open(f'{class_path}\page.txt', 'w') as f:
                f.write(str(page))
            # 获取该页的所有小说
            page_response = requests.get(class_link)
            page_response.encoding = 'gbk'
            page_response = page_response.text
            page_soup = BeautifulSoup(page_response, 'html.parser')
            page_content_div = page_soup.find('div', class_="list")
            content_tag = page_content_div.find_all('a')
            for book_content in content_tag:
                img = book_content.find('img')
                if img is not None:
                    book_name = book_content.get_text()
                    if book_name not in exsit_book_list:
                        book_link = f"https://www.qishu.de{book_content['href']}"
                        #进入小说详细界面抓包
                        book_response = requests.get(book_link)
                        book_response.encoding = 'gbk'
                        book_response = book_response.text
                        book_soup = BeautifulSoup(book_response, 'html.parser')
                        book_content_div = book_soup.find('div', class_="showDown")
                        try:
                            book_tag = book_content_div.find_all('a',string='Txt格式下载')[0]
                            book_download_link = book_tag['href']
                            result_response = requests.get(book_download_link)
                            save_path = class_path + '\\' + book_name + '.txt'
                            if result_response.status_code == 200:
                                with open(save_path, 'wb') as f:
                                    f.write(result_response.content)
                                # 记录下载
                                with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',
                                          encoding='utf-8') as f:
                                    f.write(book_name)
                                    f.write('\n')
                                    f.close()
                                #随机睡眠
                                sleep_time= random.uniform(1, 5)
                                time.sleep(sleep_time)
                            #无txt格式，下载epub
                            elif result_response.status_code == 502:
                                book_tag = book_content_div.find_all('a', string='Epub格式下载')[0]
                                book_download_link = book_tag['href']
                                result_response = requests.get(book_download_link)
                                save_path = class_path + '\\' + book_name + '.epub'
                                if result_response.status_code == 200:
                                    with open(save_path, 'wb') as f:
                                        f.write(result_response.content)
                                    #记录下载
                                    with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',encoding='utf-8') as f:
                                        f.write(book_name)
                                        f.write('\n')
                                        f.close()
                                    #随机睡眠
                                    sleep_time= random.uniform(1, 5)
                                    time.sleep(sleep_time)
                            else:
                                print(f'{book_name}下载失败，code：{result_response.status_code}')
                                with open(f'{class_path}\except.txt', 'a') as f:
                                    f.write(f'{book_name}下载失败，code：{result_response.status_code}')
                                    f.write('\n')
                                    f.close()

                        except Exception as e:
                            print(f"捕获到异常: {e}, 小说名：{book_name},页码：{page-1}，分类：{class_name}")
                            #写入日志
                            with open(f'{class_path}\except.txt', 'a') as f:
                                f.write(f"捕获到异常: {e}, 小说名：{book_name},页码：{page-1}，分类：{class_name}")
                                f.write('\n')
                                f.close()
                    else:
                        pass

while True:
    try:
        get_qishu_book()
    except:
        #链接被断开，睡2分钟继续
        time.sleep(120)