小说资源爬取

以某小说网站为例,代码如下:

# coding: utf-8
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import random
import urllib.request
#获取当前文件夹路径
current_path = os.getcwd()
#查重
def check_duplited():
    #查找路径下的所有文件夹名称
    book_path = os.path.join(current_path, '小说爬虫')
    # 创建一个空列表,用于存放所有文件夹的路径
    book_path_list = []
    # 遍历所有文件夹
    for root, dirs, files in os.walk(book_path):
        # dirs是一个包含当前路径下所有文件夹名称的列表
        book_path_list.extend(dirs)
    return book_path_list


#奇书网爬虫
def get_qishu_book():
    url = 'https://www.qishu.de/'

    #获取网页内容
    result = requests.get(url)
    result.encoding = 'gbk'
    result = result.text
    soup = BeautifulSoup(result, 'html.parser')

    #确定下载路径
    download_path = os.path.join(current_path, '小说爬虫\\奇书网')
    #判断路径是否存在
    if not os.path.exists(download_path):
        os.makedirs(download_path)

    #获取所有小说名称
    # 如果文件不存在则创建文件
    if os.path.isfile(f'{current_path}\\小说爬虫\\download_history.txt'):
        with open(f'{current_path}\\小说爬虫\\download_history.txt', 'r', encoding='utf-8') as f:
            lines = f.readlines()
            exsit_book_list = [line.strip() for line in lines]
            f.close()
    else:
        with open(f'{current_path}\\小说爬虫\\download_history.txt', 'w') as f:
            f.close()

    # 找到分类的链接跳转
    class_div = soup.find('div', class_="wrap header")
    class_tag =  class_div.find_all('a', href=True, attrs={"target":"_blank"})
    for class_information in class_tag:
        class_name = class_information.get_text()
        class_path = os.path.join(download_path,class_name)
        if not os.path.exists(class_path):
            os.makedirs(class_path)
        #如果文件不存在则创建文件
        if os.path.isfile(f'{class_path}\page.txt'):
            with open(f'{class_path}\page.txt', 'r') as f:
                page = int(f.readlines()[0])
        else:
            with open(f'{class_path}\page.txt', 'w') as f:
                page = int(1)

        class_link = f"https://www.qishu.de{class_information['href']}"
        # 获取分类下的所有小说
        response = requests.get(class_link)
        response.encoding = 'gbk'
        response = response.text
        class_soup = BeautifulSoup(response, 'html.parser')
        #获取该类别总页数
        page_div = class_soup.find('div', class_="tspage")
        page_tag = page_div.find_all('a', text='尾页')
        #使用正则获取字符串的所有数字
        page_total = int(re.findall(r"\d+", page_tag[0]['href'])[0])
        print(f'开始爬取{class_name}类小说,总页数:{page_total}...')
        while page<=page_total:
            print(f'开始爬取页码{page}....')
            # 获取该页的所有小说
            if page == 1:
                class_link = f"https://www.qishu.de{class_information['href']}"
            else:
                class_link = f"https://www.qishu.de{class_information['href']}index_{page}.html"
            page+=1
            #修改txt的内容
            with open(f'{class_path}\page.txt', 'w') as f:
                f.write(str(page))
            # 获取该页的所有小说
            page_response = requests.get(class_link)
            page_response.encoding = 'gbk'
            page_response = page_response.text
            page_soup = BeautifulSoup(page_response, 'html.parser')
            page_content_div = page_soup.find('div', class_="list")
            content_tag = page_content_div.find_all('a')
            for book_content in content_tag:
                img = book_content.find('img')
                if img is not None:
                    book_name = book_content.get_text()
                    if book_name not in exsit_book_list:
                        book_link = f"https://www.qishu.de{book_content['href']}"
                        #进入小说详细界面抓包
                        book_response = requests.get(book_link)
                        book_response.encoding = 'gbk'
                        book_response = book_response.text
                        book_soup = BeautifulSoup(book_response, 'html.parser')
                        book_content_div = book_soup.find('div', class_="showDown")
                        try:
                            book_tag = book_content_div.find_all('a',string='Txt格式下载')[0]
                            book_download_link = book_tag['href']
                            result_response = requests.get(book_download_link)
                            save_path = class_path + '\\' + book_name + '.txt'
                            if result_response.status_code == 200:
                                with open(save_path, 'wb') as f:
                                    f.write(result_response.content)
                                # 记录下载
                                with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',
                                          encoding='utf-8') as f:
                                    f.write(book_name)
                                    f.write('\n')
                                    f.close()
                                #随机睡眠
                                sleep_time= random.uniform(1, 5)
                                time.sleep(sleep_time)
                            #无txt格式,下载epub
                            elif result_response.status_code == 502:
                                book_tag = book_content_div.find_all('a', string='Epub格式下载')[0]
                                book_download_link = book_tag['href']
                                result_response = requests.get(book_download_link)
                                save_path = class_path + '\\' + book_name + '.epub'
                                if result_response.status_code == 200:
                                    with open(save_path, 'wb') as f:
                                        f.write(result_response.content)
                                    #记录下载
                                    with open(f'{current_path}\\小说爬虫\\download_history.txt', 'a',encoding='utf-8') as f:
                                        f.write(book_name)
                                        f.write('\n')
                                        f.close()
                                    #随机睡眠
                                    sleep_time= random.uniform(1, 5)
                                    time.sleep(sleep_time)
                            else:
                                print(f'{book_name}下载失败,code:{result_response.status_code}')
                                with open(f'{class_path}\except.txt', 'a') as f:
                                    f.write(f'{book_name}下载失败,code:{result_response.status_code}')
                                    f.write('\n')
                                    f.close()

                        except Exception as e:
                            print(f"捕获到异常: {e}, 小说名:{book_name},页码:{page-1},分类:{class_name}")
                            #写入日志
                            with open(f'{class_path}\except.txt', 'a') as f:
                                f.write(f"捕获到异常: {e}, 小说名:{book_name},页码:{page-1},分类:{class_name}")
                                f.write('\n')
                                f.close()
                    else:
                        pass

while True:
    try:
        get_qishu_book()
    except:
        #链接被断开,睡2分钟继续
        time.sleep(120)





  • 5
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

九七不会用python

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值