Python从多个网站爬虫下载小说

最新推荐文章于 2022-08-29 08:00:00 发布

置顶 Q tea

最新推荐文章于 2022-08-29 08:00:00 发布

阅读量533

点赞数 1

分类专栏： python pyqt Qt

本文链接：https://blog.csdn.net/scz653037148/article/details/103448337

版权

Qt 同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

python

3 篇文章 0 订阅

订阅专栏

pyqt

2 篇文章 0 订阅

订阅专栏

前言：

现在很多小说可以在线阅读，但是总是各种弹窗广告。还不能直接下载txt,资源不好找，乱码一大堆.....咳咳，不吐槽了。所以直接想到用爬虫爬取小说的信息。

过程：

首先看了几个网站的小说，发现不同网站的规则并不一样，想要一套规则用到老很难。试着从几个网站找了几个例子通过对比发现了几个主要的信息。

1：文字编码；有的用GBK,有的网站用UTF-8，那么想要不爬取乱码就要相应解码。

2：小说存储格式；如果单独一个网站它的格式都比较统一，但是不同网站架构都有一些细微的差别，或者变量名的不同。

那么我的爬取方式是解析大块的信息。其中会包含一些多余信息，我直接从前后切片截取去除了多余信息，而不是用数据清洗的方式。

3：下载方式；我们在线阅读很多都是只能读取一章，那么怎么能够下载一本呢。为了适应多个网站，这里用了一个笨办法就是直接读取网页内的 ‘下一章’的链接。

实现：

根据规则可以过滤一些广告等，下载纯净的小说。nice

代码：

# -*- coding: utf-8 -*-

"""
Module implementing book_load.
"""
import sys
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
import requests
from bs4 import BeautifulSoup
import random

from PyQt5.QtCore import pyqtSlot
from PyQt5.QtWidgets import QMainWindow

from Ui_小说下载 import Ui_MainWindow


headers={'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}





class book_load(QMainWindow, Ui_MainWindow):
    """
    Class documentation goes here.
    """
    def __init__(self, parent=None):
        """
        Constructor
        
        @param parent reference to the parent widget
        @type QWidget
        """
        super(book_load, self).__init__(parent)
        self.setupUi(self)
    
    @pyqtSlot()
    def on_pushButton_clicked(self):
        #清空
        self.lineEdit_5.setText('')
        self.textEdit.setText('')

        url=self.lineEdit_3.text()
        res=requests.get(url, headers=headers)
        #编码
        str_coding=self.lineEdit.text()
        res.encoding = str_coding
        soup=BeautifulSoup(res.text,'html.parser')
        
        #文章标题
        self.lineEdit_5.setText(soup.find('h1').text)
        str_book=soup.find(id=self.lineEdit_6.text()).text+' '
        str_book=str_book[self.spinBox.value():-1-self.spinBox_2.value()]
        print(str_book)
        self.textEdit.setText(str_book)
        try:
            print(soup.find(text=self.lineEdit_4.text()).parent['href'])
        except:
            print('下一章的url错误')
            
    
    @pyqtSlot()
    def on_pushButton_2_clicked(self):
        #清空
        fileName2, ok2 = QFileDialog.getSaveFileName(self,
                                    "文件保存",
                                    "",
                                    ";小说名 (*.txt)")

        print(fileName2)
        

        file=open(fileName2, 'w+', encoding='utf-8')


        self.lineEdit_5.setText('')
        self.textEdit.setText('')

        url=self.lineEdit_3.text()
        
        while 1:
            res=requests.get(url, headers=headers)
            #编码
            str_coding=self.lineEdit.text()
            res.encoding = str_coding
            soup=BeautifulSoup(res.text,'html.parser')

            try:
                #文章标题
                self.lineEdit_5.setText(soup.find('h1').text)
                if self.checkBox.isChecked():
                    file.write(soup.find('h1').text)
                    file.write('\n')
                str_book=soup.find(id=self.lineEdit_6.text()).text+' '
                str_book=str_book[self.spinBox.value():-1-self.spinBox_2.value()]
                #print(str_book)
                self.textEdit.setText( str_book)
                
                #print(type(str_book))
                file.write(str(str_book))
                file.write('\n')
            except:
                print('结束')
                break 
            
            
            try:
                url=self.lineEdit_2.text()+soup.find(text=self.lineEdit_4.text()).parent['href']
               # print('url:', url)
               
            except:
                print('结束')
                break   
            
             
        
        file.close()
if __name__ == "__main__":
    app = QtWidgets.QApplication(sys.argv)
    Main_Window = book_load()
    Main_Window.show()
    sys.exit(app.exec_())