简易小说爬虫(带有GUI界面)

简易小说爬虫(带有GUI界面)

效果:

在这里插入图片描述
在这里插入图片描述

特点:
  • 可实现简单的小说搜索功能
  • 可选择下载目录
  • 下载进度可视化
  • 多线程下载
代码部分:

python部分:

import random
from threading import Thread
from urllib.parse import quote

from PyQt5.QtCore import QThread, pyqtSignal, QFile, Qt
from PyQt5.QtGui import QIcon, QPalette, QBrush, QPixmap
from PyQt5.QtWidgets import QGridLayout, QLabel, QLineEdit, QPushButton, QListWidget, QProgressBar, QMessageBox, \
    QApplication, QFileDialog, QWidget
from bs4 import BeautifulSoup
import requests
import win # 引入qrc资源文件,代码在后面
from lxml import etree
import sys


def dataGet(url):
    """网页源代码获取"""
    
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]
    user_agent = random.choice(user_agent_list)
    headers = {
   'User-Agent': user_agent}
    i = 0
    while i < 4:  # 4s超时, 4次重试
        try:
            response = requests.get(url, headers, timeout=4)
            response.encoding = 'gbk'
            return response.text
        except requests.exceptions.RequestException:
            i += 1


def novelSearch(data):
    """在小说搜索网页获取小说信息"""
    
    soup = BeautifulSoup(data, features='lxml')
    lis = soup.find_all('li')
    novelList = []
    novelInfoList = []
    linkList = []
    for li in lis:
        html = etree.HTML(str(li))
        class_ = html.xpath('//span[@class="s1"]/text()')
        name = html.xpath('//span[@class="s2"]/a/text()')
        link = html.xpath('//span[@class="s2"]/a/@href')
        new = html.xpath('//span[@class="s3"]/a/text()')
        author = html.xpath('//span[@class="s4"]/text()')
        time = html.xpath('//span[@class="s5"]/text()')
        now = html.xpath('//span[@class="s7"]/text()')
        if class_ and now and new:
            novelList.append(name[0])
            novelInfoList.append([class_[0], name[0], link[0], new[0], author[0], time[0], now[0]])
            linkList.append(link[0])
    return [novelList, novelInfoList, linkList]


def chapterGet(data):
    """在目录界面获取小说章节"""
    
    html = etree.HTML(data)
    chapters_name = html.xpath('//dl/dd/a/text()')
    chapters_link = html.xpath('//dl/dd/a/@href')
    chapters = []
    for i, j in zip(chapters_name, chapters_link):
        chapters.append([i, j])
    return chapters


def contentGet(data):
    """获取小说内容"""
    
    string = data.replace('<br />', '').replace('<br>', '')
    html = etree.HTML(string)
    title = html
  • 3
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值