简易小说爬虫(带有GUI界面)
效果:
特点:
- 可实现简单的小说搜索功能
- 可选择下载目录
- 下载进度可视化
- 多线程下载
代码部分:
python部分:
import random
from threading import Thread
from urllib.parse import quote
from PyQt5.QtCore import QThread, pyqtSignal, QFile, Qt
from PyQt5.QtGui import QIcon, QPalette, QBrush, QPixmap
from PyQt5.QtWidgets import QGridLayout, QLabel, QLineEdit, QPushButton, QListWidget, QProgressBar, QMessageBox, \
QApplication, QFileDialog, QWidget
from bs4 import BeautifulSoup
import requests
import win # 引入qrc资源文件,代码在后面
from lxml import etree
import sys
def dataGet(url):
"""网页源代码获取"""
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]
user_agent = random.choice(user_agent_list)
headers = {
'User-Agent': user_agent}
i = 0
while i < 4: # 4s超时, 4次重试
try:
response = requests.get(url, headers, timeout=4)
response.encoding = 'gbk'
return response.text
except requests.exceptions.RequestException:
i += 1
def novelSearch(data):
"""在小说搜索网页获取小说信息"""
soup = BeautifulSoup(data, features='lxml')
lis = soup.find_all('li')
novelList = []
novelInfoList = []
linkList = []
for li in lis:
html = etree.HTML(str(li))
class_ = html.xpath('//span[@class="s1"]/text()')
name = html.xpath('//span[@class="s2"]/a/text()')
link = html.xpath('//span[@class="s2"]/a/@href')
new = html.xpath('//span[@class="s3"]/a/text()')
author = html.xpath('//span[@class="s4"]/text()')
time = html.xpath('//span[@class="s5"]/text()')
now = html.xpath('//span[@class="s7"]/text()')
if class_ and now and new:
novelList.append(name[0])
novelInfoList.append([class_[0], name[0], link[0], new[0], author[0], time[0], now[0]])
linkList.append(link[0])
return [novelList, novelInfoList, linkList]
def chapterGet(data):
"""在目录界面获取小说章节"""
html = etree.HTML(data)
chapters_name = html.xpath('//dl/dd/a/text()')
chapters_link = html.xpath('//dl/dd/a/@href')
chapters = []
for i, j in zip(chapters_name, chapters_link):
chapters.append([i, j])
return chapters
def contentGet(data):
"""获取小说内容"""
string = data.replace('<br />', '').replace('<br>', '')
html = etree.HTML(string)
title = html