python 小说爬虫+ui+多线程

最新推荐文章于 2024-08-21 21:05:18 发布

weixin_42229626

最新推荐文章于 2024-08-21 21:05:18 发布

阅读量1.2k

点赞数 1

分类专栏： python学习笔记文章标签： python 爬虫 PyQt5

本文链接：https://blog.csdn.net/weixin_42229626/article/details/81984616

版权

python学习笔记专栏收录该内容

3 篇文章 1 订阅

订阅专栏

爬虫主要是bs4 beautifulsoup库和 urllib 库
ui 是 pyqt5库

如果是希望通过ui反馈实施爬取进度,必须用多线程,否则ui会假死

先说爬虫
主要是通过biqukan这个网站爬取

逻辑是:
1.先获得小说章节列表,形成一个只有key,没有value的字典.以及一个各章节网址的列表
2.通过每个章节循环获得章节内容,更新到字典中去
3.把字典输出到txt.

获取章节列表以及字典代码如下:

def getlist(url,xsname):
    print('开始读取章节列表')
    k = []
    txt = {}
    head = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.biqukan.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'If-None-Match': "1518171634"
            }
    html = url
    req = request.Request(html, headers=head)
    response = request.urlopen(req)
    page = response.read().decode('gbk', 'ignore')
    soup = BeautifulSoup(page, 'lxml')
    txtlist = soup.find('div', attrs={'class': 'listmain'})
    begin_flag = 0
    for child in txtlist.dl.children:
        if child != '\n':
            if child.string == "《" + xsname + "》正文卷":
                begin_flag = 1
            if begin_flag == 1 and child.a != None:
                k.append(child.a.get('href'))
                txt[str(child.a.getText())] = []
    if len(k) != 0:
        print('章节数读取完毕! 共有: %s 章' % len(k))
    else:
        print('章节数读取失败')
    return k, txt

然后获取各章内容如下:

def gettxt(k,txt):
    head = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.biqukan.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'If-None-Match': "1518171634"
            }
    html = 'http://www.biqukan.com' + k
    req = request.Request(html, headers=head)
    response = request.urlopen(req)
    page = response.read().decode('gbk', 'ignore')
    soup = BeautifulSoup(page, 'lxml')
    text1 = soup.find('div', attrs={'id': 'content', 'class': 'showtxt'}).getText()
    text1 = str(text1).replace('\xa0', ' ')
    text1 = str(text1).replace('请记住本书首发域名：www.biqukan.com。笔趣阁手机版阅读网址：m.biqukan.com', '')
    text1 = str(text1).replace(str(html), '')
    text1.split()
    text1 = '\n'.join(text1.split())
    textname = soup.find('div', attrs={'class': 'content'})
    textname = textname.h1.string
    if list(text1) != 0 and list(textname) != 0:
        txt[str(textname)] = str(text1)
        print('[爬取成功] %-5s' % str(textname))
    else:
        print('[爬取失败] %-5s' % str(textname))
    return txt

输出txt

def putouttxt(txt,xsname):
    outputfile = open(str(os.path.split(os.path.realpath(__file__))[0]) + '\\'+ xsname + '.txt', 'a', encoding='utf-8')
    for key in txt:
        outputfile.write(str(key) + '\n\n')
        outputfile.write(str(txt[key]) + '\n\n')
    outputfile.close()

主程序

def main():
    print('------------------------------------------------------------------')
    print('!!只能从www.biqukan.com爬取小说!!')
    print('------------------------------------------------------------------')
    print('请在biqukan找到想看的小说,并在下方输入小说名称,例如:天道图书馆')
    xsname = str(input('小说名称:'))
    print('请在下方输入小说主页(目录页),例如:http://www.biqukan.com/17_17957/')
    url = str(input('小说主页:'))
    print('------------------------------------------------------------------')
    print('爬虫开始干活了!')
    print('------------------------------------------------------------------')
    (k,txt) = getlist(url,xsname)
    print('------------------------------------------------------------------')
    print('开始爬取各章节内容')
    for i in range(len(k)):
        txt = gettxt(k[i],txt)
    print('各章节内容爬取完毕')
    print('------------------------------------------------------------------')
    print('开始生成TXT文件')
    putouttxt(txt,xsname)
    print('TXT文件生成完毕')
    print('------------------------------------------------------------------')
    print('工作结束,好好看书吧!')

    test = str(input('       '))

接下来说ui.

上面的其实以及可以用了..只是输入界面是在命令提示符.用搜狗输入法不太方便.所以想着弄个ui.

这里要用到pyqt5这个库,其中包含界面的,包含多线程的.都可以直接用.

主体窗口代码如下:

class Main(QtWidgets.QWidget):

    def __init__(self, parent=None):
        super(Main, self).__init__(parent)
        self.setui()

    def setui(self):


        title = QtWidgets.QLabel('小说名称')
        url = QtWidgets.QLabel('小说主页')
        output = QtWidgets.QLabel('信息输出窗口')
        time1 = QtWidgets.QLabel('进度')
        self.timebar = QtWidgets.QProgressBar()


        self.titleEdit = QtWidgets.QLineEdit()
        self.urlEdit = QtWidgets.QLineEdit()
        self.outputinfo = QtWidgets.QTextBrowser()

        qb = QtWidgets.QPushButton('gogogo')
        qb.resize(qb.sizeHint())


        grid = QtWidgets.QGridLayout()
        grid.setSpacing(15)


        grid.addWidget(title, 1, 0)
        grid.addWidget(self.titleEdit, 1, 1)

        grid.addWidget(url, 2, 0, )
        grid.addWidget(self.urlEdit, 2, 1)

        grid.addWidget(time1, 3, 0)
        grid.addWidget(self.timebar, 3, 1, 1, 3)

        grid.addWidget(output, 4, 0)
        grid.addWidget(self.outputinfo, 4, 1, 5, 3)
        grid.addWidget(qb, 1, 3, 2, 1)

        self.setLayout(grid)

        self.setGeometry(300, 300, 720, 540)
        self.setWindowTitle('对对牌小说读取器')

        qb.clicked.connect(self.start_threads)


    def start_threads(self):
        xsname = str(self.titleEdit.text())
        url = str(self.urlEdit.text())
        thread = MyThread(self)
        thread.setup(xsname,url)
        thread.trigger.connect(self.update_text)
        thread.i_trigger.connect(self.timerEvent)
        thread.start()

    def update_text(self, msg):
        self.outputinfo.append(msg)

    def timerEvent(self,i):
        self.timebar.setValue(i)

第一个是主体窗口各种设置

第二个是开启爬虫的线程.不能干扰ui的主线程,否则会阻塞导致ui假死

后面两个是主线程和爬虫线程的信号通道

然后是爬虫线程:

class MyThread(QtCore.QThread):
    trigger = QtCore.pyqtSignal(str)
    i_trigger = QtCore.pyqtSignal(float)

    def __init__(self, parent=None):
        super(MyThread, self).__init__(parent)

    def setup(self, xsname, url):
        self.xsname = xsname
        self.url = url

    def run(self):
        爬虫代码!!
        注意要反馈信号到主线程