【python】爬虫-爬取剑网三贴吧818

最新推荐文章于 2024-08-22 08:16:01 发布

Fzuim

最新推荐文章于 2024-08-22 08:16:01 发布

阅读量994

点赞数 3

分类专栏： Python开发文章标签：爬虫

本文链接：https://blog.csdn.net/fzuim/article/details/86629166

版权

Python开发专栏收录该内容

6 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
from urllib import request
from urllib import error
import re, os, threading

'''author fzuim'''

class BDTB:
    def __init__(self, v_szTbUrl):
        self.m_szTbUrl = v_szTbUrl
        self.m_i818Count = 0
    
    def GetTbPage(self, v_iPageIndex):
        try:
            url = self.m_szTbUrl + '&pn=' + str(v_iPageIndex)
            req = request.Request(url)
            res = request.urlopen(req)
            pageCode = res.read().decode('utf-8')
            return pageCode
        except error.URLError as e:
            if hasattr(e, 'reason'):
                print(u'进入贴吧失败，原因：', e.reason)
            return None

    def GetTZList(self, v_iPageNum):
        pageCode = self.GetTbPage(v_iPageNum)
        if not pageCode:
            print(u'爬取帖子失败...')
            return None
        
        # with open(r'c:\a.log', 'w', encoding='utf-8') as f:
        #     f.write(pageCode)
        #     f.close()

        # 开始正则匹配
        rule = '<li class=" j_thread_list clearfix".*?data-field=.*?id.*?:(.*?),' # 获取帖子id
        rule += '.*?author_name.*?;:&quot;(.*?)&quot;,' # 获取帖子作者
        rule += '.*?reply_num.*?:(.*?),' # 获取帖子回复数
        rule += '.*?<a rel="noreferrer.*?">(.*?)</a>' # 获取帖子标题
        pattern = re.compile(rule, re.S)
        items = re.findall(pattern, pageCode)
        TzList = []
        for item in items:
            is818 = re.search('818', item[3])
            if is818:
                tzLink = r'https://tieba.baidu.com/p/' + item[0]
                author = item[1].encode('utf-8').decode('unicode_escape')
                TzList.append([tzLink.strip(), item[3].strip(), author.strip(), item[2].strip()])
                self.m_i818Count += 1
        return TzList
    
    def loadPage(self, v_iPageNum):
        tzlist = self.GetTZList(v_iPageNum)
        for tz in tzlist:
            print(u'链接：%s\n标题：%s\t作者：%s\t回复数：%s\n' % (tz[0], tz[1], tz[2], tz[3]))
    
    def start(self):
        print(u'准备爬取剑网三贴吧818帖子...回车爬取 Q退出')
        nowPage = 0
        myIndex = 0
        while True:
            MyInput = input()
            if (MyInput == 'Q') or (MyInput == 'q'):
                break

            while self.m_i818Count < 10:
                self.loadPage(nowPage)
                nowPage += 50
            
            self.m_i818Count = 0
            myIndex += 1
            print(u"----------------------------------第%d页----------------------------------" % myIndex)
            print(u'回车继续获取下一页...')



if __name__ == "__main__":
    #爬取剑网三帖子
    spider = BDTB(r'https://tieba.baidu.com/f?kw=%E5%89%91%E7%BD%913&ie=utf-8')
    # 线程方式启动
    t = threading.Thread(target=spider.start)
    t.start()
    t.join()

运行示例：
在这里插入图片描述