【python】爬虫-爬取剑网三贴吧818

# -*- coding: utf-8 -*-
from urllib import request
from urllib import error
import re, os, threading

'''author fzuim'''

class BDTB:
    def __init__(self, v_szTbUrl):
        self.m_szTbUrl = v_szTbUrl
        self.m_i818Count = 0
    
    def GetTbPage(self, v_iPageIndex):
        try:
            url = self.m_szTbUrl + '&pn=' + str(v_iPageIndex)
            req = request.Request(url)
            res = request.urlopen(req)
            pageCode = res.read().decode('utf-8')
            return pageCode
        except error.URLError as e:
            if hasattr(e, 'reason'):
                print(u'进入贴吧失败,原因:', e.reason)
            return None

    def GetTZList(self, v_iPageNum):
        pageCode = self.GetTbPage(v_iPageNum)
        if not pageCode:
            print(u'爬取帖子失败...')
            return None
        
        # with open(r'c:\a.log', 'w', encoding='utf-8') as f:
        #     f.write(pageCode)
        #     f.close()

        # 开始正则匹配
        rule = '<li class=" j_thread_list clearfix".*?data-field=.*?id.*?:(.*?),' # 获取帖子id
        rule += '.*?author_name.*?;:&quot;(.*?)&quot;,' # 获取帖子作者
        rule += '.*?reply_num.*?:(.*?),' # 获取帖子回复数
        rule += '.*?<a rel="noreferrer.*?">(.*?)</a>' # 获取帖子标题
        pattern = re.compile(rule, re.S)
        items = re.findall(pattern, pageCode)
        TzList = []
        for item in items:
            is818 = re.search('818', item[3])
            if is818:
                tzLink = r'https://tieba.baidu.com/p/' + item[0]
                author = item[1].encode('utf-8').decode('unicode_escape')
                TzList.append([tzLink.strip(), item[3].strip(), author.strip(), item[2].strip()])
                self.m_i818Count += 1
        return TzList
    
    def loadPage(self, v_iPageNum):
        tzlist = self.GetTZList(v_iPageNum)
        for tz in tzlist:
            print(u'链接:%s\n标题:%s\t作者:%s\t回复数:%s\n' % (tz[0], tz[1], tz[2], tz[3]))
    
    def start(self):
        print(u'准备爬取剑网三贴吧818帖子...回车爬取 Q退出')
        nowPage = 0
        myIndex = 0
        while True:
            MyInput = input()
            if (MyInput == 'Q') or (MyInput == 'q'):
                break

            while self.m_i818Count < 10:
                self.loadPage(nowPage)
                nowPage += 50
            
            self.m_i818Count = 0
            myIndex += 1
            print(u"----------------------------------第%d页----------------------------------" % myIndex)
            print(u'回车继续获取下一页...')



if __name__ == "__main__":
    #爬取剑网三帖子
    spider = BDTB(r'https://tieba.baidu.com/f?kw=%E5%89%91%E7%BD%913&ie=utf-8')
    # 线程方式启动
    t = threading.Thread(target=spider.start)
    t.start()
    t.join()

运行示例:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值