# -*- coding: utf-8 -*-
from urllib import request
from urllib import error
import re, os, threading
'''author fzuim'''
class BDTB:
def __init__(self, v_szTbUrl):
self.m_szTbUrl = v_szTbUrl
self.m_i818Count = 0
def GetTbPage(self, v_iPageIndex):
try:
url = self.m_szTbUrl + '&pn=' + str(v_iPageIndex)
req = request.Request(url)
res = request.urlopen(req)
pageCode = res.read().decode('utf-8')
return pageCode
except error.URLError as e:
if hasattr(e, 'reason'):
print(u'进入贴吧失败,原因:', e.reason)
return None
def GetTZList(self, v_iPageNum):
pageCode = self.GetTbPage(v_iPageNum)
if not pageCode:
print(u'爬取帖子失败...')
return None
# with open(r'c:\a.log', 'w', encoding='utf-8') as f:
# f.write(pageCode)
# f.close()
# 开始正则匹配
rule = '<li class=" j_thread_list clearfix".*?data-field=.*?id.*?:(.*?),' # 获取帖子id
rule += '.*?author_name.*?;:"(.*?)",' # 获取帖子作者
rule += '.*?reply_num.*?:(.*?),' # 获取帖子回复数
rule += '.*?<a rel="noreferrer.*?">(.*?)</a>' # 获取帖子标题
pattern = re.compile(rule, re.S)
items = re.findall(pattern, pageCode)
TzList = []
for item in items:
is818 = re.search('818', item[3])
if is818:
tzLink = r'https://tieba.baidu.com/p/' + item[0]
author = item[1].encode('utf-8').decode('unicode_escape')
TzList.append([tzLink.strip(), item[3].strip(), author.strip(), item[2].strip()])
self.m_i818Count += 1
return TzList
def loadPage(self, v_iPageNum):
tzlist = self.GetTZList(v_iPageNum)
for tz in tzlist:
print(u'链接:%s\n标题:%s\t作者:%s\t回复数:%s\n' % (tz[0], tz[1], tz[2], tz[3]))
def start(self):
print(u'准备爬取剑网三贴吧818帖子...回车爬取 Q退出')
nowPage = 0
myIndex = 0
while True:
MyInput = input()
if (MyInput == 'Q') or (MyInput == 'q'):
break
while self.m_i818Count < 10:
self.loadPage(nowPage)
nowPage += 50
self.m_i818Count = 0
myIndex += 1
print(u"----------------------------------第%d页----------------------------------" % myIndex)
print(u'回车继续获取下一页...')
if __name__ == "__main__":
#爬取剑网三帖子
spider = BDTB(r'https://tieba.baidu.com/f?kw=%E5%89%91%E7%BD%913&ie=utf-8')
# 线程方式启动
t = threading.Thread(target=spider.start)
t.start()
t.join()
运行示例: