python3 练手:爬百度贴吧
参考原文地址:
https://blog.csdn.net/pleasecallmewhy/article/details/8934726
https://cuiqingcai.com/993.html
贴吧:https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1&red_tag=0231085142
目标:获取文章标题,总页数,帖子内容,保存到文本文件。
实现:
# !/usr/bin/env python
# -*- encoding:utf-8 -*-
import urllib
import urllib.request
import re
import time
#文本处理类
class Tools:
removeimg = re.compile('<img.*?>')
removea = re.compile('<a href.*?>|</a>')
replacebr = re.compile('<br>')
def replace(self,x):
x = re.sub(self.removeimg,"",x)
x = re.sub(self.removea,"",x)
x = re.sub(self.replacebr,"\n",x)
return x.strip()
#实现爬取类
class BDTB:
def __init__(self):
self.headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
self.filename = "bdtb.txt"
self.contents = []
self.tools = Tools()
def get_html(self,url):
try:
myreq = urllib.request.Request(url,headers = self.headers)
myresponse = urllib.request.urlopen(myreq)
html = myresponse.read().decode('utf-8')
return html
except urllib.request.URLError as e:
if(hasattr(e,'reason')):
print("未连接,原因:" + e.reason)
#爬取标题
def get_title(self,html):
p_title = re.compile('<div.*?"core_title_bg j_core_title_bg">.*?<h3.*?>(.*?)</h3>',re.S)
mytitle = re.search(p_title,html).group(1)
return mytitle
#爬取贴子页数
def get_pagenum(self,html):
p_pagenum = re.compile('<li.*?"l_reply_num".*?>.*?<span.*?>.*?<span.*?"red">(.*?)</span>',re.S)
mypagenum = re.search(p_pagenum,html).group(1)
return mypagenum
#获取帖子内容
def get_content(self,html):
p_content = re.compile('<div.*?"post_content_.*?>(.*?)</div>',re.S)
items = re.findall(p_content,html)
for item in items:
content = "\n" + self.tools.replace(item) + "\n"
self.contents.append(content)
#写入文件
def writedata(self,pagenum,title,contents):
if(title):
self.filename = title + '.txt'
with open(self.filename,'w') as f:
f.write("本贴共" + str(pagenum) + "页:\n")
floor = 1
for content in contents:
f.write("\n第" + str(floor)+"楼:"+100*"-"+"\n")
f.write(content)
floor += 1
def start(self):
pageindex = 1
url_download = 'https://tieba.baidu.com/p/3138733512?see_lz=1&pn=' + str(pageindex)
html = self.get_html(url_download)
title = self.get_title(html)
pagenum = int(self.get_pagenum(html))
while(pageindex <= pagenum):
self.get_content(html)
self.writedata(pagenum,title,self.contents)
pageindex += 1
url_download = 'https://tieba.baidu.com/p/3138733512?see_lz=1&pn=' + str(pageindex)
html = self.get_html(url_download)
spider = BDTB()
starttime = time.time()
spider.start()
endtime = time.time()
print("用时:%.2f seconds"%(endtime - starttime))