原文:http://cuiqingcai.com/993.html
本篇目标
1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
_______________________________________________________________________________________________________
from urllib import request,parse,error
import re
class BDTB:
def __init__(self,baseUrl,seelz,floortag):
'''初始化方法
:param baseUrl: 基础url
:param seelz: 只看楼主模式
:param floortag: 是否打印楼层状态
'''
self.baseUrl = baseUrl
self.seelz = '?see_lz='+str(seelz)
self.tools=Tools()
self.floortag = floortag
#打印楼层初始化
self.floor = 1
def getPage(self,pageNum):
'''读取一个页面
:param pageNum:第几页
:return:页面源代码的字符串
'''
try:
url = self.baseUrl+self.seelz+'&pn=' + str(pageNum)
req = request.Request(url)
respose = request.urlopen(req)
text=respose.read().decode('utf-8')
return text
except error.URLError as e:
if hasattr(e,'reason'):
print('连接贴吧失败,错误原因:',e.reason)
return None
def getTitle(self):
'''读取页面帖子的题目
:return: 题目
'''
try:
page = self.getPage(1)
patterns = re.compile(r'<h3 class="core_title_txt.*?>(.*?)</h3>')
title = re.search(patterns,page)
if title:
return title.group(1)
else:
return None
except error.URLError as e:
print('连接错误,原因:',e.reason)
return None
def getpageNum(self):
'''获取帖子的页数
:return: 返回页数
'''
page = self.getPage(1)
#获取作者发布总数和帖子页数
patterns = re.compile(r'<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>')
pageNum = re.search(patterns,page)
if pageNum:
return int(pageNum.group(1)) #返回页数
else:
return None
def getText(self,number):
'''获取指定的页所有的内容
:param number: 页号
:return: 所有内容的列表
'''
page = self.getPage(number)
articles = re.compile(r'<div id="post_content_\d+".*?>(.*?)</div>',re.S)
texts = re.findall(articles,page)
contents = []
for text in texts:
#使用Tools类去除内容中的标签,同时加上换行符
new_text = '\n'+self.tools.replace(text)+'\n'
contents.append(new_text)
return contents
def downloadAll(self,pageNum):
'''下载指定页数的帖子
:param pageNum: 页面总数
:return: 无
'''
try:
title = self.getTitle()
print("该帖子共有" + str(pageNum) + "页")
for num in range(1,pageNum+1):
print('正在写入第'+str(num)+'页数据。。。')
self.downloadOnePage(title,num)
except error.URLError as e:
print('url连接失败,原因:', e.reason)
return None
def downloadOnePage(self,title,num):
'''下载一页的帖子,并用文章标题作为文件名
:param title: 文章标题
:param num: 第几页
:return: 无
'''
try:
floordefaultline = '\n'+'-'*78+'\n'
contents = self.getText(num)
filename = title +'.txt'
with open(filename,'a+') as f:
for text in contents:
floorwithnumber = '\n' + str(self.floor)+'楼' + '-' * 76 + '\n'
if self.floortag == 1: #如果floortag=1,则表明楼层序号
f.write(text)
f.write(floorwithnumber)
else:
f.write(text)
f.write(floordefaultline)
self.floor += 1
print('保存完成所有帖子!')
except error.URLError as e:
print('url连接失败,原因:',e.reason)
return None
except OSError as e:
print('文件读写错误',e.reason)
return None
class Tools:
'''
'''
#去除img标签,7位长空格
removeImg = re.compile(r'<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile(r'<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile(r'<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile(r'<td>')
#把段落开头换为\n加空两格
replacePara = re.compile(r'<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile(r'<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile(r'<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
if __name__ =="__main__":
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL, 1,1)
num = bdtb.getpageNum()
bdtb.downloadAll(num)
——————————————————————————————————————————————————————————
修改了其中原文中点点,增加了一些函数,写的一般吧。我。看看就行