使用了urllib2和正则表达式
#coding=utf-8
import urllib
import urllib2
import re
class Tool:
removeImg=re.compile('<img.*?>| {7}|') #删除img标签,7位长空格
removeAddr=re.compile('<a.*?>|</a>') #删除超链接标签
replaceLine=re.compile('<tr>|<div>|</div>|</p>') #把换行的标签换为\n
replaceTD=re.compile('<td>') #将表格制表<td>替换为/t
replacePara=re.compile('<p.*?>') #把段落开头换为\n加空两格
replaceBR=re.compile('<br><br>|<br>') #将换行符或双换行符替换为\n
removeExtraTag=re.compile('<.*?>') #将其余标签删除
def replace(self,x):
x=re.sub(self.removeImg,"",x)
x=re.sub(self.removeAddr,"",x)
x=re.sub(self.replaceLine,"\n",x)
x=re.sub(self.replaceTD,"\t",x)
x=re.sub(self.replacePara,"\n ",x)
x=re.sub(self.replaceBR,"\n",x)
x=re.sub(self.removeExtraTag,"",x)
return x.strip()
class BDTB:
def __init__(self,baseURL,seeLZ,floorTag):
self.baseURL=baseURL
self.seeLZ="?see_lz="+str(seeLZ)
self.tool=Tool()
self.file=None
self.floor=1
self.defaultTitle=u"百度贴吧"
self.floorTag=floorTag
self.user_agent="Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"
self.headers={'User-Agent':self.user_agent}
def getPage(self,pageNum):
try:
url=self.baseURL+self.seeLZ+'&pn='+str(pageNum)
request=urllib2.Request(url,headers=self.headers)
response=urllib2.urlopen(request)
return response.read().decode('utf-8')
except urllib2.URLError as e:
if hasattr(e,'reason'):
print u"连接百度贴吧失败,失败原因",e.reason
return None
def getTitle(self,page):
pattern=re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result=re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getPageNum(self,page):
pattern=re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
result=re.search(pattern,page)
if result:
return result.group(1).strip()
else:
return None
def getContent(self,page):
pattern=re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
items=re.findall(pattern,page)
contents=[]
for item in items:
content="\n"+self.tool.replace(item)+"\n"
contents.append(content.encode('utf-8'))
return contents
def setFileTitle(self,title):
if title is not None:
self.file=open(title+".txt","w+")
else:
self.file=open(self.defaultTitle+".txt","w+")
def writeData(self,contents):
for item in contents:
if self.floorTag=='1':
floorLine="\n"+str(self.floor)+"------------------------------------------------------------------\n"
self.file.write(floorLine)
self.file.write(item)
self.floor+=1
def start(self):
indexPage=self.getPage(1)
pageNum=self.getPageNum(indexPage)
title=self.getTitle(indexPage)
self.setFileTitle(title)
if pageNum==None:
print "URL已失效,请重试"
return
try:
print "该帖子共有"+str(pageNum)+"页"
for i in range(1,int(pageNum)+1):
print "正在写入第"+str(i)+"页"
page=self.getPage(i)
contents=self.getContent(page)
self.writeData(contents)
except IOError,e:
print "写入异常,原因"+e.message
finally:
print "写入任务完成"
print u"请输入帖子代号"
baseURL='https://tieba.baidu.com/p/'+str(raw_input(u'http://tieba.baidu.com/p/'))
seeLZ=raw_input("是否只获取楼主发言,是输入1,否输入0\n")
floorTag=raw_input("是否写入楼层信息,是输入1,否输入0\n")
bdtb=BDTB(baseURL,seeLZ,floorTag)
bdtb.start()