#-*- coding:utf-8 -*-#!/user/bin/python
importurllibimporturllib2importre#处理页面标签类
classTool:#去除img标签,7位长空格
removeImg = re.compile('| {7}|')#删除超链接标签
removeAddr = re.compile('|')#把换行的标签换为
replaceLine = re.compile('
replaceTD= re.compile('
')#把段落开头换为 加空两格replacePara = re.compile('
')#将换行符或双换行符替换为replaceBR = re.compile('
|
')#将其余标签剔除
removeExtraTag = re.compile('<.*?>')defreplace(self,x):
x= re.sub(self.removeImg,"",x)
x= re.sub(self.removeAddr,"",x)
x= re.sub(self.replaceLine," ",x)
x= re.sub(self.replaceTD," ",x)
x= re.sub(self.replacePara," ",x)
x= re.sub(self.replaceBR," ",x)
x= re.sub(self.removeExtraTag,"",x)#strip()将前后多余内容删除
returnx.strip()classBDTB:#初始化,传入基地址,是否只看楼主的参数
def __init__(self, baseUrl, seeLZ, floorTag):
self.baseURL=baseUrl
self.seeLZ= '?see_lz=' +str(seeLZ)
self.tool=Tool()#全局file变量,文件写入操作对象
self.file =None#楼层标号, 初始化为1
self.floor = 1
#默认标题
self.defaultTitle = u"百度某某贴吧"
#是否写入楼层分隔符标记
self.floorTag =floorTag#传入页码,获取该页帖子的代码
defgetPage(self, pageNum):try:
url= self.baseURL + self.seeLZ + '&pn=' +str(pageNum)
request=urllib2.Request(url)
response=urllib2.urlopen(request)return response.read().decode('utf-8')excepturllib2.URLError, e:if hasattr(e, "reason"):print u"连接百度贴吧失败,错误原因", e.reasonreturnNone#获得帖子标题
defgetTitle(self,page):
page= self.getPage(1)
pattern= re.compile('
(.*?)
', re.S)result=re.search(pattern, page)ifresult:#print result.group(1)
return result.group(1).strip()else:returnNone#得到帖子页数
defgetPageNum(self,page):
page= self.getPage(1)
pattern= re.compile('
(.*?)',re.S)result=re.search(pattern, page)ifresult:#print "回复个数:"
#print result.group(1)
return result.group(1).strip()else:returnNone#获得帖子的内容
defgetContent(self,page):
page= self.getPage(1)
pattern= re.compile('
items=re.findall(pattern,page)
contents=[]
floor= 1
for item initems:
content= " " + self.tool.replace(item) + " "contents.append(content.encode('utf-8'))#print self.tool.replace(item)
#floor += 1
returncontentsdefsetFileTitle(self,title):if title is notNone:
self.file= open(title + ".txt", "w+")else:
self.file= open(self.defaultTitle + ".txt", "w+")defwriteData(self,contents):for item incontents:if self.floorTag == '1':
floorline= " " + str(self.floor) + u"------------------------------------- "self.file.write(floorline)
self.file.write(item)
self.floor+= 1
defstart(self):
indexPage= self.getPage(1)
pageNum=self.getPageNum(indexPage)
title=self.getTitle(indexPage)
self.setFileTitle(title)if pageNum ==None:print "URL已失效,请重试"
return
try:print "该帖子共有" + str(pageNum) + "页"
for i in range(1,int(pageNum) + 1):print "正在写入第" + str(i) + "页数据"page=self.getPage(i)
contents=self.getContent(page)
self.writeData(contents)exceptIOError,e:print "写入异常,原因" +e.messagefinally:print "Succeed~"
print u"请输入帖子代码"baseURL= 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
seeLZ= raw_input("是否只看楼主,是输入1,否输入0\n")
floorTag= raw_input("是否写入楼层信息,是输入1,否输入0\n")
bdtb=BDTB(baseURL, seeLZ,floorTag)
bdtb.start()