#-*- coding: utf-8 -*-
"""Created on Fri Apr 15 11:47:02 2016
@author: wuhan"""
importurllibimporturllib2importreimporttimeimportos#reload(sys)#sys.setdefaultencoding("utf-8")
classTool:
removeImg= re.compile('| {12}')
removeAddr= re.compile('|')
replaceLine= re.compile('
|replaceTD= re.compile('
')replacePara= re.compile('
')replaceBR= re.compile('
|
')
removeExtraTag= re.compile('<.>')defreplace(self,x):
x= re.sub(self.removeImg, "", x)
x= re.sub(self.removeAddr, "", x)
x= re.sub(self.replaceLine, "\n", x)
x= re.sub(self.replaceBR, "\n", x)
x= re.sub(self.replacePara, "\n", x)
x= re.sub(self.replaceTD, "\t", x)
x= re.sub(self.removeExtraTag, "", x)returnx.strip()classBDTB:def __init__(self, baseUrl, seeLZ, floorTag):
self.baseURL=baseUrl
self.seeLZ= '?see_lz=' +str(seeLZ)
self.tool=Tool()
self.file=None
self.floor= 1self.defaultTitle= u'百度贴吧'self.floorTag=floorTagdefgetPage(self, pageNum):try:
url= self.baseURL + self.seeLZ + '&pn=' +str(pageNum)
request=urllib2.Request(url)
response=urllib2.urlopen(request)return response.read().decode('utf-8')excepturllib2.URLError, e:if hasattr(e, "reason"):print u'百度贴吧链接失败,错误原因 :', e.reasonreturnNonedefgetTitle(self, page):
pattern= re.compile('
.*?(.*?)',re.S)
result=re.search(pattern, page)ifresult:return result.group(1).strip()else:returnNonedefgetContents(self,page):
pattern= re.compile('
self.file= open(title + ".txt" , "w+")else:
self.file= open(self.defaultTitle + ".txt" , "w+")defwriteData(self, contents):for item incontents:if self.floorTag == '1':
floorLine= "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------------------------------------------------------\n"self.file.write(floorLine)
self.file.write(item)
self.floor+= 1
defstart(self):
indexPage= self.getPage(1)
pageNum=self.getPageNum(indexPage)
title=self.getTitle(indexPage)
self.setFileTitle(title)if pageNum ==None:print "URL已失效,请重试"
return
try:print "该贴子共有" + str(pageNum) + "页"
for i in range(1, int(pageNum)+1):print "正在写入第" + str(i) + "页数据"page=self.getPage(i)
contents=self.getContents(page)
self.writeData(contents)
self.getPicture(page, i)exceptIOError, e:print "写入异常,原因" +e.messagefinally:print "写入任务完成"
defgetPicture(self, page, PageNum):
reg= r'
imglist = re.findall(imgre,page)#读取html 中包含 imgre(正则表达式)的数据
t =time.localtime(time.time())
foldername= str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday"))
picpath= 'E:\\Python\\ImageDownload\\%s' % (foldername) #下载到的本地目录
if not os.path.exists(picpath): #路径不存在时创建一个
os.makedirs(picpath)
x=0for imgurl inimglist:
target= picpath+'\\%s_%s.jpg' %(PageNum, x)
urllib.urlretrieve(imgurl, target)#直接将远程数据下载到本地
x+=1
print u"请输入帖子代号"baseURL= 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
seeLZ= raw_input("是否只获取楼主发言,是输入1,否输入0\n".decode('utf-8').encode('gbk'))
floorTag= raw_input("是否写入楼层信息,是输入1,否输入0\n".decode('utf-8').encode('gbk'))
bdtb=BDTB(baseURL,seeLZ,floorTag)
bdtb.start()