java 爬百度贴吧 帖子_网络爬虫入门——案例一:爬取百度贴吧帖子

#-*- coding: utf-8 -*-

"""Created on Fri Apr 15 11:47:02 2016

@author: wuhan"""

importurllibimporturllib2importreimporttimeimportos#reload(sys)#sys.setdefaultencoding("utf-8")

classTool:

removeImg= re.compile('| {12}')

removeAddr= re.compile('|')

replaceLine= re.compile('

|
|
|')

replaceTD= re.compile('

')

replacePara= re.compile('

')

replaceBR= re.compile('
|
')

removeExtraTag= re.compile('<.>')defreplace(self,x):

x= re.sub(self.removeImg, "", x)

x= re.sub(self.removeAddr, "", x)

x= re.sub(self.replaceLine, "\n", x)

x= re.sub(self.replaceBR, "\n", x)

x= re.sub(self.replacePara, "\n", x)

x= re.sub(self.replaceTD, "\t", x)

x= re.sub(self.removeExtraTag, "", x)returnx.strip()classBDTB:def __init__(self, baseUrl, seeLZ, floorTag):

self.baseURL=baseUrl

self.seeLZ= '?see_lz=' +str(seeLZ)

self.tool=Tool()

self.file=None

self.floor= 1self.defaultTitle= u'百度贴吧'self.floorTag=floorTagdefgetPage(self, pageNum):try:

url= self.baseURL + self.seeLZ + '&pn=' +str(pageNum)

request=urllib2.Request(url)

response=urllib2.urlopen(request)return response.read().decode('utf-8')excepturllib2.URLError, e:if hasattr(e, "reason"):print u'百度贴吧链接失败,错误原因 :', e.reasonreturnNonedefgetTitle(self, page):

pattern= re.compile('

.*?(.*?)',re.S)

result=re.search(pattern, page)ifresult:return result.group(1).strip()else:returnNonedefgetContents(self,page):

pattern= re.compile('

self.file= open(title + ".txt" , "w+")else:

self.file= open(self.defaultTitle + ".txt" , "w+")defwriteData(self, contents):for item incontents:if self.floorTag == '1':

floorLine= "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------------------------------------------------------\n"self.file.write(floorLine)

self.file.write(item)

self.floor+= 1

defstart(self):

indexPage= self.getPage(1)

pageNum=self.getPageNum(indexPage)

title=self.getTitle(indexPage)

self.setFileTitle(title)if pageNum ==None:print "URL已失效,请重试"

return

try:print "该贴子共有" + str(pageNum) + "页"

for i in range(1, int(pageNum)+1):print "正在写入第" + str(i) + "页数据"page=self.getPage(i)

contents=self.getContents(page)

self.writeData(contents)

self.getPicture(page, i)exceptIOError, e:print "写入异常,原因" +e.messagefinally:print "写入任务完成"

defgetPicture(self, page, PageNum):

reg= r'

imglist = re.findall(imgre,page)#读取html 中包含 imgre(正则表达式)的数据

t =time.localtime(time.time())

foldername= str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday"))

picpath= 'E:\\Python\\ImageDownload\\%s' % (foldername) #下载到的本地目录

if not os.path.exists(picpath): #路径不存在时创建一个

os.makedirs(picpath)

x=0for imgurl inimglist:

target= picpath+'\\%s_%s.jpg' %(PageNum, x)

urllib.urlretrieve(imgurl, target)#直接将远程数据下载到本地

x+=1

print u"请输入帖子代号"baseURL= 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))

seeLZ= raw_input("是否只获取楼主发言,是输入1,否输入0\n".decode('utf-8').encode('gbk'))

floorTag= raw_input("是否写入楼层信息,是输入1,否输入0\n".decode('utf-8').encode('gbk'))

bdtb=BDTB(baseURL,seeLZ,floorTag)

bdtb.start()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值