python贴吧_Python爬虫学习之爬取百度贴吧资源

aec54bcbd94e53a95ffcbe7590174434.gif

爬取百度贴吧某帖子的各楼层的内容

案例源码

# coding=utf-8import urllib2from bs4 import BeautifulSoupclass BDTB:    def __init__(self, baseurl, seeLZ, floorTag):        self.baseurl = baseurl        self.seeLZ = '?see_lz=' + str(seeLZ)        self.file = None        self.floor = 1        self.floorTag = floorTag        self.defaultTitle = u"百度贴吧"    def getpage(self, pagenum):        try:            url = self.baseurl + self.seeLZ + '&pn=' + str(pagenum)            request = urllib2.Request(url)            response = urllib2.urlopen(request)            page = BeautifulSoup(response, "html5lib")            return page        except urllib2.URLError, e:            if hasattr(e, 'reason'):                print u"连接百度贴吧失败,错误原因", e.reason                return None    def getTitle(self):        page = self.getpage(1)        tag = page.h3        title = tag['title']        print title        return title    def getPageNum(self):        page = self.getpage(1)        num = page.find_all(attrs={"class": "red"})        pagenum = num[1].string        return int(pagenum)    def getcontent(self):        pagenum = self.getPageNum() + 1        contents = []        for num in range(1, pagenum):            page = self.getpage(num)            num = page.find_all('cc')            for item in num:                content = item.get_text()                contents.append(content.encode('utf-8'))        return contents    def getFileTitle(self):        title = self.getTitle()        if title is not None:            self.file = open(title + ".txt", "w+")        else:            self.file = open(self.defaultTitle + ".txt", "w+")    def writeData(self):        contents = self.getcontent()        for item in contents:            if self.floorTag == '1':                floorLine = '\n' + \                    str(self.floor) + \                    u'---------------------------------------------\n'                self.file.write(floorLine)            self.file.write(item)            self.floor += 1    def start(self):        self.getFileTitle()        pagenum = self.getPageNum()        if pagenum == None:            print "URL已失效,请重试"            return        try:            print "该帖子共有" + str(pagenum) + "页"            self.writeData()        except IOError, e:            print "写入异常,原因" + e.message        finally:            print "写入成功"print u"请输入帖子代号"baseurl = 'http://tieba.baidu.com/p/' + \    str(raw_input(u'http://tieba.baidu.com/p/'))seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0\n")floorTag = raw_input("是否写入楼层信息,是输入1否输入0\n")bdtb = BDTB(baseurl, seeLZ, floorTag)bdtb.start()

ps:推荐一下我建的python学习交流扣扣qun:937667509,群里有免费的视频教程,开发工具、电子书籍、项目源码分享。学习python web、python爬虫、数据分析、大数据,人工智能等技术有不懂的可以加入一起交流学习,一起进步!

记得关注评论、转发、收藏哟

长按下面二维码关注我

bcf1f25ed27e3f323d0e81ee010e8146.png

微信公众号:python教程

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值