爬取百度贴吧某帖子的各楼层的内容
案例源码
# coding=utf-8import urllib2from bs4 import BeautifulSoupclass BDTB: def __init__(self, baseurl, seeLZ, floorTag): self.baseurl = baseurl self.seeLZ = '?see_lz=' + str(seeLZ) self.file = None self.floor = 1 self.floorTag = floorTag self.defaultTitle = u"百度贴吧" def getpage(self, pagenum): try: url = self.baseurl + self.seeLZ + '&pn=' + str(pagenum) request = urllib2.Request(url) response = urllib2.urlopen(request) page = BeautifulSoup(response, "html5lib") return page except urllib2.URLError, e: if hasattr(e, 'reason'): print u"连接百度贴吧失败,错误原因", e.reason return None def getTitle(self): page = self.getpage(1) tag = page.h3 title = tag['title'] print title return title def getPageNum(self): page = self.getpage(1) num = page.find_all(attrs={"class": "red"}) pagenum = num[1].string return int(pagenum) def getcontent(self): pagenum = self.getPageNum() + 1 contents = [] for num in range(1, pagenum): page = self.getpage(num) num = page.find_all('cc') for item in num: content = item.get_text() contents.append(content.encode('utf-8')) return contents def getFileTitle(self): title = self.getTitle() if title is not None: self.file = open(title + ".txt", "w+") else: self.file = open(self.defaultTitle + ".txt", "w+") def writeData(self): contents = self.getcontent() for item in contents: if self.floorTag == '1': floorLine = '\n' + \ str(self.floor) + \ u'---------------------------------------------\n' self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): self.getFileTitle() pagenum = self.getPageNum() if pagenum == None: print "URL已失效,请重试" return try: print "该帖子共有" + str(pagenum) + "页" self.writeData() except IOError, e: print "写入异常,原因" + e.message finally: print "写入成功"print u"请输入帖子代号"baseurl = 'http://tieba.baidu.com/p/' + \ str(raw_input(u'http://tieba.baidu.com/p/'))seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0\n")floorTag = raw_input("是否写入楼层信息,是输入1否输入0\n")bdtb = BDTB(baseurl, seeLZ, floorTag)bdtb.start()
ps:推荐一下我建的python学习交流扣扣qun:937667509,群里有免费的视频教程,开发工具、电子书籍、项目源码分享。学习python web、python爬虫、数据分析、大数据,人工智能等技术有不懂的可以加入一起交流学习,一起进步!
记得关注评论、转发、收藏哟长按下面二维码关注我
微信公众号:python教程