# -*- coding:utf-8 -*-
# 抓取百度贴吧帖子
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib2
import re
class Tool:
# 去除img标签
removeImg = re.compile('<img.*?>')
# 去除a标签
removeA = re.compile('<a.*?>|</a>')
# 去除<br>多个标签
removeBr = re.compile('(<br>)+')
# 去除段前的七个空格标签
removeSpace = re.compile(' {7}')
def replace(self, pageContent):
result = re.sub(self.removeImg, "", pageContent)
result = re.sub(self.removeA, "", result)
result = re.sub(self.removeBr, "\n", result)
result = re.sub(self.removeSpace, "", result)
# strip()将前后多余内容删除
return result.strip()
class BDTB:
def __init__(self, baseurl, seeLZ, floorTag):
self.baseURL = baseurl
self.seeLZ = "?see_lz=" + str(seeLZ)
# HTML标签剔除工具类对象
self.tool = Tool()
# 全局file变量,文件写入操作对象
self.file = None
# 楼层标号,初始为1
self.floor = 1
# 默认的标题,如果没有成功获取到标题的话则会用这个标题
self.defaultTitle = u"百度贴吧"
self.floorTag = floorTag
def getBasePage(self, pageNum):
try:
url = self.baseURL + self.seeLZ + "&pn=" + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# print response.read()
return response.read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e, "reason"):
print u"连接百度贴吧失败,错误原因", e.reason
return None
def getPageTile(self, BasePage):
pattern = re.compile('.*?<h3 class="core_title_tresultt pull-left teresultt-overflow.*?>(.*?)</h3>', re.S)
result = re.search(pattern, BasePage)
if result:
return result.group(1).strip()
else:
return None
def getPageSum(self, BasePage):
pattern = re.compile('.*?<li class="l_reply_num".*?<span.*?>(.*?)</', re.S)
result = re.search(pattern, BasePage)
if result:
return result.group(1).strip()
# 传入页面内容,返回页面集合
def getPageContent(self, BasePage):
pattern = re.compile('.*?class="d_post_content j_d_post_content ">(.*?)</div>', re.S)
items = re.findall(pattern, BasePage)
contents = []
for item in items:
content = '\n' + self.tool.replace(item) + '\n'
contents.append(content.decode('utf-8'))
return contents
# 创建文件
def setFileTile(self, title):
if title is not None:
self.file = open(title + ".txt", "w+")
else:
self.file = open(self.defaultTitle + ".txt", "w+")
def writeData(self, contents):
for item in contents:
if self.floorTag == '1':
floorLine = '\n' + '------------------------------------------' + '\n'
self.file.write(floorLine)
self.file.write(item)
self.floor += 1
def start(self):
indexPage = self.getBasePage(1)
pageSum = self.getPageSum(indexPage)
#title = self.getPageTile(indexPage)
self.setFileTile(None)
if pageSum == None:
print "URL已失效,请重试"
return
try:
print "该帖子共有" + str(pageSum) + "页"
for i in range(1, int(pageSum)+1):
print "正在写入第" + str(i) + "页数据"
page = self.getBasePage(i)
contents = self.getPageContent(page)
self.writeData(contents)
# 出现写入异常
except IOError, e:
print "写入异常,原因" + e.message
finally:
print "写入任务完成"
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL, 1, 1)
bdtb.start()
支持2017\01\10
python参考:http://cuiqingcai.com/993.html、python廖雪峰基础