# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#百度贴吧
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool = Tool()
self.file = None
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read().decode('utf-8') #此处没写decode('utf-8') 导致result = re.search(pattern,page)处提示错误
except urllib2.URLError, e: #当网络连接失败时报错
if hasattr(e,"reason"):
print u"连接百度贴吧失败,错误原因",e.reason
return None
#获取帖子标题
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result = re.search(pattern,page)
print type(result)
if result:
print result.group(1) #测试输出
return result.group(1).strip()
else:
return None
#获取帖子一共有多少页
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1) #测试输出
return result.group(1).strip()
else:
return None
def getContent(self,page):
#正则表达式极易弄错
pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page)
#使显示楼层,更舒服
contents=[]
for item in items:
content = "\n"+self.tool.replace(item)+"\n"
contents.append(content.encode('utf-8'))
return contents
#将数据写入文件中
def writeTofile(self,contents,file):
self.file = open("ss.txt","w")
count = 1
for content in contents:
#楼之间的分隔符
floorLine = "\n" + str(count)+ "floor: " + u"-----------------------------------------------------------------------------------------\n"
self.file.write(floorLine)
self.file.write(content)
count+=1
self.file.close()
#处理页面标签类,因为直接使用getContent中的正则表达式会匹配到很多没用的标签,为了只匹配文字设计该类!
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
baseURL = 'http://tieba.baidu.com/p/3138733512'
bdtb = BDTB(baseURL,1)
#bdtb.getPage(1)
#bdtb.getTitle()
#bdtb.getPageNum()
contents = bdtb.getContent(bdtb.getPage(1))
bdtb.writeTofile(contents,"ss.txt")
(三)爬取百度帖子(完善)
最新推荐文章于 2020-06-16 09:24:57 发布