抓取百度贴吧python小爬虫（2015最新版）

最新推荐文章于 2024-05-03 18:15:12 发布

djd已经存在

最新推荐文章于 2024-05-03 18:15:12 发布

阅读量2.4k

点赞数 1

分类专栏： python网络爬虫文章标签：正则表达式爬虫网络爬虫 python 源代码

本文链接：https://blog.csdn.net/djd1234567/article/details/45230147

版权

python网络爬虫专栏收录该内容

33 篇文章 0 订阅

订阅专栏

由于百度贴吧的编码格式和HTML代码变动，许多旧的爬虫已失效。本文介绍一个2015年更新的Python爬虫源代码，用于应对这些变化，实现有效抓取。

摘要由CSDN通过智能技术生成

网上好多抓取贴吧的小爬虫都失效了，原因是百度贴吧的编码格式变了，或者是html代码变了，像这种简单的读取源代码的爬虫，只要网页源代码改变之后就得重新修改。

请诸位大牛指点。

# -*- coding:utf8 -*-
"""
程序就是读取网页的源代码，如果想获取相应的内容就找到其特定的格式，再利用正则表达式来获取。
"""
import string
import urllib2
import re

class html_Tool:

    #为了提取文字内容，我们将其与的符号均去掉，采用的方法就是替换成空。
    replacechar1 = re.compile("(\t|\n| |<a.*?>|<img.*?>)")
    replacechar2 = re.compile("<.*?>")
    replacechar3 = re.compile("<p.*?>")
    replacechar4 = re.compile("(<br/>|</p>|<tr>|<div>|</div>)")
    replacechar5 = re.compile("<td>")

    #html有5种转义字符，将其还原回来。
    replaceSymbol = [("<","<"),(">",">"),('"',"\""),(" "," ")]

    def ReplaceChar(self,x):
        x = self.replacechar1.sub("",x)
        x = self.replacechar2.sub("",x)
        x = self.replacechar3.sub("",x)
        x = self.replacechar4.sub("",x)
        x = self.replacechar5.sub("",x)

        for t in self.replaceSymbol:
            x = x.replace(t[0],t[1])
        return x

class BaiduTieba_Spider:
    
    def __init__(self,url):
        self.myUrl = url + '?see_lz=1'
        self.datas = []
        self.myTool = html_Tool()
        print u"已经启动百度贴吧爬虫，哇哈哈哈。"

    def baidu_tieba(self):
        mypage = urllib2.urlopen(self.myUrl).read().decode("utf8")
        PageNum = self.page_counter(mypage)#获取文章的页数
        title = self.find_title(mypage)#获取文章的标题
        print u"文章名称："+title
        self.save_data(self.myUrl,title,PageNum)#将内容存储在txt文件中

    def page_counter(self,mypage):#有几个<span class="red">就有几页
        myMatch = re.search(r'class="red">(\d+?)</span>',mypage,re.S)
        
        if myMatch:
            PageNum = int(myMatch.group(1))
            print u"爬虫报告：发现楼主有%d页原创内容" %PageNum
        else:
            PageNum = 0
            print u"爬虫报告：没看楼主的套路。。。"
            
        return PageNum

    def find_title(self,mypage):#<h1 class="core_title_txt后面就是文章标题
        myMatch = re.search(r'<h1 class="core_title_txt.*?>(.*?)</h1>',mypage,re.S)
        title = u'初始化标题'
        
        if myMatch:
            title = myMatch.group(1)
        else:
            print u"爬虫报告：无法加载文章标题！"
            
        #将一些特殊字符均置为空，因为一会要创建文件，文件名中不允许存在特殊字符。
        title = title.replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('<','').replace('>','').replace('|','')
        return title

    def save_data(self,url,title,PageNum):
        self.get_data(url,PageNum)
        
        f = open(title+'.txt','w+')
        f.writelines(self.datas)
        f.close()
        
        print u"爬虫报告：文件已下载到本地并打包成txt格式文件"
        print u"请输入任意键退出。。。"
        raw_input()

    def get_data(self,url,PageNum):
        url = url + "&pn="
        
        for i in range(1,PageNum+1):
            print u"爬虫报告：爬虫%d号正在加载中。。。"%i
            mypage = urllib2.urlopen(url+str(i)).read()
            
            #print mypage 测试
            
            self.deal_data(mypage.decode("utf8"))

    def deal_data(self,mypage):
        #id="post_content后面就是楼主发表的内容，去掉一些符号后，就是内容。
        myItems = re.findall('id="post_content.*?>(.*?)</div>',mypage,re.S)
        
        #print myItems 测试
        
        for item in myItems:
            #调用替换字符函数，将符号均置为空，最后将内容存入数组中存在文件里。
            data = self.myTool.ReplaceChar(item.replace("\n","").encode("utf8"))
            self.datas.append(data+'\n')


print u"""
 ---------------------
|   京东放养的爬虫      |
 ---------------------
 """

print u"请输入贴吧地址最后的数字"
bdurl = "http://tieba.baidu.com/p/"+str(raw_input(u"http://tieba.baidu.com/p/"))

mySpider = BaiduTieba_Spider(bdurl)
mySpider.baidu_tieba()