BeautifulSoup简单爬取百度贴吧

最新推荐文章于 2023-10-28 13:38:56 发布

越过山丘宁宁宁

最新推荐文章于 2023-10-28 13:38:56 发布

阅读量1.7k

点赞数 1

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/ningyingqi/article/details/78759246

版权

python 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

BeautifulSoup简单爬取百度贴吧()

一.分析百度贴吧网页信息
注意：本人使用的环境为python3.6+pycharm2017.2.4

我们以百度贴吧权利的游戏吧为例:http://tieba.baidu.com/f?ie=utf-8&kw=权利的游戏&fr=search
分析网页我们发现规律:

这里写图片描述

每换一页pn增加50 这个在我们抓取每页信息循环页数的时候用得上。
好了我们现在来分析下我们需要获取的东西有哪些？

1.话题

2.作者id

3.最后回复者的id

4.最后回复时间

这里写图片描述

二.编写代码

1.日志部分

为了方便我们查看我们的操作记录我们写一个MyLog类打印日志信息

#! -*- encoding:utf-8 -*-\
"""
乱码问题 解决方式一:#! -*- encoding:utf-8 -*-
          方式二:u'哈哈哈'  字符串以unicode格式存储
"""
import logging
import getpass
import sys

class MyLog(object):
    #构造方法
    def __init__(self):
        self.user=getpass.getuser()
        self.logger=logging.getLogger(self.user)
        self.logger.setLevel(   logging.DEBUG )  
        #日志的级别  critical error warn info  debug

        #定义日志文件
# 从命令行参数中取出第一个参数，并取从0开始到   倒数第三个字符    拼接成文件名
        self.formatter=logging.Formatter('%(asctime) -12s %(levelname) -8s %(name) -10s %(message)-12s\r\n')  #日志输出的格式

        #日志输出到文件    logging有三个内置的Handler,
        self.logHand=logging.FileHandler(self.logFile, encoding='utf8')
        self.logHand.setFormatter(   self.formatter  )   #设置 格式
        self.logHand.setLevel(   logging.DEBUG )  #设置 级别

        #日志输出 到屏幕，这是标准输出流
        self.logHandSt=logging.StreamHandler()
        self.logHandSt.setFormatter(  self.formatter )
        self.logHand.setLevel(  logging.DEBUG )

        #将两个Handler加入到  logger中
        self.logger.addHandler(   self.logHand )
        self.logger.addHandler(    self.logHandSt )

    #重新定义logger中的日志输出的级别的方法
    def debug(self,msg):
        self.logger.debug(msg)

    def  info(self,msg):
        self.logger.info(msg)

    def warn(self,msg):
        self.logger.warn(msg)

    def error(self,msg):
        self.logger.error(msg)

    def critical(self,msg):
        self.logger.critical(msg)

if __name__=='__main__':
    mylog=MyLog()
    mylog.debug(u'debug测试')
    mylog.info(u'info测试')
    mylog.warn(u'warn测试')
    mylog.error(u'error测试')
    mylog.critical(u'critical测试')

2.获取url
一般爬虫爬取信息可分为三部分：1.获取url 2.爬取信息 3.信息保存
1）获取url
根据我们上面图片的分析开始编写代码:

self.pageSum=1#要爬取的页数
#单独定义一个保存目标网站url的类
# 根据pageSum拼装要爬取的地址   思考一 ：如何确定总共有多少页数？
    def getUrls(self,pageSum):
        urls=[]
        pns=[ str(i*50) for i in range(pageSum)]
        ul=self.url.split('=')
        for pn in pns:
            ul[-1]=pn   #取最后一个'=' 也就是页数对应的数值  替换页数 0 50 100 150 200
            url='='.join( ul )
            urls.append( url )
            self.log.debug('----待爬取的地址有:'+url)
        self.log.info(u'获取URL成功')
        return urls

2）爬虫部分
数据爬取我们利用lxml解析的方式。同样我们定义一个类spider。

    def spider(self,urls):
        items=[]
        for url in urls:
            htmlContent=self.getResponseContent(url)
            soup=BeautifulSoup(htmlContent,'lxml')
            tagsli=soup.find_all('li',attrs={'class':' j_thread_list clearfix'})
            for tag in tagsli:
                item=Item()
                item.title=tag.find('a',attrs={'class':'j_th_tit '}).get_text().strip()
                item.author=tag.find('div',attrs={'class':'threadlist_author pull_right'}).span['title']
                #item.author = tag.find('span', attrs={'class': 'tb_icon_author '}).get('title') 错误
                item.lastreplypeople= tag.find('span', attrs={'class': 'tb_icon_author_rely j_replyer'})['title']
                item.lastreplytime=tag.find('span', attrs={'class': 'threadlist_reply_date pull_right j_reply_data'}).getText().strip()

                items.append(item)
                #self.log.info(u'获取标题为 -->%s--< 的帖子成功 作者是:%s' %(item.title,item.author))
                self.log.info(u'获取标题为 -->%s--< 的帖子成功' % (item.title))
        return items

3）数据保存

 # 保存爬取的文件
    def pipelines(self,items):
        fileName=u'百度贴吧.txt'.encode('utf8')
        with codecs.open(fileName,'w','utf8') as fp:
            for item in items:
                fp.write('%s %s %s %s\t\t  \r\n' %(item.title,item.author,item.lastreplypeople,item.lastreplytime))
                #fp.write('%s \r\n' % (item.title))
                self.log.info(u'标题为 -->%s<-- 的帖子保存成功' %(item.title))

4）打开网页当然还需要另外一个类来获取网页数据

 def getResponseContent(self,url):
        #对地址的中文进行编码
        try:
            url=quote(url,safe=string.printable)
            response=urllib.request.urlopen(url)
        except  error.URLError as e:
            self.log.error(u'python爬取%s 出错了' %url)
            print (e)
        else:
            self.log.info(u'python爬取%s 成功' %url)
            return response.read()

三.总结
爬取信息部分值得注意的是分析网页定位元素的时候一定细心。多尝试。如何产看网页源代码应该不用说明了吧哈哈
最后附上源码：

#! -*- encoding:utf-8 -*-\

from MyLog import MyLog
import string
from urllib.parse import quote
from urllib import error
import urllib.request
from bs4 import BeautifulSoup
import  codecs


class Item(object):
    title=None
    author=None
    lastreplypeople = None
    lastreplytime=None

class GetBaiTieBa(object):
    def __init__(self,url):
        self.url=url
        self.log=MyLog()
        self.pageSum=1#要爬取的页数
        self.urls=self.getUrls(self.pageSum) #根据pageSum拼装要爬取的地址
       #开始爬取
        self.items=self.spider(self.urls)
        #存
        self.pipelines(self.items)


    # 根据pageSum拼装要爬取的地址   思考一 ：如何确定总共有多少页数？
    def getUrls(self,pageSum):
        urls=[]
        pns=[ str(i*50) for i in range(pageSum)]
        ul=self.url.split('=')
        for pn in pns:
            ul[-1]=pn   #取最后一个'=' 也就是页数对应的数值  替换页数 0 50 100 150 200
            url='='.join( ul )
            urls.append( url )
            self.log.debug('----待爬取的地址有:'+url)
        self.log.info(u'获取URL成功')
        return urls

    #数据爬取 利用lxml解析
    def spider(self,urls):
        items=[]
        for url in urls:
            htmlContent=self.getResponseContent(url)
            soup=BeautifulSoup(htmlContent,'lxml')
            tagsli=soup.find_all('li',attrs={'class':' j_thread_list clearfix'})
            for tag in tagsli:
                item=Item()
                item.title=tag.find('a',attrs={'class':'j_th_tit '}).get_text().strip()
                item.author=tag.find('div',attrs={'class':'threadlist_author pull_right'}).span['title']
                #item.author = tag.find('span', attrs={'class': 'tb_icon_author '}).get('title') 错误
                item.lastreplypeople= tag.find('span', attrs={'class': 'tb_icon_author_rely j_replyer'})['title']
                item.lastreplytime=tag.find('span', attrs={'class': 'threadlist_reply_date pull_right j_reply_data'}).getText().strip()

                items.append(item)
                #self.log.info(u'获取标题为 -->%s--< 的帖子成功 作者是:%s' %(item.title,item.author))
                self.log.info(u'获取标题为 -->%s--< 的帖子成功' % (item.title))
        return items

    def getResponseContent(self,url):
        #对地址的中文进行编码
        try:
            url=quote(url,safe=string.printable)
            response=urllib.request.urlopen(url)
        except  error.URLError as e:
            self.log.error(u'python爬取%s 出错了' %url)
            print (e)
        else:
            self.log.info(u'python爬取%s 成功' %url)
            return response.read()

    # 保存爬取的文件
    def pipelines(self,items):
        fileName=u'百度贴吧.txt'.encode('utf8')
        with codecs.open(fileName,'w','utf8') as fp:
            for item in items:
                fp.write('%s %s %s %s\t\t  \r\n' %(item.title,item.author,item.lastreplypeople,item.lastreplytime))
                #fp.write('%s \r\n' % (item.title))
                self.log.info(u'标题为 -->%s<-- 的帖子保存成功' %(item.title))

if __name__=='__main__':
    url=u'http://tieba.baidu.com/f?kw=权利的游戏&ie=utf-8&pn=50'
    print ('url:',url)
    tb=GetBaiTieBa(url)

保存的text：
这里写图片描述

越过山丘宁宁宁

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
BeautifulSoup简单爬取百度贴吧

BeautifulSoup简单爬取百度贴吧()一.分析百度贴吧网页信息注意：本人使用的环境为python3.6+pycharm2017.2.4我们以百度贴吧权利的游戏吧为例:http://tieba.baidu.com/f?ie=utf-8&kw=权利的游戏&fr=search 分析网页我们发现规律:每换一页pn增加50 这个在我们抓取每页信息循环页数的时候用得上。好了我们现在来分析下我们
复制链接

扫一扫