爬取百度贴吧精品贴,并将每一帖子以其命名放入excel表格中

最新推荐文章于 2021-11-22 18:55:55 发布

dayun555

最新推荐文章于 2021-11-22 18:55:55 发布

阅读量463

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/dayun555/article/details/79488480

版权

python 专栏收录该内容

74 篇文章 2 订阅

订阅专栏

# -*- coding:utf-8 -*-
import re
from urllib import request
# 引入自定义的工作类  在此博文后会给出
from tools import Tools
from fake_useragent import UserAgent
agent = UserAgent()
import xlwt

"""
    https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
    kw 贴吧名称
    tab 帖子类型
    pn 数据页码
    
    https://tieba.baidu.com/p/5328438222?pn=2
    
    /p/5381402933 帖子详情地址
    pn 页码
"""


class BDTBSpider(object):
    def __init__(self):
        self.url = 'https://tieba.baidu.com'
        self.html = ''
        self.headers = {'User-Agent':agent.random}

    def get_html(self, url):
        """

        :param url:传入完整的url
        :return:
        """
        req = request.Request(url, headers=self.headers)
        response = request.urlopen(req)
        self.html = response.read().decode('utf-8','ignore')

    # 从大网页中解析标题链接与标题
    def parse_link(self):
        pattern = re.compile('<div class="threadlist_title pull_left j_th_tit.*?".*?<a rel="noreferrer".*?href="('
                             '.*?)".*?title="(.*?)"',re.S)
        res = re.findall(pattern, self.html)
        # print(res)
        for info in res:
            print("正在爬取{},请稍后.....".format(info[1]))
            # 拼接帖子详情地址
            url = self.url + info[0]
            self.get_html(url)
            # 创建一个workbook
            workbook = xlwt.Workbook(encoding='utf-8')
            sheet = workbook.add_sheet('data')
            sheet.write(0, 0, '用户昵称')
            sheet.write(0, 1, '用户头衔')
            sheet.write(0, 2, '用户等级')
            sheet.write(0, 3, '发表内容')
            sheet.write(0, 4, '客户端')
            sheet.write(0, 5, '楼层')
            sheet.write(0, 6, '发布日期')
            self.count = 1
            # 帖子标题传进来
            self.parse_detail(sheet)
            # 保存
            workbook.save(info[1] + '.xls')

        # 以下代码为获取到下一页链接
        # 先找到class="next pagination-item 字符的位置
        # 在这里注意看网页的源代码
        index = self.html.find('class="next pagination-item')
        next_html = self.html[index-80:index]
        # 定义获取下一页链接的正则
        next_pat = re.compile('<a href="(.*?)"')
        next_link = re.search(next_pat, next_html)
        if next_link:
            link = 'http:'+next_link.group(1)
            print(link)
            self.get_html(link)
            self.parse_link()
        else:
            print('没有下一页')

    # 解析详情页的函数，获取帖子具体内容
    def parse_detail(self, sheet):
        print('正在爬取下一页，请稍后')
        pattern = re.compile('<li class="d_name".*?>(.*?)</li>.*?<div class="d_badge_title ">(.*?)</div>.*?<div class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap"(.*?)</div>', re.S)
        res = re.findall(pattern, self.html)
        for info in res:
            nickname = Tools.strip_char(info[0])
            badge_title = info[1]
            badge_lv = info[2]
            content = Tools.strip_char(info[3])
            # msg是一个元组,元组存放的是处理之后的数据
            msg = Tools.get_client_floor_data(info[4])
            sheet.write(self.count, 0, nickname)
            sheet.write(self.count, 1, badge_title)
            sheet.write(self.count, 2, badge_lv)
            sheet.write(self.count, 3, content)
            sheet.write(self.count, 4, msg[0])
            sheet.write(self.count, 5, msg[1])
            sheet.write(self.count, 6, msg[2])
            self.count += 1
        # 获取下一页
        index = self.html.find('下一页')
        if index != -1:
            next_html = self.html[index-50:index]
            next_pattern = re.compile('<a href="(.*?)"')
            next_link = re.search(next_pattern,next_html).group(1)
            # print(next_link)
            # 拼接url
            next_url = self.url + next_link
            self.get_html(next_url)
            self.parse_detail(sheet)
        else:
            print('没有下一页')

工具类Tools所在文件tools:

# -*- coding:utf-8 -*-
import re
import sqlite3

class Tools(object):
    @classmethod
    def strip_char(cls,string):
        """
        :param string: 要进行处理的数据
        :return: 处理之后的数据
        """
        # 利用正则去除字符
        string = re.sub(re.compile('\n|\t| |<.*?>', re.S), '', string)
        # 将换行标签替换为\n
        # string = re.sub(re.compile('</br/>'),'\n',string)
        return string

    #处理百度天霸中的客户端 楼层  发布日期
    @classmethod
    def get_client_floor_data(cls,string):
        """

        :param string: 包含客户端\楼层、发布日期的字符串
        :return: 元组
        """
        pattern = ''
        client = ''
        floor = ''
        date = ''
        if '来自' in string:
            pattern = re.compile('<span class="tail-info".*?<a rel="noopener.*?>(.*?)</a>.*?class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
            res = re.search(pattern,string)
            client = '来自' + res.group(1)
            floor = res.group(2)
            date = res.group(3)
        else:
            pattern = re.compile('<span class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
            res = re.search(pattern, string)
            client = '来自Web客户端'
            floor = res.group(1)
            date = res.group(2)
        # 返回结果元组
        return client,floor,date

# 数据库的管理类
class DBManger(object):
    # 声明类变量
    connect = None
    cursor = None

    # 连接数据库的操作
    @classmethod
    def connect_db(cls):
        cls.connect = sqlite3.connect('qsbk.db')
        cls.cursor = cls.connect.cursor()

    # 关闭数据库的操作
    @classmethod
    def close_db(cls):
        cls.cursor.close()
        cls.connect.close()

    # 向数据库中插入数据
    @classmethod
    def insert_data(cls,dz_tuple):
        sql = 'INSERT INTO qsbk(name,age,content,vote,comments)VALUES ("%s",%s,"%s",%s,%s)'%(dz_tuple[0],dz_tuple[1],dz_tuple[2],dz_tuple[3],dz_tuple[4])
        # 执行SQL语句
        cls.cursor.execute(sql)
        cls.connect.commit()


DBManger.connect_db()

运行结果展示：