正则的应用--爬取百度贴吧NBA的精品贴详细的回复信息

最新推荐文章于 2021-09-30 07:00:00 发布

心月流云

最新推荐文章于 2021-09-30 07:00:00 发布

阅读量421

点赞数

分类专栏： python36(2018-3-8)

本文链接：https://blog.csdn.net/guifei010/article/details/79488411

版权

python36(2018-3-8) 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1.代码中需要引入的包
import re
from urllib import request
from fake_useragent import UserAgent
agent = UserAgent()
from tool import Tools
import xlwt
2.解析url
爬取-----精品贴
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码
https://tieba.baidu.com/p/5328438222?pn=2
/p/5381402933 帖子详情地址
pn 页码

3. 代码出现编码格式错误

改正方法如下：

get_html函数中把self.html = response.read().decode('utf-8')改为：

self.html = response.read().decode('utf-8', 'ignore')

4.代码实现了把精品贴主页帖子名字作为Excel表格的名在，存储了，帖子链接内详细的信息

5.获取下一页的简单方式

index = self.html.find('class="next pagination-item')

print(index)

完整代码：

# -*- coding:utf-8 -*-
import re
from urllib import request
from fake_useragent import UserAgent
agent = UserAgent()
from tool import Tools
import xlwt
'''
爬取-----精品贴
https://tieba.baidu.com/f?kw=nba&tab=good&cid=&pn=0
kw 贴吧名称
tab 帖子类型
pn 数据页码

https://tieba.baidu.com/p/5328438222?pn=2

/p/5381402933 帖子详情地址
pn 页码
'''


class BDTBSpider(object):

    def __init__(self):
        self.url = 'https://tieba.baidu.com'
        self.html = ''
        self.headers = {'User-Agent': agent.random}

    # 发起请求
    def get_html(self, url):
        req = request.Request(url, headers=self.headers)
        response = request.urlopen(req)
        self.html = response.read().decode('utf-8', 'ignore')

    # 解析数据
    def parse_link(self):
        pattern = re.compile('<div class="threadlist_title.*?<a rel="noreferrer".*?href="(.*?)".*?title="(.*?)"', re.S)
        res = re.findall(pattern, self.html)
        for info in res:
            print("正在爬取{},请稍后.....".format(info[1]))
            # 拼接帖子详情地址
            url = self.url + info[0]
            print('帖子链接：{}'.format(url))
            self.get_html(url)
            # 创建一个workbook
            workbook = xlwt.Workbook(encoding='utf-8')
            sheet = workbook.add_sheet('data')
            sheet.write(0, 0, '用户昵称')
            sheet.write(0, 1, '用户头衔')
            sheet.write(0, 2, '用户等级')
            sheet.write(0, 3, '发表内容')
            sheet.write(0, 4, '客户端')
            sheet.write(0, 5, '楼层')
            sheet.write(0, 6, '发布日期')
            self.count = 1
            # 帖子标题传进来
            self.parse_detail(sheet)
            # 保存
            workbook.save(info[1] + '.xls')
        # 先找到class="next pagination-item 字符的位置
        index = self.html.find('class="next pagination-item')
        print(index)
        next_html = self.html[index-80:index]
        next_pat = re.compile('<a href="(.*?)"')
        # 该贴吧主页的下一页
        next_link = re.search(next_pat, next_html)
        # if next_link:
        #     link = 'http:' + next_link.group(1)
        #     print(next_link.group(1).split('&')[-1])
        #     self.get_html(link)
        #     self.parse_link()
        # else:
        #     print('没有下一页')

    # 解析详情页的函数
    def parse_detail(self, sheet):
        # 准备正则，从self.html中解析数据
        pattern = re.compile('<li class="d_name".*?>(.*?)</li>.*?class="d_badge_title ">(.*?)</div>.*?class="d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<div class="post-tail-wrap">(.*?)</div>', re.S)
        res = re.findall(pattern, self.html)
        # print(res)
        # for循环拿回数据
        for info in res:
            nickname = Tools.strip_char(info[0])
            content = Tools.strip_char(info[3])
            # print(nickname, info[1], info[2], content)
            # msg  是一个元组，元组存放的是处理之后的数据
            msg = Tools.get_client_floor_date(info[4])
            print(msg)
            sheet.write(self.count, 0, nickname)
            sheet.write(self.count, 1, info[1])
            sheet.write(self.count, 2, info[2])
            sheet.write(self.count, 3, content)
            sheet.write(self.count, 4, msg[0])
            sheet.write(self.count, 5, msg[1])
            sheet.write(self.count, 6, msg[2])
            self.count += 1
        # 帖子链接，进去之后获取楼主等详细信息，找下一页位置
        index = self.html.find('下一页')
        if index != -1:
            next_html = self.html[index-40:index]
            pattern = re.compile('<a href="(.*?)"')
            next_href = re.search(pattern, next_html).group(1)
            print(next_href)
            # 拼接完整地址 例如：https://tieba.baidu.com/p/5328438222?pn=2
            url = self.url + next_href
            self.get_html(url)
            self.parse_detail(sheet)
        else:
            print('没有下一页')

    def start(self):
        self.get_html('https://tieba.baidu.com/f?kw=%s&tab=good&cid=&pn=0' % name)
        # print(self.url)
        self.parse_link()
        # self.parse_detail()


if __name__ == '__main__':
    bdtb = BDTBSpider()
    name = input('请输入贴吧名称:')
    bdtb.start()

引入的工具类的完整代码：

# -*- coding:utf-8 -*-
import re
import sqlite3

class Tools(object):
    @classmethod
    def strip_char(cls, string):
        """

        :param string:  要处理的数据
        :return:  处理之后的数据
        """
        # 利用正则去除特殊字符
        # (第一个string是内涵段子中引用时用到的)
        string = re.sub(re.compile('\n|\t| |<.*?>', re.S), '', string)
        #  糗事百科用到的下面两个string
        # string = re.sub(re.compile('\n|\t| ', re.S), '', string)
        # 将换行标签替换为\n
        # string = re.sub(re.compile('</br/>'), '\n', string)
        return string

    # 处理百度贴吧中的客户端  楼层   发布日期
    @classmethod
    def get_client_floor_date(cls, string):
        '''
        :param string:包含客户端\楼层\发布日期的字符串
        :return: 元组
        '''
        pattern = ''
        client = ''
        floor = ''
        date = ''
        if '来自' in string:
            pattern = re.compile('<span class="tail-info".*?<a rel="noopener.*?>(.*?)</a>.*?class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
            res = re.search(pattern, string)
            client = '来自' + res.group(1)
            floor = res.group(2)
            date = res.group(3)
        else:
            pattern = re.compile('<span class="tail-info">(.*?)</span.*?class="tail-info">(.*?)</span>')
            res = re.search(pattern, string)
            client = '来自Web客户端'
            floor = res.group(1)
            date = res.group(2)
        # 返回结果元组
        return client, floor, date



# 数据库管理
class DBManager(object):
    # 声明类变量
    connect = None
    cursor = None

    # 连接数据库
    @classmethod
    def connect_db(cls):
        cls.connect = sqlite3.connect('qsbk.db')
        cls.cursor = cls.connect.cursor()

    @classmethod
    def close_db(cls):
        cls.cursor.close()
        cls.connect.close()

    # 向数据库中插入数据
    @classmethod
    def insert_data(cls, dz_tuple):
        sql = "insert into qsbk(name,age,content,vote,comments) values('%s', %s, '%s', %s, %s) " % (dz_tuple[0], dz_tuple[1], dz_tuple[2], dz_tuple[3], dz_tuple[4])
        cls.cursor.execute(sql)
        cls.connect.commit()


if __name__ == '__main__':
    DBManager.connect_db()

运行结果：