利用正则爬取百度贴吧；；反爬虫措施

最新推荐文章于 2021-11-16 00:00:00 发布

一只迟到的程序猿狗狗

最新推荐文章于 2021-11-16 00:00:00 发布

阅读量2.4k

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_41580211/article/details/79089430

版权

例子
爬取nba百度贴吧：

# coding: utf-8
# 引入请求包
import requests
# 引入正则表达式包
import re

# 1.准备url
url = 'https://tieba.baidu.com/p/5082744218'
# 2.获取网页源代码,发起请求接收响应
response = requests.get(url)
html = response.content

# 3.获取网页标题
# 3.1 准备网页标题的正则表达式
title_pat = re.compile(r'<title>(.*?)</title>')
# 3.2 使用search()函数搜索符合正则的字符串
rs = re.search(title_pat, html)
# 3.3 根据当前帖子的标题，拼接文件名称
file_name = rs.group(1)+'.txt'
file_handle = open(file_name.decode('utf-8'),'w')

# 4.获取总页数
# 4.1 准备总页数的正则表达式
pages_pat = re.compile(r'共<span class="red">(.*?)</span>')
# 4.2 使用search()函数搜索符合正则的字符串
rs = re.search(pages_pat, html)
# 4.3 取出总页数字符，转换为整数类型
total_page = int(rs.group(1))
print '该帖子共有%s页！'%total_page
# for循环遍历总页码次
for x in range(1, total_page+1):
    print '正在爬取第%s页，请稍后.....'%x
    # 5.根据x的值，拼接完整的url地址
    url = 'https://tieba.baidu.com/p/5082744218?pn=%s'%x
    # 6.发起请求，获取该页的html源代码
    response = requests.get(url)
    html = response.content
    # 7.准备提取数据的正则,使用re.S 可以匹配到任何字符
    pattern = re.compile(r'<li class="d_name".*?<a data-.*?>(.*?)</a>.*?<div class="d_badge_title.*?>(.*?)</div>.*?d_badge_lv">(.*?)</div>.*?<cc>(.*?)</cc>.*?<span class="tail-info.*?<a.*?>(.*?)</a>.*?<spa.*?>(.*?)</spa.*?info">(.*?)</span>', re.S)
    # 8.使用findall()查找所有符合正则的字符
    rs = re.findall(pattern, html)

    # for 循环所有数据
    for detail in rs:
        # print detail
        # 1.取出用户名
        name =  detail[0]
        # 1.1 对用户名信息进行处理
        replace_img = re.compile(r'<img.*?>')
        # 1.2 替换为-
        name = re.sub(replace_img, '-', name)
        # 2.取出头衔
        rank = detail[1]
        # 3.取出等级
        level = detail[2]
        # 4.楼层内容
        content = detail[3]
        # 4.1 替换<br>标签为\n
        content = content.replace('<br>', '\n')
        # 4.2 剔除所有的标签
        strip_ele = re.compile(r'<.*?>')
        content = re.sub(strip_ele, '', content)
        # 4.3 去除空格
        content = content.strip()
        # print content
        # 5.取出客户端
        from_device = '来自' + detail[4]
        # 如果没有客户端，就设置为来自电脑端
        if 'img' in detail[4]:
            from_device = '来自PC电脑端'
        # 6.取出楼层
        floor_num = detail[5]
        if 'a' in floor_num:
            floor_num = <

最低0.47元/天解锁文章