爬取百度贴吧的指定贴吧的所有图片

项目分析

 

我们发现请求的每页的html的核心内容是被注释了的,所有我们无法通过beautiful或者lxml的css选择器获取,这里我们采用正则表达式进行匹配,通过分析我们发现每个帖子都有 rel="noopener",class="j_th_tit ",那这样就好办了!而在每个详情帖子中,我们发现每个人发的图片都有一个叫BDE_Image的class属性,并且html的核心内容没有被注释,所以我们可以直接使用beautiful或者xpath获取。

链接分析
# 第一页
https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%8E%8B%E7%9A%84%E6%97%A5%E5%B8%B8&ie=utf-8&pn=0
# 第二页
https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%8E%8B%E7%9A%84%E6%97%A5%E5%B8%B8&ie=utf-8&pn=50

 我们发现kw就是我们搜索的贴吧内容,pn就是每页的内容,并且每一页的增量是50,通过这个分析,我们就是可以来拼接链接进行请求了

难点分析

如何获取每个帖子的详情链接

如果获取帖子详情页下面的图片

解决办法

获取每个帖子的详情链接
# 正则匹配所有的帖子的详情链接
pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
                         re.S)
获取帖子详情页的图片
# 查找每个帖子的图片
        soup = BeautifulSoup(detail_html.text, 'lxml')
        imgs = soup.find_all(class_="BDE_Image")

获取每页的html内容

def get_page_info(url, kw, pn):
    """
    获取每页的html
    :param url: 请求对的链接
    :param kw: 搜索的内容
    :param pn: 页数
    :return: 返回获取的html
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
        'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
        'Host': 'tieba.baidu.com',
        'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
    }
    params = {
        'kw': kw,
        'ie': 'utf - 8',
        'pn': pn
    }
    try:
        res = requests.get(url=url, headers=headers, params=params)
        if res.status_code == 200:
            return res.text
        return None
    except requests.exceptions.RequestException as e:
        print(e)
        return None

解析每页的html并且获取每个帖子的图片

def parse_html(html, kw, pn):
    """
    解析每页的html文本
    :param html:每页的html
    :param kw: 搜索的贴吧
    :param pn: 页数
    :return: null
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
        'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
        'Host': 'tieba.baidu.com',
        'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
    }
    # 正则匹配所有的帖子的详情链接
    pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
                         re.S)
    titles = re.findall(pattern, html)[1:]
    index = 0
    prifix_url = 'https://tieba.baidu.com'
    for title in titles:
        title_url = prifix_url + title[0]
        # 帖子的名称
        post_name = deal_special_words(title[1])
        # 获取每个详情帖子的html
        detail_html = requests.get(url=title_url, headers=headers)
        # 查找每个帖子的图片
        soup = BeautifulSoup(detail_html.text, 'lxml')
        imgs = soup.find_all(class_="BDE_Image")
        for img in imgs:
            # 请求下载每个帖子的图片
            img_url = img['src']
            img_content = requests.get(url=img_url).content
            path = post_name + str(index) + '.' + 'jpg'
            if not os.path.exists(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}'):
                os.mkdir(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}')
            with open(f'imgs/{kw}/{path}', mode='wb') as f:
                f.write(img_content)
                index += 1
                print(f'{kw}第{round(int((pn / 50) + 1), 0)}页 {path} 下载完毕')
        index = 0

处理特殊字符

def deal_special_words(word):
    """
    处理特殊字符
    :param word: 需要处理逇特殊文本
    :return: 返回处理后的文本
    """
    # 去掉特殊符号
    replaceWord = '\/:*?"<>|.,;,。;#??"‘’!@“#¥%……&*()!@#$%^&*()'
    for i in replaceWord:
        if i in word:
            word = word.replace(i, '')  # 替换成''
    word = word.replace("\n", '')
    word.strip()
    return word

完整代码示例

import os

import requests
import re

from bs4 import BeautifulSoup
import math


def get_page_info(url, kw, pn):
    """
    获取每页的html
    :param url: 请求对的链接
    :param kw: 搜索的内容
    :param pn: 页数
    :return: 返回获取的html
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
        'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
        'Host': 'tieba.baidu.com',
        'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
    }
    params = {
        'kw': kw,
        'ie': 'utf - 8',
        'pn': pn
    }
    try:
        res = requests.get(url=url, headers=headers, params=params)
        if res.status_code == 200:
            return res.text
        return None
    except requests.exceptions.RequestException as e:
        print(e)
        return None


def deal_special_words(word):
    """
    处理特殊字符
    :param word: 需要处理逇特殊文本
    :return: 返回处理后的文本
    """
    # 去掉特殊符号
    replaceWord = '\/:*?"<>|.,;,。;#??"‘’!@“#¥%……&*()!@#$%^&*()'
    for i in replaceWord:
        if i in word:
            word = word.replace(i, '')  # 替换成''
    word = word.replace("\n", '')
    word.strip()
    return word


def parse_html(html, kw, pn):
    """
    解析每页的html文本
    :param html:每页的html
    :param kw: 搜索的贴吧
    :param pn: 页数
    :return: null
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
        'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
        'Host': 'tieba.baidu.com',
        'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
    }
    # 正则匹配所有的帖子的详情链接
    pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
                         re.S)
    titles = re.findall(pattern, html)[1:]
    index = 0
    prifix_url = 'https://tieba.baidu.com'
    for title in titles:
        title_url = prifix_url + title[0]
        # 帖子的名称
        post_name = deal_special_words(title[1])
        # 获取每个详情帖子的html
        detail_html = requests.get(url=title_url, headers=headers)
        # 查找每个帖子的图片
        soup = BeautifulSoup(detail_html.text, 'lxml')
        imgs = soup.find_all(class_="BDE_Image")
        for img in imgs:
            # 请求下载每个帖子的图片
            img_url = img['src']
            img_content = requests.get(url=img_url).content
            path = post_name + str(index) + '.' + 'jpg'
            if not os.path.exists(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}'):
                os.mkdir(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}')
            with open(f'imgs/{kw}/{path}', mode='wb') as f:
                f.write(img_content)
                index += 1
                print(f'{kw}第{round(int((pn / 50) + 1), 0)}页 {path} 下载完毕')
        index = 0


def main(kw, pn):
    """
    主函数
    :param kw:搜索的内容
    :param pn: 页数
    :return: null
    """
    url = 'https://tieba.baidu.com/f'
    html = get_page_info(url=url, kw=kw, pn=pn)
    # print(html)
    parse_html(html=html, kw=kw, pn=pn)


if __name__ == '__main__':
    kw = input('请输入您想搜索的贴吧:')
    pns = int(input('您想爬取几页内容:'))
    for pn in range(0, pns):
        main(kw, pn * 50)

效果展示

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值