项目分析
我们发现请求的每页的html的核心内容是被注释了的,所有我们无法通过beautiful或者lxml的css选择器获取,这里我们采用正则表达式进行匹配,通过分析我们发现每个帖子都有 rel="noopener",class="j_th_tit ",那这样就好办了!而在每个详情帖子中,我们发现每个人发的图片都有一个叫BDE_Image的class属性,并且html的核心内容没有被注释,所以我们可以直接使用beautiful或者xpath获取。
链接分析
# 第一页
https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%8E%8B%E7%9A%84%E6%97%A5%E5%B8%B8&ie=utf-8&pn=0
# 第二页
https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%8E%8B%E7%9A%84%E6%97%A5%E5%B8%B8&ie=utf-8&pn=50
我们发现kw就是我们搜索的贴吧内容,pn就是每页的内容,并且每一页的增量是50,通过这个分析,我们就是可以来拼接链接进行请求了
难点分析
如何获取每个帖子的详情链接
如果获取帖子详情页下面的图片
解决办法
获取每个帖子的详情链接
# 正则匹配所有的帖子的详情链接
pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
re.S)
获取帖子详情页的图片
# 查找每个帖子的图片
soup = BeautifulSoup(detail_html.text, 'lxml')
imgs = soup.find_all(class_="BDE_Image")
获取每页的html内容
def get_page_info(url, kw, pn):
"""
获取每页的html
:param url: 请求对的链接
:param kw: 搜索的内容
:param pn: 页数
:return: 返回获取的html
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
'Host': 'tieba.baidu.com',
'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
}
params = {
'kw': kw,
'ie': 'utf - 8',
'pn': pn
}
try:
res = requests.get(url=url, headers=headers, params=params)
if res.status_code == 200:
return res.text
return None
except requests.exceptions.RequestException as e:
print(e)
return None
解析每页的html并且获取每个帖子的图片
def parse_html(html, kw, pn):
"""
解析每页的html文本
:param html:每页的html
:param kw: 搜索的贴吧
:param pn: 页数
:return: null
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
'Host': 'tieba.baidu.com',
'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
}
# 正则匹配所有的帖子的详情链接
pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
re.S)
titles = re.findall(pattern, html)[1:]
index = 0
prifix_url = 'https://tieba.baidu.com'
for title in titles:
title_url = prifix_url + title[0]
# 帖子的名称
post_name = deal_special_words(title[1])
# 获取每个详情帖子的html
detail_html = requests.get(url=title_url, headers=headers)
# 查找每个帖子的图片
soup = BeautifulSoup(detail_html.text, 'lxml')
imgs = soup.find_all(class_="BDE_Image")
for img in imgs:
# 请求下载每个帖子的图片
img_url = img['src']
img_content = requests.get(url=img_url).content
path = post_name + str(index) + '.' + 'jpg'
if not os.path.exists(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}'):
os.mkdir(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}')
with open(f'imgs/{kw}/{path}', mode='wb') as f:
f.write(img_content)
index += 1
print(f'{kw}第{round(int((pn / 50) + 1), 0)}页 {path} 下载完毕')
index = 0
处理特殊字符
def deal_special_words(word):
"""
处理特殊字符
:param word: 需要处理逇特殊文本
:return: 返回处理后的文本
"""
# 去掉特殊符号
replaceWord = '\/:*?"<>|.,;,。;#??"‘’!@“#¥%……&*()!@#$%^&*()'
for i in replaceWord:
if i in word:
word = word.replace(i, '') # 替换成''
word = word.replace("\n", '')
word.strip()
return word
完整代码示例
import os
import requests
import re
from bs4 import BeautifulSoup
import math
def get_page_info(url, kw, pn):
"""
获取每页的html
:param url: 请求对的链接
:param kw: 搜索的内容
:param pn: 页数
:return: 返回获取的html
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
'Host': 'tieba.baidu.com',
'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
}
params = {
'kw': kw,
'ie': 'utf - 8',
'pn': pn
}
try:
res = requests.get(url=url, headers=headers, params=params)
if res.status_code == 200:
return res.text
return None
except requests.exceptions.RequestException as e:
print(e)
return None
def deal_special_words(word):
"""
处理特殊字符
:param word: 需要处理逇特殊文本
:return: 返回处理后的文本
"""
# 去掉特殊符号
replaceWord = '\/:*?"<>|.,;,。;#??"‘’!@“#¥%……&*()!@#$%^&*()'
for i in replaceWord:
if i in word:
word = word.replace(i, '') # 替换成''
word = word.replace("\n", '')
word.strip()
return word
def parse_html(html, kw, pn):
"""
解析每页的html文本
:param html:每页的html
:param kw: 搜索的贴吧
:param pn: 页数
:return: null
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'Cookie': '''BIDUPSID=AE0057AFE3B8DD4419EE743A83E92DDF; PSTM=1691576823; BAIDUID=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; NO_UNAME=1; BDUSS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; BDUSS_BFESS=hHWVd3MWpzQ3U1aXgta0lyUGZsakwwTjM2TThOck9hdk0ybzVkaE85Tmc4UU5sSVFBQUFBJCQAAAAAAQAAAAEAAABAw3pg0fS54rXExKuzztGpAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGBk3GRgZNxkUE; STOKEN=d0f83aa866137e6fdb3efef2ab4ad9b9f0bdbae923236b694223f8e5c6d8ce88; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; PSINO=6; BAIDUID_BFESS=AE0057AFE3B8DD44868FC929D565C3D7:FG=1; BDRCVFR[kSyA9a8U-kc]=mk3SLVN4HKm; ZFY=DcwFyqZ1lwz:BJtP2mj8laxe1Kz4ze:BdmVk0z0m1d8D4:C; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[Tp5-T0kH1pb]=mk3SLVN4HKm; BAIDU_WISE_UID=wapp_1692236855400_845; USER_JUMP=-1; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1692083037,1692164987,1692168491,1692236855; st_key_id=17; arialoadData=false; XFT=IdUDr0Jc+eEn+e3D9Mw3yCAH4aBJFOCHaFp+x/DEzLA=; tb_as_data=580034f000278ba861ab17790bb668fd664030fa66f80387bcd0b5e84160254bd58e325e02d2c9fef36705ecc46cac6015a3b41f30de9009185345e8806e3295b1fddd4b46fb27addca50f7fd8aed273fef6010b1460ea7e0b9b46d523b18bf611969327acc06c7502bc051086e4e126; RT="z=1&dm=baidu.com&si=e0c0c639-b6a8-4261-9658-3f26c331dfbf&ss=llei4onp&sl=5&tt=3gt&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1692236861; XFI=0dc8ee50-3ca0-11ee-9df4-4ff909da3b4b; BA_HECTOR=24812g842kagal8h05852k071idqv1u1p; ab_sr=1.0.1_ZDMxN2E1NmZjNTdkZTg0ZDg2ZmY2NzE3YWNkOTliZjIyNzcyOGQ1Yzk5NDc2MDkwZWM0ODQ2ZDQ3N2MzODhkOWNkZGIxNjVlZjBkZWIxN2FmMWY5ODNjYjk1NjcxYjRiNjIxZTk5OWQ3ODE5NjVjZmIyYzM5Y2RlNjE5NjBhNWFhMGE4ZjQxOGI5ZTg1ZWQ4MmY4MTEyMGVhNzYxMmEyMjU2ZTAzYmY5MzQ2ZDU1MWRiNzM4ZWQ1MmZjMjVmMjcw; st_data=ba3f60d43cbe0044de3aa35ee6bcc951f7c7fe2543853d99d7e49fe96f88617d5632202d0dbdf9a62606a5de6f2ec29b1fa1af2c41599d204dffc83c6fb97133bb42d6b8ad9720630a5e03d6476d81318981f7ae7ac9f3951ed316e65437c72afc6f3c7664399f03fe1f90054a4379e283baf8b2068412741ce818c0c67ba00ace8b2604502cac31f9f4d8bbe27bf747; st_sign=0f4c5297; BCLID=9006724417094226246; BCLID_BFESS=9006724417094226246; BDSFRCVID=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=ThFOJeCT5G09-d6fYq_zT4siQAj2RanTTPjcTR5qJ04BtyCVcmiREG0Ptt_L8MPM_EGSogKKLgOTHpkF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_BDCLCKID_SF_BFESS=Jn-fVIDKJDt3fP36q4cH-nF-qxbXq5vUWmOZ0l8KtJRDsbOd04jZKJLz3aOealoL0G5G_fomWIQahC3Pjbbb-lD0jh5RQ-bUQeb4KKJxQnLWeIJo5fF-bl01hUJiBM7MBan7QnvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtPPhbC8Gj5KhjT5LepJf-K6aKC5bL6rJabC3SbnDXU6qLp30QN08e40q5H5DM-cmJPOEVqjFDl5Hhl070M62LpDt-mLj_po2MlO4Hn6Y0xonDh835bnrqUrTBGO4KMOO5hvv8b3O3M72MUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRIDVC_y3V; H_PS_PSSID=36554_39107_39117_39198_26350_39138_39101; XFCS=57437F2722D34F2A5D4134079AF7980B035E29409E66F167E0B20B4E7549ECF7''',
'Host': 'tieba.baidu.com',
'Referer': 'https://tieba.baidu.com/f?ie=utf-8&kw=%E8%83%96%E5%A5%B3%E5%AD%A9%E5%90%A7&fr=search'
}
# 正则匹配所有的帖子的详情链接
pattern = re.compile(r'<a rel="noopener" href="(.*?)" title="(.*?)" target="_blank" class="j_th_tit ">.*?</a>',
re.S)
titles = re.findall(pattern, html)[1:]
index = 0
prifix_url = 'https://tieba.baidu.com'
for title in titles:
title_url = prifix_url + title[0]
# 帖子的名称
post_name = deal_special_words(title[1])
# 获取每个详情帖子的html
detail_html = requests.get(url=title_url, headers=headers)
# 查找每个帖子的图片
soup = BeautifulSoup(detail_html.text, 'lxml')
imgs = soup.find_all(class_="BDE_Image")
for img in imgs:
# 请求下载每个帖子的图片
img_url = img['src']
img_content = requests.get(url=img_url).content
path = post_name + str(index) + '.' + 'jpg'
if not os.path.exists(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}'):
os.mkdir(f'E:/Python/project/网络爬虫/09-爬取百度贴吧某贴吧的所有照片和视频/imgs/{kw}')
with open(f'imgs/{kw}/{path}', mode='wb') as f:
f.write(img_content)
index += 1
print(f'{kw}第{round(int((pn / 50) + 1), 0)}页 {path} 下载完毕')
index = 0
def main(kw, pn):
"""
主函数
:param kw:搜索的内容
:param pn: 页数
:return: null
"""
url = 'https://tieba.baidu.com/f'
html = get_page_info(url=url, kw=kw, pn=pn)
# print(html)
parse_html(html=html, kw=kw, pn=pn)
if __name__ == '__main__':
kw = input('请输入您想搜索的贴吧:')
pns = int(input('您想爬取几页内容:'))
for pn in range(0, pns):
main(kw, pn * 50)
效果展示