使用requests库爬取百度图片
安装
pip install requests
爬虫流程
抓取首页图片
静态页面
import re
import os
import requests
#1.目标数据是图片
#2.请求流程 1.先访问page页获取图片url2.对url发起请求,获取图片数据,3.存储
#一张图片
# url="https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2325464303,1389978966&fm=26&gp=0.jpg"
#
# res=requests.get(url)
# #res 包含 猫咪图片数据
# print(res.content)#二进制数据
# with open("猫咪.jpg","wb") as f:
# f.write(res.content)
#图片规律
#"thumbURL":"https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2535535235,1109729418&fm=26&gp=0.jpg"
#正则
#访问page页面 网页源代码 ---匹配 urls
url="https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=%E7%8C%AB%E5%92%AA"
# headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
headers={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "winWH=%5E6_1366x657; BDIMGISLOGIN=0; BDqhfp=%E7%8C%AB%E5%92%AA%26%260-10-1undefined%26%260%26%261; BAIDUID=31BE3CEB3DDB7ADCA3C987A69863BD4A:FG=1; PSTM=1585467854; BIDUPSID=6B73B5EB3CF18DDDF94A54DB137A0C70; H_WISE_SIDS=139912_143435_142019_144427_141875_141748_143789_144420_142780_144483_136862_144489_131246_141261_144741_138883_141942_127969_140066_143999_140593_143057_141808_140351_141008_143470_144727_143923_144376_131423_144289_142207_143704_143519_107318_138595_139910_144306_143478_142427_140368_138662_142505_141910_144238_142113_143859_136751_140843_110085; BDUSS=UR4a0I0UTR-QmpvflZJdlB4bnduUUR3UGx-ekhlblloSUpsSzZHT3Y1VUdOVTlmRVFBQUFBJCQAAAAAAAAAAAEAAADGU~YYtv63rMrC0rUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaoJ18GqCdfO; BDUSS_BFESS=UR4a0I0UTR-QmpvflZJdlB4bnduUUR3UGx-ekhlblloSUpsSzZHT3Y1VUdOVTlmRVFBQUFBJCQAAAAAAAAAAAEAAADGU~YYtv63rMrC0rUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaoJ18GqCdfO; __yjs_duid=1_aebcd34bae6eb31144a93ee9ce01016e1611491945901; indexPageSugList=%5B%22%E7%99%BE%E5%BA%A6%E5%9B%BE%E7%89%87%22%2C%22%E7%8C%AB%E5%92%AA%22%2C%22%E6%88%91%E8%A6%81%E5%AD%A6%E4%B9%A0%22%2C%22%E5%9B%BE%E7%89%87%22%2C%22%E8%AE%BE%E8%AE%A1%22%2C%22tornado%22%2C%22%E7%8B%97%E5%AD%90%22%2C%22%E4%BA%91%E6%B2%83%E5%AE%A2%E4%B8%8A%E7%9A%84%E6%8A%95%E6%A0%87%E6%98%AF%E6%80%8E%E4%B9%88%E5%9B%9E%E4%BA%8B%2C%E6%98%AF%E9%9C%80%E8%A6%81%E5%86%99%E5%A5%BD%E4%BB%A3%E7%A0%81%E5%86%8D%E6%8A%95%E6%A0%87%E4%B9%88%22%2C%22%E7%BE%8E%E5%9B%BE%E7%A7%80%E7%A7%80%E5%8E%BB%E9%99%A4%E5%9B%BE%E7%89%87%E6%B0%B4%E5%8D%B0%22%5D; BAIDUID_BFESS=59B0BC4A359EF2EC96697A13EDBA3229:FG=1; H_PS_PSSID=33256_33273_33595_33392_33460_26350_22157; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=7; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=akahag2ga00g2h2hrv1g39gds0r; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; userFrom=null; ab_sr=1.0.0_OTM2Nzg1MDY3YzUxYmJlZDNjZTI2ZjY0Yjc0MjQ4NTIwNzg5ODc1MjEwNjBhNTdjOGY1MmJjNWU5NzM3YTEzMmYwNGVlODA1MTkzYmRiZDAwNmM4YTgyMGNmYjQ0NjVl; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm",
"Host": "image.baidu.com",
"sec-ch-ua": '"e";v="88", ";Not A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",}
res=requests.get(url,headers=headers)
#添加请求头
# print(res.request.headers)
# print(res.text)#字符串数据
#提取数据 猫咪图片url
urls=re.findall('"thumbURL":"(.*?)"', res.text)
print(urls)
if not os.path.exists("猫咪"):
os.mkdir("猫咪")
#url发起请求,获取图片数据
for index,img_url in enumerate(urls):
print(index)
if "\\" in img_url:
img_url=img_url.replace("\\","")
res=requests.get(img_url)
#res 包含 猫咪图片数据
# print(res.content)#二进制数据
filename="猫咪"+"/"+"cat"+str(index)+".jpg"
with open(filename,"wb") as f:
f.write(res.content)
抓取多页图片
动态页面
要点是 page页的规律
import requests
import re
import os
#page 页的规律
page2="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0©right=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30&gsm=5a&1614171744194="
page3="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0©right=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=120&rn=30&gsm=78&1614171746052="
page4="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0©right=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=150&rn=30&gsm=96&1614171872254="
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"}
page_url="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0©right=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30&gsm=5a&1614171744194="
# res=requests.get(page_url,headers=headers)
#
# urls=re.findall('"thumbURL":"(.*?)"', res.text)
# print(urls)
# print(len(urls))
def get_img(img_urls,dirname):
"""
功能:获取百度图片,存储到文件夹
参数:
img_urls:图片url列表
dirname:图片存储文件夹
"""
# 对图片url发起请求
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Referer": "https://image.baidu.com"}
for index, img_url in enumerate(img_urls):
print(index)
if "\\" in img_url:
img_url = img_url.replace("\\", "")
res = requests.get(img_url, headers=headers)
# res 包含 猫咪图片数据
# print(res.content)#二进制数据
filename = dirname + "/" + "cat" + str(index) + ".jpg"
with open(filename, "wb") as f:
f.write(res.content)
#获取多页 图片
#1构造page页
for i in range(1,5):
page_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7495559878915684143&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%8C%AB%E5%92%AA&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=0&ic=0&hd=0&latest=0©right=0&word=%E7%8C%AB%E5%92%AA&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&" \
"pn={}&rn=30&gsm=5a&1614171744194="
num=str(i*30)
page_url=page_url.format(num)
res = requests.get(page_url, headers=headers)
img_urls=re.findall('"thumbURL":"(.*?)"', res.text)#提取图片url
dirname="猫咪"+str(i)
if not os.path.exists(dirname):
os.mkdir(dirname)
#对图片url发起请求
get_img(img_urls,dirname)
总结:
-
下载一页
- 1.page_url
- 2.获取图片url列表(从page_url源代码里)
- 3.对 图片url列表中的url 发起请求 获取图片数据
- 4.存储
-
下载多页
- 1.构造page页
- 2.获取图片url列表(从page_url源代码里)
- 3.对 图片url列表中的url 发起请求 获取图片数据
- 4.存储