一、准备工作
用python来实现对图片网站的爬取并保存,以大熊猫图片为例,搜索可得到下图所示
二、代码实现
这次的爬取主要用了如下的第三方库
简单构思可以分为三个小部分
1.获取网页内容
2.提取图片
3.保存图片至相应位置
下面来看第一部分:获取网页内容
def get_html(url,headers,params):
response = requests.get(url,headers=headers,params=params)
response.encoding="utf-8"
if response.status_code == 200:
return response.text
else:
print("网站源码获取错误")
return response.text
第二部分提取图片
来看代码
#提取图片源地址
def parse_pic_url(html):
result = re.findall('thumbURL":"(.*?)"',html,re.S)
return result
#获取图片二进制码
def get_pic_content(url):
response = requests.get(url)
return response.content
下面就是第三部分:保存图片
#定义一个文件夹保存
def create_fold(fold_name):
try:
os.mkdir(fold_name)
except:
print("文件夹已存在")
#保存图片
def save_pic(content,pic_name):
with open("大熊猫/"+str(pic_name)+".jpg","wb") as d:
d.write(content)
d.close()
下面是完整的代码
import os
import requests
import re
def get_html(url,headers,params):
response = requests.get(url,headers=headers,params=params)
response.encoding="utf-8"
if response.status_code == 200:
return response.text
else:
print("网站源码获取错误")
return response.text
#提取图片源地址
def parse_pic_url(html):
result = re.findall('thumbURL":"(.*?)"',html,re.S)
return result
#获取图片二进制码
def get_pic_content(url):
response = requests.get(url)
return response.content
# 定义一个文件夹保存
def create_fold(fold_name):
try:
os.mkdir(fold_name)
except:
print("文件夹已存在")
#保存图片
def save_pic(content,pic_name):
with open("大熊猫/"+str(pic_name)+".jpg","wb") as d:
d.write(content)
d.close()
def main():
pic_name=0
for i in range(10):
url="https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10636785272990048342&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A4%A7%E7%86%8A%E7%8C%AB&queryWord=%E5%A4%A7%E7%86%8A%E7%8C%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1695863772997="
headers={
"Accept":"text/plain, */*; q=0.01",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection":"keep-alive",
"Cookie":'BAIDUID=E9B7B77E6EFAEA9AE94B261906052872:FG=1; BIDUPSID=E9B7B77E6EFAEA9AE94B261906052872; PSTM=1695258294; BAIDUID_BFESS=E9B7B77E6EFAEA9AE94B261906052872:FG=1; BDUSS=hPdE9iMEdZdFoyUnAyaEJESFRKZy0xcmRXemRnazJrZTN3UmRsQmFnd1ZKVE5sRVFBQUFBJCQAAAAAAQAAAAEAAAA2tPJCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABWYC2UVmAtlTW; BDUSS_BFESS=hPdE9iMEdZdFoyUnAyaEJESFRKZy0xcmRXemRnazJrZTN3UmRsQmFnd1ZKVE5sRVFBQUFBJCQAAAAAAQAAAAEAAAA2tPJCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABWYC2UVmAtlTW; indexPageSugList=%5B%22csdn%22%5D; RT="z=1&dm=baidu.com&si=1a697eb9-7afe-4cf7-ad56-7d660c33216d&ss=lmyje565&sl=2&tt=5so&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=7r4&ul=cbp4&hd=cbqt"; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm',
"Host":"image.baidu.com",
"Referer":"https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MTEsMCwxLDYsMyw1LDQsMiw4LDcsOQ%3D%3D&word=%E5%A4%A7%E7%86%8A%E7%8C%AB",
"Sec-Ch-Ua":'"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
"Sec-Ch-Ua-Mobile":"?0",
"Sec-Ch-Ua-Platform":'"Windows"',
"Sec-Fetch-Dest":"empty",
"Sec-Fetch-Mode":"cors",
"Sec-Fetch-Site":"same-origin",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
"X-Requested-With":"XMLHttpRequest",
}
params = {
"logid":"9081819235309454042",
"ipn": "rj",
"ct": "201326592",
"fp": "result",
"word": "大熊猫",
"queryWord": "大熊猫",
"cl": "2",
"lm": "-1",
"ie": "utf-8",
"oe": "utf-8",
"nc": "1",
"pn": str(int(i+1)*30),
"rn": "30",
"gsm": "5a",
}
get_html(url,headers,params)
html = get_html(url,headers,params)
result = parse_pic_url(html)
for item in result:
pic_content = get_pic_content(item)
print(pic_content)
save_pic(pic_content,pic_name)
pic_name += 1
print("正在保存第"+str(pic_name)+"张图片")
if __name__ == '__main__':
main()
最后的运行截图
本期教程到此结束,喜欢的点个赞吧。