在代码运行之中,经常会遇到代码出错然后整个代码停滞的问题,特别是爬取多个图片或视频的时候,他可能在其中某一个环节突然出错,然后整个代码就停止了爬取,所以我们需要采用异常处理的办法对异常部分处理并跳过。
# 异常处理
import os
while True:
# 加入异常处理防止程序终止
try: # 程序正常运行时执行的语句
file_name = input("输入要创建文件夹的名字:")
os.mkdir(file_name)
except: # 程序异常时执行的语句
print("文件夹已存在,无需再次处理")
这就是异常处理的基本格式,我们来尝试运行一下
这里我们可以看见,第一次代码正常运行,第二次出现了异常处理所显示的语句。
接下来,我们把它带入图片爬取项目之中
# 导入相应库
import requests
import re
import os
# 获取网络源代码
def get_html(url, headers, params):
response = requests.get(url, headers=headers, params=params)
# 设置源代码的编码方式
response.encoding = "utf-8"
# 利用循环判断网页是否能打开
if response.status_code == 200:
return response.text
else:
print("网址源码获取错误")
# 解析提取图片的源地址
def pares_pic_url(html):
result = re.findall('thumbURL":"(.*?)"', html, re.S)
return result
# 获取图片二进制源码
def get_pic_content(url):
response = requests.get(url)
return response.content
# 保存图片
def save_pic(fold_name,content,pic_name):
with open(fold_name + "/" + str(pic_name)+".jpg","wb") as f:
f.write(content)
f.close()
# 定义一个文件夹保存
def create_fold(fold_name):
# 异常处理
try:
os.mkdir(fold_name)
except:
print("文件夹已存在")
# 定义一个main函数调用get_html函数
def main():
try:
# 输入文件夹名字
fold_name = input("请输入图片名:")
# 输入你要抓取的数量
page_num = input("请输入你要抓取的页数:")
# 调用函数,创建文件夹
create_fold(fold_name)
# 定义图片名字
pic_name = 0
# 构建循环,控制页面
for i in range(int(page_num)):
try:
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10657561438670404501&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E5%A4%A7%E7%86%8A%E7%8C%AB&queryWord=%E5%A4%A7%E7%86%8A%E7%8C%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1695870456938="
headers = {
"Accept": "text/plain, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": 'BAIDUID=E9B7B77E6EFAEA9AE94B261906052872:FG=1; BIDUPSID=E9B7B77E6EFAEA9AE94B261906052872; PSTM=1695258294; BAIDUID_BFESS=E9B7B77E6EFAEA9AE94B261906052872:FG=1; BDUSS=hPdE9iMEdZdFoyUnAyaEJESFRKZy0xcmRXemRnazJrZTN3UmRsQmFnd1ZKVE5sRVFBQUFBJCQAAAAAAQAAAAEAAAA2tPJCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABWYC2UVmAtlTW; BDUSS_BFESS=hPdE9iMEdZdFoyUnAyaEJESFRKZy0xcmRXemRnazJrZTN3UmRsQmFnd1ZKVE5sRVFBQUFBJCQAAAAAAQAAAAEAAAA2tPJCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABWYC2UVmAtlTW; indexPageSugList=%5B%22csdn%22%5D; RT="z=1&dm=baidu.com&si=1a697eb9-7afe-4cf7-ad56-7d660c33216d&ss=lmyje565&sl=2&tt=5so&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=7r4&ul=cbp4&hd=cbqt"; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm',
"Host": "image.baidu.com",
"Referer": "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MTEsMCwxLDYsMyw1LDQsMiw4LDcsOQ%3D%3D&word=%E5%A4%A7%E7%86%8A%E7%8C%AB",
"Sec-Ch-Ua": '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
"X-Requested-With": "XMLHttpRequest",
}
params = {
"tn": "resultjson_com",
"logid": "10657561438670404501",
"ipn": "rj",
"ct": "201326592",
"fp": "result",
"word": fold_name,
"queryWord": fold_name,
"cl":"2",
"lm":"-1",
"ie": "utf - 8",
"oe": "utf - 8",
"nc": "1",
"pn": str(int(i+1)*30),
"rn": "30",
"gsm": "1e",
}
html = get_html(url, headers, params)
# print(html)
result = pares_pic_url(html)
# print(result)
# 使用for循环遍历列表
for item in result:
# print(item)
# 调用函数获取二进制源码
pic_content = get_pic_content(item)
save_pic(fold_name, pic_content, pic_name)
pic_name += 1
print("正在保存第 " + str(pic_name))
except:
print("抓取第 "+str(i)+" 页错误")
except:
print("数据获取异常")
# 执行main函数
if __name__ == '__main__':
main()
紧接着,我们来运行他
可以看到,项目正在正常运行,并未出现报错,但是在异常地方进行了提示。