针对源代码进行了以下修改:
1. 增加了报错兼容
2. 增加了进度打印
3. 增加了线程池并行
经过测试,超过30页会触发反扒
import requests
import os
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
def download_image(image_url, header, save_dir, n):
try:
image_data = requests.get(url=image_url, headers=header).content
with open(os.path.join(save_dir, f'{n:06d}.jpg'), 'wb') as fp:
fp.write(image_data)
return n, None
except requests.RequestException as e:
return n, f"Error downloading image {image_url}: {e}"
def get_images_from_baidu(keyword, page_num, save_dir):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://image.baidu.com/search/acjson?'
n = 0
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for pn in tqdm(range(0, 30 * page_num, 30), desc='Downloading Pages'):
params = {
'tn': 'resultjson_com',
'logid': '7603311155072595725',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': '',
'pn': pn,
'rn': '30',
'gsm': '1e',
'1618827096642': ''
}
try:
response = requests.get(url=url, headers=header, params=params)
response.raise_for_status()
response.encoding = 'utf-8'
html = response.text
image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
except requests.RequestException as e:
print(f"Error during request: {e}")
continue
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(download_image, image_url, header, save_dir, n + i): i for i, image_url in
enumerate(image_url_list)}
for future in tqdm(as_completed(futures), desc=f'Downloading Images from Page {pn // 30 + 1}',
total=len(image_url_list)):
i = futures[future]
result_n, error = future.result()
if error:
print(error)
else:
n = result_n + 1
if __name__ == "__main__":
keyword = '趴'
page_num = 30
save_dir = f'.\\图片\\{keyword}'
get_images_from_baidu(keyword, page_num, save_dir)