用chrome解析某搜索网站图片搜索请求包,再参考网上一些已有的代码,在vscode+python3.7环境,多线程下载搜索结果页面上图片的代码,其中加入了键盘中止和ThreadPoolExecutor线程池支持,代码仅供学习交流参考:
import sys
import os, os.path as osp
import threading as th
import requests
import urllib
import re
from time import sleep
keyword="独库公路"
dbgPageNum=0 #获取页数,若为0则使用网站返回图片数据,若超过30而,则指定30页
bigPic="3" #3:大图,9:特大图
HD='' #1:高清
per_page_num=30 #每页图片数量
page_num=30 #页面图片数量
bdImgUrl='https://image.baidu.com/search/acjson'
SAVE_ROOT_PATH = './image'
threadSleepTime=0 #线程休眠时间
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
'Host': 'image.baidu.com',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin', 'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/plain, */*; q=0.01',
}
params = {
"tn": "resulttagjson",
"logid": "9084146609974904936",
"ie": "utf-8",
"fr": "",
"word": keyword,
"ipn": "r",
"fm": "index",
"pos": "history",
"queryWord": keyword,
"cl": "2",
"lm": "-1",
"oe": "utf-8",
"adpicid": "",
"st": "",
"z": bigPic,
"ic": "",
"hd": HD,
"latest":"",
"copyright":"",
"s": "",
"se": "",
"tab": "",
"width": "0",
"height": "0",
"face": "",
"istype": "",
"qc": "",
"nc": "1",
"expermode":"",
"nojc": "",
"isAsync": "true",
"pn": f'{page_num}',
"rn": f'{per_page_num}',
"gsm": "1e",
"1663432422144":""
}
str_table = {
'_z2C$q': ':',
'_z&e3B': '.',
'AzdH3F': '/'
}
char_table = {
"w": "a",
"k": "b",
"v": "c",
"1": "d",
"j": "e",
"u": "f",
"2": "g",
"i": "h",
"t": "i",
"3": "j",
"h": "k",
"s": "l",
"4": "m",
"g": "n",
"5": "o",
"r": "p",
"q": "q",
"6": "r",
"f": "s",
"p": "t",
"7": "u",
"e": "v",
"o": "w",
"8": "1",
"d": "2",
"n": "3",
"9": "4",
"c": "5",
"m": "6",
"0": "7",
"b": "8",
"l": "9",
"a": "0",
}
char_table = {ord(key): ord(value) for key, value in char_table.items()}
def decodeBDUrl(strUrl):
for key, value in str_table.items():
strUrl = strUrl.replace(key, value)
strUrl=strUrl.translate(char_table)
return strUrl
def uncodeImgUrls2(strUrl):
rs=re.findall(r'src=(.*?)&',strUrl)
if rs:
return urllib.parse.unquote(rs[0])
else:
return urllib.parse.unquote(strUrl)
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
def getBDImgSearchJson(bdimgUrl,searchParams,reqHeaders):
try:
rq=requests.get(bdImgUrl,searchParams,headers=headers,timeout=6.25)
rq.raise_for_status
if rq.status_code==200 :
rq.encoding=rq.apparent_encoding
sj=rq.json()
return sj
except:
return None
def initPageNum(pages=0):
from math import ceil
try:
bdjs=getBDImgSearchJson(bdImgUrl,params,headers)
lstnum=float(bdjs['listNum']) #网页反馈的图片列表数量
p=ceil(lstnum/30)
p=p if (p>0 and p<=30) else 30 #限制在30页以内
if pages==0:
return p
else:
return pages
except:
return -1
getPageNum=initPageNum(dbgPageNum) #初始化获取页面总数
assert getPageNum!=-1
threadLock=th.Lock()
lstFailUrl=[]
def appendFailUrl(idx,url):
threadLock.acquire()
lstFailUrl.append((idx,url))
threadLock.release()
def getPicFromObjURL(pDir,idx,lstItem):
sleep(threadSleepTime)
imgUrl=uncodeImgUrls2(decodeBDUrl(lstItem['objURL']))
print(f'{idx}->{imgUrl}\n',end='')
fName=validateTitle(lstItem['fromPageTitleEnc'])
save_filepath = f'{pDir}/{idx}_{fName}.jpeg'
try:
rq = requests.get(imgUrl,timeout=6.3)
if rq.status_code==200:
imgContent=rq.content
open(save_filepath, 'wb').write(imgContent) # 写入
if osp.getsize(save_filepath) < 5 * 1024:
appendFailUrl(idx,imgUrl)
return False
else:
print('Error Status:%d,%s'%(rq.status_code,imgUrl))
appendFailUrl(idx,imgUrl)
return False
return True
except:
appendFailUrl(idx,imgUrl)
return False
from pynput import keyboard
isEnd = False
# 键盘按下执行的函数 使用try和except的原因是有特殊按键(功能键)
def keyboard_on_press(key):
global isEnd
try:
print('字母键{0} press'.format(key.char))
except AttributeError:
print('特殊键{0} press'.format(key))
if key == keyboard.Key.esc:
isEnd = True
return False
keyListenr=keyboard.Listener(on_press=keyboard_on_press)
keyListenr.daemon=1
keyListenr.start()
maxThreadsNum=2
picIdx=0
picDir=f'{SAVE_ROOT_PATH}/{keyword}/'
os.makedirs(osp.dirname(picDir), exist_ok=True) #创建以关键字命名的目录
import concurrent.futures
tpe=concurrent.futures.ThreadPoolExecutor(max_workers=maxThreadsNum)
for i in range(getPageNum):
if not isEnd: #键盘esc中止
params['pn']=per_page_num * (i + 1)
try:
bdjs=getBDImgSearchJson(bdImgUrl,params,headers)
except:
continue
lstUrl=bdjs['data'][:-1] if not (bdjs is None) else []
for ul in lstUrl:
picIdx+=1
tpe.submit(getPicFromObjURL,picDir,picIdx,ul)
tpe.shutdown(True)
keyListenr.stop()
print('==='*20)
print(f"pageNum:{getPageNum}")
print("Procces %d Picture over!"%(picIdx))
print(f'Error url count :{len(lstFailUrl)}')