直接复制粘贴就可以了,不需要进行更改更合参数,谢谢
第一个代码:
import requests
import os
import urllib
class Spider_baidu_image():
def __init__(self):
self.url = 'http://image.baidu.com/search/acjson?'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.\
3497.81 Safari/537.36'}
self.headers_image = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.\
3497.81 Safari/537.36','Referer':'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1557124645631_R&pv=&ic=&nc=1&z=&hd=1&latest=0©right=0&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E8%83%A1%E6%AD%8C'}
# self.keyword = '刘亦菲壁纸'
self.keyword = input("请输入搜索图片关键字:")
self.paginator = int(input("请输入搜索页数,每页30张图片:"))
# self.paginator = 50
# print(type(self.keyword),self.paginator)
# exit()
def get_param(self):
"""
获取url请求的参数,存入列表并返回
:return:
"""
keyword = urllib.parse.quote(self.keyword)
params = []
for i in range(1,self.paginator+1):
params.append('tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=1&latest=0©right=0&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn={}&rn=30&gsm=78&1557125391211='.format(keyword,keyword,30*i))
return params
def get_urls(self,params):
"""
由url参数返回各个url拼接后的响应,存入列表并返回
:return:
"""
urls = []
for i in params:
urls.append(self.url+i)
return urls
def get_image_url(self,urls):
image_url = []
for url in urls:
json_data = requests.get(url,headers = self.headers).json()
json_data = json_data.get('data')
for i in json_data:
if i:
image_url.append(i.get('thumbURL'))
return image_url
def get_image(self,image_url):
"""
根据图片url,在本地目录下新建一个以搜索关键字命名的文件夹,然后将每一个图片存入。
:param image_url:
:return:
"""
cwd = os.getcwd()
file_name = os.path.join(cwd,self.keyword)
if not os.path.exists(self.keyword):
os.mkdir(file_name)
for index,url in enumerate(image_url,start=1):
with open(file_name+'\\{}.jpg'.format(index),'wb') as f:
f.write(requests.get(url,headers = self.headers_image).content)
if index != 0 and index % 30 == 0:
print('{}第{}页下载完成'.format(self.keyword,index/30))
def __call__(self, *args, **kwargs):
params = self.get_param()
urls = self.get_urls(params)
image_url = self.get_image_url(urls)
self.get_image(image_url)
if __name__ == '__main__':
spider = Spider_baidu_image()
spider()
效果图:
第二个代码:我这个代码默认搜索:“黑烟”, “白烟”, “青烟”, “蓝烟”, “黄烟”, “黄色火苗”, “蓝色火苗”, “红色火苗”,可以自行进行更改设置。不需要改动代码,只需要改动keyword就可以
"""黑烟"""
import requests, json, os
from jsonpath import jsonpath
url = "https://image.baidu.com/search/acjson?"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
keyword = ["黑烟", "白烟", "青烟", "蓝烟", "黄烟", "黄色火苗", "蓝色火苗", "红色火苗"]
for word in keyword:
print(word + "......")
if not os.path.exists(f"./{word}"):
os.mkdir(f"./{word}")
for i in range(1, 101): # 30 * 100 = 3000 每个关键词爬取3000张图片
params = {
'tn': 'resultjson_com',
'logid': '10502599365288196568',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'fr': '',
'word': word,
'queryWord': word,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '0',
'hd': '',
'latest': '',
'copyright': '',
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'expermode': '',
'nojc': '',
'isAsync': '',
'pn': 30 * i,
'rn': '30',
'gsm': '',
# # '1637644071066': '',
}
res_text = requests.get(url, headers=headers, params=params).text
# print(res_text)
res_dic = json.loads(res_text)
hoverurls = jsonpath(res_dic, '$..hoverURL') # 所有的图片url
# print(hoverurls, len(hoverurls))
n = 0
for img_url in hoverurls:
n += 1
res_content = requests.get(img_url, headers=headers).content
filename = f"./{word}/" + str(n) + '.jpg'
with open(filename, 'wb') as f:
f.write(res_content)
print(str(n) + '.jpg')
第三个代码:第三个代码虽然最少,但是我觉得是这三个当中最好的。爬取的效果。图片自动会生成的当前目录文件下
import requests, json, re, time, os
def get_asjson(page, gsm, word):
url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=9123806616981181340&ipn=rj&ct=201326592&is=&fp=result&fr=&word={word}&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={str(30 * int(page))}&rn=30&gsm={gsm}&{str(int(time.time() * 1000))}="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1637758492843_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDYsMiw0LDEsNSw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=hello',
'Cookie': 'BDqhfp=hello%26%26-10-1undefined%26%2628989%26%2635; BAIDUID=0C2336F5F3D356371C46DF079632E0C8:FG=1; BAIDUID_BFESS=0C2336F5F3D356371C46DF079632E0C8:FG=1; BIDUPSID=0C2336F5F3D356371C46DF079632E0C8; __yjs_duid=1_32693704d239fea9266064fc8a3d25631637737833661; PSTM=1637737880; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=null; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; delPer=0; PSINO=6; __yjs_st=2_ZGU4ODA5ZTdmNzczMzgxNzRiZWZhNTdkODVkY2E5MzQ3NzM3Nzc2MzZlNjYzZmRiMWVjOTlmNWQzZDA3NWY1MzM2M2NkNjNmMjMzZWVlYzQxNGQ2ODIzYjlkNTdhYTUyZjdhNWQwNjQxZWE1YTI0MWZiNzQ1NTE0N2NlNTgwNjZjODlkNWVlZWI2ZDBkNjUzNmNiZDE3NzUyYTA4ZjkxYjI1NzNhODBjOGZhZTBmMzZkY2IwOWJmNjMxNjEzNmUxYjQxZmZhM2M1ODUzYTFkNTM4NTE5MzZjZjRkODliMTE1MmRmMDY1MjI4OGJiM2I3ZGMzMDdiNjI4MWE3NDgxZV83XzQyODU3N2M0; H_PS_PSSID=35295_34446_35104_31254_35237_35049_34584_34505_35245_34578_34872_26350_35210_35145_22160; indexPageSugList=%5B%22hello%22%2C%22bello%22%2C%22hello%20%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MTJmNTIwNGNlNmI5NDg2YmZiZTI1OTM1MGZhNTJhZTZlMzVmODE2NmEwZjg5MjNlZWZjZWY1YTY3ZjQ2Yzc2MWZiNGRlODY2ZDJjOGE3N2RhMzg2NjcxZjEzY2ZiMDQ4ODNjYzgyZTZlNWM2NGQ4YjlhMzBlMWE1ZjU0ZTY2NzAxYmM0ZGRkOTM0MGI3NzUwOWZjODY2ODE5NmU1N2E1Yw=='
}
response = requests.get(url = url, headers = headers).text + "1111"
#print(response)
gsm = re.findall('"gsm":"(.*?)",', response)[0]
data = re.findall('"hoverURL":"(.*?)",', response)
#print(len(data))
return gsm, data
def save_img(imgurl_list, img_os):
for i in imgurl_list:
try:
response = requests.get(url = i).content
except:
print("no")
else:
img_name = i[28: 36]
with open(img_os + img_name + ".jpg", "wb") as file:
file.write(response)
print(i + " OK !!!")
if __name__ == "__main__":
gsm = "1e"
word = "蓝烟" #修改你要爬取的关键字
img_os = word + "_img\\"
os.mkdir(img_os)
for i in range(1, 102, 2):
asjson_1 = get_asjson(page = i, gsm = gsm, word = word)
save_img(asjson_1[1], img_os)
#print(asjson_1[1])
gsm = asjson_1[0]
while True:
asjson_2 = get_asjson(page = int(i) + 1, gsm = gsm, word = word)
save_img(asjson_2[1], img_os)
#print(asjson_2[1])
gsm = asjson_2[0]
break
以上三个方法军不需要对代码进行更改即可获取百度图片上的图片信息。