批量爬起百度图片上的数据代码--python

直接复制粘贴就可以了,不需要进行更改更合参数,谢谢
第一个代码:

import requests
import os
import urllib
 
class Spider_baidu_image():
    def __init__(self):
        self.url = 'http://image.baidu.com/search/acjson?'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.\
            3497.81 Safari/537.36'}
        self.headers_image = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.\
            3497.81 Safari/537.36','Referer':'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1557124645631_R&pv=&ic=&nc=1&z=&hd=1&latest=0&copyright=0&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word=%E8%83%A1%E6%AD%8C'}
        # self.keyword = '刘亦菲壁纸'
        self.keyword = input("请输入搜索图片关键字:")
        self.paginator = int(input("请输入搜索页数,每页30张图片:"))
        # self.paginator = 50
        # print(type(self.keyword),self.paginator)
        # exit()
    def get_param(self):
        """
        获取url请求的参数,存入列表并返回
        :return: 
        """
        keyword = urllib.parse.quote(self.keyword)
        params = []
        for i in range(1,self.paginator+1):
            params.append('tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=1&latest=0&copyright=0&word={}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn={}&rn=30&gsm=78&1557125391211='.format(keyword,keyword,30*i))
        return params
 
    def get_urls(self,params):
        """
        由url参数返回各个url拼接后的响应,存入列表并返回
        :return:
        """
        urls = []
        for i in params:
            urls.append(self.url+i)
        return urls
    def get_image_url(self,urls):
        image_url = []
        for url in urls:
            json_data = requests.get(url,headers = self.headers).json()
            json_data = json_data.get('data')
            for i in json_data:
                if i:
                    image_url.append(i.get('thumbURL'))
        return image_url
    def get_image(self,image_url):
        """
        根据图片url,在本地目录下新建一个以搜索关键字命名的文件夹,然后将每一个图片存入。
        :param image_url: 
        :return: 
        """
        cwd = os.getcwd()
        file_name = os.path.join(cwd,self.keyword)
        if not os.path.exists(self.keyword):
            os.mkdir(file_name)
        for index,url in enumerate(image_url,start=1):
            with open(file_name+'\\{}.jpg'.format(index),'wb') as f:
                f.write(requests.get(url,headers = self.headers_image).content)
            if index != 0 and index % 30 == 0:
                print('{}第{}页下载完成'.format(self.keyword,index/30))
    def __call__(self, *args, **kwargs):
        params = self.get_param()
        urls = self.get_urls(params)
        image_url = self.get_image_url(urls)
        self.get_image(image_url)
 
if __name__ == '__main__':
    spider = Spider_baidu_image()
    spider()
 

效果图:
在这里插入图片描述

第二个代码:我这个代码默认搜索:“黑烟”, “白烟”, “青烟”, “蓝烟”, “黄烟”, “黄色火苗”, “蓝色火苗”, “红色火苗”,可以自行进行更改设置。不需要改动代码,只需要改动keyword就可以

"""黑烟"""

import requests, json, os
from jsonpath import jsonpath


url = "https://image.baidu.com/search/acjson?"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}

keyword = ["黑烟", "白烟", "青烟", "蓝烟", "黄烟", "黄色火苗", "蓝色火苗", "红色火苗"]
for word in keyword:
    print(word + "......")
    if not os.path.exists(f"./{word}"):
        os.mkdir(f"./{word}")

    for i in range(1, 101):  # 30 * 100 = 3000  每个关键词爬取3000张图片
        params = {
            'tn': 'resultjson_com',
            'logid': '10502599365288196568',
            'ipn': 'rj',
            'ct': '201326592',
            'is': '',
            'fp': 'result',
            'fr': '',
            'word': word,
            'queryWord': word,
            'cl': '2',
            'lm': '-1',
            'ie': 'utf-8',
            'oe': 'utf-8',
            'adpicid': '',
            'st': '-1',
            'z': '',
            'ic': '0',
            'hd': '',
            'latest': '',
            'copyright': '',
            's': '',
            'se': '',
            'tab': '',
            'width': '',
            'height': '',
            'face': '0',
            'istype': '2',
            'qc': '',
            'nc': '1',
            'expermode': '',
            'nojc': '',
            'isAsync': '',
            'pn': 30 * i,
            'rn': '30',
            'gsm': '',  
            #     # '1637644071066': '',
        }
        res_text = requests.get(url, headers=headers, params=params).text
        # print(res_text)
        res_dic = json.loads(res_text)
        hoverurls = jsonpath(res_dic, '$..hoverURL')  # 所有的图片url
        # print(hoverurls, len(hoverurls))

        n = 0
        for img_url in hoverurls:
            n += 1
            res_content = requests.get(img_url, headers=headers).content
            filename = f"./{word}/" + str(n) + '.jpg'
            with open(filename, 'wb') as f:
                f.write(res_content)

            print(str(n) + '.jpg')

在这里插入图片描述

第三个代码:第三个代码虽然最少,但是我觉得是这三个当中最好的。爬取的效果。图片自动会生成的当前目录文件下

import requests, json, re, time, os

def get_asjson(page, gsm, word):
	url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=9123806616981181340&ipn=rj&ct=201326592&is=&fp=result&fr=&word={word}&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={str(30 * int(page))}&rn=30&gsm={gsm}&{str(int(time.time() * 1000))}="
	headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
			'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1637758492843_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDYsMiw0LDEsNSw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=hello',
			'Cookie': 'BDqhfp=hello%26%26-10-1undefined%26%2628989%26%2635; BAIDUID=0C2336F5F3D356371C46DF079632E0C8:FG=1; BAIDUID_BFESS=0C2336F5F3D356371C46DF079632E0C8:FG=1; BIDUPSID=0C2336F5F3D356371C46DF079632E0C8; __yjs_duid=1_32693704d239fea9266064fc8a3d25631637737833661; PSTM=1637737880; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=null; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; delPer=0; PSINO=6; __yjs_st=2_ZGU4ODA5ZTdmNzczMzgxNzRiZWZhNTdkODVkY2E5MzQ3NzM3Nzc2MzZlNjYzZmRiMWVjOTlmNWQzZDA3NWY1MzM2M2NkNjNmMjMzZWVlYzQxNGQ2ODIzYjlkNTdhYTUyZjdhNWQwNjQxZWE1YTI0MWZiNzQ1NTE0N2NlNTgwNjZjODlkNWVlZWI2ZDBkNjUzNmNiZDE3NzUyYTA4ZjkxYjI1NzNhODBjOGZhZTBmMzZkY2IwOWJmNjMxNjEzNmUxYjQxZmZhM2M1ODUzYTFkNTM4NTE5MzZjZjRkODliMTE1MmRmMDY1MjI4OGJiM2I3ZGMzMDdiNjI4MWE3NDgxZV83XzQyODU3N2M0; H_PS_PSSID=35295_34446_35104_31254_35237_35049_34584_34505_35245_34578_34872_26350_35210_35145_22160; indexPageSugList=%5B%22hello%22%2C%22bello%22%2C%22hello%20%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MTJmNTIwNGNlNmI5NDg2YmZiZTI1OTM1MGZhNTJhZTZlMzVmODE2NmEwZjg5MjNlZWZjZWY1YTY3ZjQ2Yzc2MWZiNGRlODY2ZDJjOGE3N2RhMzg2NjcxZjEzY2ZiMDQ4ODNjYzgyZTZlNWM2NGQ4YjlhMzBlMWE1ZjU0ZTY2NzAxYmM0ZGRkOTM0MGI3NzUwOWZjODY2ODE5NmU1N2E1Yw=='
	}
	response = requests.get(url = url, headers = headers).text + "1111"
	#print(response)
	gsm = re.findall('"gsm":"(.*?)",', response)[0]
	data = re.findall('"hoverURL":"(.*?)",', response)
	#print(len(data))
	return gsm, data
def save_img(imgurl_list, img_os):
	for i in imgurl_list:
		try:
			response = requests.get(url = i).content
		except:
			print("no")
		else:
			img_name = i[28: 36]
			with open(img_os + img_name + ".jpg", "wb") as file:
				file.write(response)
			print(i + " OK !!!")

	

if __name__ == "__main__":
	gsm = "1e"
	word = "蓝烟" #修改你要爬取的关键字
	img_os = word + "_img\\"
	os.mkdir(img_os) 
	for i in range(1, 102, 2):
		asjson_1 = get_asjson(page = i, gsm = gsm, word = word)
		save_img(asjson_1[1], img_os)
		#print(asjson_1[1])
		gsm = asjson_1[0]
		while True:
			asjson_2 = get_asjson(page = int(i) + 1, gsm = gsm, word = word)
			save_img(asjson_2[1], img_os)
			#print(asjson_2[1])
			gsm = asjson_2[0]
			break
		
	

以上三个方法军不需要对代码进行更改即可获取百度图片上的图片信息。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值