python3-爬虫：08 爬取百度图片

最新推荐文章于 2023-05-15 11:05:18 发布

今天我牙疼

最新推荐文章于 2023-05-15 11:05:18 发布

阅读量837

点赞数

分类专栏：爬虫-Python3 文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_40924514/article/details/110232800

版权

爬虫-Python3 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

爬取流程

根据链接获取json信息
根据返回的json信息提取出图片的objURL
对提取出的objURL解码转换
根据objURL下载图片（30张，如有需要可自行更改）

from urllib.parse import urlencode
from urllib.request import urlretrieve
import requests
import re
import os


def get_json():
	"""根据链接获取json信息"""

	baseUrl = 'https://image.baidu.com/search/index?'
	headers = {
		'Host':'image.baidu.com',
		# word后的为关键字，自行使用浏览器进行转换
		'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&word=%E7%81%AB%E5%BD%B1',
		'Sec-Fetch-Dest': 'empty',
		'Sec-Fetch-Mode': 'cors',
		'Sec-Fetch-Site': 'same-origin',
		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
		'X-Requested-With': 'XMLHttpRequest',
	}
	params = {
		'tn': 'resultjson_com',
		'logid': '10981796017892247737',
		'ipn': 'rj',
		'ct': '201326592',
		'is': '',
		'fp': 'result',
		'queryWord': '火影',		# 关键字 可自行更改
		'cl': '2',
		'lm': '-1',
		'ie': 'utf-8',
		'oe': 'utf-8',
		'adpicid': '',
		'st': '-1',
		'z': '',
		'ic': '0',
		'hd': '',
		'latest': '',
		'copyright': '',
		'word': '火影',		# 关键字 可自行更改
		's': '',
		'se':'' ,
		'tab': '',
		'width': '',
		'height': '',
		'face': '0',
		'istype': '2',
		'qc': '',
		'nc': '1',
		'fr': '',
		'expermode': '',
		'force': '',
		'pn': '30',		# 本页图片数量 需要的可以自行修改 每次为30的倍数
		'rn': '30',			
		'gsm': '3c',
		'1606372009928': '',
	}

	url = baseUrl + urlencode(params)
	print(url)
	try:

		response = requests.get(url, headers=headers, allow_redirects=False).json()
		return response
	except requests.ConnectionError as e:
		print(e.args)

def get_objUrl(pageInfo):

	if pageInfo.get('data'):
		urls = []
		data = pageInfo.get('data')
		for info in data:
			if info.get('objURL'):
				url = info.get('objURL')
				urls.append(url)
				print(url)
		return urls
	else:
		print('json获取失败')

def  img_decode(url):
    res = ''
    c = ['_z2C$q', '_z&e3B', 'AzdH3F']
    decode = {
    	'w':'a', 'k':'b', 'v':'c', '1':'d', 'j':'e', 'u':'f', 
    	'2':'g', 'i':'h', 't':'i', '3':'j', 'h':'k', 's':'l', 
    	'4':'m', 'g':'n', '5':'o', 'r':'p', 'q':'q', '6':'r', 
    	'f':'s', 'p':'t', '7':'u', 'e':'v', 'o':'w', '8':'1', 
    	'd':'2', 'n':'3', '9':'4', 'c':'5', 'm':'6', '0':'7', 
    	'b':'8', 'l':'9', 'a':'0', '_z2C$q':':', '_z&e3B':'.',
    	 'AzdH3F':'/',
    	}
    if(url==None or 'http' in url):
        return url
    else:
        j= url
        for m in c:
            j=j.replace(m,decode[m])
        for char in j:
            if re.match('^[a-w\d]+$',char):
                char = decode[char]
            res= res+char
        return res

def download_img(url, count):
	try:
		response = requests.get(url)
		if response.status_code == 200:
			urlretrieve(url,'img{}.jpg'.format(count))

	except requests.ConnectionError as e:
		print('保存失败！ Error: ', e.args)

def main(urllist, count=0):
	
	for url in urlList:
		url = img_decode(url)
		download_img(url, count)
		count += 1

if __name__ == '__main__':
	pageInfo = get_json()
	urlList = get_objUrl(pageInfo)
	main(urlList)

今天我牙疼

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
2
评论
python3-爬虫：08 爬取百度图片

爬取流程根据链接获取json信息根据返回的json信息提取出图片的objURL对提取出的objURL解码转换根据objURL下载图片（30张，如有需要可自行更改）from urllib.parse import urlencodefrom urllib.request import urlretrieveimport requestsimport reimport osdef get_json(): """根据链接获取json信息""" baseUrl = 'https:/
复制链接

扫一扫