百度(baidu)、bing、sogo、360关键字 - 图片批量下载
2022-07-27
SHTL 博客:https://www.shtlls.ltd
说明:
注意一: 通过脚本下载的图片,仍会存在保存之后无法显示的情况,因为有的图片接口能访问,但是没有资源,只有一些提示资源不存在的文字说明,那么就自然无法通过图片的形式进行展示了。
注意二: 下载到本地的图来源虽说都来自同一搜索引擎,却不一定是同一数据源,来自原平台(比如:搜狐新闻、小红书、微博等其他平台)和搜索引擎缓存之后
除了百度,百度搜索目前查看的几个url都是一致的
原平台 : 优点 => 数据质量更好、更清晰 缺点 => 容易出现原图url无法请求
缓存后的搜索引擎 : 优点 => 下载较稳定 缺点 => 画质较差
当前代码中下载的仍是原平台数据,确保质量!
- 百度(baidu)
# -*- encoding: utf-8 -*-
import requests
from lxml import etree
page = input('请输入要爬取多少页:')
page = int(page) + 1
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
n = 1213
pn = 1
# pn是从第几张图片获取 百度图片下滑时默认一次性显示30张
for m in range(1, page):
url = 'https://image.baidu.com/search/acjson'
param = {
'tn': 'resultjson_com',
'logid': '8846269338939606587',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': '真人玩手机侧面',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': '真人玩手机侧面',
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': '',
'pn': pn, # 从第几张图片开始
'rn': '30',
'gsm': '1e',
}
page_text = requests.get(url=url, headers=header, params=param)
page_text.encoding = 'utf-8'
page_text = page_text.json()
info_list = page_text['data']
del info_list[-1]
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=header).content
img_path = './images/' + str(n) + '.jpg' # 爬取的文件地址
with open(img_path, 'wb') as fp:
fp.write(img_data)
n = n + 1
pn += 30
- bing
import os
import urllib
import re
from bs4 import BeautifulSoup
import requests
import time
import socket
# 设置请求超时时间,防止长时间停留在同一个请求,当某个链接失效,无法请求到数据,最长允许尝试连接10s
socket.setdefaulttimeout(10)
header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}
url = "https://cn.bing.com/images/async?q={0}&first={1}&count={2}&scenario=ImageBasicHover&datsrc=N_I&layout=ColumnBased&mmasync=1&dgState=c*9_y*2226s2180s2072s2043s2292s2295s2079s2203s2094_i*71_w*198&IG=0D6AD6CBAF43430EA716510A4754C951&SFX={3}&iid=images.5599"
def getImage(url, count, path):
'''从原图url中将原图保存到本地, gif图片不保存'''
if url.split('.')[-1] == 'gif':
return count
try:
img_data = requests.get(url=url).content
img_path = path + str(count + 1) + '.jpg'
with open(img_path, 'wb') as fp:
fp.write(img_data)
except Exception as e:
time.sleep(1)
print("\033[0;31m本张图片获取异常,跳过...\turl: ", url, "\033[0m")
else:
print("图片+1,成功保存%4d张图\turl: %s" % (count + 1, url))
count += 1
return count
def findImgUrlFromHtml(html, rule, url, key, first, loadNum, sfx, count, path):
'''从缩略图列表页中找到原图的url,并返回这一页的图片数量'''
soup = BeautifulSoup(html, "lxml")
link_list = soup.find_all("a", class_="iusc")
url = []
for link in link_list:
# 数量达到预期,退出
if count == countNum:
return count
result = re.search(rule, str(link))
# 将字符串"amp;"删除
url = result.group(0)
# 组装完整url
url = url[8:len(url)]
# 打开高清图片网址
count = getImage(url, count, path)
# 完成一页,继续加载下一页
return count
def getStartHtml(url, key, first, loadNum, sfx):
'''获取缩略图列表页'''
page = urllib.request.Request(url.format(key, first, loadNum, sfx),
headers=header)
html = urllib.request.urlopen(page)
return html
if __name__ == '__main__':
name = "二哈" # 图片关键词
path = './images/bing/' # 图片保存路径
countNum = 61 # 爬取数量
key = urllib.parse.quote(name)
first = 1
loadNum = 35
sfx = 1
count = 0
rule = re.compile(r"\"murl\"\:\"http\S[^\"]+")
if not os.path.exists(path):
os.makedirs(path)
while count < countNum:
try:
html = getStartHtml(url, key, first, loadNum, sfx)
count = findImgUrlFromHtml(html, rule, url, key, first, loadNum, sfx,
count, path)
first = count + 1
sfx += 1
except Exception as e:
print("\033[0;31m出现异常[不会影响主程序]:\t", e, "\033[0m")
- sogo
# -*- coding:utf-8 -*-
import requests
import socket
# 设置请求超时时间,防止长时间停留在同一个请求,当某个链接失效,无法请求到数据,最长允许尝试连接10s
socket.setdefaulttimeout(10)
def sougou_pic_url(num, keyword, start, xml_len):
# 数字开始下标
name_num = 1
# 文件存储路径
path = './images/sougo/'
# 结束运行标志
over_flag = False
# 文件计数
img_num = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
url = 'https://pic.sogou.com/napi/pc/searchList'
param = {
'model': 1,
'start': start,
'xml_len': xml_len,
'query': keyword
}
while True:
if over_flag:
return
data = requests.get(url=url, headers=headers, params=param)
data.encoding = 'utf-8'
data = data.json()
list = data['data']['items']
for i in list:
img_url = i['picUrl']
# 原图是gif的就不要,下载之后统一格式存储
if img_url.split('.')[-1] == 'gif':
continue
try:
img_data = requests.get(url=img_url).content
img_path = path + str(name_num) + '.jpg' # 爬取的文件地址
with open(img_path, 'wb') as fp:
fp.write(img_data)
print("当前数量: %4d , 下载成功 +1 , 下载url: %s " % (img_num, img_url))
if num == img_num:
over_flag = True
break
img_num += 1
name_num += 1
except Exception as e:
print("\033[0;31m出现异常[不会影响主程序]:\t", e, "\033[0m")
if __name__ == '__main__':
# 搜素关键字
keyword = '二哈'
# 查询数量
num = 101
# 开始条数(关键词不变的情况下,每次搜索排序的权重是没变的,比如当我们下载了前1000条,发现不够,需要继续下载后面的,而不需要重复,就需要使用这个参数了)
start = 200
# 每页条数(暂定默认100)
xml_len = 100
sougou_pic_url(num, keyword, start, xml_len)
- 360
# -*- coding:utf-8 -*-
import requests
import socket
# 设置请求超时时间,防止长时间停留在同一个请求,当某个链接失效,无法请求到数据,最长允许尝试连接10s
socket.setdefaulttimeout(10)
def pic_url(keyword, num, pn, path):
# 数字开始下标
name_num = 1
# 结束运行标志
over_flag = False
# 文件计数
img_num = 1
# 每页数量
page_num = 60
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
url = 'https://image.so.com/j'
while True:
param = {
'callback': '',
'q': '二哈',
'qtag': '',
'pd': 1,
'pn': pn,
'correct': '二哈',
'adstar': 0,
'tab': 'all',
# 'sid': '765dd007f64baa7510814ffd5acdf6a1',
'ras': 6,
'cn': 0,
'gn': 0,
'kn': 18,
'crn': 0,
'bxn': 20,
'cuben': 0,
'pornn': 0,
'manun': 45,
'src': 'srp',
'sn': 113,
'ps': 160,
# 每页数量
'pc': page_num,
# '_': 1658827127443
}
if over_flag:
return
data = requests.get(url=url, headers=headers, params=param)
data.encoding = 'utf-8'
data = data.json()
list = data['list']
for i in list:
img_url = i['img']
# 原图是gif的就不要,下载之后统一格式存储
if img_url.split('.')[-1] == 'gif':
continue
try:
img_data = requests.get(url=img_url).content
img_path = path + str(name_num) + '.jpg' # 爬取的文件地址
with open(img_path, 'wb') as fp:
fp.write(img_data)
print("当前数量: %4d , 下载成功 +1 , 下载url: %s " % (img_num, img_url))
if num == img_num:
over_flag = True
break
img_num += 1
name_num += 1
except Exception as e:
print("\033[0;31m出现异常[不会影响主程序]:\t", e, "\033[0m")
continue
pn += page_num
if __name__ == '__main__':
# 文件存储路径
path = './images/360s/'
# 搜素关键字
keyword = '二哈'
# 查询数量
num = 101
# pn是从第几张图片获取 360图片下滑时默认一次性显示60张
pn = 1
pic_url(keyword, num, pn, path)