百度
import requests
import os
import re
import datetime
import hashlib
md5_list = []
def get_images_from_baidu(keyword, save_dir):
# UA 伪装:当前爬取信息伪装成浏览器
# 将 User-Agent 封装到一个字典中
# 【(网页右键 → 审查元素)或者 F12】 → 【Network】 → 【Ctrl+R】 → 左边选一项,右边在 【Response Hearders】 里查找
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
# 请求的 url
url = 'https://image.baidu.com/search/acjson?'
n = 0
# 去重
total_urls = []
global md5_list
# for pn in range(0, 30 * page_num, 30):
pag = 0
while True:
pn = pag * 30
pag += 1
# 请求参数
param = {'tn': 'resultjson_com',
# 'logid': '7603311155072595725',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': '', # 这个参数没公开,但是不可少
'pn': pn, # 显示:30-60-90
'rn': '30', # 每页显示 30 条
'gsm': '1e',
'1618827096642': ''
}
request = requests.get(url=url, headers=header, params=param)
if request.status_code != 200:
print("请求失败")
continue
request.encoding = 'utf-8'
# 正则方式提取图片链接
html = request.text
image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
image_url_list = list(set(image_url_list))
for image_url in image_url_list:
if image_url not in total_urls:
image_data = requests.get(url=image_url, headers=header).content
md5 = hashlib.md5(image_data).hexdigest()
if md5 not in md5_list:
md5_list.append(md5)
else:
print("重复数据,跳过")
continue
now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
img_name = now_time + '.jpg'
with open(os.path.join(save_dir, img_name), 'wb') as fp:
fp.write(image_data)
n +=1
print(f"已获取:{n}/{total}")
total_urls.append(image_url)
if n > total:
print("数量达到,退出")
return
if __name__ == '__main__':
keyword = '猫'
save_dir = r"D:\data\gecaoji\pachong\baidu"
save_dir = os.path.join(save_dir, keyword)
os.makedirs(save_dir, exist_ok=True)
exit_files = os.listdir(save_dir)
for exit_file in exit_files:
exit_file_path = os.path.join(save_dir, exit_file)
with open(exit_file_path,'rb') as f:
img_data = f.read()
md5 = hashlib.md5(img_data).hexdigest()
md5_list.append(md5)
total = 3000
# page_num = 100
get_images_from_baidu(keyword,save_dir)
print('Get images finished.')
bing搜索
"""功能:通过爬虫快速获取图片"""
import os
import urllib
import requests
import re
from bs4 import BeautifulSoup
import datetime
import hashlib
header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}
# 定义目标URL
url = "https://cn.bing.com/images/async?q={0}&first={1}&count={2}&scenario=ImageBasicHover&datsrc=N_I&layout=ColumnBased&mmasync=1&dgState=c*9_y*2226s2180s2072s2043s2292s2295s2079s2203s2094_i*71_w*198&IG=0D6AD6CBAF43430EA716510A4754C951&SFX={3}&iid=images.5599"
proxy_host = "127.0.0.1:7890"
# 创建代理配置
proxy_config = {
'http': f'http://{proxy_host}',
'https': f'http://{proxy_host}',
}
urls = []
md5_list = []
def getImage(url):
'''从原图url中将原图保存到本地'''
global count
global md5_list
try:
now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
img_name = now_time + '.jpg'
# time.sleep(0.5)
# urllib.request.urlretrieve(url, os.path.join(save_path, img_name))
response = requests.get(url,proxies=proxy_config)
if response.status_code == 200:
img_data = response.content
md5 = hashlib.md5(img_data).hexdigest()
if md5 not in md5_list:
md5_list.append(md5)
else:
print("重复数据,跳过")
return
with open(os.path.join(save_path, img_name), 'wb') as f:
f.write(img_data)
count +=1
print(f"已获取:{count}/{countNum}")
except Exception as e:
# time.sleep(1)
print("本张图片获取异常,跳过...")
def findImgUrlFromHtml(html, rule):
'''从缩略图列表页中找到原图的url,并返回'''
soup = BeautifulSoup(html, "lxml")
link_list = soup.find_all("a", class_="iusc")
# urls = []
global urls
for link in link_list:
result = re.search(rule, str(link))
# 将字符串"amp;"删除
# print(result)
if result is None:
continue
url = result.group(0)
# 组装完整url
url = url[8:len(url)]
urls.append(url)
# # 打开高清图片网址
getImage(url)
# 完成一页,继续加载下一页
def getStartHtml(url, key, first, loadNum, sfx):
'''获取缩略图列表页'''
page = urllib.request.Request(url.format(key, first, loadNum, sfx),
headers=header)
html = urllib.request.urlopen(page)
return html
if __name__ == '__main__':
name = "hedgehog" # 图片关键词
save_path = os.path.join(r'D:\data\gecaoji\pachong\bing', name)
os.makedirs(save_path, exist_ok=True)
exit_files = os.listdir(save_path)
for exit_file in exit_files:
exit_file_path = os.path.join(save_path, exit_file)
with open(exit_file_path,'rb') as f:
img_data = f.read()
md5 = hashlib.md5(img_data).hexdigest()
md5_list.append(md5)
countNum = 3000 # 爬取数量
key = urllib.parse.quote(name)
first = 1
loadNum = 35
sfx = 1
count = 0
rule = re.compile(r"\"murl\"\:\"http\S[^\"]+")
if not os.path.exists(save_path):
os.makedirs(save_path)
while count < countNum:
html = getStartHtml(url, key, first, loadNum, sfx)
findImgUrlFromHtml(html, rule)
first += loadNum
# sfx += 1
谷歌
# date:2020.5.25
# author:pmy
# aim:爬取google图片
# 问题在于,不能保证所爬为所见
import selenium.webdriver.common.by
from selenium import webdriver
import os
import requests
import base64
import datetime
import hashlib
# 修改keyword便可以修改搜索关键词 建议也修改存储目录
keyword = '刺猬'
save_path = r'D:\data\gecaoji\pachong\google'
picpath = os.path.join(save_path, keyword)
os.makedirs(picpath, exist_ok=True)
# url = 'https://www.google.com.hk/search?q=' + keyword + '&source=lnms&tbm=isch'
url = "https://www.google.com.hk/search?q=" + keyword + "&tbm=isch&ved=2ahUKEwiqks_KsfyCAxWaSGwGHUsPDT8Q2-cCegQIABAA&oq=hedgehog&gs_lcp=CgNpbWcQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQ6BAgjECdQzApYzApgtRBoAHAAeACAAfwBiAGQA5IBBTAuMS4xmAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=fjxxZarbKJqRseMPy560-AM&bih=931&biw=1920&hl=zh-CN"
num = 600
end = False
proxy_host = "127.0.0.1:7890"
# 定义目标URL
# 创建代理配置
proxy_config = {
'http': f'http://{proxy_host}',
'https': f'http://{proxy_host}',
}
count = 0 # 图片序号
md5_list = []
exit_img_files = os.listdir(picpath)
for exit_img_file in exit_img_files:
exit_file_Path = os.path.join(picpath,exit_img_file)
with open(exit_file_Path,"rb") as f:
img_data = f.read()
md5 = hashlib.md5(img_data).hexdigest()
md5_list.append(md5)
class Crawler_google_images:
# 初始化
def __init__(self):
self.url = url
# 获得Chrome驱动,并访问url
def init_browser(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-infobars")
browser = webdriver.Chrome(options=chrome_options)
# 访问url
browser.get(self.url)
# 最大化窗口,之后需要爬取窗口中所见的所有图片
browser.maximize_window()
return browser
# 下载图片
def download_images(self, browser, num=100):
# 存储路径
# 路径不存在时创建一个
if not os.path.exists(picpath):
os.makedirs(picpath)
pos = 0
# print(num)
urls = []
while (count < num):
# 向下滑动
js = 'var q=document.documentElement.scrollTop=' + str(pos)
pos += 500
browser.execute_script(js)
# time.sleep(0.5)
# 找到图片
# html = browser.page_source#也可以抓取当前页面的html文本,然后用beautifulsoup来抓取
# 直接通过tag_name来抓取是最简单的,比较方便
img_elements = browser.find_elements(selenium.webdriver.common.by.By.XPATH,
value='//a[@class="FRuiCf islib nfEiy"]')
for img_element in img_elements:
# 点开大图页面
try :
img_element.click()
# time.sleep(0.3)
except Exception as e:
continue
# print(e)
# try:
# 这里balabala里面有好几个,所以要过滤一下
# 取名好烦哦···
balabalas = browser.find_elements(selenium.webdriver.common.by.By.XPATH,
value='//img[@class="rg_i Q4LuWd"]')
if (balabalas):
for balabala in balabalas:
try:
src = balabala.get_attribute('src')
except:
continue
if src is None:
# print(src)
continue
# 过滤掉缩略图和无关干扰信息
if src.startswith('data:image') or src.startswith(
'https://'):
# print('Found' + str(count) + 'st image url')
# img_url_dic.append(src)
# self.save_img(count, src, picpath)
# recorded = keyboard.record(until='esc')
# # 当按下esc时结束按键监听,并输出所有按键事件
# if end == True:
# print(f"手动结束搜索")
# return urls
if src not in urls:
urls.append(src)
try:
self.save_img(src, picpath)
except:
print(f"get {src} failed")
if count >= num:
print(f"数量达到,结束搜索")
return
else:
print(src)
# except:
# print('获取图片失败')
# # 回退
# browser.back()
# time.sleep(0.3)
# except:
# print('获取页面失败')
# except:
# print("划不动了")
def save_img(self, img_src, picpath):
global md5_list
global count
now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
img_name = now_time + '.jpg'
filename = os.path.join(picpath, img_name)
if img_src.startswith("data:image"):
img_base64 = img_src.split(',')[-1]
img_data = base64.b64decode(img_base64)
md5 = hashlib.md5(img_data).hexdigest()
if md5 not in md5_list:
md5_list.append(md5)
else:
print("重复数据,跳过")
return
with open(filename, 'wb') as f:
f.write(img_data)
count += 1
print(f"已获取:{count}/{num}")
else:
r = requests.get(img_src,proxies=proxy_config)
if r.status_code == 200:
img_data = r.content
md5 = hashlib.md5(img_data).hexdigest()
if md5 not in md5_list:
md5_list.append(md5)
else:
print("重复数据,跳过")
return
with open(filename, 'wb') as f:
f.write(img_data)
count += 1
print(f"已获取:{count}/{num}")
def run(self):
self.__init__()
browser = self.init_browser()
self.download_images(browser, num) # 可以修改爬取的图片数
browser.close()
print("############爬取完成")
if __name__ == '__main__':
craw = Crawler_google_images()
craw.run()