1.参考文章
目标检测-爬虫-利用百度识图的方法来批量的爬取图片生产数据集
需求说明:根据上述代码进行了改进,原代码需要给定图片网页链接访问。因此,增加一个使用搜索好的页面直接下载的功能,实际会更方便一些。
搜索好后,需要确认一下搜到的图片是否满足要求,因为图搜图,相似图片不一定符合要求,可以减少数据收集好后筛选工作。
实现自动迭代:将搜索好的URL作为参数传入 search_similar_image 函数,并设置迭代停止条件即可。
2.项目准备
1. chromedriver安装
2. selenimu == 3.141.0;urllib3 == 1.14.0。版本过高可能导致报错。
Python中无法使用Selenium,显示ValueError: Timeout value connect was ……, but it must be an int, float or None
3.代码实现
"""
Author: paradoxjun
time: 2024年4月24日17:34:46
desc: 爬取百度识别图片
ref: orangezs
"""
import os
import re
import cv2
import time
import requests
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# 根据页面源码,可能需要修改的页面参数
modify_param = {
"input_box": "input[placeholder='拖拽图片到此处或粘贴图片网址']", # 输入的文本框元素
"search_key": "#app > div > div.page-banner > div.page-search > div > span.graph-d20-search-btn", # ”识图一下“按钮
"is_success": "div[class='graph-similar-list']", # 是否返回了结果,即是否搜索成功
"all_image_area": "div[class='general-waterfall']", # 搜索的图片区域,是上一行的结果拓展
}
def search_similar_images(browser, image_url=None, max_page=10):
"""
两种方式检索图片:1.给种子图片网址检索,2.手动检索后,把页面直接拉下来
:param browser: 启动的浏览器
:param image_url: 种子图片地址,或者搜索结果地址
:param max_page: 搜索结果滚动展开的页面
:return: 页面中图片URL的保存列表
"""
print("start find similar image of {}".format(image_url))
# 尝试连接次数,超过3次,失败不再连接
try_num = 0
search_failed = True
while (search_failed):
if try_num >= 3:
break
try:
# 如果没填URL,或者填的URL是百度识图主页,则需要指定图片网址链接
if image_url is None or image_url == home_page:
# 访问百度搜图主页
browser.get(home_page)
# 拖拽图片到此处或粘贴图片网址,该部分会随百度主页变化而修改
url_upload_textbox = browser.find_element_by_css_selector(modify_param["input_box"])
url_upload_textbox.send_keys(image_url)
# 识图一下,该部分也会随百度主页修改而修改
search_image_button = browser.find_element_by_css_selector(modify_param["search_key"])
search_image_button.click()
# 等待百度识图结果
time.sleep(5)
else:
browser.get(image_url)
# 测试是否成功(只显示一页)
graph_similar = browser.find_element_by_css_selector(modify_param["is_success"])
print("Search similar images successfully.")
# 运行到这里说明模拟使用百度识图功能成功,页面已正常加载
search_failed = False
except Exception as e:
print(f"ERROR: when request baidu image search. The error is: {e}")
finally:
try_num += 1
if search_failed:
print("give up current image")
return []
# 动态加载max_page次页面
download_page = 0
print("Dynamic loading web page...")
while download_page < max_page:
# 模拟向下滑动滚动条动态加载
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 等待滚动条10s
time.sleep(1)
download_page += 1
# 进行回显
prompt = "Now get page: "
print("\b" * (len(prompt) + 6), prompt, f"{download_page:04d}", end="", flush=True)
print()
# 解析页面中的所有url(指定页数)
graph_similar = browser.find_element_by_css_selector(modify_param["all_image_area"])
columns = []
for i in range(1, 10):
# 构建选择器,这部分似乎不会改
column_selector = f'div > div:nth-child({i})'
try:
# 查找相应的列元素
column_element = graph_similar.find_element_by_css_selector(column_selector)
# 将列元素添加到列表中
columns.append(column_element)
except Exception:
# 如果元素不存在,打印错误信息并继续下一个循环
print(f"Find {i - 1} column images.")
break
# 所有URL保存列表
url_list = []
# 按列获取
for column in columns:
column_imgs = column.find_elements_by_tag_name('a')
# 获取每一列中的每张图片
for img_box in column_imgs:
img_url = img_box.find_element_by_tag_name('img').get_attribute('src')
url_list.append(img_url)
total_imgs_num = len(url_list)
print("totally fing {} images.".format(total_imgs_num))
return url_list
def download_search_images(url_list, save_root, save_dir):
"""
下载图片,保存到指定路径
:param url_list: URL列表
:param save_root: 保存的根目录
:param save_dir: 保存在根目录下的子目录
:return:
"""
print("start downloading...")
# 本次下载保存的图片数
has_saved = 0
if not os.path.exists(save_root):
os.mkdir(save_root)
# 实际保存的路径
save_path = os.path.join(save_root, save_dir)
if not os.path.exists(save_path):
os.mkdir(save_path)
for img_url in url_list:
try:
response = requests.get(img_url, timeout=5)
except Exception as e:
print(f"ERROR: download img timeout. Error: {e}")
try:
# 读取为np数组,再用opencv解码
imgDataNp = np.frombuffer(response.content, dtype='uint8')
img = cv2.imdecode(imgDataNp, cv2.IMREAD_UNCHANGED)
# 获取下载的文件名,将u=后的两个数字作为标识id,f=后的格式转为.jpg
# http://mms0.baidu.com/it/u=1708366022,3113282384&fm=253&app=138&f=JPEG?w=500&h=375
# 正则匹配了三个部分,格式部分未使用
match = re.search(r'u=(\d+),(\d+)&.*?f=([A-Za-z]+)', img_url)
name_part = [match.group(i) for i in range(1, 4)]
img_name = f"{name_part[0]}_{name_part[1]}.jpg"
save_img = os.path.join(save_path, img_name)
if not os.path.exists(save_img):
# 保存为.jpg
cv2.imwrite(save_img, img, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
print(f"Save the {has_saved:05d} image: {save_img}")
has_saved += 1
else:
print(f"Image has exist: {img_name}")
except Exception as e:
print("ERROR: download img corruption.")
if __name__ == '__main__':
# TODO: set parameters
# chromedriver的路径
chrome_driver_path = r'F:\Work_HW\chromedriver\chromedriver.exe'
# 百度识图主页
home_page = 'https://graph.baidu.com/pcpage/index?tpl_from=pc'
# 保存图片的根目录
save_root = r'./Dataset/2024-4-24'
# 启动谷歌浏览器时,不显示页面
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
browser.set_page_load_timeout(30)
# 百度识图搜索后的页面
image_url = 'https://graph.baidu.com/pcpage/similar?carousel=503&entrance=GENERAL&extUiData%5BisLogoShow%5D=1&image=http%3A%2F%2Fmms0.baidu.com%2Fit%2Fu%3D1098828969,2616143084%26fm%3D253%26app%3D138%26f%3DJPEG%3Fw%3D667%26h%3D500&index=0&inspire=general_pc&next=2&originSign=12690ff83641d32aaad9b01713864878&page=1&render_type=carousel&session_id=9381424309576681640&shituToken=e28738&sign=12690ff83641d32aaad9b01713864878&srcp=crs_pc_similar&tn=pc&tpl_from=pc'
# 获取百度识图结果
url_list = search_similar_images(browser, image_url=image_url, max_page=10)
download_search_images(url_list, save_root=save_root, save_dir='money_detector')