一、前提准备
1.下载selenium、requests库
pip install requests
pip install selenium
2.在自己目录下创建好dataset目录,并再其下在创建两个文件夹分为命名Image,HrefJson (否则需要自己改动里面的代码)
注意:使用selenium前需知道自己浏览器对应的版本驱动!!!(我使用的浏览器为谷歌浏览器,若不是谷歌,需自行修改代码)
二、代码
#导入资源
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time,re,json,os,threading,requests,numpy as np
"""
全局静态池
"""
#书法网站url name:{url:'',}
ConfigMap = {
'百度图片':{
'Search':{
'Connection_Url':'https://image.baidu.com/', #百度引擎的URL
'Kw':'书法画作品', #所要查询的内容
'Search_InputID':'kw', #查询的搜索框ID
'BtnClass':'s_newBtn', #查询按钮ClassName
'Count':2, #动态下滑次数
'Destion_Class':'//img[@class="main_img img-hover"]' #爬取的图片资源类名
},
'Thread':{
'ThreadID': 1,
'ThreadName':'task1',
},
'SaveFileName':'BaiduPhotoData',
'SRCList':[]
},
'必应图片':{
'Search':{
'Connection_Url':'https://cn.bing.com/images/search?q=书法画&first=1',
'Kw':'作品',
'Search_InputID':'sb_form_q',
'BtnClass':'b_searchboxSubmit',
'Count':2,
'Destion_Class':'//img[@class="mimg vimgld"]' #必应的书法画作品
},
'Thread': {
'ThreadID': 2,
'ThreadName': 'task2',
},
'SaveFileName': 'BiYingPhotoData',
'SRCList': []
}
}
def SearchPhotoMSGGet(threadname,Key,Connection_Url,Kw,Search_InputID,BtnClass,Count,Destion_Class):
web = LoadChormeWeb()
#
web.get(Connection_Url)
search_input = web.find_element_by_id(Search_InputID)
search_input.send_keys(Kw) #获取百度输入框
js = 'window.scrollTo(0,document.body.scrollHeight)' #输入搜索字段
time.sleep(2)
#
btn = web.find_element_by_class_name(BtnClass) #
btn.click()
time.sleep(2)
# 动态下滑次数
for count in range(1,Count):
web.execute_script(js)
time.sleep(2)
#遍历图片ClassName的列表'//img[@class="main_img img-hover"]'
ImgList = web.find_elements(By.XPATH,Destion_Class)
for Img in ImgList:
src_value = Img.get_attribute('src')
#除去不合格的
isHTTPS = AddressFormatHeader(str(src_value))
if isHTTPS:
ConfigMap[Key]['SRCList'].append(src_value)
print(threadname+':'+'-'*15+"图片资源网址获取成功!"+'-'*15)
web.quit()
time.sleep(10)
#把图片资源网址保存成文件
def SavePhotoURLFile(threadname,filename,SRCList):
# 资源链接保存数据
SrcSaveMap = {
'author': '柒七',
'description': '书法画数据集',
'keyword': '书法画作品',
'data': []
}
for i,src in zip(range(1,len(SRCList)), SRCList):
# 每次新的
srcmap = {'index':i,'href':src}
SrcSaveMap['data'].append(srcmap)
# 为空则创建
if CheckIfFileEmpty(os.path.abspath('dataset/HrefJson/{}.json'.format(filename))):
Write(SrcSaveMap,os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'w')
#不为空处理
else:
old_data = Read(os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'r')
for i in SrcSaveMap['data']:
for href in i.values():
if not href in old_data:
old_data['data']['href'].append(i)
print(threadname+':'+'-'*15,"图片资源文件已保存",'-'*15)
time.sleep(10)
# 下载图片
def DownloadPhoto(image_path,folder_path):
ImageIndex = 1
for root,dirs,files in os.walk(folder_path):
if len(files) > 0:
for file in files:
data = Read(f'{folder_path}\\{file}','r')
print('\n', '-' * 15, "开始下载!", '-' * 15)
for Src in data['data']:
response = requests.get(Src['href'])
with open(image_path.format(ImageIndex),'wb') as f:
f.write(response.content)
f.close()
print(f"图片 {Src['href']} 下载完成\n,保存为 {image_path.format(ImageIndex)}")
ImageIndex+=1
time.sleep(1)
"""
工具
"""
# 打开浏览器
def LoadChormeWeb():
#
options = Options()
options.add_argument("--headless")
options.headless = False
#
web = Chrome(executable_path="D:/DevelopProgramming/002D-Tool/Anacode3/chromedriver.exe", options=options)
web.implicitly_wait(10)
return web
def CheckIfFileEmpty(filename):
if not os.path.isfile(filename) or os.stat(filename).st_size == 0:
return True
data = Read(filename,'r')
if len(data) > 0:
return False
else:
return True
# 文件的写入
def Write(Data,Filename,mode):
with open(Filename, mode) as f:
json.dump(Data, f,indent=4)
f.close() #释放资源
#文件的数据读取
def Read(Filename,mode):
with open(Filename, mode) as f:
data = json.load(f)
f.close()
return data
"""
线程池
"""
def asyn_method():
# 线程列表
tasks=[]
# 图片链接获取任务开启
def Task1():
print('task1:', '-' * 15, '开始工作!', '-' * 15)
SearchPhotoMSGGet(ConfigMap['百度图片']['Thread']['ThreadName'],'百度图片',**ConfigMap['百度图片']['Search'])
print('task1:','-'*15,'百度图片爬取完成!','-'*15)
SavePhotoURLFile(ConfigMap['百度图片']['Thread']['ThreadName'],ConfigMap['百度图片']['SaveFileName'],ConfigMap['百度图片']['SRCList'])
def Task2():
print('task2:', '-' * 15, '开始工作!', '-' * 15)
SearchPhotoMSGGet(ConfigMap['必应图片']['Thread']['ThreadName'],'必应图片',**ConfigMap['必应图片']['Search'])
print('task2:', '-' * 15, '必应图片爬取完成!', '-' * 15)
SavePhotoURLFile(ConfigMap['必应图片']['Thread']['ThreadName'], ConfigMap['必应图片']['SaveFileName'],
ConfigMap['必应图片']['SRCList'])
task1 = threading.Thread(target=Task1,daemon=True,name=ConfigMap['百度图片']['Thread']['ThreadName'])
task2 = threading.Thread(target=Task2,daemon=True,name=ConfigMap['必应图片']['Thread']['ThreadName'])
tasks.append(task1)
tasks.append(task2)
try:
task1.start()
task2.start()
except Exception as e:
print("线程异常:", str(e))
finally:
for th in tasks:
th.join()
if __name__ == '__main__':
asyn_method()
DownloadPhoto(os.path.abspath('dataset/Image/Painting{}.jpg'),os.path.abspath('dataset/HrefJson/'))
今天的内容就到这里,谢谢大家!!!!(注:可以继续深化下去,比如说对图片的处理)