基于selenium的爬取百度和必应图片

一、前提准备

        1.下载selenium、requests库

pip install requests
pip install selenium

        2.在自己目录下创建好dataset目录,并再其下在创建两个文件夹分为命名Image,HrefJson (否则需要自己改动里面的代码)

 注意:使用selenium前需知道自己浏览器对应的版本驱动!!!(我使用的浏览器为谷歌浏览器,若不是谷歌,需自行修改代码)

 二、代码

#导入资源
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time,re,json,os,threading,requests,numpy as np

"""
    全局静态池
"""
#书法网站url name:{url:'',}
ConfigMap = {
    '百度图片':{
        'Search':{
            'Connection_Url':'https://image.baidu.com/', #百度引擎的URL
            'Kw':'书法画作品',   #所要查询的内容
            'Search_InputID':'kw', #查询的搜索框ID
            'BtnClass':'s_newBtn',  #查询按钮ClassName
            'Count':2,  #动态下滑次数
            'Destion_Class':'//img[@class="main_img img-hover"]'   #爬取的图片资源类名
        },
        'Thread':{
            'ThreadID': 1,
            'ThreadName':'task1',
        },
        'SaveFileName':'BaiduPhotoData',
        'SRCList':[]
    },
    '必应图片':{
        'Search':{
            'Connection_Url':'https://cn.bing.com/images/search?q=书法画&first=1',
            'Kw':'作品',
            'Search_InputID':'sb_form_q',
            'BtnClass':'b_searchboxSubmit',
            'Count':2,
            'Destion_Class':'//img[@class="mimg vimgld"]' #必应的书法画作品
        },
        'Thread': {
            'ThreadID': 2,
            'ThreadName': 'task2',
        },
        'SaveFileName': 'BiYingPhotoData',
        'SRCList': []
    }

}
def SearchPhotoMSGGet(threadname,Key,Connection_Url,Kw,Search_InputID,BtnClass,Count,Destion_Class):
    web = LoadChormeWeb()
    #
    web.get(Connection_Url)
    search_input = web.find_element_by_id(Search_InputID)
    search_input.send_keys(Kw)   #获取百度输入框
    js = 'window.scrollTo(0,document.body.scrollHeight)' #输入搜索字段
    time.sleep(2)
    #
    btn = web.find_element_by_class_name(BtnClass)  #
    btn.click()
    time.sleep(2)

    # 动态下滑次数
    for count in range(1,Count):
        web.execute_script(js)
        time.sleep(2)
    #遍历图片ClassName的列表'//img[@class="main_img img-hover"]'
    ImgList = web.find_elements(By.XPATH,Destion_Class)
    for Img in ImgList:
        src_value = Img.get_attribute('src')
            #除去不合格的
        isHTTPS = AddressFormatHeader(str(src_value))
        if isHTTPS:
            ConfigMap[Key]['SRCList'].append(src_value)
    print(threadname+':'+'-'*15+"图片资源网址获取成功!"+'-'*15)
    web.quit()
    time.sleep(10)
#把图片资源网址保存成文件
def SavePhotoURLFile(threadname,filename,SRCList):
    # 资源链接保存数据
    SrcSaveMap = {
        'author': '柒七',
        'description': '书法画数据集',
        'keyword': '书法画作品',
        'data': []
    }
    for i,src in zip(range(1,len(SRCList)), SRCList):
            # 每次新的
            srcmap = {'index':i,'href':src}
            SrcSaveMap['data'].append(srcmap)
    # 为空则创建
    if CheckIfFileEmpty(os.path.abspath('dataset/HrefJson/{}.json'.format(filename))):
        Write(SrcSaveMap,os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'w')
    #不为空处理
    else:
        old_data = Read(os.path.abspath('dataset/HrefJson/{}.json'.format(filename)),'r')
        for i in SrcSaveMap['data']:
            for href in i.values():
                if not href in old_data:
                    old_data['data']['href'].append(i)
    print(threadname+':'+'-'*15,"图片资源文件已保存",'-'*15)
    time.sleep(10)
# 下载图片
def DownloadPhoto(image_path,folder_path):
    ImageIndex = 1
    for root,dirs,files in os.walk(folder_path):
        if len(files) > 0:
            for file in files:
                data = Read(f'{folder_path}\\{file}','r')
                print('\n', '-' * 15, "开始下载!", '-' * 15)
                for Src in data['data']:
                    response = requests.get(Src['href'])
                    with open(image_path.format(ImageIndex),'wb') as f:
                        f.write(response.content)
                        f.close()
                    print(f"图片 {Src['href']} 下载完成\n,保存为 {image_path.format(ImageIndex)}")
                    ImageIndex+=1

                    time.sleep(1)

"""
    工具
"""
# 打开浏览器
def LoadChormeWeb():
    #
    options = Options()
    options.add_argument("--headless")
    options.headless = False
    #
    web = Chrome(executable_path="D:/DevelopProgramming/002D-Tool/Anacode3/chromedriver.exe", options=options)
    web.implicitly_wait(10)
    return web
def CheckIfFileEmpty(filename):
    if not os.path.isfile(filename) or os.stat(filename).st_size == 0:
        return True
    data = Read(filename,'r')
    if len(data) > 0:
        return False
    else:
        return True
# 文件的写入
def Write(Data,Filename,mode):
    with open(Filename, mode) as f:
        json.dump(Data, f,indent=4)
    f.close() #释放资源
#文件的数据读取
def Read(Filename,mode):
    with open(Filename, mode) as f:
        data = json.load(f)
    f.close()
    return data

"""
    线程池
"""
def  asyn_method():
    # 线程列表
    tasks=[]
    # 图片链接获取任务开启
    def Task1():
        print('task1:', '-' * 15, '开始工作!', '-' * 15)
        SearchPhotoMSGGet(ConfigMap['百度图片']['Thread']['ThreadName'],'百度图片',**ConfigMap['百度图片']['Search'])
        print('task1:','-'*15,'百度图片爬取完成!','-'*15)
        SavePhotoURLFile(ConfigMap['百度图片']['Thread']['ThreadName'],ConfigMap['百度图片']['SaveFileName'],ConfigMap['百度图片']['SRCList'])
    def Task2():
        print('task2:', '-' * 15, '开始工作!', '-' * 15)
        SearchPhotoMSGGet(ConfigMap['必应图片']['Thread']['ThreadName'],'必应图片',**ConfigMap['必应图片']['Search'])
        print('task2:', '-' * 15, '必应图片爬取完成!', '-' * 15)
        SavePhotoURLFile(ConfigMap['必应图片']['Thread']['ThreadName'], ConfigMap['必应图片']['SaveFileName'],
                         ConfigMap['必应图片']['SRCList'])

    task1 = threading.Thread(target=Task1,daemon=True,name=ConfigMap['百度图片']['Thread']['ThreadName'])
    task2 = threading.Thread(target=Task2,daemon=True,name=ConfigMap['必应图片']['Thread']['ThreadName'])

    tasks.append(task1)
    tasks.append(task2)
    try:
        task1.start()
        task2.start()

    except Exception as e:
        print("线程异常:", str(e))
    finally:
        for th in tasks:
            th.join()
if __name__ == '__main__':
    asyn_method()
    DownloadPhoto(os.path.abspath('dataset/Image/Painting{}.jpg'),os.path.abspath('dataset/HrefJson/'))
    

今天的内容就到这里,谢谢大家!!!!(注:可以继续深化下去,比如说对图片的处理)

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值