从百度上批量爬取指定关键字的图片python程序

        最初始的文件是从网上copy了一份,原始忘记出自哪里,后面找到了补上,以感谢初始的分享者,但这个文件只能单个单个关键字图片下载,不能批量,而且程序本身也有些小问题,我在原始上做了好些修改和整合,最终汇总如下:

DownImFromWebsite.py

import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import os, shutil

#DataMainDir='挖掘机图片集合'
#num = 0
#numPicture = 0
#ImDir = ''
List = []
Urls = []


def Find(url):
    global List
    print('正在检测图片总数,请稍等.....')
    t = 0
    i = 1
    s = 0
    while t < 20000:
        Url = url + str(t)
        try:
            Result = requests.get(Url, timeout=7)
        except BaseException:
            t = t + 60
            continue
        else:
            result = Result.text
            pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
            s += len(pic_url)
            if len(pic_url) == 0:
                break
            else:
                List.append(pic_url)
                t = t + 60
    return s


def recommend(numPicture,url):
    Re = []
    try:
        html = requests.get(url)
    except error.HTTPError as e:
        return
    else:
        html.encoding = 'utf-8'
        bsObj = BeautifulSoup(html.text, 'html.parser')
        div = bsObj.find('div', id='topRS')
        if div is not None:
            listA = div.findAll('a')
            for i in listA:
                if i is not None:
                    Re.append(i.get_text())
        return Re


def dowmloadPicture(ImDir,numPicture,num,html, keyword):
    #num=0
    # t =0
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # 先利用正则表达式找到图片url
    print('找到关键词:' + keyword + '的图片,即将开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(num + 1) + '张图片,图片地址:' + str(each))
        try:
            if each is not None:
                pic = requests.get(each, timeout=7)
                Urls.append(each)
            else:
                continue
        except BaseException:
            print('错误,当前图片无法下载')
            continue
        else:
            # string = ImDir + r'\\' + keyword + '_' + str(num) + '.jpg'
            _, ImName1 = os.path.split(each)
            ImName = ImName1.split('?')[0]
            string = os.path.join(ImDir, ImName)
            try:
                fp = open(string, 'wb')
            except:
                continue
            fp.write(pic.content)
            fp.close()
            num += 1
        if num >= numPicture:
            return 0

    return num

def DownImFromNet(DataMainDir,url,KeyWord):
    #global numPicture
    tot = Find(url)
    numPicture = tot
    Recommend = recommend(numPicture,url)
    print('经过检测%s类图片共有%d张' % (KeyWord, tot))

    ImDir = os.path.join(DataMainDir, KeyWord, 'ImSet')
    if not os.path.exists(ImDir):
        os.makedirs(ImDir)
    else:
        ImNames = os.listdir(ImDir)
        if ImNames:
            for ImName in ImNames:
                ImPath = os.path.join(ImDir, ImName)
                os.remove(ImPath)
    t = 0
    tmp = url
    num=0
    while t < numPicture:
        try:
            url = tmp + str(t)
            result = requests.get(url, timeout=10)
            print(url)
        except error.HTTPError as e:
            print('网络错误,请调整网络后重试')
            t = t + 60
        else:
            num=dowmloadPicture(ImDir,numPicture,num,result.text, KeyWord)
            t = t + 60
    print('当前搜索结束,感谢使用')
    print('猜你喜欢')
    for re in Recommend:
        print(re)
    MainDir, _ = os.path.split(ImDir)
    FileName = '{}.txt'.format(KeyWord)
    UrlsFilePath = os.path.join(MainDir, FileName)
    with open(UrlsFilePath, 'w') as FId:
        for Url in Urls:
            FId.writelines(Url + '\n')

    return numPicture

BatchDownImFromWebsite.py

import os,shutil,cv2

from DownImFromWebsite import DownImFromNet
DataMainDir='.\\图片集合'
#os.makedirs(DataMainDir,0o777,True)
ExistDirNames=[FileName for FileName in os.listdir(DataMainDir) if os.path.isdir(os.path.join(DataMainDir,FileName))]
ExistDirNames.sort()
def CopyAllSubDirIm2OneBigDir():
    KeyWords = os.listdir(DataMainDir)
    AllImDir = os.path.join(DataMainDir, 'AllImSet')
    os.makedirs(AllImDir, 0o777, True)
    AllNoImDir = os.path.join(DataMainDir, 'AllNoImSet')
    os.makedirs(AllNoImDir, 0o777, True)
    OriImNum = 0
    for KeyWord in KeyWords:
        ImDir = os.path.join(DataMainDir, KeyWord, 'ImSet')
        if os.path.isdir(ImDir):
            ImNames = os.listdir(ImDir)
            OriImNum += len(ImNames)
            for ImName in ImNames:
                ImPath = os.path.join(ImDir, ImName)
                try:
                    if ImName.endswith('.jpg'):
                        shutil.copy(ImPath, AllImDir)
                        print('{}--->{}'.format(ImPath, AllImDir))
                    else:
                        try:
                            Im = cv2.imread(ImPath)
                        except:
                            shutil.copy(ImPath, AllNoImDir)
                            print('{}--->{}'.format(ImPath, AllNoImDir))
                        if Im :
                            PureImName, _ = os.path.splitext(ImName)
                            NewImPath = os.path.join(AllImDir, PureImName + '.jpg')
                            cv2.imwrite(NewImPath, Im)
                            print('{}--->{}'.format(ImPath, NewImPath))
                        else:
                            shutil.copy(ImPath, AllNoImDir)
                            print('{}--->{}'.format(ImPath, AllNoImDir))

                except:
                    continue

    FinalAllImNum = len(os.listdir(AllImDir))
    print('OriImNum={},FinalAllImNum={}'.format(OriImNum, FinalAllImNum))



if __name__ == '__main__':  # 主函数入口
    KeyWords=()#输入你想检索的关键字,比如你想检索小汽车,可能只指定小汽车这一个关键字是不够的的,你需要指定其相关的关键字才能下载的较多图片
    KeyWords = set(KeyWords)#防止关键字重复
    DownUrlImNumFile=os.path.join(DataMainDir,'DownUrlImNumFile.txt')
    with open(DownUrlImNumFile,'w') as FId:
        for KeyWord in KeyWords:
            url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word==' + KeyWord + '&pn='
            numPicture=DownImFromNet(DataMainDir,url,KeyWord)
            LineStr='{}  ImNum={}\n'.format(url,numPicture)
            FId.writelines(LineStr)

    CopyAllSubDirIm2OneBigDir()

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值