Python多进程加速大量图片数据集读取

训练机器视觉神经网络前 多进程加速数据集读取

多进程读取图片并压缩.py

import readImgMultiProcessing, os, random, gc, time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
seed = 300
random.seed(seed)
coreNum = 23 
imgPath = r
dataPath = r'
dataTestPath = r
dataEncoderPath = r'babyFaceLabelEncoder.data'
config = {"epochs": 10, "batch_size": 128, 'useIDG':True, "imageResize": (600, 600), "lr": 1e-3}
if (os.path.exists(r'tmp') == False):
    os.makedirs(r'tmp')
if __name__ == '__main__':
    if os.path.exists(dataPath) and os.path.exists(dataTestPath):
        (X_train, X_val, y_train, y_val)    = readImgMultiProcessing.readFile(dataPath)
        (X_test, Y_test)                    = readImgMultiProcessing.readFile(dataTestPath)
        class_le                            = readImgMultiProcessing.readFile(dataEncoderPath)
    else:
        tst = time.time()
        class_le = LabelEncoder()
        class_le.fit(['睡', '醒'])
        # 读取所有的图片路径
        imagePaths = sorted(list(paths.list_images(imgPath)))
        # 读取婴儿醒睡的图片
        faceImagePaths = []
        for img_path in imagePaths:
            if(img_path.split(os.path.sep)[-2] == '睡' or img_path.split(os.path.sep)[-2] == '醒'):
                faceImagePaths.append(img_path)
        # 打乱顺序
        random.shuffle(faceImagePaths)
        # 这里就读取部分,内存不够
        # faceImagePaths = faceImagePaths[:234]
        # 用train_test_split划分 训练 验证 测试,注意:这里划分的是 路径 
        trainImgPaths, testImgPaths = train_test_split(faceImagePaths, test_size=0.2, random_state=seed)
        trainImgPaths, valImgPaths  = train_test_split(trainImgPaths,  test_size=0.2, random_state=seed)
        print('数据列表划分完成')
        # 图片
        X_train     = readImgMultiProcessing.readImgMultiProcessing(trainImgPaths, coreNum, config)
        print('训练集读入完成')
        X_val       = readImgMultiProcessing.readImgMultiProcessing(valImgPaths,   coreNum, config)
        print('验证集读入完成')
        X_test      = readImgMultiProcessing.readImgMultiProcessing(testImgPaths,  coreNum, config)
        print('测试集读入完成')
        X_train     = np.asarray(X_train,   dtype=np.float) / 255.0
        print('训练集处理完成')
        X_val       = np.asarray(X_val,     dtype=np.float) / 255.0
        print('验证集处理完成')
        X_test      = np.array(  X_test,    dtype=np.float) / 255.0
        print('测试集处理完成')
        # 婴儿表情的标签
        y_train = class_le.transform([x.split(os.path.sep)[-2] for x in trainImgPaths])
        y_val   = class_le.transform([x.split(os.path.sep)[-2] for x in valImgPaths])
        Y_test  = class_le.transform([x.split(os.path.sep)[-2] for x in testImgPaths])
        # 保存数据
        readImgMultiProcessing.toFile(class_le, dataEncoderPath)
        readImgMultiProcessing.toFile((X_train, X_val, y_train, y_val), dataPath)
        readImgMultiProcessing.toFile((X_test, Y_test), dataTestPath)
        print('数据集压缩成功,数据保存完毕')
        print(len(trainImgPaths), X_train.shape, len(y_train))
        print(len(valImgPaths), X_val.shape, len(y_val))
        print(len(testImgPaths), X_test.shape, len(Y_test))
        print('用时', time.time()-tst) # 934s

readImgMultiProcessing.py

from multiprocessing import Process,Queue,Pool,Pipe,Manager
import os,time,random
from imutils import paths
import numpy as np
from PIL import Image
import pickle
import cv2
def toFile(data,path):
    with open(path, 'wb') as f:
        pickle.dump(data, f, protocol = 4) #支持大于4G的文件
def readFile(path):
    f = open(path, 'rb')
    data = pickle.load(f)
    f.close()
    return data
def resize_img_keep_ratio(img_name, target_size):
    '''
    1.resize图片,先计算最长边的resize的比例,然后按照该比例resize。
    2.计算四个边需要padding的像素宽度,然后padding
    '''
    try:
        # 用cv2&numpy打开,这样可以读取中文路径
        # img = cv2.imdecode(np.fromfile(img_name, dtype=np.uint8), -1) 
        # 使用PIL读取图片,防止中文路径和png格式的报错
        im = Image.open(img_name)
        # 转化成数组的格式
        im_array = np.array(im)
    # 报错提示
    except Exception as e:
        print(img_name, e)
    old_size = im_array.shape[0:2]
    ratio = min(float(target_size[i])/(old_size[i]) for i in range(len(old_size)))
    new_size = tuple([int(i*ratio) for i in old_size])
    img = cv2.resize(im_array,(new_size[1], new_size[0]),interpolation=cv2.INTER_CUBIC)  #注意插值算法
    pad_w = target_size[1] - new_size[1]
    pad_h = target_size[0] - new_size[0]
    top,bottom = pad_h//2, pad_h-(pad_h//2)
    left,right = pad_w//2, pad_w -(pad_w//2)
    # 填充图片,黑边填充
    img_new = cv2.copyMakeBorder(img,top,bottom,left,right,cv2.BORDER_CONSTANT,None,(0,0,0))
    if(img_name.count('.png')==1 or img_new.shape[-1]==4):
        return cv2.cvtColor(img_new, cv2.COLOR_RGBA2RGB)
    return img_new  
def getData(num, paths, return_dict, config):
    Data = []
    for img_path in paths:
        img = resize_img_keep_ratio(img_path, (config["imageResize"][0], config["imageResize"][1]))
        Data.append(img)
    Data = np.array(Data, dtype=np.float)
    Data /= 255.0
    return_dict[num] = Data
    
    
def readImgMultiProcessing(imagePaths, coreNum, config):
    # 路径的划分
    lenPerSt= int(len(imagePaths)/coreNum+1)
    paths = []
    for i in range(coreNum):
        paths.append(imagePaths[i*lenPerSt:(i+1)*lenPerSt])
    # 多进程返回值接收器
    manager = Manager()
    return_dict = manager.dict()
    jobs = []
    # 执行进程
    for i in range(coreNum):
        p = Process(target=getData,args=(str(i), paths[i], return_dict, config))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()
    # 合并数据
    data = np.asarray((list(return_dict['0'])))
    for i in range(1,coreNum):
        x = np.asarray((list(return_dict[str(i)])))
        if(int(x.shape[0])>0):
            data = np.concatenate((data,x))
    return data


# if __name__ == '__main__':
#     imgPath = r'E:\新的数据集\'
#     imagePaths = sorted(list(paths.list_images(imgPath)))[:100]
#     config = {"epochs": 10, "batch_size": 128, 'useIDG':True, # False True
#             "imageResize": (600, 600), "lr": 1e-3}
#     coreNum = 10
#     data = readImgMultiProcessing(imagePaths, coreNum, config)
#     print(data.shape)
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值