在使用keras的model.fit(x, y)训练模型时,需要全部加载训练的数据,如果数据较小时(如mnist数据集),加载全部的数据没什么问题,但是当自己的数据集很大时(如10G),全部加载数据可能会导致内存溢出,或者仅仅加载数据就需要用了很多等待时间,这样对使用Keras做实验非常不方便。当然,keras也提供另外的训练接口model.fit_generator,然后该方法的其中一个参数generator可以使用keras的ImageDataGenerator类,这个类封装得很完善,特别适用于多分类的模型读取数据集,但是有个缺点是文件的目录格式是固定的。具体的使用可以参考keras的文档,或者参考这个博客。但是,如果自己的数据存储格式和keras要求的格式不一致,就会有很多麻烦,当然你也可以把数据调整为keras的格式,不过由于数据集比较大,调整格式比较麻烦,那有没有其他的办法呢?
答案是有的。比如创建自己的generator。如下就是我的创建过程:
dataset.py (本次例子是一个二分类问题)
from PIL import Image
import os
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import glob
from keras.datasets import cifar100
# 加载数据, 全部加载数据的类型,数据集比较大,等待时间长,而且可能内存溢出
def load_data(seq_names=['BOSSbase_1.01', './data/bossbase_wow040']):
root_path = os.getcwd()
tran_images = []
labels = []
# seq_names = ['BOSSbase_1.01', 'bossbase_hill040']
for seq_name in seq_names:
frames = sorted(os.listdir(os.path.join(root_path, seq_name)))
i = 0
for frame in frames:
img_path = [os.path.join(root_path, seq_name, frame)]
# print(img_path[0])
imgs = Image.open(img_path[0])
imgs = np.array(imgs, dtype=np.float32) # .resize((512, 512))
# imgs /= 255
# print(imgs.shape)
tran_images.append(imgs)
if seq_name == seq_names[0]:
labels.append(0)
else:
labels.append(1)
# i = i + 1
# if i == 100:
# break
tran_images = np.array(tran_images)
tran_images = np.expand_dims(tran_images, axis=-1)
labels = np.array(labels)
# print('images_shape:'+str(tran_images.shape)+" labels_shape:"+str(labels.shape))
return change_data(tran_images, labels)
# 保证每个batch的stego和cover个数相同
def change_data(tran_images, labels):
lenght = tran_images.shape[0]
tran_images_a = tran_images[:int(lenght/2)]
tran_images_b = tran_images[int(lenght/2):]
# print('tran_images_a.shape:'+str(tran_images_a.shape)+ ' tran_images_b.shape:'+str(tran_images_b.shape))
labels_a = labels[:int(lenght/2)]
labels_b = labels[int(lenght/2):]
# print('labels_a.shape:' + str(labels_a.shape) + ' labels_b.shape:' + str(labels_b.shape))
train_x = []
train_y = []
for i in range(int(lenght/2)):
train_x.append(tran_images_a[i])
train_x.append(tran_images_b[i])
train_y.append(labels_a[i])
train_y.append(labels_b[i])
train_x = np.array(train_x)
train_y = np.array(train_y)
return train_x, train_y
'''
以下是创建自己的generator的具体代码
'''
# 同步对应打乱两个数组
def shuffle_two_array(a, b):
state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(state)
np.random.shuffle(b)
return a, b
# 加载自己的数据,然后进行训练集和验证集分割,分割比例是9:1,这是是二分类为例子
def load_game_data(cover_dir='./BOSSbase_1.01', stego_dir='bossbase_hill040', train_set=True):
val_size = 0.1
cover_files = sorted(glob.glob(os.path.join(cover_dir, '*')))
stego_files = sorted(glob.glob(os.path.join(stego_dir, '*')))
train_cover_files, valid_cover_files, train_stego_files, valid_stego_files = \
train_test_split(cover_files, stego_files, test_size=val_size, random_state=0)
# print("Using", len(train_cover_files) * 2, "samples for training and", len(valid_cover_files) * 2, "for validation.")
train_images = []
valid_images = []
train_labels = []
valid_labels = []
# cover-stego交叉处理
for i in range(len(train_cover_files)):
# cover
train_images.append(train_cover_files[i])
train_labels.append(0)
# stego
train_images.append(train_stego_files[i])
train_labels.append(1)
for i in range(len(valid_cover_files)):
# cover
valid_images.append(valid_cover_files[i])
valid_labels.append(0)
# stego
valid_images.append(valid_stego_files[i])
valid_labels.append(1)
train_images = np.array(train_images)
train_labels = np.array(train_labels)
valid_images = np.array(valid_images)
valid_labels = np.array(valid_labels)
# print(train_images)
# print(train_images.shape)
# print(train_labels.shape)
if train_set:
data = (train_images, train_labels) # 训练集
else:
data = (valid_images, valid_labels) # 验证集
return data
# 读取图片,代用了keras的接口
def load_batch_image(img_path, target_size=(512, 512)):
img = load_img(img_path, target_size)
img = img_to_array(img)[:, :, 0] # convertd image to numpy array, only read r channel
img = np.expand_dims(img, axis=-1)
return img
# 建立一个数据迭代器---generator
def my_dataset_generator(batch_size,
cover_dir = '/home/chenbaoying/py_project/aletheia/BOSSbase_1.01',
stego_dir='/home/chenbaoying/py_project/aletheia/data/bossbase_wow040',
train_set=True): # train set:True, valid set:False
X_samples, Y_samples = load_game_data(cover_dir=cover_dir,stego_dir=stego_dir, train_set=train_set)
batch_num = int(len(X_samples) / batch_size)
max_len = batch_num * batch_size
X_samples = np.array(X_samples[:max_len])
Y_samples = np.array(Y_samples[:max_len])
X_batches = np.split(X_samples, batch_num)
Y_batches = np.split(Y_samples, batch_num)
i = 0 # 这个下标标记变量 i 一定要放在true外面
n = len(X_batches)
while True:
for b in range(len(X_batches)): # 这里使用一个没用的变量 b ,后面没使用
i %= n # 求余防止数组溢出
X_batches[i], Y_batches[i] = shuffle_two_array(X_batches[i], Y_batches[i]) # 打乱每个batch数据
X = np.array(list(map(load_batch_image, X_batches[i])))
Y = np.array(Y_batches[i])
# print(X.shape)
i += 1
yield X, keras.utils.to_categorical(Y)
# Data augmentation,这个是keras的ImageDataGenerator的使用例子
def DataGenerator():
root_path = '/home'
# train_datagen = ImageDataGenerator(rescale=1. / 255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
train_datagen = ImageDataGenerator()
test_datagen = ImageDataGenerator()
training_set = train_datagen.flow_from_directory(os.path.join(root_path, 'images', 'train'),
target_size=(1024, 1024),
batch_size=4,
class_mode='categorical',
color_mode='grayscale')
test_set = test_datagen.flow_from_directory(os.path.join(root_path, 'images', 'my_valid'),
target_size=(1024, 1024),
batch_size=4,
class_mode='categorical',
color_mode='grayscale')
return training_set, test_set
if __name__ == '__main__':
# X_samples, Y_samples = load_game_data(cover_dir='/home/images/train/cover', stego_dir='/home/images/train/stego')
# print(X_samples.shape)
# print(Y_samples.shape)
generator = my_dataset_generator(batch_size=8, train_set=True)
# generator = MyGenerator(batch_size=8)
x, y = next(generator)
print(x.shape)
print(y.shape)
# x, y = load_data()
# print(x.shape)
# print(y[0:10])
# DataGenerator()
pass
下面是我的模型训练代码 model_test.py
from keras_preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam, SGD, Adamax, Adadelta
from keras.callbacks import TensorBoard, ModelCheckpoint
import keras
import os
from network_keras import ye_Net # 导入自己的模型
from dataset import load_data, my_dataset_generator # 导入dataset里面的my_dataset_generator
from keras.utils import multi_gpu_model
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
# print('start load data ...')
# train_x, train_y = load_data()
# training_set, test_set = DataGenrator()
model = ye_Net(input_shape=(512, 512, 1)) # 这里的ye_Net是自己搭建的一个深度模型
model = multi_gpu_model(model, gpus=2)
model_path = './model_weight/ye_Net--10--0.3865.hdf5' # 这个是加载模型预训练模型的路劲
try:
model.load_weights(model_path) # 加载模型
print("...Previous weight data...")
except:
print("...New weight data...")
pass
train = True
if train:
adadelta = Adadelta(lr=0.4, decay=0.0005)
model.compile(optimizer=adadelta, loss='categorical_crossentropy', metrics=['accuracy'])
tensorboard = TensorBoard(log_dir='./logs', write_graph=True)
checkpoint = ModelCheckpoint(filepath='./model_weight/ye_Net_hill020--{epoch:02d}--{val_loss:.4f}.hdf5',
monitor='val_loss',
verbose=1,
save_best_only=True,
period=5)
batch_size = 8
num_val = 2000 # 我的数据集总共有20000张,在dataset里面train:val=9:1分割数据,一定要填自己的文件分割后的个数
num_train = 18000 # 同上
history = model.fit_generator(my_dataset_generator(batch_size=8, stego_dir='./data/bossbase_hill020'),
steps_per_epoch=max(1, num_train//batch_size),
validation_data=my_dataset_generator(batch_size=8),
validation_steps=max(1, num_val//batch_size),
epochs=200,
initial_epoch=0,
shuffle=True)