Keras 训练时不用将数据全部加入内存

最新推荐文章于 2021-03-01 21:45:31 发布

xinfeng2005

最新推荐文章于 2021-03-01 21:45:31 发布

阅读量1w

点赞数

分类专栏：深度学习

本文链接：https://blog.csdn.net/xinfeng2005/article/details/71600652

版权

深度学习专栏收录该内容

27 篇文章 0 订阅

订阅专栏

How can I use Keras with datasets that don't fit in memory?

You can do batch training using model.train_on_batch(X, y) and model.test_on_batch(X, y). See the models documentation.

Alternatively, you can write a generator that yields batches of training data and use the methodmodel.fit_generator(data_generator, steps_per_epoch, epochs).

You can see batch training in action in our CIFAR10 example.

代码1：图像分类

import codecs
import cv2
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from visual_callbacks import AccLossPlotter
plotter = AccLossPlotter(graphs=['acc', 'loss'], save_graph=True)
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

datagen = ImageDataGenerator(
        rotation_range=0,
        width_shift_range=0.1,
        height_shift_range=0.1,
        rescale=1./255,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=False,
        fill_mode='nearest')
train_generator = datagen.flow_from_directory(
        r'chars_rec\train',  # this is the target directory
        target_size=(32, 32),  # all images will be resized to 150x150
        batch_size=32,
        shuffle=True,
        class_mode='categorical', color_mode='grayscale')  # since we use binary_crossentropy loss, we need binary labels
print(train_generator.nb_class)
class_count=train_generator.nb_class
# print(train_generator.class_indices)
# print(type(train_generator.class_indices))

np.save('class_indices.txt', train_generator.class_indices)
'''
class_indices=np.load('class_indices.txt.npy')
print(class_indices)
# print(type(class_indices))
class_indices=class_indices.tolist()
# print(type(class_indices))
value_indices={v:k for k,v in class_indices.items()}
'''# exit()validation_generator = datagen.flow_from_directory(        r'chars_rec\valication',  # this is the target directory        target_size=(32, 32),  # all images will be resized to 150x150        batch_size=32,        class_mode='categorical', color_mode='grayscale')  # since we use binary_crossentropy loss, we need binary labels######################################################model = Sequential()model.add(Conv2D(32, 3, 3, input_shape=( 32, 32, 1), border_mode='same', activation='relu'))model.add(MaxPooling2D(pool_size=(2, 2)))model.add(Conv2D(32, 3, 3, border_mode='same', activation='relu'))model.add(MaxPooling2D(pool_size=(2, 2)))model.add(Conv2D(64, 3, 3, border_mode='same', activation='relu'))model.add(MaxPooling2D(pool_size=(2, 2)))model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectorsmodel.add(Dense(128))model.add(Activation('relu'))model.add(Dropout(0.5))model.add(Dense(class_count))model.add(Activation('softmax'))###################################################

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# 用于保存验证集误差最小的参数，当验证集误差减少时，立马保存下来
checkpointer = ModelCheckpoint(filepath="chars_rec.hdf5", verbose=1, save_best_only=True, )
history = LossHistory()
if os.path.exists('chars_rec.hdf5'):
    model = load_model('chars_rec.hdf5')

model.fit_generator(
        train_generator,
        # steps_per_epoch=2000 ,#// batch_size,
        # epochs=50,
        samples_per_epoch=9150 ,#// batch_size,
        nb_epoch=500,
        validation_data=validation_generator,
        nb_val_samples=1062,
        callbacks=[checkpointer, history, plotter]
         )#// batch_size)validation_steps=800

model.save('chars_rec_end.hdf5')

代码2：文本标引

def getXY_gen(batch_size=32):
    # f = file(".\\SheKeYuan_YinWen0303_train.utf8")
    # data = f.read()[0:].decode('utf-8')
    # f.close()
    f = open(".\\train_470000_0427.utf8",'r',encoding='utf-8')
    lines = f.readlines()
    f.close()
    # print(lines[0:10])
    # exit()

    X=[]
    Y=[]
    for i in range(len(lines)):#tqdm(range(len(lines))):
        line=lines[i].strip()
        # print(i+1)#,line)
        # if i>=100000:
        #     break
        x = []
        y = []
        y_temp = []
        for j,string in enumerate(line.split('  ')):
            if string.find('=')<0:
                continue
            str,label=string.rsplit('=',1)

            # print(str,label)
            label_num=int(label)+1
            if len(str)>1 and label_num<100:
                for k,s in enumerate(str):
                    x.append(ord(s))
                    lab=[0]*class_label_count
                    if k==0:
                        lab[label_num-1]=1
                        y_temp.append(label_num-1)
                    elif k==len(str)-1:
                        lab[label_num-1+2]=1
                        y_temp.append(label_num-1+2)
                    else:
                        lab[label_num-1+1]=1
                        y_temp.append(label_num-1+1)

                    y.append(lab)
            elif len(str)==1:
                x.append(ord(str[0]))
                lab = [0] * class_label_count
                lab[label_num - 1+3] = 1
                # if label_num - 1+3==99:
                #     print('xxx')
                y.append(lab)
                y_temp.append(label_num-1+3)

        # print(x)
        # print(y_temp)
        if len(x)<max_len:
            for a in range(0,max_len-len(x)):
                x.append(0)
                lab = [0] * class_label_count
                lab[87] = 1
                y.append(lab)
        X.append(x[0:max_len])
        Y.append(y[0:max_len])

        if len(X)==batch_size:
            # print(np.shape(X))
            # print(X)
            x1=X[0:batch_size]
            y1=Y[0:batch_size]
            X=[]
            Y=[]
            yield np.array(x1),np.array(y1)

sequence = Input(shape=(max_len,), dtype='int32')
embedded = Embedding(65536, 128, input_length=max_len, mask_zero=True, trainable=False)(sequence)
blstm = Bidirectional(LSTM(64, return_sequences=True, dropout_U=0.5, dropout_W=0.5), merge_mode='sum')(embedded)
# blstm = (LSTM(128, return_sequences=True, dropout_U=0.5, dropout_W=0.5), merge_mode='sum')(embedded)
output = TimeDistributed(Dense(class_label_count, activation='softmax'))(blstm)
model = Model(input=sequence, output=output)

if os.path.exists('bilstm_0510.hdf5'):
    model=load_model('bilstm_0510.hdf5')
model.layers[1].trainable = False#Embedding层不再训练

adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#mse mae mape msle binary_crossentropy categorical_crossentropy sparse_categorical_crossentropy
# kullback_leibler_divergence
#cosine_proximity
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 用于保存验证集误差最小的参数，当验证集误差减少时，立马保存下来
checkpointer = ModelCheckpoint(filepath="bilstm_0510.hdf5", verbose=0, save_best_only=True, )
history = LossHistory()
# history = model.fit(np.array(x_train), np.array(y_train).reshape((-1,max_len,class_label_count)),
#                     batch_size=32, nb_epoch=500,validation_data = (x_test,y_test),
#                     callbacks=[checkpointer, history, plotter],
#                     verbose=1
#                     )

model.fit_generator(getXY_gen(batch_size=32), samples_per_epoch=32*100 , nb_epoch=10,
                    verbose=1,
                    callbacks=[checkpointer, history]
                    )

xinfeng2005

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Keras 训练时不用将数据全部加入内存

How can I use Keras with datasets that don't fit in memory?You can do batch training using model.train_on_batch(X, y) and model.test_on_batch(X, y). See the models documentation.Alternativ
复制链接

扫一扫

专栏目录