原理
- 在深度学习中,为了防止过拟合,被切分出来的验证集用来停止训练,但木有用于训练而导致浪费
- 本文尝试训练集停止训练后,验证集加入训练,并用多组参数进行实验
Python极简算法
图像识别示例
from keras.datasets import cifar10 # 小图像10分类
from keras.utils import to_categorical # 独热码
from keras.models import Sequential # 顺序模型
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
# 超参
verbose = 2 # 训练过程打印
batch_size = 256 # 批大小
patience = 1 # 没有进步的轮数
callbacks = [EarlyStopping('val_acc', patience=patience)]
# 读数据
(x1, y1), (x2, y2) = cifar10.load_data()
x1, x2 = x1 / 255, x2 / 255 # 像素值→[0,1]
y1 = to_categorical(y1, 10)
y2 = to_categorical(y2, 10)
def experiment(validation_size=.1):
# 验证集切分
x11, x12, y11, y12 = train_test_split(x1, y1, test_size=validation_size)
# 建模
model = Sequential()
for i in (32, 64):
model.add(Conv2D(i, (3, 3), padding='same', activation='relu'))
model.add(Conv2D(i, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile('adam', 'categorical_crossentropy', ['acc']) # 编译
# 训练、评估
h = model.fit(x11, y11, batch_size, 999, verbose, callbacks, validation_data=(x12, y12))
print(validation_size, model.evaluate(x2, y2, batch_size, verbose))
# 验证集加入训练、评估
model.fit(x12, y12, batch_size, len(h.history['acc'])-patience, verbose)
print(model.evaluate(x2, y2, batch_size, verbose))
experiment(.1)
experiment(.2)
experiment(.3)
experiment(.4)
experiment(.5)
experiment(.6)
experiment(.7)
experiment(.8)
experiment(.9)
文本分类示例
from gensim.models import Word2Vec
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, GRU
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from jieba import cut
from data10 import load_xy # https://download.csdn.net/download/Yellow_python/12862983
maxlen = 200 # 序列长度
batch_size = 256 # 每批数据量大小
verbose = 0 # 训练过程展示
callbacks = [EarlyStopping('val_acc', patience=1)] # 没有进步的训练轮数
C = ['science', 'car', 'finance', 'sports', 'military', 'medicine', 'entertainment', 'politics', 'education', 'fashion']
num_classes = len(C)
# 读数据
x, (x1, x2, y1, y2) = load_xy()
y1 = to_categorical([C.index(i) for i in y1], num_classes)
y2 = to_categorical([C.index(i) for i in y2], num_classes)
# 词向量
wv = Word2Vec([list(cut(t)) for t in x], window=7, sg=1).wv
vectors = wv.vectors
w2i = {w: i for i, w in enumerate(wv.index2word)}
# pad
x1 = pad_sequences([[vectors[w2i[w]] for w in cut(t) if w in w2i] for t in x1], maxlen, dtype='float')
x2 = pad_sequences([[vectors[w2i[w]] for w in cut(t) if w in w2i] for t in x2], maxlen, dtype='float')
def experiment(validation_size=.1):
# 训练数据、分词
x11, x12, y11, y12 = train_test_split(x1, y1, test_size=validation_size, random_state=7)
# 建模
model = Sequential()
model.add(GRU(64))
model.add(Dense(units=num_classes, activation='softmax'))
# 训练
model.compile('adam', 'categorical_crossentropy', ['acc'])
history = model.fit(x11, y11, batch_size, 99, verbose, callbacks, validation_data=(x12, y12))
# 预测
print(model.evaluate(x2, y2, batch_size, verbose), validation_size, len(history.history['acc']))
# 验证集加入训练
model.fit(x12, y12, batch_size, len(history.history['acc'])-1, verbose)
print(model.evaluate(x2, y2, batch_size, verbose))
experiment(.1)
experiment(.2)
experiment(.3)
experiment(.4)
experiment(.5)
experiment(.6)
experiment(.7)
experiment(.8)
experiment(.9)
结果及结论
附1组结果
结论
图像识别结果有提升,非常不稳定但可控,建议在选择10%作为验证集并在训练结束后加入至少1次
文本分类结果有明显提升,不稳定但可控
无论哪种分类,训练集结束后,进行至少一轮验证集,能有效提升模型准确率