文章目录
一、过拟合与欠拟合简述
欠拟合问题,根本的原因是模型复杂度过低,导致拟合的函数无法满足训练集,误差较大。
碰到欠拟合问题可以尝试将模型复杂度增加一下,例如:堆叠更多的层数,增加每一层的单元数量,如果有所改善,说明需要增加模型的复杂度
过拟合问题,根本原因是模型复杂度过高,导致拟合函数接近完美拟合训练集,误差很小。
但是由于使用的模型复杂度大于合理的模型复杂度,过拟合训练集,会使得模型在训练集上效果特别好,但是在测试集上效果变差。
模型的次方越高,模型的表达能力越强。
reduce overfitting 手段
-
More Data—更多的数据
对于图片来讲,可以通过增加噪声、数据增强(旋转、裁剪、光照、翻转)等来增加训练数据量 -
降低模型复杂度 (一般不这么做)
shadow——衡量数据量与网络 regularation——正则化 正则化网络能够对训练数据中的常见数据构造出相对简单的模型,并且对训练数据中各种各样的噪声有良好的抵抗能力, 提升了模型的泛化能力。可以理解为一种能使噪声数据不会过多影响网络输出的方法
-
Dropout—增加鲁棒性
-
data argumentation—数据增强
-
Early Stopping—使用验证集做早停
二、降低过拟合—交叉验证
1.train_val划分
# train_val
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
def preprocess(x, y):
"""
x is a simple image, not a batch
"""
x = tf.cast(x, dtype=tf.float32) / 255.
x = tf.reshape(x, [28 * 28])
y = tf.cast(y, dtype=tf.int32)
y = tf.one_hot(y, depth=10)
return x, y
batchsz = 128
(x, y), (x_val, y_val) = datasets.mnist.load_data()
print('datasets:', x.shape, y.shape, x.min(), x.max())
db = tf.data.Dataset.from_tensor_slices((x, y))
db = db.map(preprocess).shuffle(60000).batch(batchsz)
ds_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
ds_val = ds_val.map(preprocess).batch(batchsz)
sample = next(iter(db))
print(sample[0].shape, sample[1].shape)
network = Sequential([layers.Dense(256, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu'),
layers.Dense(10)])
network.build(input_shape=(None, 28 * 28))
network.summary()
network.compile(optimizer=optimizers.Adam(lr=0.01),
loss=tf.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy']
)
network.fit(db, epochs=5, validation_data=ds_val,
validation_steps=2)
network.evaluate(ds_val)
sample = next(iter(ds_val))
x = sample[0]
y = sample[1] # one-hot
pred = network.predict(x) # [b, 10]
# convert back to number
y = tf.argmax(y, axis=1)
pred = tf.argmax(pred, axis=1)
print(pred)
print(y)
2.train_val_test划分
利用train set来训练模型参数,利用val set来选择在哪个时间上停止掉,选择哪个时间戳上的参数,利用test set来做一个测试。
不能利用test set来挑选val set上的参数,会造成数据污染,即利用先验知识来挑选参数。
# train_val_test
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
def preprocess(x, y):
"""
x is a simple image, not a batch
"""
x = tf.cast(x, dtype=tf.float32) / 255.
x = tf.reshape(x, [28 * 28])
y = tf.cast(y, dtype=tf.int32)
y = tf.one_hot(y, depth=10)
return x, y
batchsz = 128
(x, y), (x_test, y_test) = datasets.mnist.load_data()
print('datasets:', x.shape, y.shape, x.min(), x.max())
# 可以直接分割数据集
# x_train, x_val = tf.split(x, num_or_size_splits=[50000, 10000])
# y_train, y_val = tf.split(y, num_or_size_splits=[50000, 10000])
idx = tf.range(60000)
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:50000]), tf.gather(y, idx[:50000])
x_val, y_val = tf.gather(x, idx[-10000:]), tf.gather(y, idx[-10000:])
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)
db_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
db_train = db_train.map(preprocess).shuffle(50000).batch(batchsz)
db_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
db_val = db_val.map(preprocess).shuffle(10000).batch(batchsz)
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
db_test = db_test.map(preprocess).batch(batchsz)
sample = next(iter(db_train))
print(sample[0].shape, sample[1].shape)
network = Sequential([layers.Dense(256, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu'),
layers.Dense(10)])
network.build(input_shape=(None, 28 * 28))
network.summary()
network.compile(optimizer=optimizers.Adam(lr=0.01),
loss=tf.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy']
)
network.fit(db_train, epochs=6, validation_data=db_val, validation_freq=2)
print('Test performance:')
network.evaluate(db_test)
sample = next(iter(db_test))
x = sample[0]
y = sample[1] # one-hot
pred = network.predict(x) # [b, 10]
# convert back to number
y = tf.argmax(y, axis=1)
pred = tf.argmax(pred, axis=1)
print(pred)
print(y)
3.K—fold 交叉验证
既防止了死记硬背,又将数据充分利用了起来。
from sklearn.model_selection import KFold
for train_idx, val_idx in KFold(n_splits=5,random_state=0,shuffle=False).split(x):
#print(train, test)
print(train_idx)
print(val_idx)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, optimizers, Sequential, metrics
from sklearn.model_selection import KFold
import numpy as np
def preprocess(x, y):
"""
x is a simple image, not a batch
"""
x = tf.cast(x, dtype=tf.float32) / 255.
x = tf.reshape(x, [28 * 28])
y = tf.cast(y, dtype=tf.int32)
y = tf.one_hot(y, depth=10)
return x, y
batchsz = 128
(x, y), (x_test, y_test) = datasets.mnist.load_data()
print('datasets:', x.shape, y.shape, x.min(), x.max())
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
db_test = db_test.map(preprocess).batch(batchsz)
accuracy = []
for train_idx, val_idx in KFold(n_splits=5, random_state=0,shuffle=True).split(x, y):
db_train = tf.data.Dataset.from_tensor_slices((x[train_idx, ...], y[train_idx, ...]))
db_train = db_train.map(preprocess).shuffle(40000).batch(batchsz)
db_val = tf.data.Dataset.from_tensor_slices((x[val_idx, ...], y[val_idx, ...]))
db_val = db_val.map(preprocess).shuffle(10000).batch(batchsz)
sample = next(iter(db_train))
# print(sample[0].shape, sample[1].shape)
network = Sequential([layers.Dense(256, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu'),
layers.Dense(10)])
network.build(input_shape=(None, 28 * 28))
network.compile(optimizer=optimizers.Adam(lr=0.01),
loss=tf.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy']
)
# 早停
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
network.fit(db_train, epochs=6, validation_data=db_val, validation_freq=1, verbose=1, callbacks=[early_stop])
network.evaluate(db_test)
print('model evaluation', network.evaluate(db_test))
accuracy.append(network.evaluate(db_test)[1])
print('交叉验证的平均准确率为', np.mean(accuracy))
datasets: (60000, 28, 28) (60000,) 0 255
Epoch 1/6
375/375 [==============================] - 1s 3ms/step - loss: 0.3016 - accuracy: 0.9095 - val_loss: 0.1987 - val_accuracy: 0.9384
Epoch 2/6
375/375 [==============================] - 1s 3ms/step - loss: 0.1428 - accuracy: 0.9600 - val_loss: 0.1331 - val_accuracy: 0.9638
Epoch 3/6
375/375 [==============================] - 1s 3ms/step - loss: 0.1172 - accuracy: 0.9673 - val_loss: 0.1329 - val_accuracy: 0.9635
Epoch 4/6
375/375 [==============================] - 1s 3ms/step - loss: 0.0994 - accuracy: 0.9725 - val_loss: 0.1307 - val_accuracy: 0.9647
Epoch 5/6
375/375 [==============================] - 1s 3ms/step - loss: 0.0859 - accuracy: 0.9764 - val_loss: 0.1159 - val_accuracy: 0.9705
Epoch 6/6
375/375 [==============================] - 1s 3ms/</