防止过拟合的方法
- 增加更多的数据 (代价大)
- 降低模型的复杂度 减小网络层数 正则化
- Dropout 丢弃
- 数据增强
- 提前结束
Regularization 正则化 (weight Decay)权值衰减
- L1-regularization
- L2-regularization
l2_model = keras.models.Sequential([
keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001),
activation=tf.nn.relu,input_shape=(NUM_WORDS,)),
keras.layers.Dense(16, kernel_regularizer=keras.regularizers.l2(0.001),
activation=tf.nn.relu),
keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
动量与学习率
-
momentum 动量 惯性
-
learning rate decay 学习率衰减
动量
添加动量
有动能
# 在代码中修改动量
optimizer = SGD(learning_rate=0.02, momentum=0.9)
optimizer = RMSprop(learning_rate=0.02, momentum=0.9)
optimizer = Adam(learning_rate=0.02, beta_1=0.9, beta_2=0.999)
学习率 调整
- 过小
- 刚好
- 太大
学习率自动衰减
学习率根据 函数 随着训练过程而衰减
optimizer = SGD(learning_rate=0.2)
for epoch in range(100):
# 计算损失
# 更新优化器
optimizer.learing_rate = 0.2 * (100-epoch)/100
# 更新权重
提前停止与丢弃
Early stopping 提前停止
- 使用验证集 选择参数
- 监测 验证集 正确率结果
- 在正确率最高时停止 (连续下滑次数 停止)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eA1i8AZz-1603439913073)(./assets/提前停止.jpg)]
dropout 部分丢弃
- 学习少一点 学的更好 鲁棒性更好
- 每个连接 存在[0, 1]的概率丢失
network = Sequential([
layers.Dense(256, activation="relu"),
layers.Dropout(0.5),
layers.Dense(128, activation="relu"),
layers.Dropout(0.5),
layers.Dense(64, activation="relu"),
layers.Dense(32, activation="relu"),
layers.Dense(10)])
训练与测试的分开
for step, (x, y) in enumerate(db):
with tf.GradientTape() as tape:
x = tf.reshape(x, (-1, 28*28))
out = network(x, training=True)
# test
out = network(x, training=False)
Stochastic 梯度下降
Stochastic loss的计算
从全部数据求均值 => 随机从全部数据中 取batch 中求均值
使用 Dropout,动能,正则化实例
import tensorflow as tf
from tensorflow.keras import datasets, layers, optimizers, Sequential
# 获取物理GPU的个数
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
# 设置内存增长方式 自增长
tf.config.experimental.set_memory_growth(gpu, True)
# 预处理
def pre_process(x, y):
x = tf.cast(x, dtype=tf.float32) / 255.
y = tf.cast(y, dtype=tf.int32)
return x, y
batch_size = 128
# 数据集加载
(x, y), (x_val, y_val) = datasets.mnist.load_data()
print('数据集:', x.shape, y.shape, x.min(), x.max())
# 数据集预处理
db = tf.data.Dataset.from_tensor_slices((x, y))
db = db.map(pre_process).shuffle(60000).batch(batch_size).repeat(10)
ds_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
ds_val = ds_val.map(pre_process).batch(batch_size)
# 搭建网络 增加Dropout
network = Sequential([layers.Dense(256, activation='relu'),
layers.Dropout(0.5), # 0.5 rate to drop
layers.Dense(128, activation='relu'),
layers.Dropout(0.5), # 0.5 rate to drop
layers.Dense(64, activation='relu'),
layers.Dense(32, activation='relu'),
layers.Dense(10)])
# 编译网络
network.build(input_shape=(None, 28 * 28))
network.summary()
# 设置优化器 动能的增加
optimizer = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
for step, (x, y) in enumerate(db):
with tf.GradientTape() as tape:
# 前向传播
x = tf.reshape(x, (-1, 28 * 28))
out = network(x, training=True)
y_one_hot = tf.one_hot(y, depth=10)
# 损失计算
loss = tf.reduce_mean(tf.losses.categorical_crossentropy(y_one_hot, out, from_logits=True))
# 损失正则化
loss_regularization = []
for p in network.trainable_variables:
loss_regularization.append(tf.nn.l2_loss(p))
loss_regularization = tf.reduce_sum(tf.stack(loss_regularization))
# 更新损失(正则化)
loss = loss + 0.0001 * loss_regularization
# 梯度计算 更新参数
grads = tape.gradient(loss, network.trainable_variables)
optimizer.apply_gradients(zip(grads, network.trainable_variables))
if step % 100 == 0:
print(step, 'loss:', float(loss), 'loss_regularization:', float(loss_regularization))
# 验证集测试
if step % 500 == 0:
total, total_correct = 0., 0
# 训练模式 评估正确率
for step, (x, y) in enumerate(ds_val):
x = tf.reshape(x, (-1, 28 * 28))
out = network(x, training=True) # 训练模式 打开dropout
pred = tf.argmax(out, axis=1)
pred = tf.cast(pred, dtype=tf.int32)
correct = tf.equal(pred, y)
total_correct += tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
total += x.shape[0]
print(step, '验证集正确率(训练模式):', total_correct / total)
total, total_correct = 0., 0
for step, (x, y) in enumerate(ds_val):
x = tf.reshape(x, (-1, 28 * 28))
out = network(x, training=False) # 非训练模式 关闭 dropout
pred = tf.argmax(out, axis=1)
pred = tf.cast(pred, dtype=tf.int32)
correct = tf.equal(pred, y)
total_correct += tf.reduce_sum(tf.cast(correct, dtype=tf.int32)).numpy()
total += x.shape[0]
print(step, '验证集正确率(测试模式):', total_correct / total)