20.应用机器学习的一些建议
1.导入包
import numpy as np
%matplotlib widget
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu,linear
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
from public_tests_a1 import *
tf.keras.backend.set_floatx('float64')
from assigment_utils import *
tf.autograph.set_verbosity(0)
2. 评估学习算法(以线性回归为例)
在部署模型之前,如何在新数据上测试它的性能?
答案有两部分:
- 将原始数据集拆分为“训练”和“测试”集。
- 使用训练数据拟合模型参数
- 使用测试数据在新数据上评估模型
- 开发一个错误函数来评估您的模型。
2.1 分离数据集
把数据集分成测试集和训练集
# Generate some data
X,y,x_ideal,y_ideal = gen_data(18, 2, 0.7)
print("X.shape", X.shape, "y.shape", y.shape)
#split the data using sklearn routine
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)
X.shape (18,) y.shape (18,)
X_train.shape (12,) y_train.shape (12,)
X_test.shape (6,) y_test.shape (6,)
可视化数据集
fig, ax = plt.subplots(1,1,figsize=(4,4))
ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
ax.set_title("Training, Test",fontsize = 14)
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.scatter(X_train, y_train, color = "red", label="train")
ax.scatter(X_test, y_test, color = dlc["dlblue"], label="test")
ax.legend(loc='upper left')
plt.show()
2.2 误差计算
当评估线性回归模型时,将预测值和目标值的平方误差差取平均值。
J test ( w , b ) = 1 2 m test ∑ i = 0 m test − 1 ( f w , b ( x test ( i ) ) − y test ( i ) ) 2 (1) J_\text{test}(\mathbf{w},b) = \frac{1}{2m_\text{test}}\sum_{i=0}^{m_\text{test}-1} ( f_{\mathbf{w},b}(\mathbf{x}^{(i)}_\text{test}) - y^{(i)}_\text{test} )^2 \tag{1} Jtest(w,b)=2mtest1i=0∑mtest−1(fw,b(xtest(i))−ytest(i))2(1)
# UNQ_C1
# GRADED CELL: eval_mse
def eval_mse(y, yhat):
"""
Calculate the mean squared error on a data set.
Args:
y : (ndarray Shape (m,) or (m,1)) target value of each example
yhat : (ndarray Shape (m,) or (m,1)) predicted value of each example
Returns:
err: (scalar)
"""
m = len(y)
err = 0.0
for i in range(m):
### START CODE HERE ###
err_i = ((yhat[i]-y[i])**2)
err += err_i
err = err /(2*m)
### END CODE HERE ###
return(err)
#函数调用
y_hat = np.array([2.4, 4.2])
y_tmp = np.array([2.3, 4.1])
print(eval_mse(y_hat, y_tmp))
2.3 比较模型在训练集和测试集上的表现
让我们建立一个高次多项式模型来最小化训练误差。这将使用“sklearn”中的linear_regression函数。如果您想查看详细信息,代码位于导入的实用程序文件中。以下步骤为:
- 创建并拟合模型。(“fit拟合”是训练或运行梯度下降的另一个名称)。
- 计算训练数据上的误差。
- 计算测试数据的误差。
# create a model in sklearn, train on training data
degree = 10
lmodel = lin_model(degree)
lmodel.fit(X_train, y_train)
# predict on training data, find training error
yhat = lmodel.predict(X_train)
err_train = lmodel.mse(y_train, yhat)
# predict on test data, find error
yhat = lmodel.predict(X_test)
err_test = lmodel.mse(y_test, yhat)
print(f"training err {
err_train:0.2f}, test err {
err_test:0.2f}")
#训练集上的计算误差显著小于测试集的计算误差。
training err 58.01, test err 171215.01
为什么会发生这种情况?原因:1)过拟合,2)方差高,3)“泛化”差。1) is overfitting, 2) has high variance 3) ‘generalizes’ poorly.
def plt_train_test(X_train, y_train, X_test, y_test, x, y_pred, x_ideal, y_ideal, degree):
fig, ax = plt.subplots(1,1, figsize=(4,4))
fig.canvas.toolbar_visible = False
fig.canvas.header_visible = False
fig.canvas.footer_visible = False
ax.set_title("Poor Performance on Test Data",fontsize = 12)
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.scatter(X_train, y_train, color = "red", label="train")
ax.scatter(X_test, y_test, color = dlc["dlblue"], label="test")
ax.set_xlim(ax.get_xlim())
ax.set_ylim(ax.get_ylim())
ax.plot(x, y_pred, lw=0.5, label=f"predicted, degree={
degree}")
ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
ax.legend(loc='upper left')
plt.tight_layout()
plt.show()
# plot predictions over data range
x = np.linspace(0,int(X.max()),100) # predict values for plot
y_pred = lmodel.predict(x).reshape(-1,1)
plt_train_test(X_train, y_train, X_test, y_test, x, y_pred, x_ideal, y_ideal, degree)
测试集错误表明此模型在新数据上无法正常工作。如果您使用测试错误来指导模型的改进,那么模型将在测试数据上表现良好……但测试数据旨在表示新数据。
您还需要另一组数据交叉验证集测试新的数据性能,来指导模型的改进
下表所示的训练、交叉验证和测试集的分布是典型的分布,但可以根据可用数据的数量而变化。
数据 | 占总数的百分比 | 说明 |
---|---|---|
training | 60 | 用于在训练或拟合中调整模型参数 w w w和 b b b的数据 |
交叉验证cross-validation | 20 | 用于调整其他模型参数的数据,如多项式度、正则化或神经网络结构 |
test | 20 | 调整后用于测试模型的数据,以衡量新数据的性能 |
让我们在下面生成三个数据集。我们将再次使用sklearn
中的train_test_split
,但将调用它两次以获得三个分割
# Generate data
X,y, x_ideal,y_ideal = gen_data(40, 5, 0.7)
print("X.shape", X.shape, "y.shape", y.shape)
#split the data using sklearn routine
X_train, X_, y_train, y_ = train_test_split(X,y,test_size=0.40, random_state=1)
X_cv, X_test, y_cv, y_test = train_test_split(X_,y_,test_size=0.50, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_cv.shape", X_cv.shape, "y_cv.shape", y_cv.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)
X.shape (40,) y.shape (40,)
X_train.shape (24,) y_train.shape (24,)
X_cv.shape (8,) y_cv.shape (8,)
X_test.shape (8,) y_test.shape (8,)
3.Bias and Variance
显然,多项式模型的阶数太高。你如何选择一个好的值?事实证明,如图所示,训练和交叉验证性能可以提供指导。通过尝试一系列degree values(多项式的次数值),可以评估训练和交叉验证性能。当程度变得太大时,交叉验证性能将开始相对于训练性能下降。让我们在我们的例子中尝试一下。
3.1 可视化数据集
把训练集和交叉验证集可视化
fig, ax = plt.subplots(1,1,figsize=(4,4))
ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
ax.set_tit