1、导入数据并可视化
data = sio.loadmat(文件路径)
X = data["X"] # (12,1)
y = data["y"] # (12,1)
Xval = data["Xval"]
yval = data["yval"]
Xtest = data["Xtest"]
ytest = data["ytest"]
# 可视化训练集
plt.scatter(X, y, marker="o", c="b")
plt.xlabel("Change in water level (x)")
plt.ylabel("Water flowing out of the dam (y)")
plt.title("linear regression")
2、正则化线性回归
def cost(theta, X, y, lamda):
part1 = np.mean(pow(X.dot(theta) - y.ravel(), 2))/2
part2 = np.sum(np.delete(theta*theta, 0, axis=0))*lamda/(2*X.shape[0])
return part1 + part2
def gradient(theta, X, y, lamda):
m = X.shape[0]
part1 = (X.dot(theta) - y.ravel()).dot(X)/m
part2 = (lamda/m) * theta
return part1 + part2
拟合线性回归
# 线性拟合
theta = np.ones((2,))
X = np.insert(X, 0, 1, axis=1)
print(cost(theta, X, y, 1)) # 303.9931922202643
res = opt.minimize(fun=cost, x0=theta, args=(X, y, 0), method="TNC", jac=gradient)
print(res.x) # [13.08790398 0.36777923]
plt.plot([i for i in range(-50, 40)], [res.x[0] + res.x[1]*i for i in range(-50, 40)], c="r")
plt.show()
3、学习曲线
(使用训练集的子集来拟合模型;计算训练代价和验证集代价时,没有用正则化
# 画出学习曲线
Xval = np.insert(Xval, 0, 1, axis=1)
error_train = []
error_validation = []
for i in range(X.shape[0]):
subX = X[:i+1]
suby = y[:i+1]
res = opt.minimize(fun=cost, x0=theta, args=(subX, suby, 0), method="TNC", jac=gradient)
t = res.x
error_train.append(cost(t, subX, suby, 0))
error_validation.append((cost(t, Xval, yval, 0)))
plt.plot([i for i in range(X.shape[0])], error_train, label="training set error")
plt.plot([i for i in range(X.shape[0])], error_validation, label="cross validation set error")
plt.legend(loc="upper right")
plt.xlabel("m(numbers of training set)")
plt.title("learning curves")
plt.show() # 欠拟合
4、多项式拟合
线性回归对于现有数据过于简单,会欠拟合,我们需要多添加一些特征。
# 添加多次项、添加特征
def poly_features(X, power_max):
_X = X.reshape(-1, 1)
res = np.ones((X.shape[0], 1))
for i in range(power_max):
res = np.concatenate((res, np.power(_X, i+1)), axis=1)
return res[..., 1:]
def normalize_features(X, means, stds):
return (X - means) / stds
# 多项式拟合
power_max = 6
lamda = 0
features = poly_features(data["X"], power_max)
means = np.mean(features, axis=0)
stds = np.std(features, axis=0, ddof=1)
normalize_feature = normalize_features(features, means, stds)
normalize_X = np.insert(normalize_feature, 0, 1, axis=1)
res = opt.minimize(fun=cost, x0=np.ones((power_max+1)), args=(normalize_X, y, lamda), method="TNC", jac=gradient)
plt.scatter(data['X'], y, marker='x', c='r')
plt.xlabel("Change in water level (x)")
plt.ylabel("Water flowing out of the dam (y)")
plt.title("polynomial(8) regression")
X = np.linspace(-100, 100, 50)
normalized_X = normalize_features(poly_features(X, power_max), means, stds)
normalized_X = np.insert(normalized_X, 0, 1, axis=1)
plt.plot(X, normalized_X.dot(res.x))
plt.show()
5、学习曲线
# 训练集变化对应的学习曲线
error_train = []
error_validation = []
# 直接利用全部训练集的归一化参数,直接将训练集和验证集数据全部归一化,以后直接在里面取即可
train_features = poly_features(data["X"], power_max)
train_normalized_features = normalize_features(train_features, means, stds)
train_normalized_X = np.insert(train_normalized_features, 0, 1, axis=1)
val_features = poly_features(data["Xval"], power_max)
val_normalized_features = normalize_features(val_features, means, stds)
val_normalized_X = np.insert(val_normalized_features, 0, 1, axis=1)
yval = data["yval"]
for i in range(1, train_normalized_X.shape[0]):
subX = train_normalized_X[:i+1]
suby = y[:i+1]
res = opt.minimize(fun=cost, x0=np.ones((power_max+1,)), args=(subX, suby, 1), method="TNC", jac=gradient)
t = res.x
error_train.append(cost(t, subX, suby, 0))
error_validation.append((cost(t, val_normalized_X, yval, 0)))
plt.plot([i for i in range(1, train_normalized_X.shape[0])], error_train, label="training set error")
plt.plot([i for i in range(1, train_normalized_X.shape[0])], error_validation, label="cross validation set error")
plt.legend(loc="upper right")
plt.xlabel("m(numbers of training set)")
plt.title("learning curves")
plt.show()
6、选取不同lambda值时的学习曲线
# lamda变化对应的学习曲线
ls = [0, 0.01, 0.02, 0.04, 0.08, 0.15, 0.32, 0.64, 1.28, 2.56, 3, 5.12, 10]
error_train = []
error_validation = []
for l in ls:
res = opt.minimize(fun=cost, x0=np.ones((power_max + 1,)), args=(train_normalized_X, y, l),
method="TNC", jac=gradient)
error_train.append(cost(res.x, train_normalized_X, y, 0))
error_validation.append(cost(res.x, val_normalized_X, yval, 0))
plt.plot([i for i in ls], error_train, label="training set error")
plt.plot([i for i in ls], error_validation, label="cross validation set error")
plt.legend(loc="upper right")
plt.xlabel("lambda")
plt.ylabel("error")
plt.title("Selecting λ using a cross validation set")
plt.show()
7、计算测试集的损失值
# 计算测试集的损失值
test_features = poly_features(Xtest, power_max)
test_normalized_features = normalize_features(test_features, means, stds)
test_normalized_X = np.insert(test_normalized_features, 0, 1, axis=1)
res = opt.minimize(fun=cost, x0=np.ones((power_max + 1,)), args=(train_normalized_X, y, 3),
method="TNC", jac=gradient)
print(cost(res.x, test_normalized_X, ytest, 0)) # 9.72989416504514
8、随机选取数据
# 从数据集中随机取出n组
def randomly_select(data, n):
res = np.array(data)
m = data.shape[0]
for i in range(m - n):
index = np.random.randint(0, res.shape[0] - 1)
res = np.delete(res, index, axis=0)
return res
# 随机选择数据
error_train = []
error_validation = []
print(train_normalized_X.shape, val_normalized_X.shape) # (12, 7) (21, 7)
for i in range(X.shape[0]):
Xy = randomly_select(np.concatenate((train_normalized_X, y), axis=1), i+1)
subtrainX = Xy[..., :-1]
subtrainy = Xy[..., -1]
res = opt.minimize(fun=cost, x0=np.ones((power_max + 1,)), args=(subtrainX, subtrainy, 0.01), method="TNC",
jac=gradient)
t = res.x
error_train.append(cost(t, subtrainX, subtrainy, 0.01))
Xy = randomly_select(np.concatenate((val_normalized_X, yval), axis=1), i + 1)
subvalX = Xy[..., :-1]
subvaly = Xy[..., -1]
error_validation.append(cost(t, subvalX, subvaly, 0.01))
plt.plot([i for i in range(X.shape[0])], error_train, label="training set error")
plt.plot([i for i in range(X.shape[0])], error_validation, label="cross validation set error")
plt.legend(loc="upper right")
plt.xlabel("m(numbers of training set)")
plt.title("learning curves(randomly select)")
plt.show()