线性回归
# an example
import numpy as np
import matplotlib.pyplot as plt
X = 2 * np.random.rand(100, 1)
y = 3 * X + 4 + np.random.randn(100, 1)
plt.plot(X, y, 'k.')
plt.show()
X_b = np.c_[np.ones((100, 1)), X]
theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
theta
# array([[3.91711324],
# [2.934259 ]])
使用sklearn进行线性回归:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg. fit(X, y)
lin_reg.intercept_, lin_reg.coef_
# (array([3.91711324]), array([[2.934259]]))
# Scikit-Learn 将偏差项(intercept_)核特征权重(coef_)分开
梯度下降
1、批量梯度下降
计算梯度下降的每一步时,都是基于完整的训练集X
eta = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2, 1)
for iteration in range(n_iterations):
gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
theta = theta - eta * gradients
theta
# array([[3.91711324],
# [2.934259 ]])
2、随机梯度下降
每一步在训练集中随机选择一个实例,并基于该单个实例计算梯度
当成本函数非常不规则时, 随机梯度下降可以帮助算法跳出局部最小值。相比批量梯度下降,随机梯度下降对寻找全局最小值更有优势。
随机性的好处在于可以逃离局部最优,但是缺点是永远定位不出最小值
解决方法:逐渐降低学习率——模拟退火
n_epochs = 50
t0, t1 = 5, 50 # learning schedule hyperparameters
def learning_schedule(t):
return t0/(t + t1)
theta = np.random.randn(2, 1)
for epoch in range(n_epochs):
for i in range(m):
random_index = np.random.randint(m)
xi = X_b[random_index:random_index + 1]
yi = y[random_index:random_index + 1]
gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
eta = learning_schedule(epoch * m + i)
theta = theta - eta * gradients
theta
# array([[3.97165339],
# [2.94092433]])
使用随机梯度下降时, 训练实例必须保持独立且均匀分布(IID),以确保平均而言将参数拉向全局最优值。确保这一点的一个简单方法时在训练过程中对实例进行随机混洗
使用带有Scikit-Learn的随机梯度下降执行线性回归
from sklearn.linear_model import SGDRegressor
# run until iterations == 1000 or loss < 0.001 (max_iter=1000, tol = 1e-3)
# learning rate = 0.1 (eta0=0.1)
# No ragularization (penalty=None)
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())
# rival() : Flatten multidimensional array
sgd_reg.intercept_, sgd_reg.coef_
# (array([4.37064466]), array([2.85664733]))
3、小批量梯度下降
每一步中,不是根据完整的训练集(如批量梯度下降)或仅基于一个实例(如随机梯度下降)来计算梯度
多项式回归
多项式回归:可以用线性模型拟合非线性模型。 将每个特征的幂次方添加为一个新特征, 然后在此拓展训练集上训练一个线性模型
# an example
import numpy as np
import matplotlib.pyplot as plt
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
plt.plot(X, y, 'k.')
plt.show()
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
X[0], X_poly[0]
# (array([-2.57177719]), array([-2.57177719, 6.61403794]))
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
lin_reg.intercept_, lin_reg.coef_
# (array([2.13509276]), array([[1.00195518, 0.50394306]]))
# 模型估算: y = 0.503 * x^2 + 1.002 * x + 2.135
学习曲线
绘制模型在训练集核验证集上关于训练集大小(或训练迭代)的性能函数
X = 6 * np.random.rand(100, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(100, 1)
# learning curve
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
train_errors, val_errors = [], []
for m in range(1, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict[:m]))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.plot(np.sqrt(train_errors), 'r-+', linewidth=2, label='train')
plt.plot(np.sqrt(val_errors), 'b-', linewidth=3, label='val')
以一个普通回归模型的学习曲线为例:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
相同数据上10阶多项式模型的学习曲线:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
polynomial_regression = Pipeline([
("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
("lin_reg", LinearRegression()),
])
plot_learning_curves(polynomial_regression, X, y)
模型的泛化误差可以表示成三个不同的误差之和:
偏差: 产生原因在于错误的假设, 比如假设数据是线性的,结果是二次的。高偏差模型最可能欠拟合训练数据
方差: 由于模型对训练数据的细微变化过于敏感。具有许多自由度的模型(如高阶多项式模型)可能具有较高的方差,可能过拟合训练数据
不可避免的误差: 由数据本身的噪声所致。减少这部分误差的唯一办法就是清理数据
正则化训练模型
1、岭回归(Tikhonov正则化)
使用Scikit-Learn和闭式解执行岭回归:
# ridge regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])
# array([[8.47099313]])
并使用随机梯度下降法:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty='l2')
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])
# array([8.56812845])
2、Lasso回归(最小绝对收缩和选择算子回归 Least Absolute Shrinkage and Selection Operator Regression)
倾向于完全消除最不重要特征的权重
使用Scikit-Learn执行Lasso回归
# lasso regression
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]])
# array([8.33582843])
随机梯度下降法实现Lasso回归
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty='l1')
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])
# array([8.57026917])
3、弹性网络
介于岭回归和Lasso回归, 简单混合, 混合比r可以控制
# elastic net regression
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5) # l1_ratio=0.5 mixing ratio
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])
# array([8.23664857]
4、提前停止
在验证误差达到最小值时停止训练
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
poly_scaler = Pipeline([
("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
("std_scaler", StandardScaler())
])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)
sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,
penalty=None, learning_rate="constant", eta0=0.0005)
minimum_val_error = float("inf")
best_epoch = None
best_model = None
for epoch in range(1000):
sgd_reg.fit(X_train_poly_scaled, y_train)
y_val_predict = sgd_reg.predict(X_val_poly_scaled)
val_error = mean_squared_error(y_val, y_val_predict)
if val_error < minimum_val_error:
minimum_val_error = val_error
best_epoch = epoch
best_model = clone(sgd_reg)
逻辑回归
计算输入特征的加权和, 输出结果的数理逻辑值
# an example: detect virginica iris
# load and process the data
from sklearn import datasets
iris = datasets.load_iris()
iris.keys()
# dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
X = iris['data'][:, 3:] # petal width
y = (iris['target'] == 2).astype(np.uint8) # 1 if Iris virginica, else 0
训练逻辑回归模型:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
花瓣宽度在0-3cm之间的鸢尾花,模型估算的概率:
X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
y_proba = log_reg.predict_proba(X_new)
plt.plot(X_new, y_proba[:, 1], "g-", label="Iris virginica")
plt.plot(X_new, y_proba[:, 0], "b--", label="Not Iris virginica")
log_reg.predict([[1.7], [1.5]])
# array([1, 0], dtype=uint8)
# predict(): Logical value predict_proba(): predicted probability
Softmax回归
多元逻辑回归
X = iris["data"][:, (2, 3)] # petal length, petal width
y = iris["target"]
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10)
softmax_reg.fit(X, y)
softmax_reg.predict([[5, 2]])
# array([2])
softmax_reg.predict_proba([[5, 2]])
# array([[6.38014896e-07, 5.74929995e-02, 9.42506362e-01]])