多项式与泛化)
一、多项式回归
import numpy as np
import matplotlib.pyplot as plt
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x **2 +x + 2+ np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
y_predict = lin_reg.predict(X)
plt.scatter(x,y)
plt.plot(x,y_predict,color="r")
plt.show()
- 不能很好的拟合数据,为了解决该问题,我们多加一个数据
(X**2).shape
X2 = np.hstack([X,X**2])
X2.shape
lin_reg2 = LinearRegression()
lin_reg2.fit(X2,y)
y_predict2 = lin_reg2.predict(X2)
plt.scatter(x,y)
plt.plot(x,y_predict2,color="r")
plt.show()
-
由于生成的x是没有顺序的,对应的y_predict2也无序,所以造成如此多的线
-
为了解决这个问题,对x进行排序
np.sort(x)
, y_predict2也要找到x对应的值,即y_predict2[np.argsort(x)]
-
np.argsort(x),返回x升序排列的索引值,然后y_predict2进行按照索引进行排列。
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict2[np.argsort(x)],color="r")
plt.show()
- 返回查看系数
二、关于PolynomialFeatures
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x **2 +x + 2+ np.random.normal(0,1,size=100)
from sklearn.preprocessing import PolynomialFeatures # 数据预处理
poly = PolynomialFeatures(degree = 2) # 最大二次幂
poly.fit(X)
X2 = poly.transform(X) # 将X转化为多项式的特征
lin_reg3 = LinearRegression()
lin_reg3.fit(X2,y)
y_predict2 = lin_reg3.predict(X2)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict2[np.argsort(x)],color="r")
plt.show()
- PolynomialFeatures详解
X = np.arange(1,11).reshape(-1,2)
X.shape
poly = PolynomialFeatures(degree =2) # 二次幂
poly.fit(X)
X2 = poly.transform(X)
degree修改为3
poly = PolynomialFeatures(degree =3) # 二次幂
poly.fit(X)
X3 = poly.transform(X)
X3
三、Pipeline【管道?】
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x **2 +x + 2+ np.random.normal(0,1,size=100)
- Pipeline,填入每一步的元组
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
poly_reg = Pipeline([
('poly',PolynomialFeatures(degree=2)),
('std_scaler',StandardScaler()),
('lin_reg',LinearRegression())
])
poly_reg.fit(X,y)
y_predict = poly_reg.predict(X)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict[np.argsort(x)],color="r")
plt.show()
四、过拟合和欠拟合
4.1 原始数据集
np.random.seed(666)
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x **2 +x + 2+ np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
4.2 使用线性回归训练数据
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
lin_reg.score(X,y)
y_predict = lin_reg.predict(X)
plt.scatter(x,y)
plt.plot(np.sort(x),y_predict[np.argsort(x)],color="r")
plt.show()
from sklearn.metrics import mean_squared_error
y_predict = lin_reg.predict(X)
mean_squared_error(y,y_predict)
这里统一使用均方误差作为对比
4.3 使用多项式回归
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
def Polynomialregression(degree):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('lin_reg',LinearRegression())
])
poly2_reg = Polynomialregression(degree=2)
poly2_reg.fit(X,y)
y2_predict = poly2_reg.predict(X)
mean_squared_error(y,y2_predict)
plt.scatter(x,y)
plt.plot(np.sort(x),y2_predict[np.argsort(x)],color="r")
plt.show()
poly10_reg = Polynomialregression(degree=10)
poly10_reg.fit(X,y)
y10_predict = poly10_reg.predict(X)
mean_squared_error(y,y10_predict)
- 过拟合
plt.scatter(x,y)
plt.plot(np.sort(x),y10_predict[np.argsort(x)],color="r")
plt.show()
poly100_reg = Polynomialregression(degree=100)
poly100_reg.fit(X,y)
y100_predict = poly100_reg.predict(X)
mean_squared_error(y,y100_predict)
plt.scatter(x,y)
plt.plot(np.sort(x),y100_predict[np.argsort(x)],color="r")
plt.show()
X_plot = np.linspace(-3,3,100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color="r")
plt.show()
X_plot = np.linspace(-3,3,100).reshape(100,1)
y_plot = poly100_reg.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color="r")
plt.axis([-3,3,-1,10])
plt.show()
五、train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)
- 欠拟合
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_predict = lin_reg.predict(X_test)
mean_squared_error(y_test,y_predict)
- fitting
poly2_reg = Polynomialregression(degree=2)
poly2_reg.fit(X_train,y_train)
y2_predict = poly2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
- 过拟合
poly10_reg = Polynomialregression(degree=10)
poly10_reg.fit(X_train,y_train)
y10_predict = poly10_reg.predict(X_test)
mean_squared_error(y_test,y10_predict)
poly100_reg = Polynomialregression(degree=100)
poly100_reg.fit(X_train,y_train)
y100_predict = poly100_reg.predict(X_test)
mean_squared_error(y_test,y100_predict)
六、学习曲线
- 原始数据
np.random.seed(666)
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x **2 +x + 2+ np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=10)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
train_score = []
test_score = []
for i in range(1,76):
lin_reg = LinearRegression()
lin_reg.fit(X_train[:i],y_train[:i])
y_train_predict = lin_reg.predict(X_train[:i])
# 训练误差
train_score.append(mean_squared_error(y_train[:i],y_train_predict))
y_test_predict = lin_reg.predict(X_test)
# 测试误差
test_score.append(mean_squared_error(y_test,y_test_predict))
plt.plot([i for i in range(1,76)],np.sqrt(train_score),label= "train")
plt.plot([i for i in range(1,76)],np.sqrt(test_score),label= "test")
plt.legend()
plt.show()
- 重新封装函数
def plot_learning_curve(algo,X_train,X_test,y_train,y_test):
train_score = []
test_score = []
for i in range(1,len(X_train)+1):
algo.fit(X_train[:i],y_train[:i])
y_train_predict = algo.predict(X_train[:i])
# 训练误差
train_score.append(mean_squared_error(y_train[:i],y_train_predict))
y_test_predict = algo.predict(X_test)
# 测试误差
test_score.append(mean_squared_error(y_test,y_test_predict))
plt.figure(figsize=(10,8))
plt.plot([i for i in range(1,len(X_train)+1)],np.sqrt(train_score),label= "train")
plt.plot([i for i in range(1,len(X_train)+1)],np.sqrt(test_score),label= "test")
plt.legend()
plt.axis([0,len(X_train)+1,0,4])
plt.show()
plot_learning_curve(LinearRegression(),X_train,X_test,y_train,y_test)
- 欠拟合
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
def Polynomialregression(degree):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('lin_reg',LinearRegression())
])
poly2_reg = Polynomialregression(degree=2)
plot_learning_curve(poly2_reg,X_train,X_test,y_train,y_test)
- 过拟合
poly2_reg = Polynomialregression(degree=20)
plot_learning_curve(poly2_reg,X_train,X_test,y_train,y_test)
七、验证数据集和交叉验证(Cross Validation)
== K个模型的均值作为结果调参==
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.4)
best_score,best_p,best_k = 0,0,0
for k in range(2,11):
for p in range(1,6):
knn_clf = KNeighborsClassifier(weights = "distance",n_neighbors=k,p=p)# P表示距离
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if score > best_score:
best_score,best_p,best_k = score,p,k
print("best_k=",best_k)
print("best_p=",best_p)
print("best_score = ",best_score)
7.1使用交叉验证
from sklearn.model_selection import cross_val_score
knn_clf = KNeighborsClassifier()
cross_val_score(knn_clf,X_train,y_train)
import warnings
warnings.filterwarnings('ignore')
for k in range(2,11):
for p in range(1,6):
knn_clf = KNeighborsClassifier(weights = "distance",n_neighbors=k,p=p)# P表示距离
scores = cross_val_score(knn_clf,X_train,y_train)
score = np.mean(scores)
if score > best_score:
best_score,best_p,best_k = score,p,k
print("best_k=",best_k)
print("best_p=",best_p)
print("best_score = ",best_score)
best_knn_clf = KNeighborsClassifier(weights="distance",n_neighbors=3,p=2)
best_knn_clf.fit(X_train,y_train)
best_knn_clf.score(X_test,y_test)
7.2 网格搜索
from sklearn.model_selection import GridSearchCV
param_grid = [
{
'weights':['distance'],
'n_neighbors':[i for i in range(2,11)],
'p':[i for i in range(1,6)]
}
]
grid_search = GridSearchCV(KNeighborsClassifier(),param_grid,verbose=1)
grid_search.fit(X_train,y_train)
- 默认CV=3
grid_search.best_estimator_
grid_search.best_params_
grid_search.best_score_
best_knn_clf = grid_search.best_estimator_
best_knn_clf.score(X_test,y_test)
cross_val_score(knn_clf,X_train,y_train,cv=5)
GridSearchCV(KNeighborsClassifier(),param_grid,verbose=1,cv=5)
== 网格搜索自带交叉验证==
7.3 留一法 LOO-CV
八、偏差与方差
- 偏差
== 主要是欠拟合==
- 方差
== 过拟合==
KNN高方差
线性回归 高偏差
== K越小,模型越复杂==,模型方差越大
- 解决过拟合
- 降低模型复杂度
- 减少数据维度,降噪
- 增加样本数
- 引入验证集
九、模型正则化
- 可解决过拟合问题
9.1 岭回归 - L2正则化
- 设置绘制图形函数
def plot_model(model):
X_plot = np.linspace(-3,3,100).reshape(100,1)
y_plot = model.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,0,6])
plt.show()
- 导入岭回归,设置pipeline
from sklearn.linear_model import Ridge
def Ridgeregression(degree,alpha):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('ridge_reg',Ridge(alpha=alpha))
])
- 岭回归训练模型,调整alpha的值
ridge1_reg = Ridgeregression(20,0.0001)
ridge1_reg.fit(X_train,y_train)
y1_predict = ridge1_reg.predict(X_test)
mean_squared_error(y_test,y1_predict)
plot_model(ridge1_reg)
ridge2_reg = Ridgeregression(20,1)
ridge2_reg.fit(X_train,y_train)
y2_predict = ridge2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
plot_model(ridge2_reg)
ridge3_reg = Ridgeregression(20,100)
ridge3_reg.fit(X_train,y_train)
y3_predict = ridge3_reg.predict(X_test)
mean_squared_error(y_test,y3_predict)
plot_model(ridge3_reg)
ridge4_reg = Ridgeregression(20,100000)
ridge4_reg.fit(X_train,y_train)
y4_predict = ridge4_reg.predict(X_test)
mean_squared_error(y_test,y4_predict)
plot_model(ridge4_reg)
9.2 LASSORegression - L1正则化
== LASSO 趋向于使得一部分theta值变为0,可作为特征选择用==
-
Ridge Regression
- alpha趋近于无穷,求导得下式
- alpha趋近于无穷,求导得下式
-
LASSO Regression
np.random.seed(42)
x = np.random.uniform(-3,3,size=100)
X = x.reshape(-1,1)
y = 0.5 * x + 3 + np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
np.random.seed(666)
X_train,X_test,y_train,y_test = train_test_split(X,y)
def Polynomialregression(degree):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('lin_reg',LinearRegression())
])
poly10_reg = Polynomialregression(degree = 20)
poly10_reg.fit(X_train,y_train)
y10_predict = poly10_reg.predict(X_test)
mean_squared_error(y_test,y10_predict)
def plot_model(model):
X_plot = np.linspace(-3,3,100).reshape(100,1)
y_plot = model.predict(X_plot)
plt.scatter(x,y)
plt.plot(X_plot[:,0],y_plot,color='r')
plt.axis([-3,3,0,6])
plt.show()
plot_model(poly10_reg)
from sklearn.linear_model import Lasso
def Lassoregression(degree,alpha):
return Pipeline([
('poly',PolynomialFeatures(degree=degree)),
('std_scaler',StandardScaler()),
('lasso_reg',Lasso(alpha=alpha))
])
lasso1_reg = Lassoregression(20,0.01)
lasso1_reg.fit(X_train,y_train)
y1_predict = lasso1_reg.predict(X_test)
mean_squared_error(y_test,y1_predict)
plot_model(lasso1_reg)
lasso2_reg = Lassoregression(20,0.1)
lasso2_reg.fit(X_train,y_train)
y2_predict = lasso2_reg.predict(X_test)
mean_squared_error(y_test,y2_predict)
plot_model(lasso2_reg)
lasso3_reg = Lassoregression(20,1)
lasso3_reg.fit(X_train,y_train)
y3_predict = lasso3_reg.predict(X_test)
mean_squared_error(y_test,y3_predict)
plot_model(lasso3_reg)
9.3 Ridge VS LASSO
- 弹性网 - 结合L1和L2