3.2得到最优的超参alpha,计算在test上的评价指标。
以下是对应的代码实现:
```python
# 导入数据集
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
# 加载数据集
boston = load_boston()
X, y = boston['data'], boston['target']
# 数据集预处理
num_features = [0, 4, 5, 6, 7, 9, 10, 12]
cat_features = [8]
num_transformer = Pipeline([('scaler', StandardScaler())])
cat_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(transformers=[
('num', num_transformer, num_features),
('cat', cat_transformer, cat_features)])
# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 线性回归模型
lr_model = Pipeline([('preprocessor', preprocessor), ('lr', LinearRegression())])
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)
print("Linear Regression")
print("MSE:", mean_squared_error(y_test, lr_y_pred))
print("R^2:", r2_score(y_test, lr_y_pred))
print()
# 处理y变量再进行线性回归
y_log = np.log1p(y_train)
lr_model2 = Pipeline([('preprocessor', preprocessor), ('lr', LinearRegression())])
lr_model2.fit(X_train, y_log)
lr_y_pred2 = np.expm1(lr_model2.predict(X_test))
print("Linear Regression with processed y")
print("MSE:", mean_squared_error(y_test, lr_y_pred2))
print("R^2:", r2_score(y_test, lr_y_pred2))
print()
# 岭回归模型
ridge_model = Pipeline([('preprocessor', preprocessor), ('ridge', Ridge())])
param_grid = [{'ridge__alpha': np.logspace(-3, 3, 13)}]
ridge_grid = GridSearchCV(ridge_model, param_grid=param_grid, cv=10, scoring='r2')
ridge_grid.fit(X_train, y_train)
ridge_y_pred = ridge_grid.predict(X_test)
print("Ridge Regression")
print("MSE:", mean_squared_error(y_test, ridge_y_pred))
print("R^2:", r2_score(y_test, ridge_y_pred))
print()
# 处理y变量再进行岭回归
y_log2 = np.log1p(y_train)
ridge_model2 = Pipeline([('preprocessor', preprocessor), ('ridge', Ridge())])
ridge_grid2 = GridSearchCV(ridge_model2, param_grid=param_grid, cv=10, scoring='r2')
ridge_grid2.fit(X_train, y_log2)
ridge_y_pred2 = np.expm1(ridge_grid2.predict(X_test))
print("Ridge Regression with processed y")
print("MSE:", mean_squared_error(y_test, ridge_y_pred2))
print("R^2:", r2_score(y_test, ridge_y_pred2))
print()
# 画出alpha与R^2的关系图
alphas = ridge_grid2.cv_results_['param_ridge__alpha'].data
r2_scores = ridge_grid2.cv_results_['mean_test_score']
plt.semilogx(alphas, r2_scores)
plt.xlabel('Alpha')
plt.ylabel('R^2')
plt.title('Ridge Regression with Processed y')
plt.show()
```
建议保存代码并在本地运行。