第一部分:核心思想
就好比成绩评级,班里面的同学被分为优秀,中等,差三类,那如果这个学生被分到优秀的那一组中,我们就可以通过估计优秀这一组中学生的成绩来作为该学生的成绩。
第二部分:代码实现
(1)导包
#第一部分:导包
import matplotlib.pyplot as plt
import numpy as np
(2)创建数据集
#第二部分:创建数据集
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
# 从在线源导入波士顿房价数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
# 数据处理,拼接特征列和目标列
x = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]
(3)划分数据集
#第三部分:划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=233)
(4)调用决策树回归模型
#第四部分:调用决策树回归模型
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)
print("直接使用决策树得到的测试集效果:",regressor.score(x_test, y_test))
(5)开始绘制学习曲线
#第五部分:开始绘制学习曲线
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
plt.rcParams["figure.figsize"] = (12, 8)
max_depth = [2, 5, 10, 20]
for i, depth in enumerate(max_depth):
reg = DecisionTreeRegressor(max_depth=depth)
train_error, test_error = [], []
for k in range(len(x_train)):
reg.fit(x_train[:k + 1], y_train[:k + 1])
y_train_pred = reg.predict(x_train[:k + 1])
train_error.append(r2_score(y_train[:k + 1], y_train_pred))
y_test_pred = reg.predict(x_test)
test_error.append(r2_score(y_test, y_test_pred))
plt.subplot(2, 2, i + 1)
plt.ylim(0, 1.1)
plt.title("Depth: {0}".format(depth))
plt.plot([k + 1 for k in range(len(x_train))], train_error, color="red", label="train")
plt.plot([k + 1 for k in range(len(x_train))], test_error, color="blue", label="test")
plt.legend()
plt.show()
(6)网格搜索寻找最优超参数
#第六部分:网格搜索寻找最优超参数
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
# 定义参数网格
params = {
'max_depth': [n for n in range(2, 15)],
'min_samples_leaf': [sn for sn in range(3, 20)]
}
# 创建 GridSearchCV 对象
grid = GridSearchCV(
estimator=DecisionTreeRegressor(),
param_grid=params,
n_jobs=-1
)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
#输出在测试集上的表现:
reg=grid.best_estimator_
print("使用网格搜索之后的测试集效果:",reg.score(x_test, y_test))
(7)完整pycharm代码实现
#第一部分:导包
import matplotlib.pyplot as plt
import numpy as np
#第二部分:创建数据集
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
# 从在线源导入波士顿房价数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
# 数据处理,拼接特征列和目标列
x = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]
#第三部分:划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=233)
#第四部分:调用决策树回归模型
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)
print("直接使用决策树得到的测试集效果:",regressor.score(x_test, y_test))
#第五部分:开始绘制学习曲线
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
plt.rcParams["figure.figsize"] = (12, 8)
max_depth = [2, 5, 10, 20]
for i, depth in enumerate(max_depth):
reg = DecisionTreeRegressor(max_depth=depth)
train_error, test_error = [], []
for k in range(len(x_train)):
reg.fit(x_train[:k + 1], y_train[:k + 1])
y_train_pred = reg.predict(x_train[:k + 1])
train_error.append(r2_score(y_train[:k + 1], y_train_pred))
y_test_pred = reg.predict(x_test)
test_error.append(r2_score(y_test, y_test_pred))
plt.subplot(2, 2, i + 1)
plt.ylim(0, 1.1)
plt.title("Depth: {0}".format(depth))
plt.plot([k + 1 for k in range(len(x_train))], train_error, color="red", label="train")
plt.plot([k + 1 for k in range(len(x_train))], test_error, color="blue", label="test")
plt.legend()
plt.show()
#第六部分:网格搜索寻找最优超参数
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
# 定义参数网格
params = {
'max_depth': [n for n in range(2, 15)],
'min_samples_leaf': [sn for sn in range(3, 20)]
}
# 创建 GridSearchCV 对象
grid = GridSearchCV(
estimator=DecisionTreeRegressor(),
param_grid=params,
n_jobs=-1
)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
#输出在测试集上的表现:
reg=grid.best_estimator_
print("使用网格搜索之后的测试集效果:",reg.score(x_test, y_test))