一、Pipeline和make_pipeline的作用
1、简化代码
2、结合网络搜索进行模型参数最优搜索
二、用法
Pipeline和make_pipeline的区别在于make_pipeline不需要指定步骤名
make_pipeline创建的对象步骤名默认为构造方法的小写形式
如StandardScaler()的步骤名为standardscaler
2.1、Pipeline的用法
用法1:传入对象
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2)
# # 数据标准化
transform = StandardScaler()
# # 多项式特征转换
degree = 2 # 设置多项式的阶数
poly = PolynomialFeatures(degree=degree)
# # Ridge回归模型
alpha = 0.001
model = Lasso(alpha=alpha,max_iter=10000)
#步骤名可以自己指定
pipe= Pipeline([('step1', transform),
('step2', poly),
('step3', model)
])
#fit执行后,自动完成对训练集的特征数据和目标数据进行标准化和2次多项式变换,然后自动使用Lasso进行训练
pipe.fit(X_train, y_train)
# 预测
#不需要对测试集数据进行标准化,pipe已经记录了之前的步骤
y_pred = pipe.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差(Mean Squared Error): {mse}")
用法1:传入构造方法
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2)
pipe = Pipeline([
('step1',StandardScaler()),
('step2',PolynomialFeatures(degree=2)),
('step3',Lasso(alpha=0.001,max_iter=10000)),
])
pipe.fit(X_train, y_train)
# 预测
#不需要对测试集数据进行标准化,pipe已经记录了之前的步骤
y_pred = pipe.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差(Mean Squared Error): {mse}")
2.2、make_pipeline的用法
用法1:传入对象
# # 数据标准化
transform = StandardScaler()
# # 多项式特征转换
degree = 2 # 设置多项式的阶数
poly = PolynomialFeatures(degree=degree)
# # Ridge回归模型
alpha = 0.001
model = Lasso(alpha=alpha,max_iter=10000)
#不需要指定步骤名
pipe= make_pipeline(transform,poly,model)
用法2:传入构造方法
#区别在于不需要指定步骤名
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2),Lasso())
pipe.fit(X_train, y_train)
2.3、原理
Pipeline和make_pipeline的对象的fit方法会自动调用传入步骤对象的fit方法和transforn方法
次序为:先执行fit(),然后执行transforn()
如果没有transforn方法,仅执行fit方法
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2),Lasso())
pipe.fit(X_train, y_train)
对于这段代码而言,在pipe.fit()执行后,相当于先使用SrandarScaler的fit方法计算X_train的标准差和方差,然后SrandarScaler的transform方法进行标准化,然后再以同样的流程使用PolynomialFeatures的fit方法和transform方法进行多维变换,最后执行Lasso的fit方法完成模型训练
三、获得模型参数
不难发现,上述使用的是线性模型Lasso,我们自然关心该模型的权重列表和偏置项,如何获取?
#'lasso'为步骤名,返回获得该步骤的对象
model_object = pipe.named_steps['lasso']
# 获得lasso对象的权重列表和偏置项
print(f"权重列表: {model_object.coef_}")
print(f"偏置项: {model_object.intercept_}")
如果是通过make_pipeline创建的对象,并没有指定步骤名,如何获取?
方法1:
make_pipeline创建的对象步骤名默认为构造方法的小写形式
如StandardScaler()的步骤名为standardscaler
方法2:
#获得步骤的列表
names = list(pipe.named_steps.keys())
names
#执行结果
#['standardscaler', 'polynomialfeatures', 'lasso']
其实最后一个步骤都是模型,只需要取列表最后一个元素就行了
#获得最后一个步骤名
step_name = list(pipe.named_steps.keys())[-1]
#获得模型对象
model_object = pipe.named_steps[step_name]
#获得权重列表和偏置项
print(f"权重列表: {model_object.coef_}")
print(f"偏置项: {model_object.intercept_}")
四、结合网络搜索
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2)
# 创建 make_pipeline
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
# 定义要搜索的参数范围
param_grid = {'polynomialfeatures__degree': [1, 2, 3, 4, 5]}
# 使用 GridSearchCV 进行搜索
# 搜索范围在param_grid中指定,使用-MSE来衡量模型的优劣
grid_search = GridSearchCV(pipe, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
# 打印最佳参数和对应的均方误差
print("最佳参数: ", grid_search.best_params_)
print("最佳均方误差: ", -grid_search.best_score_)
# 使用最佳参数的模型进行预测
y_pred = grid_search.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f"测试集均方误差(Mean Squared Error): {mse}")
# 获得权重列表和参数项
# 获得最好的模型
estimator = grid_search.best_estimator_
# 获得步骤名
step_name = list(pipe.named_steps.keys())[-1]
# 获得最好模型的对象
model_object = estimator.named_steps[step_name]
# 打印
print(f"权重列表: {model_object.coef_}")
print(f"偏置项: {model_object.intercept_}")
#执行结果
#最佳参数: {'polynomialfeatures__degree': 2}
# 最佳均方误差: 14.874345935688263
# 测试集均方误差(Mean Squared Error): 7.125129666414321
# 权重列表: [ 0. -0.16489284 -0.67000771 1.69546768 0.15132951 -1.25780925
# 3.54617897 -1.83322412 -2.4772359 1.32442866 -0.8421803 -1.13394853
# 1.39336348 -2.89603004 0.11134484 0.38640105 0.66150587 5.44348503
# -0.5509124 0.61287793 -0.723776 -1.26079077 -1.38917696 0.27205661
# -0.15035765 -0.24179656 0.90352914 -0.00629017 -0.73229711 -0.21214634
# -1.07387519 -0.3208589 -0.11096156 -0.61397699 0.12354678 2.1999797
# -0.40963319 0.74123677 -0.63313101 1.21842076 0.03066229 2.42076158
# 0.97826018 0.58088057 1.95378429 0.11360603 0.40760311 -0.0781846
# 0.54811964 -0.87021351 0.50209406 -1.07555374 -0.76275064 0.1103302
# 0.60778914 -0.77911338 0.67016015 -0.29767505 0.21540953 -0.33352648
# -0.7476588 0.0370018 -1.26888623 2.774194 -1.08098704 1.28251695
# -1.8534928 -0.07953803 1.10852843 0.04887559 -1.09541869 0.19648582
# -0.50713085 -1.92895772 -0.83140009 -0.34484423 -0.85837323 0.13621138
# -0.17129224 2.76342005 -1.55562408 -0.01285553 -2.00463088 -1.412321
# 1.87232813 -2.15342918 -1.64197408 -0.20892059 -1.10590194 1.04108378
# -3.869222 3.36993807 -0.64451948 0.37665302 -1.66301668 -1.19585597
# 1.96472125 -1.11275966 -1.23018163 -0.02302311 0.16729789 0.25092167
# -0.29644755 -0.36536331 0.67735243]
# 偏置项: 21.016874124354054
五、完整代码
以波士顿房价数据为例
该数据不能直接使用load获取了,以下给出了获取方式
5.1、数据获取
import numpy as np
import pandas as pd
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
target = target.reshape(len(target),1)
df = np.hstack((data,target))
df = pd.DataFrame(df)
df.to_csv('./data/boston.csv',index=False)
5.2、p和m的示例
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import pandas as pd
# 读取数据
data = pd.read_csv('./data/boston.csv')
target = data.iloc[:, -1]
data = data.iloc[:, :-1]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2)
#以下使用对象的方式创建的时候 可以打开
#---------------------------------------------------------------------
# # 数据标准化
# transform = StandardScaler()
# # 多项式特征转换
# degree = 2 # 设置多项式的阶数
# poly = PolynomialFeatures(degree=degree)
# # Ridge回归模型
# alpha = 0.001
# model = Lasso(alpha=alpha,max_iter=10000)
# pipe= Pipeline([('step1', transform),
# ('step2', poly),
# ('step3', model)
# ])
# pipe = Pipeline([
# ('step1',StandardScaler()),
# ('step2',PolynomialFeatures(degree=2)),
# ('step3',Lasso(alpha=0.001,max_iter=10000)),
# ])
#---------------------------------------------------------------------
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2),Lasso())
pipe.fit(X_train, y_train)
# 预测
y_pred = pipe.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f"均方误差(Mean Squared Error): {mse}")
# 获得权重列表和偏置项
step_name = list(pipe.named_steps.keys())[-1]
model_object = pipe.named_steps[step_name]
print(f"权重列表: {model_object.coef_}")
print(f"偏置项: {model_object.intercept_}")
5.3、网络搜索
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
# 读取数据
data = pd.read_csv('./data/boston.csv')
target = data.iloc[:, -1]
data = data.iloc[:, :-1]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2)
# 创建 make_pipeline
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
# 定义要搜索的参数范围
param_grid = {'polynomialfeatures__degree': [1, 2, 3, 4, 5]}
# 使用 GridSearchCV 进行搜索
# 搜索范围在param_grid中指定,使用-MSE来衡量模型的优劣
grid_search = GridSearchCV(pipe, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
# 打印最佳参数和对应的均方误差
print("最佳参数: ", grid_search.best_params_)
print("最佳均方误差: ", -grid_search.best_score_)
# 使用最佳参数的模型进行预测
y_pred = grid_search.predict(X_test)
# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f"测试集均方误差(Mean Squared Error): {mse}")
# 获取权重列表和偏置项
estimator = grid_search.best_estimator_
step_name = list(pipe.named_steps.keys())[-1]
model_object = estimator.named_steps[step_name]
print(f"权重列表: {model_object.coef_}")
print(f"偏置项: {model_object.intercept_}")