#导入
from sklearn.pipeline import Pipeline
#导入“流水线”各个模块(标准化,降维,分类)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
# 定义流水线模型
pl_svm=Pipeline([('scaler',StandardScaler()),('pca',PCA()),('svm',SVC())])
# 随机生成100条2个特征3个类别的数据
from sklearn.datasets.samples_generator import make_blobs
X,y=make_blobs(n_samples=100,n_features=2,centers=3,random_state=6)
# 拆分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=7)
# 训练和预测
pl_svm.fit(X_train,y_train)
pl_svm.predict(X_test)
# 模型评估
from sklearn.metrics import classification_report
print(classification_report(y_test,pl_svm.predict(X_test)))
因为本来就只有两个特征,所以pca只是对样本进行了旋转,没有降维
# 查看流水线的处理流程
pl_svm.steps
# 查看流水线的处理流程(字典的形式展示)
pl_svm.named_steps
网格搜索+pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
x,y=cancer.data,cancer.target
# 拆分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=8)
# 定义流水线
pipeline=Pipeline([('scaler',StandardScaler()),('pca',PCA()),('svm',SVC())])
# 定义网格搜索参数
param_grid={'svm__C':[0.001,0.01,0.1,1,10,100],'svm__gamma':[0.001,0.01,0.1,1,10,100]}
# 网格搜索模型实例化
grid=GridSearchCV(pipeline,param_grid,cv=5)
交叉验证中,先拆分再来标准化,因为如果测试集和训练集一起标准化,则将信息泄露了
而流水线不会信息泄露
具体详情,查看《python机器学习基础教程》240面
grid.fit(X_train,y_train)
grid.predict(X_test)
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1])
from sklearn.pipeline import make_pipeline
pipe_short=make_pipeline(StandardScaler(),PCA(),SVC())
#该语句与上面的pipeline=Pipeline([('scaler',StandardScaler()),('pca',PCA()),('svm',SVC())])等价,
#只是省去了取名字的过程
pipe_short.named_steps