sklearn官方文档学习笔记 管道和特征联合:组合估计器
Pipeline and FeatureUnion: combining estimators
管道:链接估计器
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
estimators=[('reduce_dim',PCA()),('clf',SVC())]#其中的'reduce_dim'是自定义的步骤名字
pipe=Pipeline(estimators)
pipe
Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False))])
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(),MultinomialNB())#make_pipeline是上面代码的一种简写形式
Pipeline(steps=[('binarizer', Binarizer(copy=True, threshold=0.0)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
pipe.steps[0]#steps属性里以列表形式存着管道中的估计器
('reduce_dim',
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False))
pipe.named_steps['reduce_dim']#在named_steps属性中以dict形式存着步骤
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
pipe.set_params(clf__C=10)#以这种形式给指定名字的估计器(clf)的参数(C)赋值 <estimator>__<parameter>
Pipeline(steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False))])
这对于进行网格搜索尤其重要:
from sklearn.model_selection import GridSearchCV
params=dict(reduce_dim__n_components=[2,5,10],#设置reduce_dim的n_components参数为多个值以供选取模型最优值
clf__C=[0.1,10,100])#在模型选择过程中可以设置参数列表
grid_search=GridSearchCV(pipe,param_grid=params)
单独步骤也可以被替换为参数,并且非最终步骤可以被忽略,将它们设置为None:
from sklearn.linear_model import LogisticRegression
params=dict(reduce_dim=[None,PCA(5),PCA(10)],clf=[SVC(),LogisticRegression()],clf__C=[0.1,10,100])
grid_search=GridSearchCV(pipe,param_grid=params)
FeatureUnion:复合特征空间
一个FeatureUnion由一组transformer对象组成,在fit过程中,每个transformer对象独立fit数据,它们的样本向量首尾相接组成更大的向量。
FeatureUnion和前一节的Pipeline目的相近,都是为了方便联合参数估计和验证。它们两个可以联合组成更复杂的模型
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
estimators=[('linear_pca',PCA()),('kernel_pca',KernelPCA())]
combined=FeatureUnion(estimators)
combined
FeatureUnion(n_jobs=1,
transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
fit_inverse_transform=False, gamma=None, kernel='linear',
kernel_params=None, max_iter=None, n_components=None, n_jobs=1,
random_state=None, remove_zero_eig=False, tol=0))],
transformer_weights=None)
像Pipeline一样,FeatureUnion有一个名为make_union的简写构造函数,不需要明确命名组件。
combined.set_params(kernel_pca=None)#Pipeline一样,可以使用set_params替换单独的步骤,并通过设置为None来忽略
FeatureUnion(n_jobs=1,
transformer_list=[('linear_pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('kernel_pca', None)],
transformer_weights=None)