1.数据准备和生成模型的Pipeline
Pipeline最小化数据损失
Pipeline能够处理训练数据和评估数据集之间的数据泄漏问题,通常在数据处理过程中对分离出的所有数据子集做同样的数据处理,如正态化处理。
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
filename='/home/duan/pima indians.txt'
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataset=read_csv(filename,names=names)
array=dataset.values
X=array[:,0:8]
Y=array[:,8]
seed=7
kfold=KFold(n_splits=num_folds, random_state=seed)
steps=[]
steps.append(('Standardize',StandardScaler()))
steps.append(('lda',LinearDiscriminantAnalysis()))
model=Pipeline(steps)
result=cross_val_score(model,X,Y,cv=kfold)
print(result.mean())
运行结果为:
0.773462064251538
2.特征选择和生成模型的Pipeline
Pipeline也提供了一个工具(FeatureUnion)来保证数据特征选择时数据的稳固性。
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
filename='/home/duan/pima indians.txt'
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataset=read_csv(filename,names = names)
array=dataset.values
X=array[:,0:8]
Y=array[:,8]
seed=7
kfold=KFold(n_splits = num_folds, random_state=seed)
#生成featuresunion
features=[]
features.append(('pca',PCA()))
features.append(('select_best',SelectKBest(k=6)))
#生成Pipeline
steps=[]
steps.append(('feature_union',FeatureUnion(features)))
steps.append(('Logistic',LogisticRegression()))
model=Pipeline(steps)
result=cross_val_score(model,X,Y,cv=kfold)
print(result.mean())
运行结果:
0.7799555707450445