1.数据准备和生成模型的Pipeline
Pipeline能够处理训练数据集和评估数据集之间的数据泄露问题,通常会在数据处理过程中对分离出的所有数据子集做同样的数据处理,如正态化处理。
from pandas import read_csv
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = read_csv(filename, names=names)
# 将数据划分成输入数据和输出数据
array = dataset.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
steps=[]
steps.append(('Standardize',StandardScaler()))
steps.append(('lda',LinearDiscriminantAnalysis()))
model=Pipeline(steps)
result=cross_val_score(model,X,Y,cv=kfold)
print(result.mean())
2.特征选择和生成模型的Pipeline
Pipeline提供一个工具(FeatureUnion)来保证数据特征选择时数据的稳定性。
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = read_csv(filename, names=names)
# 将数据划分成输入数据和输出数据
array = dataset.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
# 生成FeatureUnion
features = []
features.append(('pca', PCA()))
features.append(('select_best', SelectKBest(k=6)))
# 生成Pipeline
steps = []
steps.append(('feature_union', FeatureUnion(features)))
steps.append(('logistic', LogisticRegression()))
model = Pipeline(steps)
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())