'''
在机器学习pipeline中同时使用PCA和LDA
'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
# import the Iris dataset from scikit-learn
from sklearn.datasets import load_iris
# import our plotting module
import matplotlib.pyplot as plt
# load the Iris dataset
iris = load_iris()
# 创建X,y变量来表示特征和响应变量列。create X and y variables to hold features and response column
iris_X, iris_y = iris.data, iris.target
# Create a PCA module to keep a single component
single_pca = PCA(n_components=1)
# Create a LDA module to keep a single component
single_lda = LinearDiscriminantAnalysis(n_components=1)
# Instantiate a KNN model
knn = KNeighborsClassifier(n_neighbors=3)
# run a cross validation on the KNN without any feature transformation
knn_average = cross_val_score(knn, iris_X, iris_y).mean()
# This is a baseline accuracy. If we did nothing, KNN on its own achieves a 98% accuracy
knn_average
#Let's use our LDA, which keeps only the most powerful component
lda_pipeline = Pipeline([('lda', single_lda), ('knn', knn)])
lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()
lda_average
# create a pipeline that performs PCA
pca_pipeline = Pipeline([('pca', single_pca), ('knn', knn)])
pca_average = cross_val_score(pca_pipeline, iris_X, iris_y).mean()
pca_average
# try LDA with 2 components
lda_pipeline = Pipeline([('lda',LinearDiscriminantAnalysis(n_components=2)),('knn', knn)])
lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()
# Just as good as using original data
lda_average
# compare our feature transformation tools to a feature selection tool
from sklearn.feature_selection import SelectKBest
# try all possible values for k, excluding keeping all columns
for k in [1, 2, 3]:
# make the pipeline
select_pipeline = Pipeline([('select', SelectKBest(k=k)), ('knn', knn)])
# cross validate the pipeline
select_average = cross_val_score(select_pipeline, iris_X, iris_y).mean()
print (k, "best feature has accuracy:", select_average)
'''
用 GridSearch module 寻找最优组合:
Scaling data (with or without mean/std)
PCA components
LDA components
KNN neighbors
'''
def get_best_model_and_accuracy(model, params, X, y):
grid = GridSearchCV(model, # the model to grid search
params, # the parameter set to try
error_score=0.) # if a parameter set raises an error, continue and set the performance as 0
grid.fit(X, y) # fit the model and parameters
# our classical metric for performance
print ("Best Accuracy: {}".format(grid.best_score_))
# the best parameters that caused the best accuracy
print ("Best Parameters: {}".format(grid.best_params_))
# the average time it took a model to fit to the data (in seconds)
avg_time_fit = round(grid.cv_results_['mean_fit_time'].mean(), 3)
print ("Average Time to Fit (s): {}".format(avg_time_fit))
# the average time it took a model to predict out of sample data (in seconds)
# this metric gives us insight into how this model will perform in real-time analysis
print ("Average Time to Score (s):{}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
iris_params = {
'preprocessing__scale__with_std': [True, False],
'preprocessing__scale__with_mean': [True, False],
'preprocessing__pca__n_components':[1, 2, 3, 4],
'preprocessing__lda__n_components':[1, 2],# according to scikit-learn docs, max allowed n_components for LDA is number of classes-1
'clf__n_neighbors': range(1, 9) }
# make a larger pipeline
preprocessing = Pipeline([('scale', StandardScaler()), ('pca', PCA()),('lda', LinearDiscriminantAnalysis())])
iris_pipeline = Pipeline(steps=[('preprocessing', preprocessing),('clf',KNeighborsClassifier())])
get_best_model_and_accuracy(iris_pipeline, iris_params, iris_X, iris_y)
'''output:
1 best feature has accuracy: 0.9538398692810457
2 best feature has accuracy: 0.9607843137254902
3 best feature has accuracy: 0.9738562091503268
Best Accuracy: 0.9866666666666667
Best Parameters: {'clf__n_neighbors': 3, 'preprocessing__lda__n_components': 2, 'preprocessing__pca__n_components': 3, 'preprocessing__scale__with_mean': True, 'preprocessing__scale__with_std': False}
Average Time to Fit (s): 0.003
Average Time to Score (s):0.003
'''