在机器学习pipeline中同时使用PCA和LDA

最新推荐文章于 2024-04-11 15:18:37 发布

Just Jump

最新推荐文章于 2024-04-11 15:18:37 发布

阅读量708

点赞数 1

分类专栏：特征工程机器学习文章标签：机器学习特征值分解 python

本文链接：https://blog.csdn.net/eylier/article/details/105086143

版权

机器学习同时被 2 个专栏收录

81 篇文章 9 订阅

订阅专栏

特征工程

22 篇文章 2 订阅

订阅专栏

'''
在机器学习pipeline中同时使用PCA和LDA
'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
# import the Iris dataset from scikit-learn
from sklearn.datasets import load_iris
# import our plotting module
import matplotlib.pyplot as plt
# load the Iris dataset
iris = load_iris()
# 创建X，y变量来表示特征和响应变量列。create X and y variables to hold features and response column
iris_X, iris_y = iris.data, iris.target

# Create a PCA module to keep a single component
single_pca = PCA(n_components=1)
# Create a LDA module to keep a single component
single_lda = LinearDiscriminantAnalysis(n_components=1)
# Instantiate a KNN model
knn = KNeighborsClassifier(n_neighbors=3)

# run a cross validation on the KNN without any feature transformation
knn_average = cross_val_score(knn, iris_X, iris_y).mean()
# This is a baseline accuracy. If we did nothing, KNN on its own achieves a 98% accuracy
knn_average

#Let's use our LDA, which keeps only the most powerful component
lda_pipeline = Pipeline([('lda', single_lda), ('knn', knn)])
lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()
lda_average

# create a pipeline that performs PCA
pca_pipeline = Pipeline([('pca', single_pca), ('knn', knn)])
pca_average = cross_val_score(pca_pipeline, iris_X, iris_y).mean()
pca_average

# try LDA with 2 components
lda_pipeline = Pipeline([('lda',LinearDiscriminantAnalysis(n_components=2)),('knn', knn)])
lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean()
# Just as good as using original data
lda_average

# compare our feature transformation tools to a feature selection tool
from sklearn.feature_selection import SelectKBest
# try all possible values for k, excluding keeping all columns
for k in [1, 2, 3]:
	# make the pipeline
	select_pipeline = Pipeline([('select', SelectKBest(k=k)), ('knn', knn)])
	# cross validate the pipeline
	select_average = cross_val_score(select_pipeline, iris_X, iris_y).mean()
	print (k, "best feature has accuracy:", select_average)

'''
用 GridSearch module 寻找最优组合：
Scaling data (with or without mean/std) 
PCA components
LDA components
KNN neighbors
'''
def get_best_model_and_accuracy(model, params, X, y):
	grid = GridSearchCV(model, # the model to grid search
						params, # the parameter set to try
						error_score=0.) # if a parameter set raises an error, continue and set the performance as 0
	grid.fit(X, y) # fit the model and parameters
	# our classical metric for performance
	print ("Best Accuracy: {}".format(grid.best_score_))
	# the best parameters that caused the best accuracy
	print ("Best Parameters: {}".format(grid.best_params_))
	# the average time it took a model to fit to the data (in seconds)
	avg_time_fit = round(grid.cv_results_['mean_fit_time'].mean(), 3)
	print ("Average Time to Fit (s): {}".format(avg_time_fit))
	# the average time it took a model to predict out of sample data (in seconds)
	# this metric gives us insight into how this model will perform in real-time analysis
	print ("Average Time to Score (s):{}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
iris_params = {
		'preprocessing__scale__with_std': [True, False],
		'preprocessing__scale__with_mean': [True, False],
		'preprocessing__pca__n_components':[1, 2, 3, 4], 
		'preprocessing__lda__n_components':[1, 2],# according to scikit-learn docs, max allowed n_components for LDA is number of classes-1
		'clf__n_neighbors': range(1, 9) }
# make a larger pipeline
preprocessing = Pipeline([('scale', StandardScaler()), ('pca', PCA()),('lda', LinearDiscriminantAnalysis())])
iris_pipeline = Pipeline(steps=[('preprocessing', preprocessing),('clf',KNeighborsClassifier())])
get_best_model_and_accuracy(iris_pipeline, iris_params, iris_X, iris_y)


'''output:
1 best feature has accuracy: 0.9538398692810457
2 best feature has accuracy: 0.9607843137254902
3 best feature has accuracy: 0.9738562091503268
Best Accuracy: 0.9866666666666667
Best Parameters: {'clf__n_neighbors': 3, 'preprocessing__lda__n_components': 2, 'preprocessing__pca__n_components': 3, 'preprocessing__scale__with_mean': True, 'preprocessing__scale__with_std': False}
Average Time to Fit (s): 0.003
Average Time to Score (s):0.003
'''