Python机器学习库sklearn自动特征选择（训练集）

最新推荐文章于 2024-08-08 16:43:27 发布

thinker_1120

最新推荐文章于 2024-08-08 16:43:27 发布

阅读量1.4w

点赞数 6

分类专栏：算法实现

本文链接：https://blog.csdn.net/cymy001/article/details/78576272

版权

算法实现专栏收录该内容

38 篇文章 9 订阅

订阅专栏

1.单变量分析

from sklearn.feature_selection import SelectPercentile

%matplotlib inline
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html#sklearn.feature_selection.SelectPercentile
from distutils.version import LooseVersion as Version  
from sklearn import __version__ as sklearn_version  
if Version(sklearn_version) < '0.18':  
    from sklearn.cross_validation import train_test_split  
else:  
    from sklearn.model_selection import train_test_split 

cancer = load_breast_cancer()   #cancer.data大小(569, 30)
# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))   #np.random.normal正态分布N(0,1)生成(569, 50)维噪声数组
# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])   #带噪声的特征(569,80)维数组

X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5)

# use f_classif (the default) and SelectPercentile to select 10% of features:
select = SelectPercentile(percentile=50)   #选择特征重要度排在前百分之50的特征,默认计算函数是f_classif(只适用于分类问题)
select.fit(X_train, y_train)
# transform training set:
X_train_selected = select.transform(X_train)

print(X_train.shape)
print(X_train_selected.shape)
#Output:
#(284, 80)
#(284, 40)

不同的特征选择重要性计算函数：
f_classif：
Compute the ANOVA F-value for the provided sample.
f_regression：
1)Compute the correlation between each regressor and the target,

( X [ : , i ] - m e a n ( X [ : , i ] ) ) ( y - m e a n ( y ) ) s t d ( X [ : , i ] ) s t d ( y )

$\frac{(X[:, i] - mean(X[:, i])) (y - mean(y)) }{std(X[:, i]) std(y)}$
2)It is converted to an F score then to a p-value.
F, p = f_classif(X_train, y_train)

import matplotlib.pyplot as plt
from sklearn.feature_selection import f_classif, f_regression, chi2
plt.figure()
plt.plot(p, 'o')

这里写图片描述

mask = select.get_support()   #返回被选中特征和没被选中特征的掩码或者整数索引
print(mask)
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')   #plt.matshow画混淆矩阵图

这里写图片描述

验证get_support的作用
def getnummask(mask):
    j=0
    for i in range(len(mask)):
        if mask[i]==True:
            j+=1
    return j
getnummask(mask)
#Output:
#40

from sklearn.linear_model import LogisticRegression
# transform test data:
X_test_selected = select.transform(X_test)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Score with all features: %f" % lr.score(X_test, y_test))
lr.fit(X_train_selected, y_train)
print("Score with only selected features: %f" % lr.score(X_test_selected, y_test))
#Output:
#Score with all features: 0.929825
#Score with only selected features: 0.940351

2.基于模型的特征选择

from sklearn.feature_selection import SelectFromModel

from sklearn.feature_selection import SelectFromModel
#根据某种模型估计a coef_ attribute or a feature_importances_ attribute,然后删除特征重要性小于给定阈值的特征
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")
#threshold参数设置根据特征重要性筛选特征的阈值
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print(X_train.shape)
print(X_train_l1.shape)
#Output:
#(284, 80)
#(284, 40)

mask = select.get_support()
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

这里写图片描述

X_test_l1 = select.transform(X_test)
LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
#Output:
#0.9508771929824561

3.RFE，逐步特征删除

from sklearn.feature_selection import RFE

from sklearn.feature_selection import RFE
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE
#根据a coef_ attribute or a feature_importances_ attribute逐渐删除重要性最低的特征子集,直到特征达到指定需要个数为止
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40)
#select = RFE(LogisticRegression(penalty="l1"), n_features_to_select=40)
#n_features_to_select参数设置要选择的特征数目,默认选一半

select.fit(X_train, y_train)
# visualize the selected features:
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

这里写图片描述

X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)

LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)
#Output:
#0.9508771929824561

select.score(X_test, y_test)
#Output:
#0.9508771929824561

4.序列化特征选择http://rasbt.github.io/mlxtend/

from mlxtend.feature_selection import SequentialFeatureSelector

from mlxtend.feature_selection import SequentialFeatureSelector
#http://rasbt.github.io/mlxtend/
sfs = SequentialFeatureSelector(LogisticRegression(), k_features=40, forward=True, scoring='accuracy',cv=5)
sfs = sfs.fit(X_train, y_train)
mask = np.zeros(80, dtype='bool')
mask[np.array(sfs.k_feature_idx_)] = True
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

这里写图片描述

LogisticRegression().fit(sfs.transform(X_train), y_train).score(sfs.transform(X_test), y_test)
#Output:
#0.93684210526315792

#查看序列特征加入顺序
from mlxtend.feature_selection import SequentialFeatureSelector
#http://rasbt.github.io/mlxtend/
sfs = SequentialFeatureSelector(LogisticRegression(), k_features=40, forward=True, scoring='accuracy',cv=5)
sfs = sfs.fit(X_train, y_train)
np.array(sfs.k_feature_idx_)
#Output:
#array([ 0,  2,  4,  5,  6,  7,  8,  9, 14, 15, 16, 17, 18, 19, 20, 22, 24,
#       25, 26, 27, 28, 29, 30, 31, 32, 35, 36, 38, 39, 42, 43, 45, 50, 52,
#       59, 64, 70, 71, 72, 78])