谈谈特征选择及基于scikit-learn的示例

最新推荐文章于 2023-12-20 15:12:25 发布

frank_hetest

最新推荐文章于 2023-12-20 15:12:25 发布

阅读量208

点赞数

特征选择方法有多种，主要包括以下几种。

移除低方差特征法
假设某个特征对应的方差为0或者非常小，通常可以认为该特征的作用可以忽略，因而可以移除该特征。

示例如下：

from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1], [0, 1, 0], 
     [1, 0, 0], [0, 1, 1], 
     [0, 1, 0], [0, 1, 1]]

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

sel.fit_transform(X)

array([[0, 1],
       [1, 0],       
       [0, 0], 
       [1, 1], 
       [1, 0],       
       [1, 1]])

容易看出上述方法剔除了第一列，因为 0?wx_fmt=png 超出了给定阈值，即第一列有超过 5/6 的变量取值为0。

2. 单变量特征选择法

这种方法通过单变量统计检验来选择最好的特征。

比如，通过卡方检验方法可以选择前两个最好的特征。

示例如下：

from sklearn.datasets import load_iris

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

iris = load_iris()

X, y = iris.data, iris.target

X.shape

(150, 4)

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)

X_new.shape

(150, 2)

下面给出一个完整的示例，来说明特征选择对SVM的影响。

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn import datasets, svm

from sklearn.feature_selection import SelectPercentile, f_classif

###############################
# import some data to play with

# The iris dataset

iris = datasets.load_iris()

# Some noisy data not correlated

E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

# Add the noisy data to the informative features

X = np.hstack((iris.data, E))

y = iris.target

#####################################

plt.figure(1)

plt.clf()

X_indices = np.arange(X.shape[-1])

####################################

# Univariate feature selection with F-test for feature scoring

# We use the default selection function: 
# the 10% most significant features

selector = SelectPercentile(f_classif, percentile=10)

selector.fit(X, y)

scores = -np.log10(selector.pvalues_)

scores /= scores.max()

plt.bar(X_indices - .45, scores, width=.2,
        label=r'Univariate score ($-Log(p_{value})$)', color='g')

########################################

# Compare to the weights of an SVM

clf = svm.SVC(kernel='linear')

clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)

svm_weights /= svm_weights.max()

plt.bar(X_indices - .25, svm_weights, width=.2, 
    label='SVM weight', color='r')

clf_selected = svm.SVC(kernel='linear')

clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)

svm_weights_selected /= svm_weights_selected.max()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
        width=.2, label='SVM weights after selection', color='b')

plt.title("Comparing feature selection")

plt.xlabel('Feature number')

plt.yticks(())

plt.axis('tight')

plt.legend(loc='upper right')

plt.show()

实验结果如下

0?wx_fmt=png

3 递归特征淘汰法

这种方法通过在训练所得模型中逐步去除不那么重要的特征，每次都会去掉权值绝对值最小的特征，直至特征个数达到预设数目。

示例如下：

print(__doc__)

from sklearn.svm import SVC

from sklearn.datasets import load_digits

from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt

# Load the digits dataset

digits = load_digits()

X = digits.images.reshape((len(digits.images), -1))

y = digits.target

# Create the RFE object and rank each pixel

svc = SVC(kernel="linear", C=1)

rfe = RFE(estimator=svc, n_features_to_select=1, step=1)

rfe.fit(X, y)

ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking

plt.matshow(ranking)

plt.colorbar()

plt.title("Ranking of pixels with RFE")

plt.show()

实验结果如下： 0?wx_fmt=png

含有交叉验证方法的示例如下：

print(__doc__)

import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.cross_validation import StratifiedKFold

from sklearn.feature_selection import RFECV

from sklearn.datasets import make_classification

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
                           n_redundant=2, n_repeated=0, n_classes=8,
                           n_clusters_per_class=1, random_state=0)

# Create the RFE object and compute a cross-validated score.

svc = SVC(kernel="linear")

# The "accuracy" scoring is proportional to the number of correct

# classifications

rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
              scoring='accuracy')

rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores

plt.figure()

plt.xlabel("Number of features selected")

plt.ylabel("Cross validation score (nb of correct classifications)")

plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

plt.show()

0?wx_fmt=png

4 基于 SelectFromModel 的特征选择

这种方法中，训练好的模型中特征对应权重小于设定阈值时则舍弃。

示例如下：

# Author: Manoj Kumar <mks542@nyu.edu>

# License: BSD 3 clause

print(__doc__)

import matplotlib.pyplot as plt

import numpy as np

from sklearn.datasets import load_boston

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LassoCV

# Load the boston dataset.

boston = load_boston()

X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since
# the L1 norm promotes sparsity of features.

clf = LassoCV()

# Set a minimum threshold of 0.25

sfm = SelectFromModel(clf, threshold=0.25)

sfm.fit(X, y)

n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.

# Note that the attribute can be set directly instead of repeatedly

# fitting the metatransformer.

while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]

# Plot the selected two features from X.

plt.title(
    "Features selected from Boston using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)

feature1 = X_transform[:, 0]

feature2 = X_transform[:, 1] 
plt.plot(feature1, feature2, 'r.')

plt.xlabel("Feature number 1")

plt.ylabel("Feature number 2")

plt.ylim([np.min(feature2), np.max(feature2)])

plt.show()

0?wx_fmt=png

基于稀疏恢复的特征选择

这种方法跟压缩感知和L1正则的关联性比较大。

print(__doc__)

# Author: Alexandre Gramfort and Gael Varoquaux

# License: BSD 3 clause

import warnings

import matplotlib.pyplot as plt

import numpy as np

from scipy import linalg

from sklearn.linear_model import (RandomizedLasso,
                                  lasso_stability_path,
                                  LassoLarsCV)

from sklearn.feature_selection import f_regression

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import auc, precision_recall_curve

from sklearn.ensemble import ExtraTreesRegressor

from sklearn.utils.extmath import pinvh

from sklearn.utils import ConvergenceWarning

def mutual_incoherence(X_relevant, X_irelevant):
    """Mutual incoherence, as defined by 
    formula (26a) of [Wainwright2006].    """
    projector = np.dot(np.dot(X_irelevant.T, X_relevant),
                       pinvh(np.dot(X_relevant.T, X_relevant)))
    return np.max(np.abs(projector).sum(axis=1))

for conditioning in (1, 1e-4):
    #####################################
    # Simulate regression data with a correlated design
    n_features = 501
    n_relevant_features = 3
    noise_level = .2
    coef_min = .2
    # The Donoho-Tanner phase transition is 
    # around n_samples=25: below we
    # will completely fail to recover in the well-conditioned case
    n_samples = 25
    block_size = n_relevant_features

    rng = np.random.RandomState(42)

    # The coefficients of our model
    coef = np.zeros(n_features)
    coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

    # The correlation of our design: variables correlated by blocs of 3
    corr = np.zeros((n_features, n_features))
    for i in range(0, n_features, block_size):
        corr[i:i + block_size, i:i + block_size] = 1 - conditioning
    corr.flat[::n_features + 1] = 1
    corr = linalg.cholesky(corr)

    # Our design
    X = rng.normal(size=(n_samples, n_features))
    X = np.dot(X, corr)
    # Keep [Wainwright2006] (26c) constant
    X[:n_relevant_features] /= np.abs(
        linalg.svdvals(X[:n_relevant_features])).max()
    X = StandardScaler().fit_transform(X.copy())

    # The output variable
    y = np.dot(X, coef)
    y /= np.std(y)
    # We scale the added noise as a function of the average correlation
    # between the design and the output variable
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(X[:, :n_relevant_features],
                            X[:, n_relevant_features:])

    ####################################
    # Plot stability selection path, 
    # using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
                                                   eps=0.05)

    plt.figure()
    # We plot the path as a function of alpha/alpha_max 
    # to the power 1/3: the
    # power 1/3 scales the path less brutally than 
    # the log, and enables to
    # see the progression along the path
    hg = plt.plot(alpha_grid[1:] ** .333, 
                  scores_path[coef != 0].T[1:], 'r')
    hb = plt.plot(alpha_grid[1:] ** .333, 
                  scores_path[coef == 0].T[1:], 'k')
    ymin, ymax = plt.ylim()
    plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    plt.ylabel('Stability score: proportion of times selected')
    plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
    plt.axis('tight')
    plt.legend((hg[0], hb[0]), 
               ('relevant features', 'irrelevant features'),
               loc='best')

    ####################################
    # Plot the estimated stability scores for a given alpha

    # Use 6-fold cross-validation rather than 
    # the default 3-fold: it leads to
    # a better choice of alpha:
    # Stop the user warnings outputs- they are 
    # not necessary for the example
    # as it is specifically set up to be challenging.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        lars_cv = LassoLarsCV(cv=6).fit(X, y)

    # Run the RandomizedLasso: we use a paths
    # going down to .1*alpha_max
    # to avoid exploring the regime in which
    # very noisy variables enter
    # the model
    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
    trees = ExtraTreesRegressor(100).fit(X, y)
    # Compare with F-score
    F, _ = f_regression(X, y)

    plt.figure()
    for name, score in [('F-test', F),
                        ('Stability selection', clf.scores_),
                        ('Lasso coefs', np.abs(lars_cv.coef_)),
                        ('Trees', trees.feature_importances_),
                        ]:
        precision, recall, thresholds = precision_recall_curve(coef != 0,
                                                               score)
        plt.semilogy(np.maximum(score / np.max(score), 1e-4),
                     label="%s. AUC: %.3f" % (name, auc(recall, precision)))

    plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
             label="Ground truth")
    plt.xlabel("Features")
    plt.ylabel("Score")
    # Plot only the 100 first coefficients
    plt.xlim(0, 100)
    plt.legend(loc='best')
    plt.title('Feature selection scores - Mutual incoherence: %.1f'
              % mi)

plt.show()

实验结果如下

0?wx_fmt=png

基于树（随机森林）的特征选择方法

示例如下：

print(__doc__)

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification

from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features

X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)

# Build a forest and compute the feature importances

forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)

importances = forest.feature_importances_

std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)

indices = np.argsort(importances)[::-1]

# Print the feature ranking

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % 
    (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure()

plt.title("Feature importances")

plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")

plt.xticks(range(X.shape[1]), indices)

plt.xlim([-1, X.shape[1]])

plt.show()

实验结果如下

0?wx_fmt=png

下面是通过森林法得到人脸中每个像素特征的重要性。

print(__doc__)

from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces

from sklearn.ensemble import ExtraTreesClassifier

# Number of cores to use to perform parallel 
# fitting of the forest model

n_jobs = 1

# Load the faces dataset

data = fetch_olivetti_faces()

X = data.images.reshape((len(data.images), -1))

y = data.targetmask = y < 5  
# Limit to 5 classes

X = X[mask]y = y[mask]

# Build a forest and compute the pixel importances

print("Fitting ExtraTreesClassifier on faces data with %d cores..."
     % n_jobs)

t0 = time()

forest = ExtraTreesClassifier(n_estimators=1000,
                              max_features=128,
                              n_jobs=n_jobs,
                              random_state=0)

forest.fit(X, y)

print("done in %0.3fs" % (time() - t0))

importances = forest.feature_importances_

importances = importances.reshape(data.images[0].shape)

# Plot pixel importances

plt.matshow(importances, cmap=plt.cm.hot)

plt.title("Pixel importances with forests of trees")

plt.show()