特征选择

特征选择

sklearn.feature_selection中的类可以用于在样本集中做特征选择和降维,从而提高估计器的准确率或提升他们在高维数据集中的性能。

删除方差小的特征

VarianceThreshold是一个用于特征选择的简单方法。它移除所有方差不满足给定阈值的特征。

VarianceThreshold(threshold=(.8 * (1 - .8)))
单变量特征选择

单变量特征选择通过基于单变量统计实验选择最好的特征。可以看做是估计器的预处理步骤。scikit-learn暴露特征选择函数使用transform方法:

  • SelectKBest
  • SelectPercentile
  • SelectFpr/SelectFdr/SelectFwe
  • GenericUnivariateSelect
X_new = SelectKBest(chi2, k=2).fit_transform(X)
# coding: utf-8
# Univariate Feature Selection

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif


X, y = load_iris(return_X_y=True)

E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))

X = np.hstack((X, E))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

plt.figure(1)
plt.clf()

X_indices = np.arange(X.shape[-1])

selector = SelectKBest(f_classif, k=4)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - 0.45, scores, width=.2,
        label=r'Univariance score ($-Log(p_{value})$)')

clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print('Classification accuracy wih selecting features: {:.3f}'
      .format(clf.score(X_test, y_test)))

svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()

plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight')

clf_selected = make_pipeline(
    SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())

clf_selected.fit(X_train, y_train)
print('Classification accuracy after univariate feature selection: {:.3f}'
      .format(clf_selected.score(X_test, y_test)))

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
        width=.2, label='SVM weights after selection')

plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
迭代特征消除

假设一个外部的估计器指定权重给特征,迭代特征消除逐步考虑越来越小的特征集合以选择特征。

# coding: utf-8
# Recursive feature elimination with cross-validation

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification


X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
                           n_redundant=2, n_repeated=0, n_classes=8,
                           n_clusters_per_class=1, random_state=0)

svc = SVC(kernel='linear')

rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features: %d" % rfecv.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
plt.show()
使用SelectFromModel特征选择

SelectFromModel是一种元估计器,能够和其它任意有coef_feature_importances_属性一起使用。

# coding: utf-8
# Feature selection using SelectFromModel and LassoCV

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

diabetes = load_diabetes()

X = diabetes.data
y = diabetes.target

feature_names = diabetes.feature_names

clf = LassoCV().fit(X, y)
importance = np.abs(clf.coef_)

idx_third = importance.argsort()[-3]
threshold = importance[idx_third] + 0.01

idx_features = (-importance).argsort()[:2]
name_features = np.array(feature_names)[idx_features]
print('Selected features: {}'.format(name_features))

sfm = SelectFromModel(clf, threshold=threshold)
sfm.fit(X, y)
X_transform = sfm.transform(X)

n_features = sfm.transform(X).shape[1]

plt.title(
    "Features from diabets using SelectFromModel with "
    "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("First feature: {}".format(name_features[0]))
plt.ylabel("Second feature: {}".format(name_features[1]))
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()
基于L1的特征选择

linear_model.Lasso/linear_model.LogisticRegression/svm.LinearSVC

基于树的特征选择

ExtraTreesClassifier

特征选择作为管道的一部分

特征选择是在做实际学习前的一个预处理步骤。推荐的方式是使用sklearn.pipeline.Pipeline

clf = Pipeline([
	('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
	('classification', RandomForestClassifier())])
clf.fit(X, y)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值