下面介绍两种之前竞赛使用到的特征选择方案
方案一
- 流程图
以上方法使用方差、卡方检验、决策树模型输出特征重要性方法综合起来进行特征选择,该方案在马上AI全球挑战赛中发挥了比较大的作用。该链接是我们的解决方案,开源是一种精神,仅供大家共同学习交流。
- python代码实现
#coding=utf-8
import numpy as np
import pandas as pd
'''单变量特征选取'''
from sklearn.feature_selection import SelectKBest, chi2
'''去除方差小的特征'''
from sklearn.feature_selection import VarianceThreshold
'''循环特征选取'''
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
'''RFE_CV'''
from sklearn.ensemble import ExtraTreesClassifier
class FeatureSelection(object):
def __init__(self, feature_num):
self.feature_num = feature_num
self.train_test, self.label, self.test = self.read_data() # features #
self.feature_name = list(self.train_test.columns) # feature name #
def read_data(self):
test = pd.read_csv(r'test_feature.csv', encoding='utf-8')
train_test = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
print('读取数据完毕。。。')
label = train_test[['target']]
test = test.iloc[:, 1:]
train_test = train_test.iloc[:, 2:]
return train_test, label, test
def variance_threshold(self):
sel = VarianceThreshold()
sel.fit_transform(self.train_test)
feature_var = list(sel.variances_) # feature variance #
features = dict(zip(self.feature_name, feature_var))
features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
# print(features) # 100 cols #
return set(features) # return set type #
def select_k_best(self):
ch2 = SelectKBest(chi2, k=self.feature_num)
ch2.fit(self.train_test, self.label)
feature_var = list(ch2.scores_) # feature scores #
features = dict(zip(self.feature_name, feature_var))
features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
# print(features) # 100 cols #
return set(features) # return set type #
def svc_select(self):
svc = SVC(kernel='rbf', C=1, random_state=2018) # linear #
rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1)
rfe.fit(self.train_test, self.label.ravel())
print(rfe.ranking_)
return rfe.ranking_
def tree_select(self):
clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=4)
clf.fit(self.train_test, self.label)
feature_var = list(clf.feature_importances_) # feature scores #
features = dict(zip(self.feature_name, feature_var))
features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
# print(features) # 100 cols #
return set(features) # return set type #
def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False):
names = set([])
if variance_threshold is True:
name_one = self.variance_threshold()
names = names.union(name_one)
if select_k_best is True:
name_two = self.select_k_best()
names = names.intersection(name_two)
if svc_select is True:
name_three = self.svc_select()
names = names.intersection(name_three)
if tree_select is True:
name_four = self.tree_select()
names = names.intersection(name_four)
print(names)
return list(names)
selection = FeatureSelection(100)
selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
由于使用SVC方法速度太慢,我就没有使用它
方案二
方案二是使用遗传算法做特征选择,算法原理我就不多阐述了,可以见我另一篇博文,虽然是用遗传算法解决Tsp问题,但除了编码方式不一样外其它几乎差不多。
End:如有不当之处,还望不吝赐教