回归问题特征选择
波士顿房价数据
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
#Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor()
rf.fit(X, Y)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),reverse=True))
实际操作数据
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
data= pd.read_excel("C:/Users/yangge/Desktop/td.xlsx")
X = data.drop(['isbad'], 1)
Y = data["isbad"]
names = [x for x in data.columns if x not in ['isbad']]
rf = RandomForestRegressor()
rf.fit(X, Y)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),reverse=True))
分类问题特征选择
xgboost
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 14:17:28 2018
@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 11:50:15 2018
@author: Administrator
"""
import pandas as pd
import xgboost as xgb
import operator
import matplotlib.pyplot as plt
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
if __name__ == '__main__':
train = pd.read_excel("C:/Users/yangge/Desktop/td.xlsx")
params = {
'min_child_weight': 100,
'eta': 0.02,
'colsample_bytree': 0.7,
'max_depth': 12,
'subsample': 0.7,
'alpha': 1,
'gamma': 1,
'silent': 1,
'verbose_eval': True,
'seed': 12
}
rounds = 10
y = train['isbad']
X = train.drop(['isbad'], 1)
xgtrain = xgb.DMatrix(X, label=y)
bst = xgb.train(params, xgtrain, num_boost_round=rounds)
features = [x for x in train.columns if x not in ['isbad']]
ceate_feature_map(features)
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1)) #lambda x:x[1]
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.to_excel("C:/Users/yangge/Desktop/feat_importance3.xlsx", index=False)
plt.figure()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.show()
随机森林
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
data= pd.read_excel("C:/Users/yangge/Desktop/td.xlsx")
X = data.drop(['isbad'], 1)
Y = data["isbad"]
names = [x for x in data.columns if x not in ['isbad']]
rf = RandomForestClassifier()
rf.fit(X, Y)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),reverse=True))
如果对你有帮助,请点下赞,予人玫瑰手有余香!