# -*- coding: UTF-8 -*-
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import cross_validation
###########################################
#卡方(Chi2)检验
e14601 = np.loadtxt(open("/home/zlf/Documents/e14601_data_set_n0.csv", "rb"), delimiter=",", skiprows=0)
a = e14601.shape;
print(a[0], a[1]);
data = e14601[:, 0:a[1] - 1];
label = e14601[:, a[1] - 1];
select = 48
data_new = SelectKBest(chi2, select).fit_transform(data, label)
print(data_new.shape)
e14601_new = np.zeros([a[0], select+1])
e14601_new[:,:select] = data_new;
e14601_new[:,select] = label;
print(e14601_new.shape);
np.savetxt('/home/zlf/Documents/e14601_data_set_n0_select.csv', e14601_new, delimiter=',')
###########################################
# 基于模型的特征排序 (Model based ranking)
e14601 = np.loadtxt(open("/home/zlf/Documents/e14601_data_set.csv", "rb"), delimiter=",", skiprows=0)
a = e14601.shape;
data = e14601[:, 0:a[1] - 1];
label = e14601[:, a[1] - 1];
rf = RandomForestClassifier(n_estimators=20, max_depth=4)
scores = []
# 单独采用每个特征进行建模,并进行交叉验证
for i in range(a[1]-1):
score = cross_val_score(rf, data[:, i:i+1], label, scoring="accuracy", # 注意X[:, i]和X[:, i:i+1]的区别
cv=cross_validation.ShuffleSplit(a[0], n_iter=3, test_size=0.3, random_state=0))
scores.append((format(np.mean(score), '.3f'), i))
A = sorted(scores, reverse=True);
print(A)
print(A[0][1])
select = 48;
e14601_new = np.zeros([a[0], select+1])
print(e14601_new.shape)
for i in range(select):
e14601_new[:, i]=e14601[:, A[i][1]]
e14601_new[:,select] = label;
print(e14601_new[:,0])
np.savetxt('/home/zlf/Documents/e14601_data_set_n0_select0.csv', e14601_new, delimiter=',')
python特征选择
最新推荐文章于 2024-05-17 19:25:33 发布