mRMR特征选择算法(feature_selection)的使用

源程序下载地址,本机电脑安装java环境,具体环境安装可自行百度,google.

用以实现用 mRMR 从特征集中提取特征的程序(python)

#inport neccesary bags

import csv#用来保存csv文件
import pandas as pd
import numpy as np
import re
import os#用来调用系统程序

#改变默认文件夹位置
os.chdir("XXX")
#input path name
datapath ="XXX"

#output path name
outputpath="XXX"
"""
    mrmr and svm
"""



#read csv data from path
train_data = pd.read_csv(datapath, header=None, index_col=None)
X = np.array(train_data)
Y = list(map(lambda x: 1, xrange(len(train_data) // 2)))
Y2 = list(map(lambda x: 0, xrange(len(train_data) // 2)))
Y.extend(Y2)
Y=np.array(Y)
Y=Y.reshape(2260,1)

#concatenate class and data
full_csv_with_class=np.concatenate([Y,X],axis=1)
print full_csv_with_class

#print the results of original csv data and final full data
print "the shape of data:"+str(X.shape)
print "the shape of data and class:"+str(full_csv_with_class.shape)

#generating virtual headers
columns=["class"]
columns_numbers=np.arange(full_csv_with_class.shape[1]-1)
columns.extend(columns_numbers)

# Write data into files
csvFile2 = open(outputpath,'w') 
writer = csv.writer(csvFile2)
m = len(full_csv_with_class)
writer.writerow(columns)
for i in range(m):
    writer.writerow(full_csv_with_class[i])
csvFile2.close()
[[ 1.     1.     1.    ...,  0.     1.     0.075]
 [ 1.     0.     0.    ...,  1.     1.     0.1  ]
 [ 1.     1.     0.    ...,  1.     0.     0.175]
 ..., 
 [ 0.     0.     0.    ...,  1.     1.     0.075]
 [ 0.     0.     0.    ...,  0.     1.     0.025]
 [ 0.     0.     0.    ...,  0.     1.     0.05 ]]
the shape of data:(2260, 200)
the shape of data and class:(2260, 201)
os.system("./mRMR/mrmr -i "+outputpath+" -n 200 >mRMR/output.mrmrout")
print "complete "
complete 
#读取文件

fn=open("mRMR/output.mrmrout",'r')
location_mark=0
final_set=[]
for line in fn.readlines():
    if line.strip() =="":
        location_mark=0
    if location_mark==1 and line.split()[1]!="Fea":
         final_set.append(int(line.split()[1]))
    if re.findall(r"mRMR",line) and re.findall(r"feature",line):
        location_mark=1
print final_set
[133, 135, 140, 130, 145, 110, 115, 105, 120, 125, 150, 102, 185, 190, 180, 195, 100, 160, 165, 155, 170, 175, 101, 5, 85, 95, 98, 90, 99, 200, 177, 33, 50, 14, 8, 149, 109, 94, 121, 134, 113, 84, 21, 156, 71, 31, 6, 59, 189, 158, 122, 176, 58, 46, 64, 188, 10, 1, 38, 184, 19, 138, 2, 159, 81, 181, 44, 199, 26, 63, 82, 45, 148, 114, 172, 183, 32, 7, 48, 131, 146, 163, 83, 39, 49, 171, 80, 132, 197, 77, 88, 56, 9, 157, 198, 75, 164, 147, 70, 76, 196, 27, 182, 25, 96, 127, 13, 57, 126, 65, 107, 34, 108, 60, 139, 69, 55, 89, 30, 35, 40, 106, 20, 15, 104, 97, 111, 18, 103, 41, 78, 116, 61, 192, 3, 43, 67, 23, 118, 191, 4, 11, 194, 119, 66, 17, 87, 137, 136, 167, 141, 53, 117, 154, 28, 86, 42, 151, 52, 74, 68, 193, 51, 22, 179, 153, 62, 186, 152, 169, 12, 161, 129, 112, 166, 93, 47, 79, 162, 128, 29, 16, 143, 36, 187, 168, 144, 73, 124, 91, 54, 174, 178, 24, 173, 37, 142, 72, 123, 92]
precision_copy=0
recall_copy=0
SN_copy=0
SP_copy=0
GM_copy=0
TP_copy=0
TN_copy=0
FP_copy=0
FN_copy=0
ACC_copy=0
F1_Score_copy=0
F_measure_copy=0
MCC_copy=0
pos_copy=0
neg_copy=0
y_pred_prob_copy=[]
y_pred_copy=[]

关键语句:
os.system("./mRMR/mrmr -i "+outputpath+" -n 200 >mRMR/output.mrmrout")

  • ./mRMR/mrmr代表执行程序,也即最上面github里面下载的
  • -i outputpath代表输出的csv地址,也即原始特诊集合(一下会说明)
  • -n 200代表选取200维度,一次从得分排列
  • mRMR/output.mrmrout代表输出的文件(文件情况如下)
    output.mrmrout


csv格式需要特别说明,分类的类别需要在第一列,同时必须要有columns的标签(class一行必须有)
这里写图片描述

	[133, 135, 140, 130, 145, 110, 115, 105, 120, 125, 150, 102, 185, 190, 180, 195, 100, 160, 165, 155, 170, 175, 101, 5, 85, 95, 98, 90, 99, 200, 177, 33, 50, 14, 8, 149, 109, 94, 121, 134, 113, 84, 21, 156, 71, 31, 6, 59, 189, 158, 122, 176, 58, 46, 64, 188, 10, 1, 38, 184, 19, 138, 2, 159, 81, 181, 44, 199, 26, 63, 82, 45, 148, 114, 172, 183, 32, 7, 48, 131, 146, 163, 83, 39, 49, 171, 80, 132, 197, 77, 88, 56, 9, 157, 198, 75, 164, 147, 70, 76, 196, 27, 182, 25, 96, 127, 13, 57, 126, 65, 107, 34, 108, 60, 139, 69, 55, 89, 30, 35, 40, 106, 20, 15, 104, 97, 111, 18, 103, 41, 78, 116, 61, 192, 3, 43, 67, 23, 118, 191, 4, 11, 194, 119, 66, 17, 87, 137, 136, 167, 141, 53, 117, 154, 28, 86, 42, 151, 52, 74, 68, 193, 51, 22, 179, 153, 62, 186, 152, 169, 12, 161, 129, 112, 166, 93, 47, 79, 162, 128, 29, 16, 143, 36, 187, 168, 144, 73, 124, 91, 54, 174, 178, 24, 173, 37, 142, 72, 123, 92]

这些数字是从mRMR/output.mrmrout里面提取出来的特征维度的排序
读者可根据这些排序的维度逐渐提取以寻找最优的维度集合。

重申mrmr程序和特征提取程序地址

  • 2
    点赞
  • 53
    收藏
    觉得还不错? 一键收藏
  • 9
    评论
当然可以!下面是一个用Python实现的mRMR特征选择算法的程序: ```python import numpy as np from scipy.stats import entropy def mrmr_feature_selection(X, y, n_features): num_samples, num_features = X.shape selected_features = [] # 存储被选择的特征索引 remaining_features = list(range(num_features)) # 所有特征的初始索引列表 # 计算每个特征与目标变量的互信息 mi_scores = np.zeros(num_features) for i in range(num_features): mi_scores[i] = mutual_information(X[:, i], y) for _ in range(n_features): max_mrmr = -np.inf best_feature = None for feature in remaining_features: mrmr = mi_scores[feature] - average_conditional_mutual_information(X[:, feature], selected_features, y) if mrmr > max_mrmr: max_mrmr = mrmr best_feature = feature selected_features.append(best_feature) remaining_features.remove(best_feature) return selected_features def mutual_information(x, y): px = np.histogram(x, bins='auto')[0] / len(x) py = np.histogram(y, bins='auto')[0] / len(y) pxy = np.histogram2d(x, y, bins='auto')[0] / len(x) mi = 0 for i in range(len(px)): for j in range(len(py)): if pxy[i][j] > 0: mi += pxy[i][j] * np.log2(pxy[i][j] / (px[i] * py[j])) return mi def average_conditional_mutual_information(x, selected_features, y): acmi = 0 for feature in selected_features: acmi += conditional_mutual_information(x, feature, y) return acmi / len(selected_features) def conditional_mutual_information(x, z, y): pz = np.histogram(z, bins='auto')[0] / len(z) pxz = np.histogram2d(x, z, bins='auto')[0] / len(x) pyz = np.histogram2d(y, z, bins='auto')[0] / len(y) pxyz = np.histogramdd((x, y, z), bins='auto')[0] / len(x) cmi = 0 for i in range(len(pz)): for j in range(len(pyz)): for k in range(len(pxz)): if pxyz[i][j][k] > 0: cmi += pxyz[i][j][k] * np.log2(pxyz[i][j][k] / (pz[i] * pxz[k][i] * pyz[j][i])) return cmi # 使用示例 X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) # 样本特征矩阵 y = np.array([0, 1, 0]) # 目标变量 n_features = 2 # 需要选择的特征数量 selected_features = mrmr_feature_selection(X, y, n_features) print("Selected features:", selected_features) ``` 这个程序实现了mRMR(最大相关最小冗余)特征选择算法,通过计算互信息和条件互信息来评估特征与目标变量的关联性和特征之间的相关性。在示例中,我们使用一个简单的样本特征矩阵和目标变量,选择2个最相关且最不冗余的特征。你可以根据自己的数据和需求进行调整和扩展。
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值