"""
特征选择中多类别标签的Fisher score计算
注意:.mat数据集必须满足
特征集为n*m(n为样本数,m为特征数)
标签集为n*1
如果用matlab打开之后不满足以上,可以进行转置
如果是csv文件,也可以进行转换
"""
import pandas as pd
from collections import defaultdict
import numpy as np
import scipy.io as sio
def F_S(sample,label,loop):
df1 = pd.DataFrame(sample)
df2 = pd.DataFrame(label, columns=['label'])
data = pd.concat([df1, df2], axis=1) # 合并成为一个dataframe
target_equivalence_class = defaultdict(list)
for m, n in [(n, m) for m, n in list(enumerate(data.label))]:
target_equivalence_class[m].append(n) # m为某类标签,n为某些样本
# print(target_equivalence_class)
n = len(label)
# print(n)
n_class = {}
n_value = 'a'
for k, values in target_equivalence_class.items():
n_value = len(values)
n_class[k] = n_value # 计算出第n类标签的个数
for k, values in target_equivalence_class.items():
n_value = len(values)
n_class[k] = n_value # 计算出第n类标签的个数
# print(n_class)
lst = []
features_list = list(data.columns)[:-1] # 以列表的形式列出所有特征的下标【0,1,2....】
# print(features_list)
SB = []
SW = []
for feature in features_list:
for key in target_equivalence_class.keys():
data_key = data[data.label == key]
a_feature_mean = data_key[feature].mean()
a_SW = sum((data_key[feature] - a_feature_mean) ** 2)
SW.append(a_SW)
# print(type(a_SW))
all_feature_mean = data[feature].mean()
a_SB = n_value / n * (a_feature_mean - all_feature_mean) ** 2
a_SB = float(a_SB)
SB.append(a_SB)
# print(a_SB)
all_SB = sum(SB) # sum()函数是列表内相加
# print(all_SB)
all_SW = sum(SW) / n
if all_SW == 0:
m_fisher_score = np.nan
else:
m_fisher_score = all_SB / all_SW
lst.append(m_fisher_score)
lst1 = np.array(lst)
a = np.argsort(-lst1)
reduct_set = a[:loop] # FS排名前k个特征
return reduct_set
file = 'Data\\U_Sonar.mat'
m = sio.loadmat(file)
sample = m['X']
label = m['Y']
a=F_S(sample,label,50)
print(a)
看到很多都是二分类标签的Fisher score的文章,多类标签的FS较少,从github上看到了大佬写的一个二分类标签的FS,所以进行了一些改动,研一小菜鸟一枚,写的算法没有进行优化可能比较简单,希望大佬们批评指正~~