python多表格提取特征_多类别特征提取算法ReliefF Python实现

Relief算法的局限性在于只能处理两类别数据,因此1994年Kononeill对其进行了扩展,得到了可以处理多类别问题的

ReliefF算法,可以处理多类别问题。该算法用于处理目标属性为连续值的回归问题。ReliefF算法在处理多类别问题

时,每次从训练集样本中集中随机取出一个样本R,然后从和R同类样本中找出R的k个近邻样本,从每个R的不同类别

的样本集中均找出k个近邻样本,然后更新每个特征的权重。

ReliefF相对于Relief有两点变动:

1、在寻找nearHit和nearMiss时,不再是只找一个sample,而是寻找k个sample:

nearHit:在同类中寻找k个相近的sample,将每个sample都与base sample的feature相减后,将k个结果累加再平均。

nearMiss:在n-1个异类中都寻找k个相近的sample,处理方法同上,不过累加后要除以(n-1)k以平均结果。

2、寻找nearHit和nearMiss的时候运用的是L1代替了L2(即范数1代替范数2)

方法fit()的参数k是寻找的nearHit和nearMiss的数量,norm是距离公式(虽然默认应该是L1,但是我依旧提供L2供使用)。

ReliefF算法步骤如下:

现有不同类别的样本若干, 对每类样本称作 Xn。

1. 从所有样本中,随机取出一个样本a。

2. 在与样本a相同分类的样本组内,取出k个最近邻样本。

3. 在所有其他与样本a不同分类的样本组内, 也分别取出k个最近邻样本。

4. 计算每个特征的权重。

对于每个特征的权重有:

Center

其中, p(C) 为该类别的比例。 p(Class(R)) 为随机选取的某样本的类别的比例。

可以看到,权重意义在于, 减去相同分类的该特征差值, 加上不同分类的该特征的差值。(若该特征与分类有关,则相同分类的该特征的值应该相似, 而不同分类的值应该不相似)

最后可以根据权重排序,得到合适的特征。

代码如下:

Python

import numpy as np

from random import randrange

from sklearn.datasets import make_blobs

from sklearn.preprocessing import normalize

def distanceNorm(Norm,D_value):

# initialization

# Norm for distance

if Norm == '1':

counter = np.absolute(D_value);

counter = np.sum(counter);

elif Norm == '2':

counter = np.power(D_value,2);

counter = np.sum(counter);

counter = np.sqrt(counter);

elif Norm == 'Infinity':

counter = np.absolute(D_value);

counter = np.max(counter);

else:

raise Exception('We will program this later......');

return counter;

def fit(features,labels,iter_ratio,k,norm):

# initialization

(n_samples,n_features) = np.shape(features);

distance = np.zeros((n_samples,n_samples));

weight = np.zeros(n_features);

labels = list(labels)

# compute distance

for index_i in range(n_samples):

for index_j in range(index_i+1,n_samples):

D_value = features[index_i] - features[index_j];

distance[index_i,index_j] = distanceNorm(norm,D_value);

distance += distance.T;

# start iteration

for iter_num in range(int(iter_ratio*n_samples)):

# random extract a sample

index_i = randrange(0,n_samples,1);

self_features = features[index_i];

# initialization

nearHit = list();

nearMiss = dict();

n_labels = list(set(labels));

termination = np.zeros(len(n_labels));

del n_labels[n_labels.index(labels[index_i])];

for label in n_labels:

nearMiss[label] = list();

distance_sort = list();

# search for nearHit and nearMiss

distance[index_i,index_i] = np.max(distance[index_i]); # filter self-distance

for index in range(n_samples):

distance_sort.append([distance[index_i,index],index,labels[index]]);

distance_sort.sort(key = lambda x:x[0]);

for index in range(n_samples):

# search nearHit

if distance_sort[index][2] == labels[index_i]:

if len(nearHit) < k:

nearHit.append(features[distance_sort[index][1]]);

else:

termination[distance_sort[index][2]] = 1;

# search nearMiss

elif distance_sort[index][2] != labels[index_i]:

if len(nearMiss[distance_sort[index][2]]) < k:

nearMiss[distance_sort[index][2]].append(features[distance_sort[index][1]]);

else:

termination[distance_sort[index][2]] = 1;

if list(termination).count(0) == 0:

break;

# update weight

nearHit_term = np.zeros(n_features);

for x in nearHit:

nearHit += np.abs(np.power(self_features - x,2));

nearMiss_term = np.zeros((len(list(set(labels))),n_features));

for index,label in enumerate(nearMiss.keys()):

for x in nearMiss[label]:

nearMiss_term[index] += np.abs(np.power(self_features - x,2));

weight += nearMiss_term[index]/(k*len(nearMiss.keys()));

weight -= nearHit_term/k;

# print weight/(iter_ratio*n_samples);

return weight/(iter_ratio*n_samples);

def test():

(features, labels) = make_blobs(n_samples = 500, n_features = 10, centers = 4);

features = normalize(X = features, norm = 'l2', axis = 0);

for x in range(1,11):

weight = fit(features=features, labels=labels, iter_ratio=1, k=5, norm='2');

print(weight)

if __name__ == '__main__': test();

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

importnumpyasnp

fromrandomimportrandrange

fromsklearn.datasetsimportmake_blobs

fromsklearn.preprocessingimportnormalize

defdistanceNorm(Norm,D_value):

# initialization

# Norm for distance

ifNorm=='1':

counter=np.absolute(D_value);

counter=np.sum(counter);

elifNorm=='2':

counter=np.power(D_value,2);

counter=np.sum(counter);

counter=np.sqrt(counter);

elifNorm=='Infinity':

counter=np.absolute(D_value);

counter=np.max(counter);

else:

raiseException('We will program this later......');

returncounter;

deffit(features,labels,iter_ratio,k,norm):

# initialization

(n_samples,n_features)=np.shape(features);

distance=np.zeros((n_samples,n_samples));

weight=np.zeros(n_features);

labels=list(labels)

# compute distance

forindex_iinrange(n_samples):

forindex_jinrange(index_i+1,n_samples):

D_value=features[index_i]-features[index_j];

distance[index_i,index_j]=distanceNorm(norm,D_value);

distance+=distance.T;

# start iteration

foriter_numinrange(int(iter_ratio*n_samples)):

# random extract a sample

index_i=randrange(0,n_samples,1);

self_features=features[index_i];

# initialization

nearHit=list();

nearMiss=dict();

n_labels=list(set(labels));

termination=np.zeros(len(n_labels));

deln_labels[n_labels.index(labels[index_i])];

forlabelinn_labels:

nearMiss[label]=list();

distance_sort=list();

# search for nearHit and nearMiss

distance[index_i,index_i]=np.max(distance[index_i]);# filter self-distance

forindexinrange(n_samples):

distance_sort.append([distance[index_i,index],index,labels[index]]);

distance_sort.sort(key=lambdax:x[0]);

forindexinrange(n_samples):

# search nearHit

ifdistance_sort[index][2]==labels[index_i]:

iflen(nearHit)

nearHit.append(features[distance_sort[index][1]]);

else:

termination[distance_sort[index][2]]=1;

# search nearMiss

elifdistance_sort[index][2]!=labels[index_i]:

iflen(nearMiss[distance_sort[index][2]])

nearMiss[distance_sort[index][2]].append(features[distance_sort[index][1]]);

else:

termination[distance_sort[index][2]]=1;

iflist(termination).count(0)==0:

break;

# update weight

nearHit_term=np.zeros(n_features);

forxinnearHit:

nearHit+=np.abs(np.power(self_features-x,2));

nearMiss_term=np.zeros((len(list(set(labels))),n_features));

forindex,labelinenumerate(nearMiss.keys()):

forxinnearMiss[label]:

nearMiss_term[index]+=np.abs(np.power(self_features-x,2));

weight+=nearMiss_term[index]/(k*len(nearMiss.keys()));

weight-=nearHit_term/k;

# print weight/(iter_ratio*n_samples);

returnweight/(iter_ratio*n_samples);

deftest():

(features,labels)=make_blobs(n_samples=500,n_features=10,centers=4);

features=normalize(X=features,norm='l2',axis=0);

forxinrange(1,11):

weight=fit(features=features,labels=labels,iter_ratio=1,k=5,norm='2');

print(weight)

if__name__=='__main__':test();

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值