【机器学习杂记|备忘】手写SBS类进行特征提取
利用**SBS(序列后向选择,Sequential Backward Selection)**进行特征选择,本质上是一种特征算法。在削减特征数量后,计算剩余的特征组合中表现最好的一组(通常用准确率),如此往复,直到减到你期望的特征数量。当然,它未必能达到全局最优。
贴上SBS类代码:
最核心的地方在line35-55
#In[]
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from itertools import combinations
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class SBS():
def __init__(self, estimator, k_features, scoring=accuracy_score,
test_size=0.25, random_state=1):
self.scoring = scoring
self.estimator = clone(estimator)
self.k_features = k_features
self.test_size = test_size
self.random_state = random_state
def fit(self,X,y):
X_train,X_test,y_train,y_test = train_test_split(
X,y,test_size=self.test_size,random_state=self.random_state)
# X_train数据shape是两维,shape[0]是行数表示样本数,shape[1]是列数表示特征数
dim = X_train.shape[1]
# 转为一个范围,比如如果dim=3,结果是一个元组类型(0,1,2)
# self.indices_,是python的类的知识,转为成员变量
self.indices_ = tuple(range(dim))
#转为list
self.subsets_ = [self.indices_]
# 以准确率作为度量的score,计算出以indices_作为选择特征下,训练后在测试集上准确率
score = self._calc_score(X_train, y_train,
X_test, y_test, self.indices_)#_calc_score在后面定义
# 保存这次的准确率,就是说,在没有特征减少的情况下的准确率先保存
self.scores_ = [score]
# dim是所有特征数,k_features是最少要测试保留的特征数
while dim > self.k_features:
scores = []
subsets = []
# 后面dim会递减,这样就是去3个中的每个特征进行测试,计算测试集的准确率
for p in combinations(self.indices_,r=dim-1):
score = self._calc_score(X_train,y_train,X_test,y_test,p)
scores.append(score)
subsets.append(p)
#获取特征数量为dim-1时score最大值的索引
best = np.argmax(scores)
# 获取索引对应的特征集合,比如,(0, 1) (0, 2) (1, 2)中的一组
self.indices_ = subsets[best]
#将分类最好的特征组合加入到这个子集,体先筛选过程
self.subsets_.append(self.indices_)
#将准确率也保存起来
self.scores_.append(scores[best])
#开始下一个组合
dim = dim - 1
# 这里的k_score_与输入的k_features是对应的,一般是输入1,[-1]索引表示取
# 最后一个值,就是特征最少情况下组合的准确率,如果k_features=2,就是
# 只有两个特征组合情况下的最佳准确率。为什么保存这个?因为很多情况下,用户想
# 看看在给定特征最少情况下的分类准确率,如果达到预定目标,就可以直接进行这样的特征选择了。
self.k_score_ = self.scores_[-1]
return self
def transform(self,X):
return X[:,self.indices_]
#计算准确率
def _calc_score(self,X_train,y_train,X_test,y_test,indices):
self.estimator.fit(X_train[:,indices],y_train)
y_pred = self.estimator.predict(X_test[:,indices])
return self.scoring(y_test,y_pred)
# In[]
#读取数据
url = 'D:/Practice_code/python_code/'
wine_data = pd.read_csv(url + 'wine.data',header=None)
#命名列
wine_data.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
'Proline']
wine_data.head()
#In[]
np.unique(wine_data['Class label'])
#一共有三类,1,2,3
#In[]
#构造训练集与测试集
X,y = wine_data.iloc[:,1:].values,wine_data.iloc[:,0].values
#利用KNN进行分类
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
#查找最重要的五个特征
sbs = SBS(knn,k_features=5)
sbs.fit(X,y)
#打印出筛选出特征的索引
for item in sbs.subsets_:
print(item)
'''输出结果:
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
(0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11)
(0, 1, 2, 3, 6, 7, 8, 9, 10, 11)
(0, 1, 2, 3, 6, 7, 8, 9, 11)
(0, 1, 2, 3, 6, 8, 9, 11)
(0, 1, 3, 6, 8, 9, 11)
(0, 1, 3, 6, 8, 9)
(0, 3, 6, 8, 9)
'''
#In[]
#小改一下,通过字典的方式将特征显示出来
#key_index从-1开始是因为训练集标签索引0对应的是特征列表里的1.
key_index = [i for i in range(-1,len(wine_data.columns)-1)]
feature_dict = dict(zip(key_index,wine_data.columns))
for subsets in sbs.subsets_:
for key in subsets:
print(feature_dict[key],end=',')
print('\n')
'''输出结果:
Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,
Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Ash,Alcalinity of ash,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Ash,Alcalinity of ash,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Ash,Alcalinity of ash,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Ash,Alcalinity of ash,Flavanoids,Proanthocyanins,Color intensity,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Alcalinity of ash,Flavanoids,Proanthocyanins,Color intensity,OD280/OD315 of diluted wines,
Alcohol,Malic acid,Alcalinity of ash,Flavanoids,Proanthocyanins,Color intensity,
Alcohol,Alcalinity of ash,Flavanoids,Proanthocyanins,Color intensity,
'''
# %%