基于sklearn的SVM和留一法(LOOCV)进行二分类
需要的导入包
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import scipy.io as scio # 用于读取matlab格式的数据
import copy
数据准备
首先导入两组数据的特征(每一个对象有n个特征可以是一维或多维的特征),比如正常人组(m1个)和病人组(m2个),将两组人的特征并到一起。之后为两组人生成label,比如正常人的label为-1,病人组的label为1。数据整理完成后得到的数据集dataset为(m1+m2)*n的一个特征矩阵,得到的dataset_label为(m1 + m2)*1的label向量。
# 将导入的数据data1 data2合并为dataset,并产生一一对应的label为datalabels
dataset = (np.hstack((train_data1, train_data2))).T
data1_label = list(-1 for i in range(np.size(train_data1, 1)))
data2_label = list(1 for i in range(np.size(train_data2, 1)))
datalabels = data1_label + data2_label
留一法交叉验证(LOOCV)
############################SVM+LOOVC################
loo = LeaveOneOut()
loo.get_n_splits(dataset)
predictlabel_list = []
reallabel_list = []
scaler = StandardScaler()
dataset = scaler.fit_transform(dataset)
clf = SVC(C=1, kernel='linear', gamma='auto')
count_right_label = 0
count = 0 # 循环次数
# 用留一法进行验证
for train_index, test_index in loo.split(dataset):
X_train, X_test = dataset[train_index], dataset[test_index]
Y_train, Y_test = np.array(datalabels)[train_index], np.array(datalabels)[test_index]
clf.fit(X_train, Y_train)
predictlabel_list.append(list(clf.predict(X_test)))
reallabel_list.append(list(Y_test))
if Y_test == clf.predict(X_test):
count_right_label += 1
count += 1
print('第{}次循环'.format(count))
accurancy = count_right_label / len(datalabels)
print('******循环结束!************')
print('准确率为:%.2f%%' % (accurancy * 100))
print('******运行结束!************')
运行结束后可以得到 predictlabel_list ,reallabel_list,分别存放真实的label和预测的label,可用于后续的指标分析 。
计算F1_SCORE,ACC(准确率),SEN(敏感度),SPE(特异性)
在得到真正的label和预测的label之后调用自己的函数get_TpTN_FpFn得到TP、TN、FP、FN之后计算上述指标,后面会给出功能函数的说明
TP_count, TN_count, FP_count, FN_count = get_TpTN_FpFn(reallabel_list, predictlabel_list)
F1_score = (2 * TP_count) / (2 * TP_count + FP_count + FN_count)
ACC = (TP_count + TN_count) / (TP_count + FN_count + TN_count + FP_count)
SEN = TP_count / (TP_count + FN_count)
SPE = TN_count / (TN_count + FP_count)
ACC:分类准确率
ACC = (TP +TN)/(TP+TN+FP+FN)
敏感性(sensitivity)
SEN = TP/TP+FN
特异度(specificity)
SPE = TN/TP+FP
F1分数
F1 = 2TP/(2TP + FN +FP)
计算TP,TN,FP,FN 辅助函数说明
此函数的输入list1为真实的label列表,list2为预测输出的label
# 获取一个预测结果的TP TN FP FN
def get_TpTN_FpFn(list1, list2):
# list1 为真实的label list2 为预测的label
reallabel_list = list(flatten(list1))
predictlabel_list = list(flatten(list2))
TP_count = 0
TN_count = 0
FP_count = 0
FN_count = 0
for i in range(len(reallabel_list)):
if reallabel_list[i] == 1 and predictlabel_list[i] == 1:
TP_count += 1
if reallabel_list[i] == -1 and predictlabel_list[i] == -1:
TN_count += 1
if reallabel_list[i] == -1 and predictlabel_list[i] == 1:
FP_count += 1
if reallabel_list[i] == 1 and predictlabel_list[i] == -1:
FN_count += 1
return TP_count, TN_count, FP_count, FN_count
辅助函数2
# 将一个任意嵌套的列表整理为一个列表
def flatten(nested):
try:
for sublist in nested:
for element in flatten(sublist):
yield element
except TypeError:
yield nested
在我的另外一篇博文有说明上面函数的作用
函数功能说明
后续说明
这篇文章主要说给大家分享一下怎么使用,一篇工具性的博文,我平时也是主要当作工具类,没有很深入分析参数的选取,每种计算指标的意义什么的,欢迎大家留言讨论,有问题指出,欢迎批评指正。
后面计划将在此基础上写一篇绘制ROC曲线的文章,以及对svm进行permutation验证求出分类的P值,也就是显著性。
完整代码
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import scipy.io as scio
###################功能性函数###################################################
# 将一个任意嵌套的列表整理为一个列表
def flatten(nested):
try:
for sublist in nested:
for element in flatten(sublist):
yield element
except TypeError:
yield nested
# 获取一个预测结果的TP TN FP FN
def get_TpTN_FpFn(list1, list2):
# list1 为真实的label list2 为预测的label
reallabel_list = list(flatten(list1))
predictlabel_list = list(flatten(list2))
TP_count = 0
TN_count = 0
FP_count = 0
FN_count = 0
for i in range(len(reallabel_list)):
if reallabel_list[i] == 1 and predictlabel_list[i] == 1:
TP_count += 1
if reallabel_list[i] == -1 and predictlabel_list[i] == -1:
TN_count += 1
if reallabel_list[i] == -1 and predictlabel_list[i] == 1:
FP_count += 1
if reallabel_list[i] == 1 and predictlabel_list[i] == -1:
FN_count += 1
return TP_count, TN_count, FP_count, FN_count
if __name__ == '__main__':
# ################# 数据的读取与整理了#############################################
path2 = 'C:\\Users\\Administer\\Desktop\\classer\\dataset'
MSN_HC = scio.loadmat(path2 + '\\HC_coupling_str_fun_20201022.mat')
train_data1= MSN_HC['all_coupling'].T
MSN_GTCS = scio.loadmat(path2 + '\\GTCS_coupling_str_fun.mat')
train_data2= MSN_GTCS['all_coupling'].T
# 以上代码按照自己的数据要求输入,可以自己删除。
dataset = (np.hstack((train_data1, train_data2))).T
data1_label = list(-1 for i in range(np.size(train_data1, 1)))
data2_label = list(1 for i in range(np.size(train_data2, 1)))
datalabels = data1_label + data2_label
#####################################LOOVC#####################################
loo = LeaveOneOut()
loo.get_n_splits(dataset)
predictlabel_list = []
reallabel_list = []
scaler = StandardScaler()
dataset = scaler.fit_transform(dataset)
clf = SVC(C=1, kernel='linear', gamma='auto')
count_right_label = 0
count = 0 # 循环次数
# 用留一法进行验证
for train_index, test_index in loo.split(dataset):
X_train, X_test = dataset[train_index], dataset[test_index]
Y_train, Y_test = np.array(datalabels)[train_index], np.array(datalabels)[test_index]
clf.fit(X_train, Y_train)
predictlabel_list.append(list(clf.predict(X_test)))
reallabel_list.append(list(Y_test))
if Y_test == clf.predict(X_test):
count_right_label += 1
count += 1
print('第{}次循环'.format(count))
accurancy = count_right_label / len(datalabels)
print('******循环结束!************')
print('准确率为:%.2f%%' % (accurancy * 100))
print('******运行结束!************')
TP_count, TN_count, FP_count, FN_count = get_TpTN_FpFn(reallabel_list, predictlabel_list)
F1_score = (2 * TP_count) / (2 * TP_count + FP_count + FN_count)
ACC = (TP_count + TN_count) / (TP_count + FN_count + TN_count + FP_count)
SEN = TP_count / (TP_count + FN_count)
SPE = TN_count / (TN_count + FP_count)
print('F1_SCORE为:%.2f%%' % (F1_score * 100))
print('ACC(准确率)为:%.2f%%' % (ACC * 100))
print('SEN(敏感度)为:%.2f%%' % (SEN * 100))
print('SPE(特异性)为:%.2f%%' % (SPE * 100))