https://blog.csdn.net/weixin_39531594/article/details/110804636
1、从excel数据中读取
import pandas as pd
file_path = r'OCT500-Text labels.xlsx' # r对路径进行转义,windows需要
raw_data = pd.read_excel(file_path, header=0) # header=0表示第一行是表头,就自动去除了
print(raw_data.values.shape)
raw_data.head()
raw_data.number.value_counts()
2、取出输入和输出数据
y = raw_data.number.values
X = raw_data.ID.values #raw_data.drop(['number'], axis = 1)
print(X)
print(y)
3、采用k-fold划分训练集和测试机
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
#kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) #StratifiedKFold, MultilabelStratifiedKFold
kfold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)
for train_ix, test_ix in kfold.split(X, y):
print('train_ix is ************',train_ix.shape)
print('test_ix is ************',test_ix.shape)
print('_____________________________________')
# select rows
train_X = X[train_ix]
test_X = X[test_ix]
train_y = y[train_ix]
test_y = y[test_ix]
# summarize train and test composition
train_0, train_1,train_2, train_3, train_4, train_5, train_6 = len(train_y[train_y==0]), len(train_y[train_y==1]), len(train_y[train_y==2]), len(train_y[train_y==3])\
, len(train_y[train_y==4]), len(train_y[train_y==5]), len(train_y[train_y==6])
test_0, test_1,test_2, test_3, test_4, test_5, test_6 = len(test_y[test_y==0]), len(test_y[test_y==1]), len(test_y[test_y==2])\
, len(test_y[test_y==3]), len(test_y[test_y==4]), len(test_y[test_y==5]), len(test_y[test_y==6])
print('>Train: 0=%d, 1=%d,2=%d,3=%d,4=%d,5=%d,6=%d;\
Test: 0=%d, 1=%d ,2=%d,3=%d,4=%d,5=%d,6=%d' % (train_0, train_1, train_2, train_3, train_4, train_5, train_6,\
test_0, test_1,test_2, test_3, test_4, test_5, test_6 ))
结果如下
train_ix is ************ (400,)
test_ix is ************ (100,)
_____________________________________
>Train: 0=200, 1=39,2=52,3=12,4=12,5=8,6=77; Test: 0=51, 1=10 ,2=12,3=4,4=2,5=2,6=19
train_ix is ************ (400,)
test_ix is ************ (100,)
_____________________________________
>Train: 0=201, 1=40,2=51,3=13,4=11,5=8,6=76; Test: 0=50, 1=9 ,2=13,3=3,4=3,5=2,6=20
train_ix is ************ (400,)
test_ix is ************ (100,)
_____________________________________
>Train: 0=201, 1=39,2=51,3=13,4=11,5=8,6=77; Test: 0=50, 1=10 ,2=13,3=3,4=3,5=2,6=19
train_ix is ************ (400,)
test_ix is ************ (100,)
_____________________________________
>Train: 0=201, 1=39,2=51,3=13,4=11,5=8,6=77; Test: 0=50, 1=10 ,2=13,3=3,4=3,5=2,6=19
train_ix is ************ (400,)
test_ix is ************ (100,)
_____________________________________
>Train: 0=201, 1=39,2=51,3=13,4=11,5=8,6=77; Test: 0=50, 1=10 ,2=13,3=3,4=3,5=2,6=19
版本2
from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.model_selection import KFold
#kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) #StratifiedKFold, MultilabelStratifiedKFold
kfold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)
ss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=0)
ss = KFold(n_splits=2, shuffle=True, random_state=1)
for train_ix, test_ix in kfold.split(X, y):
print('train_ix is ************',train_ix.shape)
print('test_ix is ************',test_ix.shape)
print('_____________________________________')
# select rows
train_X = X[train_ix]
test_X = X[test_ix]
train_y = y[train_ix]
test_y = y[test_ix]
# summarize train and test composition
train_0, train_1,train_2, train_3, train_4, train_5, train_6 = len(train_y[train_y==0]), len(train_y[train_y==1]), len(train_y[train_y==2]), len(train_y[train_y==3])\
, len(train_y[train_y==4]), len(train_y[train_y==5]), len(train_y[train_y==6])
test_0, test_1,test_2, test_3, test_4, test_5, test_6 = len(test_y[test_y==0]), len(test_y[test_y==1]), len(test_y[test_y==2])\
, len(test_y[test_y==3]), len(test_y[test_y==4]), len(test_y[test_y==5]), len(test_y[test_y==6])
print('>Train: 0=%d, 1=%d,2=%d,3=%d,4=%d,5=%d,6=%d;\
Test: 0=%d, 1=%d ,2=%d,3=%d,4=%d,5=%d,6=%d' % (train_0, train_1, train_2, train_3, train_4, train_5, train_6,\
test_0, test_1,test_2, test_3, test_4, test_5, test_6 ))
for val_index, test_index in ss.split(test_X, test_y):
print(val_index.shape, test_index.shape)
test_X = X[test_index]
test_y = y[test_index]
val_X = X[val_index]
val_y = y[val_index]
test_0, test_1,test_2, test_3, test_4, test_5, test_6 = len(test_y[test_y==0]), len(test_y[test_y==1]), len(test_y[test_y==2])\
, len(test_y[test_y==3]), len(test_y[test_y==4]), len(test_y[test_y==5]), len(test_y[test_y==6])
val_0, val_1,val_2, val_3, val_4, val_5, val_6 = len(val_y[val_y==0]), len(val_y[val_y==1]), len(val_y[val_y==2])\
, len(val_y[val_y==3]), len(val_y[val_y==4]), len(val_y[val_y==5]), len(val_y[val_y==6])
print('>Train: 0=%d, 1=%d,2=%d,3=%d,4=%d,5=%d,6=%d;\
Test: 0=%d, 1=%d ,2=%d,3=%d,4=%d,5=%d,6=%d:\
Val: 0=%d, 1=%d ,2=%d,3=%d,4=%d,5=%d,6=%d' % (train_0, train_1, train_2, train_3, train_4, train_5, train_6,\
test_0, test_1,test_2, test_3, test_4, test_5, test_6,\
val_0, val_1,val_2, val_3, val_4, val_5, val_6 ))
OCTA数据集两分类的结果
import pandas as pd
file_path = r'./OCT300.xlsx' # r对路径进行转义,windows需要
raw_data = pd.read_excel(file_path, header=0) # header=0表示第一行是表头,就自动去除了
print(raw_data.values.shape)
raw_data.head()
y = raw_data.label.values
X = raw_data.ID.values #raw_data.drop(['number'], axis = 1)
#print(X)
#print(y)
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import numpy as np
#kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) #StratifiedKFold, MultilabelStratifiedKFold
kfold = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)
i = 0
for train_val_ix, test_ix in kfold.split(X, y):
i +=1
print('_____________________________________')
# select rows
train_val_X = X[train_val_ix]
train_val_y = y[train_val_ix]
test_X = X[test_ix]
test_y = y[test_ix]
train_X,val_X,train_y,val_y= train_test_split(train_val_X,train_val_y,test_size=0.125,random_state = 20,shuffle = True)
print('train_val_X is ************',train_val_X)
print('train_val_ix ix is ************',train_val_ix)
print('test is ************',test_X)
print('test_ix is ************',test_ix)
train_X = train_X-10000-1
val_X = val_X-10000-1
test_X = test_X -10000-1
if i==1:
print('train_val is ************',)
print('val_X is ************',val_X-10000)
print('test_X is ************',test_X-10000)
np.savetxt('./OCT300/OCT300_train_Kfold1.txt',train_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_train_y_Kfold1.txt',train_y,fmt='%d')
np.savetxt('./OCT300/OCT300_val_Kfold1.txt',val_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_val_y_Kfold1.txt',val_y,fmt='%d')
np.savetxt('./OCT300/OCT300_test_Kfold1.txt',test_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_test_y_Kfold1.txt',test_y,fmt='%d')
elif i==2:
np.savetxt('./OCT300/OCT300_train_Kfold2.txt',train_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_train_y_Kfold2.txt',train_y,fmt='%d')
np.savetxt('./OCT300/OCT300_val_Kfold2.txt',val_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_val_y_Kfold2.txt',val_y,fmt='%d')
np.savetxt('./OCT300/OCT300_test_Kfold2.txt',test_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_test_y_Kfold2.txt',test_y,fmt='%d')
elif i==3:
np.savetxt('./OCT300/OCT300_train_Kfold3.txt',train_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_train_y_Kfold3.txt',train_y,fmt='%d')
np.savetxt('./OCT300/OCT300_val_Kfold3.txt',val_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_val_y_Kfold3.txt',val_y,fmt='%d')
np.savetxt('./OCT300/OCT300_test_Kfold3.txt',test_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_test_y_Kfold3.txt',test_y,fmt='%d')
elif i==4:
np.savetxt('./OCT300/OCT300_train_Kfold4.txt',train_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_train_y_Kfold4.txt',train_y,fmt='%d')
np.savetxt('./OCT300/OCT300_val_Kfold4.txt',val_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_val_y_Kfold4.txt',val_y,fmt='%d')
np.savetxt('./OCT300/OCT300_test_Kfold4.txt',test_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_test_y_Kfold4.txt',test_y,fmt='%d')
elif i==5:
np.savetxt('./OCT300/OCT300_train_Kfold5.txt',train_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_train_y_Kfold5.txt',train_y,fmt='%d')
np.savetxt('./OCT300/OCT300_val_Kfold5.txt',val_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_val_y_Kfold5.txt',val_y,fmt='%d')
np.savetxt('./OCT300/OCT300_test_Kfold5.txt',test_X,fmt='%d')
#np.savetxt('./OCT300/OCT300_test_y_Kfold5.txt',test_y,fmt='%d')
# summarize train and test composition
train_0, train_1 = len(train_y[train_y==0]), len(train_y[train_y==1])
val_0, val_1 = len(val_y[val_y==0]), len(val_y[val_y==1])
test_0, test_1 = len(test_y[test_y==0]), len(test_y[test_y==1])
print('>Train: 0=%d, 1=%d; Val: 0=%d, 1=%d; Test: 0=%d, 1=%d; ' % (train_0, train_1, val_0, val_1, test_0, test_1 ))