转:『Sklearn』数据划分方法及python代码

原理介绍

K折交叉验证:

KFold,GroupKFold,StratifiedKFold,

留一法:

LeaveOneGroupOut,LeavePGroupsOut,LeaveOneOut,LeavePOut,

随机划分法:

ShuffleSplit,GroupShuffleSplit,StratifiedShuffleSplit,

 

 

代码实现

流程:

实例化分类器 -> 迭代器迭代组[.split()]

KFold(n_splits=2)

1
2
3
4
5
6
7
8
9
10
11
12
#KFold<br>import numpy as np
from  sklearn.model_selection  import  KFold
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
kf = KFold(n_splits = 2 )     # 定义分成几个组
# kf.get_n_splits(X)    # 查询分成几个组
print (kf)
for  train_index,test_index  in  kf.split(X):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)

 GroupKFold(n_splits=2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# GroupKFold,不是很懂这个划分方法
import  numpy as np
from  sklearn.model_selection  import  GroupKFold
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
groups = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
group_kfold = GroupKFold(n_splits = 2 )
group_kfold.get_n_splits(X,y,groups)
print (group_kfold)
for  train_index,test_index  in  group_kfold.split(X,y,groups):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
 
#GroupKFold(n_splits=2)
#Train Index: [0 2 4] ,Test Index: [1 3 5]
#Train Index: [1 3 5] ,Test Index: [0 2 4]

  StratifiedKFold(n_splits=3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# stratifiedKFold:保证训练集中每一类的比例是相同的(尽量)
import  numpy as np
from  sklearn.model_selection  import  StratifiedKFold
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 1 , 1 , 2 , 2 , 2 ])
skf = StratifiedKFold(n_splits = 3 )
skf.get_n_splits(X,y)
print (skf)
for  train_index,test_index  in  skf.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
 
#StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
#Train Index: [1 2 4 5] ,Test Index: [0 3]
#Train Index: [0 2 3 5] ,Test Index: [1 4]

 LeaveOneOut()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# leaveOneOut:测试集就留下一个
import  numpy as np
from  sklearn.model_selection  import  LeaveOneOut
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
loo = LeaveOneOut()
loo.get_n_splits(X)
print (loo)
for  train_index,test_index  in  loo.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
#LeaveOneOut()
#Train Index: [1 2 3 4 5] ,Test Index: [0]
#Train Index: [0 2 3 4 5] ,Test Index: [1]
#Train Index: [0 1 3 4 5] ,Test Index: [2]
#Train Index: [0 1 2 4 5] ,Test Index: [3]
#Train Index: [0 1 2 3 5] ,Test Index: [4]
#Train Index: [0 1 2 3 4] ,Test Index: [5]

LeavePOut(p=3)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
LeavePOut:测试集留下P个
import  numpy as np
from  sklearn.model_selection  import  LeavePOut
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
lpo = LeavePOut(p = 3 )
lpo.get_n_splits(X)
print (lpo)
for  train_index,test_index  in  lpo.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
 
#LeavePOut(p=3)
#Train Index: [3 4 5] ,Test Index: [0 1 2]
#Train Index: [2 4 5] ,Test Index: [0 1 3]
#Train Index: [2 3 5] ,Test Index: [0 1 4]
#Train Index: [2 3 4] ,Test Index: [0 1 5]
#Train Index: [1 4 5] ,Test Index: [0 2 3]
#Train Index: [1 3 5] ,Test Index: [0 2 4]
#Train Index: [1 3 4] ,Test Index: [0 2 5]
#Train Index: [1 2 5] ,Test Index: [0 3 4]
#Train Index: [1 2 4] ,Test Index: [0 3 5]
#Train Index: [1 2 3] ,Test Index: [0 4 5]
#Train Index: [0 4 5] ,Test Index: [1 2 3]
#Train Index: [0 3 5] ,Test Index: [1 2 4]
#Train Index: [0 3 4] ,Test Index: [1 2 5]
#Train Index: [0 2 5] ,Test Index: [1 3 4]
#Train Index: [0 2 4] ,Test Index: [1 3 5]
#Train Index: [0 2 3] ,Test Index: [1 4 5]
#Train Index: [0 1 5] ,Test Index: [2 3 4]
#Train Index: [0 1 4] ,Test Index: [2 3 5]
#Train Index: [0 1 3] ,Test Index: [2 4 5]
#Train Index: [0 1 2] ,Test Index: [3 4 5]

 ShuffleSplit(n_splits=3,test_size=.25,random_state=0)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#ShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,训练集额和测试集的比例随机选定,训练集和测试集的比例的和可以小于1
  
import  numpy as np
from  sklearn.model_selection  import  ShuffleSplit
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 3 , 4 , 5 , 6 ])
rs = ShuffleSplit(n_splits = 3 ,test_size = . 25 ,random_state = 0 )
rs.get_n_splits(X)
print (rs)
for  train_index,test_index  in  rs.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
print ( "==============================" )
rs = ShuffleSplit(n_splits = 3 ,train_size = . 5 ,test_size = . 25 ,random_state = 0 )
rs.get_n_splits(X)
print (rs)
for  train_index,test_index  in  rs.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
 
#ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
#Train Index: [1 3 0 4] ,Test Index: [5 2]
#Train Index: [4 0 2 5] ,Test Index: [1 3]
#Train Index: [1 2 4 0] ,Test Index: [3 5]
#==============================
#ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=0.5)
#Train Index: [1 3 0] ,Test Index: [5 2]
#Train Index: [4 0 2] ,Test Index: [1 3]
#Train Index: [1 2 4] ,Test Index: [3 5]

 StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# StratifiedShuffleSplitShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,
# 训练集额和测试集的比例随机选定,训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的
 
import  numpy as np
from  sklearn.model_selection  import  StratifiedShuffleSplit
X = np.array([[ 1 , 2 ],[ 3 , 4 ],[ 5 , 6 ],[ 7 , 8 ],[ 9 , 10 ],[ 11 , 12 ]])
y = np.array([ 1 , 2 , 1 , 2 , 1 , 2 ])
sss = StratifiedShuffleSplit(n_splits = 3 ,test_size = . 5 ,random_state = 0 )
sss.get_n_splits(X,y)
print (sss)
for  train_index,test_index  in  sss.split(X,y):
     print ( "Train Index:" ,train_index, ",Test Index:" ,test_index)
     X_train,X_test = X[train_index],X[test_index]
     y_train,y_test = y[train_index],y[test_index]
     #print(X_train,X_test,y_train,y_test)
 
#StratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,train_size=None)
#Train Index: [5 4 1] ,Test Index: [3 2 0]
#Train Index: [5 2 3] ,Test Index: [0 4 1]
#Train Index: [5 0 4] ,Test Index: [3 1 2]
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值