转：『Sklearn』数据划分方法及python代码

 
    
         #KFold<br>import numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         KFold 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
 
         kf 
         = 
         KFold(n_splits 
         = 
         2 
         )     
         # 定义分成几个组 
        
 
         # kf.get_n_splits(X)    # 查询分成几个组 
        
 
         print 
         (kf) 
        
 
         for  
         train_index,test_index  
         in  
         kf.split(X): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        
 
  

 
    
         # GroupKFold，不是很懂这个划分方法 
        
 
         import  
         numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         GroupKFold 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
 
         groups 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
 
         group_kfold 
         = 
         GroupKFold(n_splits 
         = 
         2 
         ) 
        
 
         group_kfold.get_n_splits(X,y,groups) 
        
 
         print 
         (group_kfold) 
        
 
         for  
         train_index,test_index  
         in  
         group_kfold.split(X,y,groups): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        

            
        
 
         #GroupKFold(n_splits=2) 
        
 
         #Train Index: [0 2 4] ,Test Index: [1 3 5] 
        
 
         #Train Index: [1 3 5] ,Test Index: [0 2 4] 
        
 
  

 
    
         # stratifiedKFold：保证训练集中每一类的比例是相同的（尽量） 
        
 
         import  
         numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         StratifiedKFold 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         1 
         , 
         1 
         , 
         2 
         , 
         2 
         , 
         2 
         ]) 
        
 
         skf 
         = 
         StratifiedKFold(n_splits 
         = 
         3 
         ) 
        
 
         skf.get_n_splits(X,y) 
        
 
         print 
         (skf) 
        
 
         for  
         train_index,test_index  
         in  
         skf.split(X,y): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        

            
        
 
         #StratifiedKFold(n_splits=3, random_state=None, shuffle=False) 
        
 
         #Train Index: [1 2 4 5] ,Test Index: [0 3] 
        
 
         #Train Index: [0 2 3 5] ,Test Index: [1 4] 
        
 
  

 
    
         # leaveOneOut：测试集就留下一个 
        
 
         import  
         numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         LeaveOneOut 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
 
         loo 
         = 
         LeaveOneOut() 
        
 
         loo.get_n_splits(X) 
        
 
         print 
         (loo) 
        
 
         for  
         train_index,test_index  
         in  
         loo.split(X,y): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        
 
         #LeaveOneOut() 
        
 
         #Train Index: [1 2 3 4 5] ,Test Index: [0] 
        
 
         #Train Index: [0 2 3 4 5] ,Test Index: [1] 
        
 
         #Train Index: [0 1 3 4 5] ,Test Index: [2] 
        
 
         #Train Index: [0 1 2 4 5] ,Test Index: [3] 
        
 
         #Train Index: [0 1 2 3 5] ,Test Index: [4] 
        
 
         #Train Index: [0 1 2 3 4] ,Test Index: [5] 
        
 
  

 
         LeavePOut：测试集留下P个 
        
         import  
         numpy as np 
        
         from  
         sklearn.model_selection  
         import  
         LeavePOut 
        
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
         lpo 
         = 
         LeavePOut(p 
         = 
         3 
         ) 
        
         lpo.get_n_splits(X) 
        
         print 
         (lpo) 
        
         for  
         train_index,test_index  
         in  
         lpo.split(X,y): 
        
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
         #print(X_train,X_test,y_train,y_test) 
        
         #LeavePOut(p=3) 
        
         #Train Index: [3 4 5] ,Test Index: [0 1 2] 
        
         #Train Index: [2 4 5] ,Test Index: [0 1 3] 
        
         #Train Index: [2 3 5] ,Test Index: [0 1 4] 
        
         #Train Index: [2 3 4] ,Test Index: [0 1 5] 
        
         #Train Index: [1 4 5] ,Test Index: [0 2 3] 
        
         #Train Index: [1 3 5] ,Test Index: [0 2 4] 
        
         #Train Index: [1 3 4] ,Test Index: [0 2 5] 
        
         #Train Index: [1 2 5] ,Test Index: [0 3 4] 
        
         #Train Index: [1 2 4] ,Test Index: [0 3 5] 
        
         #Train Index: [1 2 3] ,Test Index: [0 4 5] 
        
         #Train Index: [0 4 5] ,Test Index: [1 2 3] 
        
         #Train Index: [0 3 5] ,Test Index: [1 2 4] 
        
         #Train Index: [0 3 4] ,Test Index: [1 2 5] 
        
         #Train Index: [0 2 5] ,Test Index: [1 3 4] 
        
         #Train Index: [0 2 4] ,Test Index: [1 3 5] 
        
         #Train Index: [0 2 3] ,Test Index: [1 4 5] 
        
         #Train Index: [0 1 5] ,Test Index: [2 3 4] 
        
         #Train Index: [0 1 4] ,Test Index: [2 3 5] 
        
         #Train Index: [0 1 3] ,Test Index: [2 4 5] 
        
         #Train Index: [0 1 2] ,Test Index: [3 4 5]

 
    
         #ShuffleSplit 把数据集打乱顺序，然后划分测试集和训练集，训练集额和测试集的比例随机选定，训练集和测试集的比例的和可以小于1 
        
 
            
        
 
         import  
         numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         ShuffleSplit 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         3 
         , 
         4 
         , 
         5 
         , 
         6 
         ]) 
        
 
         rs 
         = 
         ShuffleSplit(n_splits 
         = 
         3 
         ,test_size 
         = 
         . 
         25 
         ,random_state 
         = 
         0 
         ) 
        
 
         rs.get_n_splits(X) 
        
 
         print 
         (rs) 
        
 
         for  
         train_index,test_index  
         in  
         rs.split(X,y): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        
 
         print 
         ( 
         "==============================" 
         ) 
        
 
         rs 
         = 
         ShuffleSplit(n_splits 
         = 
         3 
         ,train_size 
         = 
         . 
         5 
         ,test_size 
         = 
         . 
         25 
         ,random_state 
         = 
         0 
         ) 
        
 
         rs.get_n_splits(X) 
        
 
         print 
         (rs) 
        
 
         for  
         train_index,test_index  
         in  
         rs.split(X,y): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        

            
        
 
         #ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None) 
        
 
         #Train Index: [1 3 0 4] ,Test Index: [5 2] 
        
 
         #Train Index: [4 0 2 5] ,Test Index: [1 3] 
        
 
         #Train Index: [1 2 4 0] ,Test Index: [3 5] 
        
 
         #============================== 
        
 
         #ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=0.5) 
        
 
         #Train Index: [1 3 0] ,Test Index: [5 2] 
        
 
         #Train Index: [4 0 2] ,Test Index: [1 3] 
        
 
         #Train Index: [1 2 4] ,Test Index: [3 5] 
        
 
  

 
    
         # StratifiedShuffleSplitShuffleSplit 把数据集打乱顺序，然后划分测试集和训练集， 
        
 
         # 训练集额和测试集的比例随机选定，训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的 
        

            
        
 
         import  
         numpy as np 
        
 
         from  
         sklearn.model_selection  
         import  
         StratifiedShuffleSplit 
        
 
         X 
         = 
         np.array([[ 
         1 
         , 
         2 
         ],[ 
         3 
         , 
         4 
         ],[ 
         5 
         , 
         6 
         ],[ 
         7 
         , 
         8 
         ],[ 
         9 
         , 
         10 
         ],[ 
         11 
         , 
         12 
         ]]) 
        
 
         y 
         = 
         np.array([ 
         1 
         , 
         2 
         , 
         1 
         , 
         2 
         , 
         1 
         , 
         2 
         ]) 
        
 
         sss 
         = 
         StratifiedShuffleSplit(n_splits 
         = 
         3 
         ,test_size 
         = 
         . 
         5 
         ,random_state 
         = 
         0 
         ) 
        
 
         sss.get_n_splits(X,y) 
        
 
         print 
         (sss) 
        
 
         for  
         train_index,test_index  
         in  
         sss.split(X,y): 
        
 
              
         print 
         ( 
         "Train Index:" 
         ,train_index, 
         ",Test Index:" 
         ,test_index) 
        
 
              
         X_train,X_test 
         = 
         X[train_index],X[test_index] 
        
 
              
         y_train,y_test 
         = 
         y[train_index],y[test_index] 
        
 
              
         #print(X_train,X_test,y_train,y_test) 
        

            
        
 
         #StratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,train_size=None) 
        
 
         #Train Index: [5 4 1] ,Test Index: [3 2 0] 
        
 
         #Train Index: [5 2 3] ,Test Index: [0 4 1] 
        
 
         #Train Index: [5 0 4] ,Test Index: [3 1 2]