训练XGBoost的一些脚本,

https://www.kaggle.com/happycube/bosch-production-line-performance/scirpus-extreme-bayes-faron-36312/code

说明:这个脚本是训练XGBoost的脚本,值得去学习,特别是XGBoost的一些参数设置

[python]  view plain  copy
  1.  import gc  
  2. import numpy as np  
  3. import pandas as pd  
  4. import xgboost as xgb  
  5. from sklearn.cross_validation import StratifiedKFold  
  6. from sklearn.metrics import matthews_corrcoef  
  7. from operator import itemgetter  
  8.   
  9. # per raddar, all date features except for stations 24+25 are identical  
  10.   
  11. def get_date_features():  
  12.     directory = '../input/'  
  13.     trainfile = 'train_date.csv'  
  14.       
  15.     for i, chunk in enumerate(pd.read_csv(directory + trainfile,  
  16.                                           chunksize=1,  
  17.                                           low_memory=False)):  
  18.         features = list(chunk.columns)  
  19.         break  
  20.   
  21.     seen = np.zeros(52)  
  22.     rv = []  
  23.     for f in features:  
  24.         if f == 'Id' or 'S24' in f or 'S25' in f:  
  25.             rv.append(f)  
  26.             continue  
  27.               
  28.         station = int(f.split('_')[1][1:])  
  29. #        print(station)  
  30.           
  31.         if seen[station]:  
  32.             continue  
  33.           
  34.         seen[station] = 1  
  35.         rv.append(f)  
  36.           
  37.     return rv  
  38.           
  39. usefuldatefeatures = get_date_features()  
  40.   
  41. def get_mindate():  
  42.     directory = '../input/'  
  43.     trainfile = 'train_date.csv'  
  44.     testfile = 'test_date.csv'  
  45.       
  46.     features = None  
  47.     subset = None  
  48.       
  49.     for i, chunk in enumerate(pd.read_csv(directory + trainfile,  
  50.                                           usecols=usefuldatefeatures,  
  51.                                           chunksize=50000,  
  52.                                           low_memory=False)):  
  53.         print(i)  
  54.           
  55.         if features is None:  
  56.             features = list(chunk.columns)  
  57.             features.remove('Id')  
  58.           
  59.         df_mindate_chunk = chunk[['Id']].copy()  
  60.         df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values  
  61.           
  62.         if subset is None:  
  63.             subset = df_mindate_chunk.copy()  
  64.         else:  
  65.             subset = pd.concat([subset, df_mindate_chunk])  
  66.               
  67.         del chunk  
  68.         gc.collect()  
  69.   
  70.     for i, chunk in enumerate(pd.read_csv(directory + testfile,  
  71.                                           usecols=usefuldatefeatures,  
  72.                                           chunksize=50000,  
  73.                                           low_memory=False)):  
  74.         print(i)  
  75.           
  76.         df_mindate_chunk = chunk[['Id']].copy()  
  77.         df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values  
  78.         subset = pd.concat([subset, df_mindate_chunk])  
  79.           
  80.         del chunk  
  81.         gc.collect()        
  82.           
  83.     return subset  
  84.   
  85.   
  86. df_mindate = get_mindate()  
  87.   
  88. df_mindate.sort_values(by=['mindate''Id'], inplace=True)  
  89.   
  90. df_mindate['mindate_id_diff'] = df_mindate.Id.diff()  
  91.   
  92. midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan)  
  93. midr[0:-1] = -df_mindate.mindate_id_diff.values[1:]  
  94.   
  95. df_mindate['mindate_id_diff_reverse'] = midr  
  96.   
  97. def mcc(tp, tn, fp, fn):  
  98.     sup = tp * tn - fp * fn  
  99.     inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)  
  100.     if inf == 0:  
  101.         return 0  
  102.     else:  
  103.         return sup / np.sqrt(inf)  
  104.   
  105.   
  106. def eval_mcc(y_true, y_prob, show=False):  
  107.     idx = np.argsort(y_prob)  
  108.     y_true_sort = y_true[idx]  
  109.     n = y_true.shape[0]  
  110.     nump = 1.0 * np.sum(y_true)  # number of positive  
  111.     numn = n - nump  # number of negative  
  112.     tp = nump  
  113.     tn = 0.0  
  114.     fp = numn  
  115.     fn = 0.0  
  116.     best_mcc = 0.0  
  117.     best_id = -1  
  118.     mccs = np.zeros(n)  
  119.     for i in range(n):  
  120.         if y_true_sort[i] == 1:  
  121.             tp -= 1.0  
  122.             fn += 1.0  
  123.         else:  
  124.             fp -= 1.0  
  125.             tn += 1.0  
  126.         new_mcc = mcc(tp, tn, fp, fn)  
  127.         mccs[i] = new_mcc  
  128.         if new_mcc >= best_mcc:  
  129.             best_mcc = new_mcc  
  130.             best_id = i  
  131.     if show:  
  132.         best_proba = y_prob[idx[best_id]]  
  133.         y_pred = (y_prob > best_proba).astype(int)  
  134.         return best_proba, best_mcc, y_pred  
  135.     else:  
  136.         return best_mcc  
  137.   
  138.   
  139. def mcc_eval(y_prob, dtrain):  
  140.     y_true = dtrain.get_label()  
  141.     best_mcc = eval_mcc(y_true, y_prob)  
  142.     return 'MCC', best_mcc  
  143.   
  144.   
  145. def create_feature_map(features):  
  146.     outfile = open('xgb.fmap''w')  
  147.     for i, feat in enumerate(features):  
  148.         outfile.write('{0}\t{1}\tq\n'.format(i, feat))  
  149.     outfile.close()  
  150.   
  151.   
  152. def get_importance(gbm, features):  
  153.     create_feature_map(features)  
  154.     importance = gbm.get_fscore(fmap='xgb.fmap')  
  155.     importance = sorted(importance.items(), key=itemgetter(1), reverse=True)  
  156.     return importance  
  157.   
  158.   
  159. def LeaveOneOut(data1, data2, columnName, useLOO=False):  
  160.     grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index()  
  161.     grpCount = data1.groupby(columnName)['Response'].count().reset_index()  
  162.     grpOutcomes['cnt'] = grpCount.Response  
  163.     if(useLOO):  
  164.         grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1]  
  165.     grpOutcomes.drop('cnt', inplace=True, axis=1)  
  166.     outcomes = data2['Response'].values  
  167.     x = pd.merge(data2[[columnName, 'Response']], grpOutcomes,  
  168.                  suffixes=('x_'''),  
  169.                  how='left',  
  170.                  on=columnName,  
  171.                  left_index=True)['Response']  
  172.     if(useLOO):  
  173.         x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1)  
  174.         #  x = x + np.random.normal(0, .01, x.shape[0])  
  175.     return x.fillna(x.mean())  
  176.   
  177.   
  178. def GrabData():  
  179.     directory = '../input/'  
  180.     trainfiles = ['train_categorical.csv',  
  181.                   'train_date.csv',  
  182.                   'train_numeric.csv']  
  183.     testfiles = ['test_categorical.csv',  
  184.                  'test_date.csv',  
  185.                  'test_numeric.csv']  
  186.   
  187.     cols = [['Id',  
  188.              'L1_S24_F1559''L3_S32_F3851',  
  189.              'L1_S24_F1827''L1_S24_F1582',  
  190.              'L3_S32_F3854''L1_S24_F1510',  
  191.              'L1_S24_F1525'],  
  192.             ['Id',  
  193.              'L3_S30_D3496''L3_S30_D3506',  
  194.              'L3_S30_D3501''L3_S30_D3516',  
  195.              'L3_S30_D3511'],  
  196.             ['Id',  
  197.              'L1_S24_F1846''L3_S32_F3850',  
  198.              'L1_S24_F1695''L1_S24_F1632',  
  199.              'L3_S33_F3855''L1_S24_F1604',  
  200.              'L3_S29_F3407''L3_S33_F3865',  
  201.              'L3_S38_F3952''L1_S24_F1723',  
  202.              'Response']]  
  203.     traindata = None  
  204.     testdata = None  
  205.     for i, f in enumerate(trainfiles):  
  206.         print(f)  
  207.         subset = None  
  208.         for i, chunk in enumerate(pd.read_csv(directory + f,  
  209.                                               usecols=cols[i],  
  210.                                               chunksize=50000,  
  211.                                               low_memory=False)):  
  212.             print(i)  
  213.             if subset is None:  
  214.                 subset = chunk.copy()  
  215.             else:  
  216.                 subset = pd.concat([subset, chunk])  
  217.             del chunk  
  218.             gc.collect()  
  219.         if traindata is None:  
  220.             traindata = subset.copy()  
  221.         else:  
  222.             traindata = pd.merge(traindata, subset.copy(), on="Id")  
  223.         del subset  
  224.         gc.collect()  
  225.     del cols[2][-1]  # Test doesn't have response!  
  226.     for i, f in enumerate(testfiles):  
  227.         print(f)  
  228.         subset = None  
  229.         for i, chunk in enumerate(pd.read_csv(directory + f,  
  230.                                               usecols=cols[i],  
  231.                                               chunksize=50000,  
  232.                                               low_memory=False)):  
  233.             print(i)  
  234.             if subset is None:  
  235.                 subset = chunk.copy()  
  236.             else:  
  237.                 subset = pd.concat([subset, chunk])  
  238.             del chunk  
  239.             gc.collect()  
  240.         if testdata is None:  
  241.             testdata = subset.copy()  
  242.         else:  
  243.             testdata = pd.merge(testdata, subset.copy(), on="Id")  
  244.         del subset  
  245.         gc.collect()  
  246.           
  247.     traindata = traindata.merge(df_mindate, on='Id')  
  248.     testdata = testdata.merge(df_mindate, on='Id')  
  249.           
  250.     testdata['Response'] = 0  # Add Dummy Value  
  251.     visibletraindata = traindata[::2]  
  252.     blindtraindata = traindata[1::2]  
  253.     print(blindtraindata.columns)  
  254.     for i in range(2):  
  255.         for col in cols[i][1:]:  
  256.             print(col)  
  257.             blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata,  
  258.                                                      blindtraindata,  
  259.                                                      col, False).values  
  260.             testdata.loc[:, col] = LeaveOneOut(visibletraindata,  
  261.                                                testdata, col, False).values  
  262.     del visibletraindata  
  263.     gc.collect()  
  264.     testdata.drop('Response', inplace=True, axis=1)  
  265.     return blindtraindata, testdata  
  266.   
  267.   
  268. def Train():  
  269.     train, test = GrabData()  
  270.     print('Train:', train.shape)  
  271.     print('Test', test.shape)  
  272.     features = list(train.columns)  
  273.     features.remove('Response')  
  274.     features.remove('Id')  
  275.     print(features)  
  276.     num_rounds = 50  
  277.     params = {}  
  278.     params['objective'] = "binary:logistic"  
  279.     params['eta'] = 0.021  
  280.     params['max_depth'] = 7  
  281.     params['colsample_bytree'] = 0.82  
  282.     params['min_child_weight'] = 3  
  283.     params['base_score'] = 0.005  
  284.     params['silent'] = True  
  285.   
  286.     print('Fitting')  
  287.     trainpredictions = None  
  288.     testpredictions = None  
  289.   
  290.     dvisibletrain = \  
  291.         xgb.DMatrix(train[features],  
  292.                     train.Response,  
  293.                     silent=True)  
  294.     dtest = \  
  295.         xgb.DMatrix(test[features],  
  296.                     silent=True)  
  297.   
  298.     folds = 1  
  299.     for i in range(folds):  
  300.         print('Fold:', i)  
  301.         params['seed'] = i  
  302.         watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')]  
  303.         clf = xgb.train(params, dvisibletrain,  
  304.                         num_boost_round=num_rounds,  
  305.                         evals=watchlist,  
  306.                         early_stopping_rounds=20,  
  307.                         feval=mcc_eval,  
  308.                         maximize=True  
  309.                         )  
  310.         limit = clf.best_iteration+1  
  311.         # limit = clf.best_ntree_limit  
  312.         predictions = \  
  313.             clf.predict(dvisibletrain, ntree_limit=limit)  
  314.   
  315.         best_proba, best_mcc, y_pred = eval_mcc(train.Response,  
  316.                                                 predictions,  
  317.                                                 True)  
  318.         print('tree limit:', limit)  
  319.         print('mcc:', best_mcc)  
  320.         print(matthews_corrcoef(train.Response,  
  321.                                 y_pred))  
  322.         if(trainpredictions is None):  
  323.             trainpredictions = predictions  
  324.         else:  
  325.             trainpredictions += predictions  
  326.         predictions = clf.predict(dtest, ntree_limit=limit)  
  327.         if(testpredictions is None):  
  328.             testpredictions = predictions  
  329.         else:  
  330.             testpredictions += predictions  
  331.         imp = get_importance(clf, features)  
  332.         print('Importance array: ', imp)  
  333.   
  334.     best_proba, best_mcc, y_pred = eval_mcc(train.Response,  
  335.                                             trainpredictions/folds,  
  336.                                             True)  
  337.     print(matthews_corrcoef(train.Response,  
  338.                             y_pred))  
  339.   
  340.     submission = pd.DataFrame({"Id": train.Id,  
  341.                                "Prediction": trainpredictions/folds,  
  342.                                "Response": train.Response})  
  343.     submission[['Id',  
  344.                 'Prediction',  
  345.                 'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv',  
  346.                                     index=False)  
  347.   
  348.     submission = pd.DataFrame({"Id": test.Id.values,  
  349.                                "Response": testpredictions/folds})  
  350.     submission[['Id''Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv',  
  351.                                           index=False)  
  352.     y_pred = (testpredictions/folds > .08).astype(int)  
  353.     submission = pd.DataFrame({"Id": test.Id.values,  
  354.                                "Response": y_pred})  
  355.     submission[['Id''Response']].to_csv('xgbsubmission'+str(folds)+'.csv',  
  356.                                           index=False)  
  357.   
  358. if __name__ == "__main__":  
  359.     print('Started')  
  360.     Train()  
  361.     print('Finished')  

值得学习的几个点:

1、自定义评价函数

2、使用early_stop来防止过拟合

3、train,test数据的生成方式简洁

4、对于同一个模型,跑n-fold取平均。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值