导入第三方包
import os
import gc
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn. linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn. preprocessing import MinMaxScaler
from sklearn. model_selection import StratifiedKFold, KFold
from sklearn. metrics import log_loss
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import OneHotEncoder
from tqdm import tqdm
import matplotlib. pyplot as plt
import time
import warnings
warnings. filterwarnings( 'ignore' )
读取数据
train = pd. read_csv( 'train.csv' )
test= pd. read_csv( 'testA.csv' )
train. head( )
id heartbeat_signals label 0 0 0.9912297987616655,0.9435330436439665,0.764677... 0.0 1 1 0.9714822034884503,0.9289687459588268,0.572932... 0.0 2 2 1.0,0.9591487564065292,0.7013782792997189,0.23... 2.0 3 3 0.9757952826275774,0.9340884687738161,0.659636... 0.0 4 4 0.0,0.055816398940721094,0.26129357194994196,0... 2.0
test. head( )
id heartbeat_signals 0 100000 0.9915713654170097,1.0,0.6318163407681274,0.13... 1 100001 0.6075533139615096,0.5417083883163654,0.340694... 2 100002 0.9752726292239277,0.6710965234906665,0.686758... 3 100003 0.9956348033996116,0.9170249621481004,0.521096... 4 100004 1.0,0.8879490481178918,0.745564725322326,0.531...
数据预处理
def reduce_mem_usage ( df) :
start_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage of dataframe is {:.2f} MB' . format ( start_mem) )
for col in df. columns:
col_type = df[ col] . dtype
if col_type != object :
c_min = df[ col] . min ( )
c_max = df[ col] . max ( )
if str ( col_type) [ : 3 ] == 'int' :
if c_min > np. iinfo( np. int8) . min and c_max < np. iinfo( np. int8) . max :
df[ col] = df[ col] . astype( np. int8)
elif c_min > np. iinfo( np. int16) . min and c_max < np. iinfo( np. int16) . max :
df[ col] = df[ col] . astype( np. int16)
elif c_min > np. iinfo( np. int32) . min and c_max < np. iinfo( np. int32) . max :
df[ col] = df[ col] . astype( np. int32)
elif c_min > np. iinfo( np. int64) . min and c_max < np. iinfo( np. int64) . max :
df[ col] = df[ col] . astype( np. int64)
else :
if c_min > np. finfo( np. float16) . min and c_max < np. finfo( np. float16) . max :
df[ col] = df[ col] . astype( np. float16)
elif c_min > np. finfo( np. float32) . min and c_max < np. finfo( np. float32) . max :
df[ col] = df[ col] . astype( np. float32)
else :
df[ col] = df[ col] . astype( np. float64)
else :
df[ col] = df[ col] . astype( 'category' )
end_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage after optimization is: {:.2f} MB' . format ( end_mem) )
print ( 'Decreased by {:.1f}%' . format ( 100 * ( start_mem - end_mem) / start_mem) )
return df
train_list = [ ]
for items in train. values:
train_list. append( [ items[ 0 ] ] + [ float ( i) for i in items[ 1 ] . split( ',' ) ] + [ items[ 2 ] ] )
train = pd. DataFrame( np. array( train_list) )
train. columns = [ 'id' ] + [ 's_' + str ( i) for i in range ( len ( train_list[ 0 ] ) - 2 ) ] + [ 'label' ]
train = reduce_mem_usage( train)
test_list= [ ]
for items in test. values:
test_list. append( [ items[ 0 ] ] + [ float ( i) for i in items[ 1 ] . split( ',' ) ] )
test = pd. DataFrame( np. array( test_list) )
test. columns = [ 'id' ] + [ 's_' + str ( i) for i in range ( len ( test_list[ 0 ] ) - 1 ) ]
test = reduce_mem_usage( test)
Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%
train. head( )
id s_0 s_1 s_2 s_3 s_4 s_5 s_6 s_7 s_8 ... s_196 s_197 s_198 s_199 s_200 s_201 s_202 s_203 s_204 label 0 0.0 0.991211 0.943359 0.764648 0.618652 0.379639 0.190796 0.040222 0.026001 0.031708 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 1.0 0.971680 0.929199 0.572754 0.178467 0.122986 0.132324 0.094421 0.089600 0.030487 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 2.0 1.000000 0.958984 0.701172 0.231812 0.000000 0.080688 0.128418 0.187500 0.280762 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 3 3.0 0.975586 0.934082 0.659668 0.249878 0.237061 0.281494 0.249878 0.249878 0.241455 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 4.0 0.000000 0.055817 0.261230 0.359863 0.433105 0.453613 0.499023 0.542969 0.616699 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
5 rows × 207 columns
test. head( )
id s_0 s_1 s_2 s_3 s_4 s_5 s_6 s_7 s_8 ... s_195 s_196 s_197 s_198 s_199 s_200 s_201 s_202 s_203 s_204 0 100000.0 0.991699 1.000000 0.631836 0.136230 0.041412 0.102722 0.120850 0.123413 0.107910 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 1 100001.0 0.607422 0.541504 0.340576 0.000000 0.090698 0.164917 0.195068 0.168823 0.198853 ... 0.389893 0.386963 0.367188 0.364014 0.360596 0.357178 0.350586 0.350586 0.350586 0.36377 2 100002.0 0.975098 0.670898 0.686523 0.708496 0.718750 0.716797 0.720703 0.701660 0.596680 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 3 100003.0 0.995605 0.916992 0.520996 0.000000 0.221802 0.404053 0.490479 0.527344 0.518066 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 4 100004.0 1.000000 0.888184 0.745605 0.531738 0.380371 0.224609 0.091125 0.057648 0.003914 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000
5 rows × 206 columns
训练数据/测试数据准备
x_train = train. drop( [ 'id' , 'label' ] , axis= 1 )
y_train = train[ 'label' ]
x_test= test. drop( [ 'id' ] , axis= 1 )
模型训练
def abs_sum ( y_pre, y_tru) :
y_pre= np. array( y_pre)
y_tru= np. array( y_tru)
loss= sum ( sum ( abs ( y_pre- y_tru) ) )
return loss
def cv_model ( clf, train_x, train_y, test_x, clf_name) :
folds = 5
seed = 2021
kf = KFold( n_splits= folds, shuffle= True , random_state= seed)
test = np. zeros( ( test_x. shape[ 0 ] , 4 ) )
cv_scores = [ ]
onehot_encoder = OneHotEncoder( sparse= False )
for i, ( train_index, valid_index) in enumerate ( kf. split( train_x, train_y) ) :
print ( '************************************ {} ************************************' . format ( str ( i+ 1 ) ) )
trn_x, trn_y, val_x, val_y = train_x. iloc[ train_index] , train_y[ train_index] , train_x. iloc[ valid_index] , train_y[ valid_index]
if clf_name == "lgb" :
train_matrix = clf. Dataset( trn_x, label= trn_y)
valid_matrix = clf. Dataset( val_x, label= val_y)
params = {
'boosting_type' : 'gbdt' ,
'objective' : 'multiclass' ,
'num_class' : 4 ,
'num_leaves' : 2 ** 5 ,
'feature_fraction' : 0.8 ,
'bagging_fraction' : 0.8 ,
'bagging_freq' : 4 ,
'learning_rate' : 0.1 ,
'seed' : seed,
'nthread' : 28 ,
'n_jobs' : 24 ,
'verbose' : - 1 ,
}
model = clf. train( params,
train_set= train_matrix,
valid_sets= valid_matrix,
num_boost_round= 2000 ,
verbose_eval= 100 ,
early_stopping_rounds= 200 )
val_pred = model. predict( val_x, num_iteration= model. best_iteration)
test_pred = model. predict( test_x, num_iteration= model. best_iteration)
val_y= np. array( val_y) . reshape( - 1 , 1 )
val_y = onehot_encoder. fit_transform( val_y)
print ( '预测的概率矩阵为:' )
print ( test_pred)
test += test_pred
score= abs_sum( val_y, val_pred)
cv_scores. append( score)
print ( cv_scores)
print ( "%s_scotrainre_list:" % clf_name, cv_scores)
print ( "%s_score_mean:" % clf_name, np. mean( cv_scores) )
print ( "%s_score_std:" % clf_name, np. std( cv_scores) )
test= test/ kf. n_splits
return test
def lgb_model ( x_train, y_train, x_test) :
lgb_test = cv_model( lgb, x_train, y_train, x_test, "lgb" )
return lgb_test
lgb_test = lgb_model( x_train, y_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0525735
[200] valid_0's multi_logloss: 0.0422444
[300] valid_0's multi_logloss: 0.0407386
[400] valid_0's multi_logloss: 0.0421508
Early stopping, best iteration is:
[291] valid_0's multi_logloss: 0.0405414
预测的概率矩阵为:
[[9.99967165e-01 3.10638661e-05 9.29743793e-07 8.41756551e-07]
[7.41632330e-05 9.03442916e-04 9.99022377e-01 1.70465437e-08]
[4.51699884e-07 3.30405989e-08 2.11550199e-07 9.99999304e-01]
...
[7.28937568e-02 4.13186418e-04 9.26689719e-01 3.33742355e-06]
[9.99959509e-01 4.03563040e-05 9.53421322e-08 3.98128462e-08]
[9.86220731e-01 2.16631217e-03 7.06637757e-03 4.54657877e-03]]
[604.9584360073212]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0566626
[200] valid_0's multi_logloss: 0.0450852
[300] valid_0's multi_logloss: 0.044078
[400] valid_0's multi_logloss: 0.0455546
Early stopping, best iteration is:
[275] valid_0's multi_logloss: 0.0437793
预测的概率矩阵为:
[[9.99991401e-01 7.69109547e-06 6.65504756e-07 2.42084688e-07]
[5.72380482e-05 1.32812809e-03 9.98614607e-01 2.66534396e-08]
[2.82123411e-06 4.13195205e-07 1.34026965e-06 9.99995425e-01]
...
[6.96398024e-02 6.52459907e-04 9.29685742e-01 2.19960932e-05]
[9.99972366e-01 2.75069005e-05 7.68142933e-08 5.07415018e-08]
[9.67263676e-01 7.26154408e-03 2.41533542e-02 1.32142531e-03]]
[604.9584360073212, 623.4313863731123]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0498722
[200] valid_0's multi_logloss: 0.038028
[300] valid_0's multi_logloss: 0.0358066
[400] valid_0's multi_logloss: 0.0361478
[500] valid_0's multi_logloss: 0.0379597
Early stopping, best iteration is:
[340] valid_0's multi_logloss: 0.0354344
预测的概率矩阵为:
[[9.99972032e-01 2.62406774e-05 1.17282152e-06 5.54230651e-07]
[1.05242811e-05 6.50215805e-05 9.99924453e-01 6.93812546e-10]
[1.93240868e-06 1.10384984e-07 3.76773426e-07 9.99997580e-01]
...
[1.34894410e-02 3.84569683e-05 9.86471555e-01 5.46564350e-07]
[9.99987431e-01 1.25532882e-05 1.03902298e-08 5.46727770e-09]
[9.78722948e-01 1.06329839e-02 6.94192038e-03 3.70214810e-03]]
[604.9584360073212, 623.4313863731123, 508.02381607269535]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0564768
[200] valid_0's multi_logloss: 0.0448698
[300] valid_0's multi_logloss: 0.0446719
[400] valid_0's multi_logloss: 0.0470399
Early stopping, best iteration is:
[250] valid_0's multi_logloss: 0.0438853
预测的概率矩阵为:
[[9.99979692e-01 1.70821979e-05 1.27048476e-06 1.95571841e-06]
[5.66207785e-05 4.02275314e-04 9.99541086e-01 1.82828519e-08]
[2.62267451e-06 3.58613522e-07 4.78645006e-06 9.99992232e-01]
...
[4.56636552e-02 5.69497433e-04 9.53758468e-01 8.37980573e-06]
[9.99896785e-01 1.02796802e-04 2.46636563e-07 1.72061021e-07]
[8.70911669e-01 1.73790185e-02 1.04478175e-01 7.23113697e-03]]
[604.9584360073212, 623.4313863731123, 508.02381607269535, 660.4867407547265]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with n_jobs=24, nthread=28 will be ignored. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0506398
[200] valid_0's multi_logloss: 0.0396422
[300] valid_0's multi_logloss: 0.0381065
[400] valid_0's multi_logloss: 0.0390162
[500] valid_0's multi_logloss: 0.0414986
Early stopping, best iteration is:
[324] valid_0's multi_logloss: 0.0379497
预测的概率矩阵为:
[[9.99993352e-01 6.02902202e-06 1.13002685e-07 5.06277302e-07]
[1.03959552e-05 5.03778956e-04 9.99485820e-01 5.07638601e-09]
[1.92568065e-07 5.07155306e-08 4.94690856e-08 9.99999707e-01]
...
[8.83103121e-03 2.51969353e-05 9.91142776e-01 9.96143937e-07]
[9.99984791e-01 1.51997858e-05 5.62426491e-09 3.80450197e-09]
[9.86084001e-01 8.75968498e-04 1.09742304e-02 2.06580027e-03]]
[604.9584360073212, 623.4313863731123, 508.02381607269535, 660.4867407547265, 539.2160054696062]
lgb_scotrainre_list: [604.9584360073212, 623.4313863731123, 508.02381607269535, 660.4867407547265, 539.2160054696062]
lgb_score_mean: 587.2232769354923
lgb_score_std: 55.803854529412156
lgb_test
array([[9.99980728e-01, 1.76213718e-05, 8.30311502e-07, 8.20013519e-07],
[4.17884592e-05, 6.40529371e-04, 9.99317669e-01, 1.35506068e-08],
[1.60411705e-06, 1.93189968e-07, 1.35290248e-06, 9.99996850e-01],
...,
[4.21035373e-02, 3.39759532e-04, 9.57549652e-01, 7.05120616e-06],
[9.99960176e-01, 3.96826162e-05, 8.69614967e-08, 5.43774298e-08],
[9.57840605e-01, 7.66316543e-03, 3.07228116e-02, 3.77341788e-03]])
temp= pd. DataFrame( lgb_test)
temp
0 1 2 3 0 0.999981 1.762137e-05 8.303115e-07 8.200135e-07 1 0.000042 6.405294e-04 9.993177e-01 1.355061e-08 2 0.000002 1.931900e-07 1.352902e-06 9.999968e-01 3 0.999971 1.813623e-05 1.051597e-05 3.590280e-08 4 0.999983 1.774501e-06 1.544578e-05 1.965755e-07 ... ... ... ... ... 19995 0.998071 3.111825e-04 1.091087e-04 1.508597e-03 19996 0.999858 1.306361e-04 1.085224e-05 8.328535e-08 19997 0.042104 3.397595e-04 9.575497e-01 7.051206e-06 19998 0.999960 3.968262e-05 8.696150e-08 5.437743e-08 19999 0.957841 7.663165e-03 3.072281e-02 3.773418e-03
20000 rows × 4 columns
result= pd. read_csv( 'sample_submit.csv' )
result[ 'label_0' ] = temp[ 0 ]
result[ 'label_1' ] = temp[ 1 ]
result[ 'label_2' ] = temp[ 2 ]
result[ 'label_3' ] = temp[ 3 ]
result. to_csv( 'submit.csv' , index= False )
总结: 用天池实验室GPU运行 需要先!pip install 一系列文件 耗时较长 真正training大约需要15分钟左右 成绩能达到558.9469