import os
import gc
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn. linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn. preprocessing import MinMaxScaler
from sklearn. model_selection import StratifiedKFold, KFold
from sklearn. metrics import log_loss
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import OneHotEncoder
from tqdm import tqdm
import matplotlib. pyplot as plt
import time
import warnings
warnings. filterwarnings( 'ignore' )
train = pd. read_csv( 'train.csv' )
test= pd. read_csv( 'testA.csv' )
train. head( )
id heartbeat_signals label 0 0 0.9912297987616655,0.9435330436439665,0.764677... 0.0 1 1 0.9714822034884503,0.9289687459588268,0.572932... 0.0 2 2 1.0,0.9591487564065292,0.7013782792997189,0.23... 2.0 3 3 0.9757952826275774,0.9340884687738161,0.659636... 0.0 4 4 0.0,0.055816398940721094,0.26129357194994196,0... 2.0
train. shape
(100000, 3)
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 100000 non-null int64
1 heartbeat_signals 100000 non-null object
2 label 100000 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB
test. head( )
id heartbeat_signals 0 100000 0.9915713654170097,1.0,0.6318163407681274,0.13... 1 100001 0.6075533139615096,0.5417083883163654,0.340694... 2 100002 0.9752726292239277,0.6710965234906665,0.686758... 3 100003 0.9956348033996116,0.9170249621481004,0.521096... 4 100004 1.0,0.8879490481178918,0.745564725322326,0.531...
def reduce_mem_usage ( df) :
start_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage of dataframe is {:.2f} MB' . format ( start_mem) )
for col in df. columns:
col_type = df[ col] . dtype
if col_type != object :
c_min = df[ col] . min ( )
c_max = df[ col] . max ( )
if str ( col_type) [ : 3 ] == 'int' :
if c_min > np. iinfo( np. int8) . min and c_max < np. iinfo( np. int8) . max :
df[ col] = df[ col] . astype( np. int8)
elif c_min > np. iinfo( np. int16) . min and c_max < np. iinfo( np. int16) . max :
df[ col] = df[ col] . astype( np. int16)
elif c_min > np. iinfo( np. int32) . min and c_max < np. iinfo( np. int32) . max :
df[ col] = df[ col] . astype( np. int32)
elif c_min > np. iinfo( np. int64) . min and c_max < np. iinfo( np. int64) . max :
df[ col] = df[ col] . astype( np. int64)
else :
if c_min > np. finfo( np. float16) . min and c_max < np. finfo( np. float16) . max :
df[ col] = df[ col] . astype( np. float16)
elif c_min > np. finfo( np. float32) . min and c_max < np. finfo( np. float32) . max :
df[ col] = df[ col] . astype( np. float32)
else :
df[ col] = df[ col] . astype( np. float64)
else :
df[ col] = df[ col] . astype( 'category' )
end_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage after optimization is: {:.2f} MB' . format ( end_mem) )
print ( 'Decreased by {:.1f}%' . format ( 100 * ( start_mem - end_mem) / start_mem) )
return df
train_list = [ ]
for items in train. values:
train_list. append( [ items[ 0 ] ] + [ float ( i) for i in items[ 1 ] . split( ',' ) ] + [ items[ 2 ] ] )
train = pd. DataFrame( np. array( train_list) )
train. columns = [ 'id' ] + [ 's_' + str ( i) for i in range ( len ( train_list[ 0 ] ) - 2 ) ] + [ 'label' ]
train = reduce_mem_usage( train)
Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 207 entries, id to label
dtypes: float16(206), float32(1)
memory usage: 39.7 MB
train. columns
Index(['id', 'heartbeat_signals', 'label'], dtype='object')
test_list= [ ]
for items in test. values:
test_list. append( [ items[ 0 ] ] + [ float ( i) for i in items[ 1 ] . split( ',' ) ] )
test = pd. DataFrame( np. array( test_list) )
test. columns = [ 'id' ] + [ 's_' + str ( i) for i in range ( len ( test_list[ 0 ] ) - 1 ) ]
test = reduce_mem_usage( test)
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%
train. to_csv( 'train_1.csv' , index= False )
test. to_csv( 'test1.csv' , index= False )
train = pd. read_csv( 'train_1.csv' )
test= pd. read_csv( 'test_1.csv' )
test. shape
(20000, 206)
x_train = train. drop( [ 'id' , 'label' ] , axis= 1 )
y_train = train[ 'label' ]
x_test= test. drop( [ 'id' ] , axis= 1 )
def abs_sum ( y_pre, y_tru) :
y_pre= np. array( y_pre)
y_tru= np. array( y_tru)
loss= sum ( sum ( abs ( y_pre- y_tru) ) )
return loss
x_train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 205 entries, s_0 to s_204
dtypes: float16(205)
memory usage: 39.1 MB
y_train
0 0.0
1 0.0
2 2.0
3 0.0
4 2.0
...
99995 0.0
99996 2.0
99997 3.0
99998 2.0
99999 0.0
Name: label, Length: 100000, dtype: float16
def cv_model ( clf, train_x, train_y, test_x, clf_name) :
folds = 2
seed = 2021
kf = KFold( n_splits= folds, shuffle= True , random_state= seed)
test = np. zeros( ( test_x. shape[ 0 ] , 4 ) )
cv_scores = [ ]
onehot_encoder = OneHotEncoder( sparse= False )
for i, ( train_index, valid_index) in enumerate ( kf. split( train_x, train_y) ) :
print ( '************************************ {} ************************************' . format ( str ( i+ 1 ) ) )
trn_x, trn_y, val_x, val_y = train_x. iloc[ train_index] , train_y[ train_index] , train_x. iloc[ valid_index] , train_y[ valid_index]
if clf_name == "lgb" :
train_matrix = clf. Dataset( trn_x, label= trn_y)
valid_matrix = clf. Dataset( val_x, label= val_y)
params = {
'boosting_type' : 'gbdt' ,
'objective' : 'multiclass' ,
'num_class' : 4 ,
'num_leaves' : 2 ** 5 ,
'feature_fraction' : 0.8 ,
'bagging_fraction' : 0.8 ,
'bagging_freq' : 4 ,
'learning_rate' : 0.1 ,
'seed' : seed,
'nthread' : 28 ,
'n_jobs' : 24 ,
'verbose' : - 1 ,
}
model = clf. train( params,
train_set= train_matrix,
valid_sets= valid_matrix,
num_boost_round= 500 ,
verbose_eval= 100 ,
early_stopping_rounds= 50 )
val_pred = model. predict( val_x, num_iteration= model. best_iteration)
test_pred = model. predict( test_x, num_iteration= model. best_iteration)
val_y= np. array( val_y) . reshape( - 1 , 1 )
val_y = onehot_encoder. fit_transform( val_y)
print ( '预测的概率矩阵为:' )
print ( test_pred)
test += test_pred
score= abs_sum( val_y, val_pred)
cv_scores. append( score)
print ( cv_scores)
print ( "%s_scotrainre_list:" % clf_name, cv_scores)
print ( "%s_score_mean:" % clf_name, np. mean( cv_scores) )
print ( "%s_score_std:" % clf_name, np. std( cv_scores) )
test= test/ kf. n_splits
return test
def lgb_model ( x_train, y_train, x_test) :
lgb_test = cv_model( lgb, x_train, y_train, x_test, "lgb" )
return lgb_test
lgb_test = lgb_model( x_train, y_train, x_test)
************************************ 1 ************************************
Training until validation scores don't improve for 50 rounds
[100] valid_0's multi_logloss: 0.0704711
[200] valid_0's multi_logloss: 0.0549909
[300] valid_0's multi_logloss: 0.0528245
Early stopping, best iteration is:
[307] valid_0's multi_logloss: 0.0527672
预测的概率矩阵为:
[[9.99957299e-01 3.65514675e-05 2.15328640e-06 3.99633487e-06]
[1.69790672e-04 1.68415918e-03 9.98145910e-01 1.39703148e-07]
[2.56058215e-06 1.92758463e-07 1.54282144e-06 9.99995704e-01]
...
[4.95018598e-02 1.97696400e-04 9.50268244e-01 3.21994803e-05]
[9.99949933e-01 4.93829878e-05 3.43103019e-07 3.41406778e-07]
[9.55306577e-01 4.78105718e-03 3.36174390e-02 6.29492698e-03]]
[1955.3947895231136]
************************************ 2 ************************************
Training until validation scores don't improve for 50 rounds
[100] valid_0's multi_logloss: 0.0682392
[200] valid_0's multi_logloss: 0.0526494
[300] valid_0's multi_logloss: 0.0504195
Early stopping, best iteration is:
[291] valid_0's multi_logloss: 0.0503616
预测的概率矩阵为:
[[9.99869813e-01 1.14722981e-04 2.99645841e-06 1.24677403e-05]
[9.79354868e-05 2.00967286e-03 9.97892104e-01 2.87704544e-07]
[4.55551526e-06 1.74254833e-07 3.71127623e-06 9.99991559e-01]
...
[4.87378712e-02 1.90985930e-04 9.51004592e-01 6.65512467e-05]
[9.99943945e-01 5.54682404e-05 3.49358046e-07 2.37799631e-07]
[9.08801290e-01 1.60743827e-03 6.74482993e-02 2.21429722e-02]]
[1955.3947895231136, 1935.860233737527]
lgb_scotrainre_list: [1955.3947895231136, 1935.860233737527]
lgb_score_mean: 1945.6275116303204
lgb_score_std: 9.767277892793231
temp= pd. DataFrame( lgb_test)
result= pd. read_csv( 'sample_submit.csv' )
result[ 'label_0' ] = temp[ 0 ]
result[ 'label_1' ] = temp[ 1 ]
result[ 'label_2' ] = temp[ 2 ]
result[ 'label_3' ] = temp[ 3 ]
result. to_csv( 'submit.csv' , index= False )