import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib. pyplot as plt
import seaborn as sns
from tsfresh import extract_features, select_features
warnings. filterwarnings( 'ignore' )
% matplotlib inline
import itertools
import matplotlib. gridspec as gridspec
from sklearn import datasets
from sklearn. linear_model import LogisticRegression
from sklearn. neighbors import KNeighborsClassifier
from sklearn. naive_bayes import GaussianNB
from sklearn. ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn. preprocessing import OneHotEncoder
from sklearn. model_selection import cross_val_score, train_test_split
from sklearn. model_selection import StratifiedKFold
from sklearn. model_selection import train_test_split
from sklearn. model_selection import StratifiedKFold
from sklearn. model_selection import train_test_split
import lightgbm as lgb
from sklearn. neural_network import MLPClassifier, MLPRegressor
from sklearn. metrics import mean_squared_error, mean_absolute_error
def reduce_mem_usage ( df) :
start_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage of dataframe is {:.2f} MB' . format ( start_mem) )
for col in df. columns:
col_type = df[ col] . dtype
if col_type != object :
c_min = df[ col] . min ( )
c_max = df[ col] . max ( )
if str ( col_type) [ : 3 ] == 'int' :
if c_min > np. iinfo( np. int8) . min and c_max < np. iinfo( np. int8) . max :
df[ col] = df[ col] . astype( np. int8)
elif c_min > np. iinfo( np. int16) . min and c_max < np. iinfo( np. int16) . max :
df[ col] = df[ col] . astype( np. int16)
elif c_min > np. iinfo( np. int32) . min and c_max < np. iinfo( np. int32) . max :
df[ col] = df[ col] . astype( np. int32)
elif c_min > np. iinfo( np. int64) . min and c_max < np. iinfo( np. int64) . max :
df[ col] = df[ col] . astype( np. int64)
else :
if c_min > np. finfo( np. float16) . min and c_max < np. finfo( np. float16) . max :
df[ col] = df[ col] . astype( np. float16)
elif c_min > np. finfo( np. float32) . min and c_max < np. finfo( np. float32) . max :
df[ col] = df[ col] . astype( np. float32)
else :
df[ col] = df[ col] . astype( np. float64)
else :
df[ col] = df[ col] . astype( 'category' )
end_mem = df. memory_usage( ) . sum ( ) / 1024 ** 2
print ( 'Memory usage after optimization is: {:.2f} MB' . format ( end_mem) )
print ( 'Decreased by {:.1f}%' . format ( 100 * ( start_mem - end_mem) / start_mem) )
return df
train_features = pd. read_csv( "train_features.csv" )
test_features = pd. read_csv( "test_features.csv" )
data_train = pd. read_csv( "train.csv" )
data_train_label = data_train[ "label" ]
train_features_filtered = select_features( train_features, data_train_label)
train_features_filtered
heartbeat_signals__sum_values heartbeat_signals__fft_coefficient__attr_"abs"__coeff_38 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_37 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_36 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_35 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_34 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_33 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_32 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_31 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_30 ... heartbeat_signals__fft_coefficient__attr_"abs"__coeff_84 heartbeat_signals__fft_coefficient__attr_"imag"__coeff_97 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_90 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_94 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_92 heartbeat_signals__fft_coefficient__attr_"real"__coeff_97 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_75 heartbeat_signals__fft_coefficient__attr_"real"__coeff_88 heartbeat_signals__fft_coefficient__attr_"real"__coeff_92 heartbeat_signals__fft_coefficient__attr_"real"__coeff_83 0 38.927945 0.660949 1.090709 0.848728 1.168685 0.982133 1.223496 1.236300 1.104172 1.497129 ... 0.531883 -0.047438 0.554370 0.307586 0.564596 0.562960 0.591859 0.504124 0.528450 0.473568 1 19.445634 1.718217 1.280923 1.850706 1.460752 1.924501 1.925485 1.715938 2.079957 1.818636 ... 0.563590 -0.109579 0.697446 0.398073 0.640969 0.270192 0.224925 0.645082 0.635135 0.297325 2 21.192974 1.814281 1.619051 1.215343 1.787166 2.146987 1.686190 1.540137 2.291031 2.403422 ... 0.712487 -0.074042 0.321703 0.390386 0.716929 0.316524 0.422077 0.722742 0.680590 0.383754 3 42.113066 2.109550 0.619634 2.366413 2.071539 1.000340 2.728281 1.391727 2.017176 2.610492 ... 0.601499 -0.184248 0.564669 0.623353 0.466980 0.651774 0.308915 0.550097 0.466904 0.494024 4 69.756786 0.194549 0.348882 0.092119 0.653924 0.231422 1.080003 0.711244 1.357904 1.237998 ... 0.015292 0.070505 0.065835 0.051780 0.092940 0.103773 0.179405 -0.089611 0.091841 0.056867 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 63.323449 0.840651 1.186210 1.396236 0.417221 2.036034 1.659054 0.500584 1.693545 0.859932 ... 0.779955 0.005525 0.486013 0.273372 0.705386 0.602898 0.447929 0.474844 0.564266 0.133969 99996 69.657534 1.557787 1.393960 0.989147 1.611333 1.793044 1.092325 0.507138 1.763940 2.677643 ... 0.539489 0.114670 0.579498 0.417226 0.270110 0.556596 0.703258 0.462312 0.269719 0.539236 99997 40.897057 0.469758 1.000355 0.706395 1.190514 0.674603 1.632769 0.229008 2.027802 0.302457 ... 0.282597 -0.474629 0.460647 0.478341 0.527891 0.904111 0.728529 0.178410 0.500813 0.773985 99998 42.333303 0.992948 1.354894 2.238589 1.237608 1.325212 2.785515 1.918571 0.814167 2.613950 ... 0.594252 -0.162106 0.694276 0.681025 0.357196 0.498088 0.433297 0.406154 0.324771 0.340727 99999 53.290117 1.624625 1.739088 2.936555 0.154759 2.921164 2.183932 1.485150 2.685922 0.583443 ... 0.463697 0.289364 0.285321 0.422103 0.692009 0.276236 0.245780 0.269519 0.681719 -0.053993
100000 rows × 708 columns
test_features= test_features. loc[ : , list ( train_features_filtered. columns) ]
test_features
heartbeat_signals__sum_values heartbeat_signals__fft_coefficient__attr_"abs"__coeff_38 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_37 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_36 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_35 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_34 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_33 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_32 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_31 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_30 ... heartbeat_signals__fft_coefficient__attr_"abs"__coeff_84 heartbeat_signals__fft_coefficient__attr_"imag"__coeff_97 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_90 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_94 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_92 heartbeat_signals__fft_coefficient__attr_"real"__coeff_97 heartbeat_signals__fft_coefficient__attr_"abs"__coeff_75 heartbeat_signals__fft_coefficient__attr_"real"__coeff_88 heartbeat_signals__fft_coefficient__attr_"real"__coeff_92 heartbeat_signals__fft_coefficient__attr_"real"__coeff_83 0 19.229863 2.381214 0.832151 2.509869 1.082112 2.517858 1.656104 2.257162 2.213421 1.815374 ... 0.563470 -0.040576 0.485441 0.472059 0.448018 0.449347 0.479950 0.480448 0.442279 0.355992 1 84.298932 0.987660 0.856174 0.616261 0.293339 0.191558 0.528684 1.010080 1.478182 1.713876 ... 0.037307 0.010074 0.272897 0.247538 0.286948 0.143829 0.189416 0.124293 0.154624 0.077530 2 47.789921 0.696393 1.165387 1.004378 0.951231 1.542114 0.946219 1.673430 1.445220 1.118439 ... 0.738423 -0.159505 0.418298 0.566628 0.849684 0.950851 0.779324 0.439255 0.839315 0.454957 3 47.069011 3.137668 0.044897 3.392946 3.054217 0.726293 3.582653 2.414946 1.257669 3.188068 ... 0.273142 0.366949 0.891690 0.214585 0.927562 0.648872 0.730178 0.606528 0.830105 0.662320 4 24.899397 0.496010 1.401020 0.536501 1.712592 1.044629 1.533405 1.330258 1.251771 1.441028 ... 0.644046 -0.129700 0.578560 0.783258 0.480598 0.485003 0.667111 0.594234 0.447980 0.511133 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 19995 43.175130 1.776937 0.211527 1.986940 0.393550 1.693620 1.139395 1.459990 1.734535 1.025180 ... 0.546742 -0.060254 0.507950 0.560192 0.541534 0.249750 0.608796 0.455444 0.535306 0.268471 19996 31.030782 1.451045 2.483726 1.105440 1.979721 2.821799 0.475276 2.782573 2.827882 0.520034 ... 0.491662 0.016413 0.480380 0.459172 0.363756 0.427028 0.544692 0.754834 0.361866 0.536087 19997 31.648623 2.141301 0.546706 2.340499 1.362651 1.942634 2.043679 0.994065 2.248144 1.007128 ... 0.529880 0.001012 0.768960 0.834159 0.672114 0.520215 0.341519 0.713419 0.664354 0.370047 19998 19.305442 0.221708 2.355288 1.051282 1.742370 2.164058 0.435583 2.649994 1.190594 2.328580 ... 0.527500 -0.103574 0.521222 0.426435 0.636887 0.446365 0.551442 0.503703 0.635246 0.258394 19999 35.204569 0.827017 0.492990 1.627089 1.106799 0.639821 1.350155 0.533904 1.332401 1.229578 ... 0.248776 0.091218 0.659750 0.636282 0.319922 0.472824 0.355830 0.346311 0.312797 0.540855
20000 rows × 708 columns
test_features. columns = range ( test_features. shape[ 1 ] )
train_features_filtered. columns = range ( train_features_filtered. shape[ 1 ] )
train_features_filtered
0 1 2 3 4 5 6 7 8 9 ... 698 699 700 701 702 703 704 705 706 707 0 38.927945 0.660949 1.090709 0.848728 1.168685 0.982133 1.223496 1.236300 1.104172 1.497129 ... 0.531883 -0.047438 0.554370 0.307586 0.564596 0.562960 0.591859 0.504124 0.528450 0.473568 1 19.445634 1.718217 1.280923 1.850706 1.460752 1.924501 1.925485 1.715938 2.079957 1.818636 ... 0.563590 -0.109579 0.697446 0.398073 0.640969 0.270192 0.224925 0.645082 0.635135 0.297325 2 21.192974 1.814281 1.619051 1.215343 1.787166 2.146987 1.686190 1.540137 2.291031 2.403422 ... 0.712487 -0.074042 0.321703 0.390386 0.716929 0.316524 0.422077 0.722742 0.680590 0.383754 3 42.113066 2.109550 0.619634 2.366413 2.071539 1.000340 2.728281 1.391727 2.017176 2.610492 ... 0.601499 -0.184248 0.564669 0.623353 0.466980 0.651774 0.308915 0.550097 0.466904 0.494024 4 69.756786 0.194549 0.348882 0.092119 0.653924 0.231422 1.080003 0.711244 1.357904 1.237998 ... 0.015292 0.070505 0.065835 0.051780 0.092940 0.103773 0.179405 -0.089611 0.091841 0.056867 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 63.323449 0.840651 1.186210 1.396236 0.417221 2.036034 1.659054 0.500584 1.693545 0.859932 ... 0.779955 0.005525 0.486013 0.273372 0.705386 0.602898 0.447929 0.474844 0.564266 0.133969 99996 69.657534 1.557787 1.393960 0.989147 1.611333 1.793044 1.092325 0.507138 1.763940 2.677643 ... 0.539489 0.114670 0.579498 0.417226 0.270110 0.556596 0.703258 0.462312 0.269719 0.539236 99997 40.897057 0.469758 1.000355 0.706395 1.190514 0.674603 1.632769 0.229008 2.027802 0.302457 ... 0.282597 -0.474629 0.460647 0.478341 0.527891 0.904111 0.728529 0.178410 0.500813 0.773985 99998 42.333303 0.992948 1.354894 2.238589 1.237608 1.325212 2.785515 1.918571 0.814167 2.613950 ... 0.594252 -0.162106 0.694276 0.681025 0.357196 0.498088 0.433297 0.406154 0.324771 0.340727 99999 53.290117 1.624625 1.739088 2.936555 0.154759 2.921164 2.183932 1.485150 2.685922 0.583443 ... 0.463697 0.289364 0.285321 0.422103 0.692009 0.276236 0.245780 0.269519 0.681719 -0.053993
100000 rows × 708 columns
train = reduce_mem_usage( train_features_filtered)
test = reduce_mem_usage( test_features)
Memory usage of dataframe is 540.16 MB
Memory usage after optimization is: 135.04 MB
Decreased by 75.0%
Memory usage of dataframe is 108.03 MB
Memory usage after optimization is: 27.01 MB
Decreased by 75.0%
X_train, X_val, y_train, y_val = train_test_split( train, data_train_label, test_size= 0.2 )
def build_model_rf ( X_train, y_train) :
model = RandomForestRegressor( n_estimators = 100 )
model. fit( X_train, y_train)
return model
def build_model_lgb ( X_train, y_train) :
model = lgb. LGBMRegressor( num_leaves= 63 , learning_rate = 0.1 , n_estimators = 100 )
model. fit( X_train, y_train)
return model
def build_model_nn ( X_train, y_train) :
model = MLPRegressor( alpha= 1e - 05 , hidden_layer_sizes= ( 5 , 2 ) , random_state= 1 , solver= 'lbfgs' )
model. fit( X_train, y_train)
return model
print ( 'predict rf...' )
model_rf = build_model_rf( X_train, y_train)
val_rf = model_rf. predict( X_val)
subA_rf = model_rf. predict( test)
print ( 'predict lgb...' )
model_lgb = build_model_lgb( X_train, y_train)
val_lgb = model_lgb. predict( X_val)
subA_lgb = model_rf. predict( test)
print ( 'predict NN...' )
model_nn = build_model_nn( X_train, y_train)
val_nn = model_nn. predict( X_val)
subA_nn = model_rf. predict( test)
predict rf...
predict lgb...
predict NN...
train_rf_pred = model_rf. predict( X_train)
train_lgb_pred = model_lgb. predict( X_train)
train_nn_pred = model_nn. predict( X_train)
stacking_X_train = pd. DataFrame( )
stacking_X_train[ 'Method_1' ] = train_rf_pred
stacking_X_train[ 'Method_2' ] = train_lgb_pred
stacking_X_train[ 'Method_3' ] = train_nn_pred
stacking_X_val = pd. DataFrame( )
stacking_X_val[ 'Method_1' ] = val_rf
stacking_X_val[ 'Method_2' ] = val_lgb
stacking_X_val[ 'Method_3' ] = val_nn
stacking_X_test = pd. DataFrame( )
stacking_X_test[ 'Method_1' ] = subA_rf
stacking_X_test[ 'Method_2' ] = subA_lgb
stacking_X_test[ 'Method_3' ] = subA_nn
stacking_X_test. head( )
Method_1 Method_2 Method_3 0 0.07 0.07 0.07 1 2.00 2.00 2.00 2 3.00 3.00 3.00 3 0.02 0.02 0.02 4 0.00 0.00 0.00
model_lr_stacking = build_model_rf( stacking_X_train, y_train)
train_pre_Stacking = model_lr_stacking. predict( stacking_X_train)
print ( 'MAE of stacking:' , mean_absolute_error( y_train, train_pre_Stacking) )
val_pre_Stacking = model_lr_stacking. predict( stacking_X_val)
print ( 'MAE of stacking:' , mean_absolute_error( y_val, val_pre_Stacking) )
print ( 'Predict stacking...' )
subA_Stacking = model_lr_stacking. predict( stacking_X_test)
MAE of stacking: 0.0011627499999999997
MAE of stacking: 0.040751499999999996
Predict stacking...
subA_Stacking
array([0. , 2. , 3. , ..., 0. , 0. , 0.16])