task05 HeartbeatClassification 模型融合

该博客介绍了如何使用TSFRESH库进行特征选择,并通过优化数据类型减少内存占用。接着,它展示了如何构建并训练随机森林、LightGBM和神经网络模型,以及如何通过堆叠这些模型的预测来提高性能。最后,通过随机森林在堆叠后的数据上进行训练,得到了最终的预测结果。
摘要由CSDN通过智能技术生成
import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tsfresh import extract_features, select_features

warnings.filterwarnings('ignore')
%matplotlib inline

import itertools
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
# from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
# from mlxtend.plotting import plot_learning_curves
# from mlxtend.plotting import plot_decision_regions

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
train_features = pd.read_csv("train_features.csv")
test_features = pd.read_csv("test_features.csv")
data_train = pd.read_csv("train.csv")
data_train_label = data_train["label"]
# 按照特征和数据label之间的相关性进行特征选择
train_features_filtered = select_features(train_features, data_train_label)

train_features_filtered
heartbeat_signals__sum_valuesheartbeat_signals__fft_coefficient__attr_"abs"__coeff_38heartbeat_signals__fft_coefficient__attr_"abs"__coeff_37heartbeat_signals__fft_coefficient__attr_"abs"__coeff_36heartbeat_signals__fft_coefficient__attr_"abs"__coeff_35heartbeat_signals__fft_coefficient__attr_"abs"__coeff_34heartbeat_signals__fft_coefficient__attr_"abs"__coeff_33heartbeat_signals__fft_coefficient__attr_"abs"__coeff_32heartbeat_signals__fft_coefficient__attr_"abs"__coeff_31heartbeat_signals__fft_coefficient__attr_"abs"__coeff_30...heartbeat_signals__fft_coefficient__attr_"abs"__coeff_84heartbeat_signals__fft_coefficient__attr_"imag"__coeff_97heartbeat_signals__fft_coefficient__attr_"abs"__coeff_90heartbeat_signals__fft_coefficient__attr_"abs"__coeff_94heartbeat_signals__fft_coefficient__attr_"abs"__coeff_92heartbeat_signals__fft_coefficient__attr_"real"__coeff_97heartbeat_signals__fft_coefficient__attr_"abs"__coeff_75heartbeat_signals__fft_coefficient__attr_"real"__coeff_88heartbeat_signals__fft_coefficient__attr_"real"__coeff_92heartbeat_signals__fft_coefficient__attr_"real"__coeff_83
038.9279450.6609491.0907090.8487281.1686850.9821331.2234961.2363001.1041721.497129...0.531883-0.0474380.5543700.3075860.5645960.5629600.5918590.5041240.5284500.473568
119.4456341.7182171.2809231.8507061.4607521.9245011.9254851.7159382.0799571.818636...0.563590-0.1095790.6974460.3980730.6409690.2701920.2249250.6450820.6351350.297325
221.1929741.8142811.6190511.2153431.7871662.1469871.6861901.5401372.2910312.403422...0.712487-0.0740420.3217030.3903860.7169290.3165240.4220770.7227420.6805900.383754
342.1130662.1095500.6196342.3664132.0715391.0003402.7282811.3917272.0171762.610492...0.601499-0.1842480.5646690.6233530.4669800.6517740.3089150.5500970.4669040.494024
469.7567860.1945490.3488820.0921190.6539240.2314221.0800030.7112441.3579041.237998...0.0152920.0705050.0658350.0517800.0929400.1037730.179405-0.0896110.0918410.056867
..................................................................
9999563.3234490.8406511.1862101.3962360.4172212.0360341.6590540.5005841.6935450.859932...0.7799550.0055250.4860130.2733720.7053860.6028980.4479290.4748440.5642660.133969
9999669.6575341.5577871.3939600.9891471.6113331.7930441.0923250.5071381.7639402.677643...0.5394890.1146700.5794980.4172260.2701100.5565960.7032580.4623120.2697190.539236
9999740.8970570.4697581.0003550.7063951.1905140.6746031.6327690.2290082.0278020.302457...0.282597-0.4746290.4606470.4783410.5278910.9041110.7285290.1784100.5008130.773985
9999842.3333030.9929481.3548942.2385891.2376081.3252122.7855151.9185710.8141672.613950...0.594252-0.1621060.6942760.6810250.3571960.4980880.4332970.4061540.3247710.340727
9999953.2901171.6246251.7390882.9365550.1547592.9211642.1839321.4851502.6859220.583443...0.4636970.2893640.2853210.4221030.6920090.2762360.2457800.2695190.681719-0.053993

100000 rows × 708 columns

test_features= test_features.loc[:,list(train_features_filtered.columns)]
test_features
heartbeat_signals__sum_valuesheartbeat_signals__fft_coefficient__attr_"abs"__coeff_38heartbeat_signals__fft_coefficient__attr_"abs"__coeff_37heartbeat_signals__fft_coefficient__attr_"abs"__coeff_36heartbeat_signals__fft_coefficient__attr_"abs"__coeff_35heartbeat_signals__fft_coefficient__attr_"abs"__coeff_34heartbeat_signals__fft_coefficient__attr_"abs"__coeff_33heartbeat_signals__fft_coefficient__attr_"abs"__coeff_32heartbeat_signals__fft_coefficient__attr_"abs"__coeff_31heartbeat_signals__fft_coefficient__attr_"abs"__coeff_30...heartbeat_signals__fft_coefficient__attr_"abs"__coeff_84heartbeat_signals__fft_coefficient__attr_"imag"__coeff_97heartbeat_signals__fft_coefficient__attr_"abs"__coeff_90heartbeat_signals__fft_coefficient__attr_"abs"__coeff_94heartbeat_signals__fft_coefficient__attr_"abs"__coeff_92heartbeat_signals__fft_coefficient__attr_"real"__coeff_97heartbeat_signals__fft_coefficient__attr_"abs"__coeff_75heartbeat_signals__fft_coefficient__attr_"real"__coeff_88heartbeat_signals__fft_coefficient__attr_"real"__coeff_92heartbeat_signals__fft_coefficient__attr_"real"__coeff_83
019.2298632.3812140.8321512.5098691.0821122.5178581.6561042.2571622.2134211.815374...0.563470-0.0405760.4854410.4720590.4480180.4493470.4799500.4804480.4422790.355992
184.2989320.9876600.8561740.6162610.2933390.1915580.5286841.0100801.4781821.713876...0.0373070.0100740.2728970.2475380.2869480.1438290.1894160.1242930.1546240.077530
247.7899210.6963931.1653871.0043780.9512311.5421140.9462191.6734301.4452201.118439...0.738423-0.1595050.4182980.5666280.8496840.9508510.7793240.4392550.8393150.454957
347.0690113.1376680.0448973.3929463.0542170.7262933.5826532.4149461.2576693.188068...0.2731420.3669490.8916900.2145850.9275620.6488720.7301780.6065280.8301050.662320
424.8993970.4960101.4010200.5365011.7125921.0446291.5334051.3302581.2517711.441028...0.644046-0.1297000.5785600.7832580.4805980.4850030.6671110.5942340.4479800.511133
..................................................................
1999543.1751301.7769370.2115271.9869400.3935501.6936201.1393951.4599901.7345351.025180...0.546742-0.0602540.5079500.5601920.5415340.2497500.6087960.4554440.5353060.268471
1999631.0307821.4510452.4837261.1054401.9797212.8217990.4752762.7825732.8278820.520034...0.4916620.0164130.4803800.4591720.3637560.4270280.5446920.7548340.3618660.536087
1999731.6486232.1413010.5467062.3404991.3626511.9426342.0436790.9940652.2481441.007128...0.5298800.0010120.7689600.8341590.6721140.5202150.3415190.7134190.6643540.370047
1999819.3054420.2217082.3552881.0512821.7423702.1640580.4355832.6499941.1905942.328580...0.527500-0.1035740.5212220.4264350.6368870.4463650.5514420.5037030.6352460.258394
1999935.2045690.8270170.4929901.6270891.1067990.6398211.3501550.5339041.3324011.229578...0.2487760.0912180.6597500.6362820.3199220.4728240.3558300.3463110.3127970.540855

20000 rows × 708 columns

#重命名,否则会报错
test_features.columns = range(test_features.shape[1])
train_features_filtered.columns = range(train_features_filtered.shape[1])
train_features_filtered
0123456789...698699700701702703704705706707
038.9279450.6609491.0907090.8487281.1686850.9821331.2234961.2363001.1041721.497129...0.531883-0.0474380.5543700.3075860.5645960.5629600.5918590.5041240.5284500.473568
119.4456341.7182171.2809231.8507061.4607521.9245011.9254851.7159382.0799571.818636...0.563590-0.1095790.6974460.3980730.6409690.2701920.2249250.6450820.6351350.297325
221.1929741.8142811.6190511.2153431.7871662.1469871.6861901.5401372.2910312.403422...0.712487-0.0740420.3217030.3903860.7169290.3165240.4220770.7227420.6805900.383754
342.1130662.1095500.6196342.3664132.0715391.0003402.7282811.3917272.0171762.610492...0.601499-0.1842480.5646690.6233530.4669800.6517740.3089150.5500970.4669040.494024
469.7567860.1945490.3488820.0921190.6539240.2314221.0800030.7112441.3579041.237998...0.0152920.0705050.0658350.0517800.0929400.1037730.179405-0.0896110.0918410.056867
..................................................................
9999563.3234490.8406511.1862101.3962360.4172212.0360341.6590540.5005841.6935450.859932...0.7799550.0055250.4860130.2733720.7053860.6028980.4479290.4748440.5642660.133969
9999669.6575341.5577871.3939600.9891471.6113331.7930441.0923250.5071381.7639402.677643...0.5394890.1146700.5794980.4172260.2701100.5565960.7032580.4623120.2697190.539236
9999740.8970570.4697581.0003550.7063951.1905140.6746031.6327690.2290082.0278020.302457...0.282597-0.4746290.4606470.4783410.5278910.9041110.7285290.1784100.5008130.773985
9999842.3333030.9929481.3548942.2385891.2376081.3252122.7855151.9185710.8141672.613950...0.594252-0.1621060.6942760.6810250.3571960.4980880.4332970.4061540.3247710.340727
9999953.2901171.6246251.7390882.9365550.1547592.9211642.1839321.4851502.6859220.583443...0.4636970.2893640.2853210.4221030.6920090.2762360.2457800.2695190.681719-0.053993

100000 rows × 708 columns

train = reduce_mem_usage(train_features_filtered)
test = reduce_mem_usage(test_features)
Memory usage of dataframe is 540.16 MB
Memory usage after optimization is: 135.04 MB
Decreased by 75.0%
Memory usage of dataframe is 108.03 MB
Memory usage after optimization is: 27.01 MB
Decreased by 75.0%
X_train, X_val, y_train, y_val = train_test_split(train, data_train_label, test_size=0.2)
# 单模函数

# 随机森林
def build_model_rf(X_train,y_train):
    model = RandomForestRegressor(n_estimators = 100)
    model.fit(X_train, y_train)
    return model

# lightgbm
def build_model_lgb(X_train,y_train):
    model = lgb.LGBMRegressor(num_leaves=63,learning_rate = 0.1,n_estimators = 100)
    model.fit(X_train, y_train)
    return model

#nn:神经网络
def build_model_nn(X_train,y_train):
    model = MLPRegressor(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,solver='lbfgs')
    model.fit(X_train, y_train)
    return model
# 这里针对三个单模进行训练,其中subA_rf/lgb/nn都是可以提交的模型
# 单模没有进行调参,因此是弱分类器,效果可能不是很好。

print('predict rf...')
model_rf = build_model_rf(X_train,y_train)
val_rf = model_rf.predict(X_val)
subA_rf = model_rf.predict(test)


print('predict lgb...')
model_lgb = build_model_lgb(X_train,y_train)
val_lgb = model_lgb.predict(X_val)
subA_lgb = model_rf.predict(test)


print('predict NN...')
model_nn = build_model_nn(X_train,y_train)
val_nn = model_nn.predict(X_val)
subA_nn = model_rf.predict(test)
predict rf...
predict lgb...
predict NN...
## Stacking

## 第一层
train_rf_pred = model_rf.predict(X_train)
train_lgb_pred = model_lgb.predict(X_train)
train_nn_pred = model_nn.predict(X_train)

stacking_X_train = pd.DataFrame()
stacking_X_train['Method_1'] = train_rf_pred
stacking_X_train['Method_2'] = train_lgb_pred
stacking_X_train['Method_3'] = train_nn_pred

stacking_X_val = pd.DataFrame()
stacking_X_val['Method_1'] = val_rf
stacking_X_val['Method_2'] = val_lgb
stacking_X_val['Method_3'] = val_nn

stacking_X_test = pd.DataFrame()
stacking_X_test['Method_1'] = subA_rf
stacking_X_test['Method_2'] = subA_lgb
stacking_X_test['Method_3'] = subA_nn
stacking_X_test.head()
Method_1Method_2Method_3
00.070.070.07
12.002.002.00
23.003.003.00
30.020.020.02
40.000.000.00
# 第二层是用random forest
model_lr_stacking = build_model_rf(stacking_X_train,y_train)

## 训练集
train_pre_Stacking = model_lr_stacking.predict(stacking_X_train)
print('MAE of stacking:',mean_absolute_error(y_train,train_pre_Stacking))

## 验证集
val_pre_Stacking = model_lr_stacking.predict(stacking_X_val)
print('MAE of stacking:',mean_absolute_error(y_val,val_pre_Stacking))

## 预测集
print('Predict stacking...')
subA_Stacking = model_lr_stacking.predict(stacking_X_test)
MAE of stacking: 0.0011627499999999997
MAE of stacking: 0.040751499999999996
Predict stacking...
subA_Stacking
array([0.  , 2.  , 3.  , ..., 0.  , 0.  , 0.16])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值