零基础入门数据挖掘-心跳信号分类预测baseline
导入第三方包
import os
import gc
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
读取数据
# 读取数据
train = pd.read_csv('train.csv')
test=pd.read_csv('testA.csv')
train.head()
id | heartbeat_signals | label | |
---|---|---|---|
0 | 0 | 0.9912297987616655,0.9435330436439665,0.764677... | 0.0 |
1 | 1 | 0.9714822034884503,0.9289687459588268,0.572932... | 0.0 |
2 | 2 | 1.0,0.9591487564065292,0.7013782792997189,0.23... | 2.0 |
3 | 3 | 0.9757952826275774,0.9340884687738161,0.659636... | 0.0 |
4 | 4 | 0.0,0.055816398940721094,0.26129357194994196,0... | 2.0 |
test.head()
id | heartbeat_signals | |
---|---|---|
0 | 100000 | 0.9915713654170097,1.0,0.6318163407681274,0.13... |
1 | 100001 | 0.6075533139615096,0.5417083883163654,0.340694... |
2 | 100002 | 0.9752726292239277,0.6710965234906665,0.686758... |
3 | 100003 | 0.9956348033996116,0.9170249621481004,0.521096... |
4 | 100004 | 1.0,0.8879490481178918,0.745564725322326,0.531... |
# 查看数据
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 100000 non-null int64
1 heartbeat_signals 100000 non-null object
2 label 100000 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB
train.describe()
id | label | |
---|---|---|
count | 100000.000000 | 100000.000000 |
mean | 49999.500000 | 0.856960 |
std | 28867.657797 | 1.217084 |
min | 0.000000 | 0.000000 |
25% | 24999.750000 | 0.000000 |
50% | 49999.500000 | 0.000000 |
75% | 74999.250000 | 2.000000 |
max | 99999.000000 | 3.000000 |
train.label.value_counts()
0.0 64327
3.0 17912
2.0 14199
1.0 3562
Name: label, dtype: int64
train.label.value_counts().plot(kind="bar")
plt.title("label class histogram")
plt.xlabel("label")
plt.ylabel("Frequency")
Text(0, 0.5, 'Frequency')
对label作计数统计,发现有样本不均衡现象:label为4的样本数量较少,label为1的样本数量较多
train.heartbeat_signals.apply(lambda x:len(x.split(",")))
0 205
1 205
2 205
3 205
4 205
...
99995 205
99996 205
99997 205
99998 205
99999 205
Name: heartbeat_signals, Length: 100000, dtype: int64
查看heartbeat_signal列,发现每个样本的心跳信号是一列浮点数,分布在0~1之间,长度为205,且多数信号后面存在大量的0信号
数据预处理
train.memory_usage()
Index 128
id 800000
heartbeat_signals 800000
label 800000
dtype: int64
# 减少内存
def reduce_mem_usage(df):
# df.memory_usage()返回每列的内存使用情况Series(以字节为单位),然后求和并转换单位为mb
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
# 选择满足条件的占内存最小的类型来存储int和float型数据
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
# object类转化为category类型,category类型在底层使用了int值来表示一个列中的值,节省空间
df[col] = df[col].astype('category')
#计算类型转换后数据所占总内存
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
# 简单预处理
# 训练集
print("===============train===============")
train_list = []
# for循环每次取出一行数据(数据类型为np.array)
for items in train.values:
# 拆分heartbeat_signals数据
train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])
train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)
# 测试集
print("===============test===============")
test_list=[]
for items in test.values:
test_list.append([items[0]] + [float(i) for i in items[1].split(',')])
test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)
===============train===============
Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
===============test===============
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%
train.head()
id | s_0 | s_1 | s_2 | s_3 | s_4 | s_5 | s_6 | s_7 | s_8 | ... | s_196 | s_197 | s_198 | s_199 | s_200 | s_201 | s_202 | s_203 | s_204 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.991211 | 0.943359 | 0.764648 | 0.618652 | 0.379639 | 0.190796 | 0.040222 | 0.026001 | 0.031708 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 0.971680 | 0.929199 | 0.572754 | 0.178467 | 0.122986 | 0.132324 | 0.094421 | 0.089600 | 0.030487 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 2.0 | 1.000000 | 0.958984 | 0.701172 | 0.231812 | 0.000000 | 0.080688 | 0.128418 | 0.187500 | 0.280762 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 |
3 | 3.0 | 0.975586 | 0.934082 | 0.659668 | 0.249878 | 0.237061 | 0.281494 | 0.249878 | 0.249878 | 0.241455 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 4.0 | 0.000000 | 0.055817 | 0.261230 | 0.359863 | 0.433105 | 0.453613 | 0.499023 | 0.542969 | 0.616699 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 |
5 rows × 207 columns
训练数据、测试数据准备
# 数据和label的拆分
x_train = train.drop(['id','label'], axis=1) # 训练集数据
y_train = train['label'] # 训练集label
x_test=test.drop(['id'], axis=1) # 测试集数据
数据下采样
# 下采样
#取出label为1的样本的样本数、索引
number_records_1 = len(train[train.label == 1])
records_1_indices = np.array(train[train.label == 1.0].index)
#取出label为其他值的样本的索引
records_2_indices = np.array(train[train.label == 2.0].index)
records_3_indices = np.array(train[train.label == 3.0].index)
records_0_indices = np.array(train[train.label == 0.0].index)
# 随机选取与label1数目相同的其余label的标签
random_2_indices = np.random.choice(records_2_indices, number_records_1, replace = False) #replace=False 再一次抽取,样本不可重复出现
random_2_indices = np.array(random_2_indices)
random_3_indices = np.random.choice(records_3_indices, number_records_1, replace = False) #replace=False 再一次抽取,样本不可重复出现
random_3_indices = np.array(random_3_indices)
random_0_indices = np.random.choice(records_0_indices, number_records_1, replace = False) #replace=False 再一次抽取,样本不可重复出现
random_0_indices = np.array(random_0_indices)
# 组成最终的下采样后的数据集
under_sample_indices = np.concatenate([random_0_indices,random_2_indices,random_3_indices,records_1_indices])
under_sample_train = train.iloc[under_sample_indices,:]
x_undersample_train = under_sample_train.drop(['id','label'], axis=1).reset_index(drop = True)
y_undersample_train = under_sample_train['label'].reset_index(drop = True)
模型训练
# 定义绝对值损失函数
def abs_sum(y_pre,y_tru):
y_pre=np.array(y_pre)
y_tru=np.array(y_tru)
loss=sum(sum(abs(y_pre-y_tru)))
return loss
def cv_model(clf, train_x, train_y, test_x, clf_name):
"""
clf:选择classifier分类器
clf_name:分类器名称
"""
folds = 5
seed = 2021
# 交叉验证
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
# 初始化预测的概率矩阵
test = np.zeros((test_x.shape[0],4))
cv_scores = []
onehot_encoder = OneHotEncoder(sparse=False)
for i, (train_index, valid_index) in enumerate(kf.split(train_x)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 4,
'num_leaves': 2 ** 5,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': seed,
'nthread': 28,
'n_jobs':24,
'verbose': -1,
}
# 创建模型
model = clf.train(params,
train_set=train_matrix,
valid_sets=valid_matrix,
num_boost_round=2000,
verbose_eval=100,
early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
val_y=np.array(val_y).reshape(-1, 1)
val_y = onehot_encoder.fit_transform(val_y)
print('预测的概率矩阵为:')
print(test_pred)
test += test_pred
score=abs_sum(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
test=test/kf.n_splits
return test
def lgb_model(x_train, y_train, x_test):
lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_test
lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0525735
[200] valid_0's multi_logloss: 0.0422444
[300] valid_0's multi_logloss: 0.0407076
[400] valid_0's multi_logloss: 0.0420398
Early stopping, best iteration is:
[289] valid_0's multi_logloss: 0.0405457
预测的概率矩阵为:
[[9.99969791e-01 2.85197261e-05 1.00341946e-06 6.85357631e-07]
[7.93287264e-05 7.69060914e-04 9.99151590e-01 2.00810971e-08]
[5.75356884e-07 5.04051497e-08 3.15322414e-07 9.99999059e-01]
...
[6.79267940e-02 4.30206297e-04 9.31640185e-01 2.81516302e-06]
[9.99960477e-01 3.94098074e-05 8.34030725e-08 2.94638661e-08]
[9.88705846e-01 2.14081630e-03 6.67418381e-03 2.47915423e-03]]
[607.0736049372185]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0566626
[200] valid_0's multi_logloss: 0.0450852
[300] valid_0's multi_logloss: 0.044078
[400] valid_0's multi_logloss: 0.0455546
Early stopping, best iteration is:
[275] valid_0's multi_logloss: 0.0437793
预测的概率矩阵为:
[[9.99991401e-01 7.69109547e-06 6.65504756e-07 2.42084688e-07]
[5.72380482e-05 1.32812809e-03 9.98614607e-01 2.66534396e-08]
[2.82123411e-06 4.13195205e-07 1.34026965e-06 9.99995425e-01]
...
[6.96398024e-02 6.52459907e-04 9.29685742e-01 2.19960932e-05]
[9.99972366e-01 2.75069005e-05 7.68142933e-08 5.07415018e-08]
[9.67263676e-01 7.26154408e-03 2.41533542e-02 1.32142531e-03]]
[607.0736049372185, 623.4313863731124]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0498722
[200] valid_0's multi_logloss: 0.038028
[300] valid_0's multi_logloss: 0.0358066
[400] valid_0's multi_logloss: 0.0361478
[500] valid_0's multi_logloss: 0.0379597
Early stopping, best iteration is:
[340] valid_0's multi_logloss: 0.0354344
预测的概率矩阵为:
[[9.99972032e-01 2.62406774e-05 1.17282152e-06 5.54230651e-07]
[1.05242811e-05 6.50215805e-05 9.99924453e-01 6.93812546e-10]
[1.93240868e-06 1.10384984e-07 3.76773426e-07 9.99997580e-01]
...
[1.34894410e-02 3.84569683e-05 9.86471555e-01 5.46564350e-07]
[9.99987431e-01 1.25532882e-05 1.03902298e-08 5.46727770e-09]
[9.78722948e-01 1.06329839e-02 6.94192038e-03 3.70214810e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0564768
[200] valid_0's multi_logloss: 0.0448698
[300] valid_0's multi_logloss: 0.0446719
[400] valid_0's multi_logloss: 0.0470399
Early stopping, best iteration is:
[250] valid_0's multi_logloss: 0.0438853
预测的概率矩阵为:
[[9.99979692e-01 1.70821979e-05 1.27048476e-06 1.95571841e-06]
[5.66207785e-05 4.02275314e-04 9.99541086e-01 1.82828519e-08]
[2.62267451e-06 3.58613522e-07 4.78645006e-06 9.99992232e-01]
...
[4.56636552e-02 5.69497433e-04 9.53758468e-01 8.37980573e-06]
[9.99896785e-01 1.02796802e-04 2.46636563e-07 1.72061021e-07]
[8.70911669e-01 1.73790185e-02 1.04478175e-01 7.23113697e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.0506398
[200] valid_0's multi_logloss: 0.0396422
[300] valid_0's multi_logloss: 0.0381065
[400] valid_0's multi_logloss: 0.0390162
[500] valid_0's multi_logloss: 0.0414986
Early stopping, best iteration is:
[324] valid_0's multi_logloss: 0.0379497
预测的概率矩阵为:
[[9.99993352e-01 6.02902202e-06 1.13002685e-07 5.06277302e-07]
[1.03959552e-05 5.03778956e-04 9.99485820e-01 5.07638601e-09]
[1.92568065e-07 5.07155306e-08 4.94690856e-08 9.99999707e-01]
...
[8.83103121e-03 2.51969353e-05 9.91142776e-01 9.96143937e-07]
[9.99984791e-01 1.51997858e-05 5.62426491e-09 3.80450197e-09]
[9.86084001e-01 8.75968498e-04 1.09742304e-02 2.06580027e-03]]
[607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_scotrainre_list: [607.0736049372185, 623.4313863731124, 508.02381607269535, 660.4867407547266, 539.2160054696064]
lgb_score_mean: 587.6463107214719
lgb_score_std: 55.944536405714565
由于样本标签分布不均匀,前面对数据集做了下采样,再次训练模型:
lgb_test_undersamped = lgb_model(x_undersample_train, y_undersample_train, x_test)
************************************ 1 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.114481
[200] valid_0's multi_logloss: 0.121278
[300] valid_0's multi_logloss: 0.142431
Early stopping, best iteration is:
[129] valid_0's multi_logloss: 0.11133
预测的概率矩阵为:
[[9.97909504e-01 2.05690126e-03 1.37408064e-05 1.98534778e-05]
[5.29540833e-05 2.71891022e-03 9.97227460e-01 6.76094883e-07]
[1.67245916e-05 2.31546330e-06 4.99075771e-06 9.99975969e-01]
...
[7.31721572e-02 3.65607531e-02 8.87518521e-01 2.74856896e-03]
[9.97227859e-01 2.75894404e-03 1.05707283e-05 2.62636571e-06]
[1.72415730e-01 3.26642460e-01 6.65459626e-02 4.34395848e-01]]
[279.89382785738206]
************************************ 2 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.14047
[200] valid_0's multi_logloss: 0.156166
[300] valid_0's multi_logloss: 0.182637
Early stopping, best iteration is:
[107] valid_0's multi_logloss: 0.139486
预测的概率矩阵为:
[[9.97637026e-01 2.26617101e-03 7.61022611e-05 2.07008734e-05]
[1.73246625e-04 7.10584410e-03 9.92715902e-01 5.00693350e-06]
[2.54639765e-05 7.61968678e-06 3.51436283e-05 9.99931773e-01]
...
[7.12577218e-02 2.33032167e-01 6.92863112e-01 2.84699897e-03]
[9.96584919e-01 3.37795178e-03 2.52579171e-05 1.18714552e-05]
[2.97622604e-01 4.02905434e-01 1.41173366e-01 1.58298597e-01]]
[279.89382785738206, 364.5849925070536]
************************************ 3 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.117352
[200] valid_0's multi_logloss: 0.116617
[300] valid_0's multi_logloss: 0.135823
Early stopping, best iteration is:
[142] valid_0's multi_logloss: 0.111634
预测的概率矩阵为:
[[9.98782396e-01 1.15469066e-03 4.32136748e-05 1.96997210e-05]
[2.49293743e-05 1.90295485e-03 9.98071825e-01 2.90581742e-07]
[9.10024124e-06 2.03901034e-06 2.20466441e-06 9.99986656e-01]
...
[7.91691326e-02 7.11461731e-03 9.13350712e-01 3.65538181e-04]
[9.99115803e-01 8.77129392e-04 4.65368198e-06 2.41390238e-06]
[1.20920311e-01 5.60310816e-01 1.27816215e-01 1.90952657e-01]]
[279.89382785738206, 364.5849925070536, 291.81448543469014]
************************************ 4 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.118365
[200] valid_0's multi_logloss: 0.115946
[300] valid_0's multi_logloss: 0.128339
Early stopping, best iteration is:
[136] valid_0's multi_logloss: 0.112798
预测的概率矩阵为:
[[9.99333742e-01 6.43997815e-04 1.77285801e-05 4.53125727e-06]
[1.85250042e-05 2.45895523e-03 9.97522087e-01 4.32881200e-07]
[5.27576705e-06 9.55304395e-07 5.44191422e-06 9.99988327e-01]
...
[1.12449235e-01 3.01623502e-02 8.56576699e-01 8.11715852e-04]
[9.96615832e-01 3.36790701e-03 9.43435168e-06 6.82690020e-06]
[5.71821675e-01 2.57162461e-01 4.54324005e-02 1.25583463e-01]]
[279.89382785738206, 364.5849925070536, 291.81448543469014, 296.11137994407875]
************************************ 5 ************************************
[LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24
Training until validation scores don't improve for 200 rounds
[100] valid_0's multi_logloss: 0.116624
[200] valid_0's multi_logloss: 0.127048
[300] valid_0's multi_logloss: 0.148542
Early stopping, best iteration is:
[128] valid_0's multi_logloss: 0.114111
预测的概率矩阵为:
[[9.99799720e-01 1.73361480e-04 1.06621394e-05 1.62564574e-05]
[4.35885896e-05 1.67797711e-03 9.98277835e-01 5.99544292e-07]
[1.21090595e-05 4.21721375e-06 1.20332494e-05 9.99971640e-01]
...
[5.04861158e-02 5.72191215e-03 9.43613485e-01 1.78486939e-04]
[9.94659919e-01 5.32772250e-03 6.54049986e-06 5.81780206e-06]
[7.94347056e-02 2.73343032e-01 7.71439116e-02 5.70078350e-01]]
[279.89382785738206, 364.5849925070536, 291.81448543469014, 296.11137994407875, 296.42967075096817]
lgb_scotrainre_list: [279.89382785738206, 364.5849925070536, 291.81448543469014, 296.11137994407875, 296.42967075096817]
lgb_score_mean: 305.76687129883453
lgb_score_std: 30.013552465720718