特征工程
学习目标
学习时间序列数据的特征预处理方法
学习时间序列特征处理工具Tsfresh(TimeSeries Fresh)的使用
数据预处理
时间序列数据格式处理、加入时间步特征time
特征工程
时间序列特征构造、特征筛选、使用tsfresh进行时间序列特征处理
# 库函数导入
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tsfresh as tsf
from tsfresh import extract_features,select_features
from tsfresh.utilities.dataframe_functions import impute
#数据读取
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("testA.csv")
print(data_train.shape)
print(data_test.shape)
(100000, 3)
(20000, 2)
data_train.head()
id | heartbeat_signals | label | |
---|---|---|---|
0 | 0 | 0.9912297987616655,0.9435330436439665,0.764677... | 0.0 |
1 | 1 | 0.9714822034884503,0.9289687459588268,0.572932... | 0.0 |
2 | 2 | 1.0,0.9591487564065292,0.7013782792997189,0.23... | 2.0 |
3 | 3 | 0.9757952826275774,0.9340884687738161,0.659636... | 0.0 |
4 | 4 | 0.0,0.055816398940721094,0.26129357194994196,0... | 2.0 |
data_test.head()
id | heartbeat_signals | |
---|---|---|
0 | 100000 | 0.9915713654170097,1.0,0.6318163407681274,0.13... |
1 | 100001 | 0.6075533139615096,0.5417083883163654,0.340694... |
2 | 100002 | 0.9752726292239277,0.6710965234906665,0.686758... |
3 | 100003 | 0.9956348033996116,0.9170249621481004,0.521096... |
4 | 100004 | 1.0,0.8879490481178918,0.745564725322326,0.531... |
数据预处理
# 对心电特征进行行转列处理,同时为每个心电信号加入时间步特征time
train_heartbeat_df = data_train["heartbeat_signals"].str.split(",",expand=True).stack()
train_heartbeat_df = train_heartbeat_df.reset_index()
train_heartbeat_df = train_heartbeat_df.set_index("level_0")
train_heartbeat_df.index.name = None
train_heartbeat_df.rename(columns=