1、导入包并读取数据
# 包导入
import pandas as pd
import numpy as np
import tsfresh as tsf
from tsfresh import extract_features, select_features#时间序列特征处理工具 Tsfresh(TimeSeries Fresh)
from tsfresh.utilities.dataframe_functions import impute
# 数据读取
data_train = pd.read_csv("train.csv")
data_test_A = pd.read_csv("testA.csv")
print(data_train.shape)
print(data_test_A.shape)
(100000, 3)
(20000, 2)
data_train.head()
id | heartbeat_signals | label | |
---|---|---|---|
0 | 0 | 0.9912297987616655,0.9435330436439665,0.764677... | 0.0 |
1 | 1 | 0.9714822034884503,0.9289687459588268,0.572932... | 0.0 |
2 | 2 | 1.0,0.9591487564065292,0.7013782792997189,0.23... | 2.0 |
3 | 3 | 0.9757952826275774,0.9340884687738161,0.659636... | 0.0 |
4 | 4 | 0.0,0.055816398940721094,0.26129357194994196,0... | 2.0 |
data_test_A.head()
id | heartbeat_signals | |
---|---|---|
0 | 100000 | 0.9915713654170097,1.0,0.6318163407681274,0.13... |
1 | 100001 | 0.6075533139615096,0.5417083883163654,0.340694... |
2 | 100002 | 0.9752726292239277,0.6710965234906665,0.686758... |
3 | 100003 | 0.9956348033996116,0.9170249621481004,0.521096... |
4 | 100004 | 1.0,0.8879490481178918,0.745564725322326,0.531... |
2、数据预处理
# 对心电特征进行行转列处理,同时为每个心电信号加入时间步特征time
train_heartbeat_df = data_train["heartbeat_signals"].str.split(",", expand=True).stack()
#stack()就是将dataframe的列变成行,stack的英文意思就是“堆叠”。形象的理解就是,在使用了stack()函数后,“行”会越来越多。
train_heartbeat_df = train_heartbeat_df.reset_index()#重置索引,此时会将元数据的索引和上步缺省的列名也生成列level_0和level_1,此时train_heartbeat_df变成了数据框
train_heartbeat_df = train_heartbeat_df.set_index("level_0")#将level_0(即原始数据的索引)设置为索引
train_heartbeat_df.index.name = None#索引名不要
train_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)#重命名列
train_heartbeat_df["heartbeat_signals"] = train_heartbeat_df["heartbeat_signals"].astype(float)
train_heartbeat_df
train_heartbeat_df = data_train["heartbeat_signals"].str.split(",", expand=True).stack()
train_heartbeat_df
0 0 0.9912297987616655
1 0.9435330436439665
2 0.7646772997256593
3 0.6185708990212999
4 0.3796321642826237
...
99999 200 0.0
201 0.0
202 0.0
203 0.0
204 0.0
Length: 20500000, dtype: object
train_heartbeat_df = train_heartbeat_df.reset_index()
train_heartbeat_df
level_0 | level_1 | 0 | |
---|---|---|---|
0 | 0 | 0 | 0.9912297987616655 |
1 | 0 | 1 | 0.9435330436439665 |
2 | 0 | 2 | 0.7646772997256593 |
3 | 0 | 3 | 0.6185708990212999 |
4 | 0 | 4 | 0.3796321642826237 |
... | ... | ... | ... |
20499995 | 99999 | 200 | 0.0 |
20499996 | 99999 | 201 | 0.0 |
20499997 | 99999 | 202 | 0.0 |
20499998 | 99999 | 203 | 0.0 |
20499999 | 99999 | 204 | 0.0 |
20500000 rows × 3 columns
train_heartbeat_df = train_heartbeat_df.set_index("level_0")
train_heartbeat_df
level_1 | 0 | |
---|---|---|
level_0 | ||
0 | 0 | 0.9912297987616655 |
0 | 1 | 0.9435330436439665 |
0 | 2 | 0.7646772997256593 |
0 | 3 | 0.6185708990212999 |
0 | 4 | 0.3796321642826237 |
... | ... | ... |
99999 | 200 | 0.0 |
99999 | 201 | 0.0 |
99999 | 202 | 0.0 |
99999 | 203 | 0.0 |
99999 | 204 | 0.0 |
20500000 rows × 2 columns
train_heartbeat_df.index.name = None
train_heartbeat_df
level_1 | 0 | |
---|---|---|
0 | 0 | 0.9912297987616655 |
0 | 1 | 0.9435330436439665 |
0 | 2 | 0.7646772997256593 |
0 | 3 | 0.6185708990212999 |
0 | 4 | 0.3796321642826237 |
... | ... | ... |
99999 | 200 | 0.0 |
99999 | 201 | 0.0 |
99999 | 202 | 0.0 |
99999 | 203 | 0.0 |
99999 | 204 | 0.0 |
20500000 rows × 2 columns
type(train_heartbeat_df)
pandas.core.frame.DataFrame
train_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)#重命名列
train_heartbeat_df
time | heartbeat_signals | |
---|---|---|
0 | 0 | 0.9912297987616655 |
0 | 1 | 0.9435330436439665 |
0 | 2 | 0.7646772997256593 |
0 | 3 | 0.6185708990212999 |
0 | 4 | 0.3796321642826237 |
... | ... | ... |
99999 | 200 | 0.0 |
99999 | 201 | 0.0 |
99999 | 202 | 0.0 |
99999 | 203 | 0.0 |
99999 | 204 | 0.0 |
20500000 rows × 2 columns
train_heartbeat_df.heartbeat_signals.dtypes
dtype('O')
train_heartbeat_df["heartbeat_signals"] = train_heartbeat_df["heartbeat_signals"].astype(float)
train_heartbeat_df
time | heartbeat_signals | |
---|---|---|
0 | 0 | 0.991230 |
0 | 1 | 0.943533 |
0 | 2 | 0.764677 |
0 | 3 | 0.618571 |
0 | 4 | 0.379632 |
... | ... | ... |
99999 | 200 | 0.000000 |
99999 | 201 | 0.000000 |
99999 | 202 | 0.000000 |
99999 | 203 | 0.000000 |
99999 | 204 | 0.000000 |
20500000 rows × 2 columns
# 将处理后的心电特征加入到训练数据中,同时将训练数据label列单独存储
data_train_label = data_train["label"]
data_train = data_train.drop("label", axis=1)
data_train = data_train.drop("heartbeat_signals", axis=1)
data_train = data_train.join(train_heartbeat_df)
#join方法提供了一个简便的方法用于将两个DataFrame中的不同的列索引合并成为一个DataFrame。其中参数的意义与merge方法基本相同,只是join方法默认为左外连接how=left。
data_train
id | time | heartbeat_signals | |
---|---|---|---|
0 | 0 | 0 | 0.991230 |
0 | 0 | 1 | 0.943533 |
0 | 0 | 2 | 0.764677 |
0 | 0 | 3 | 0.618571 |
0 | 0 | 4 | 0.379632 |
... | ... | ... | ... |
99999 | 99999 | 200 | 0.000000 |
99999 | 99999 | 201 | 0.000000 |
99999 | 99999 | 202 | 0.000000 |
99999 | 99999 | 203 | 0.000000 |
99999 | 99999 | 204 | 0.000000 |
20500000 rows × 3 columns
data_train[data_train["id"]==1]
id | time | heartbeat_signals | |
---|---|---|---|
1 | 1 | 0 | 0.971482 |
1 | 1 | 1 | 0.928969 |
1 | 1 | 2 | 0.572933 |
1 | 1 | 3 | 0.178457 |
1 | 1 | 4 | 0.122962 |
... | ... | ... | ... |
1 | 1 | 200 | 0.000000 |
1 | 1 | 201 | 0.000000 |
1 | 1 | 202 | 0.000000 |
1 | 1 | 203 | 0.000000 |
1 | 1 | 204 | 0.000000 |
205 rows × 3 columns
3、使用 tsfresh 进行时间序列特征处理
1.特征抽取 **Tsfresh(TimeSeries Fresh)**是一个Python第三方工具包。 它可以自动计算大量的时间序列数据的特征。此外,该包还包含了特征重要性评估、特征选择的方法,因此,不管是基于时序数据的分类问题还是回归问题,tsfresh都会是特征提取一个不错的选择。官方文档:Introduction — tsfresh 0.17.1.dev24+g860c4e1 documentation
from tsfresh.feature_extraction import extract_features,MinimalFCParameters
# 特征提取
settings=MinimalFCParameters()
train_features = extract_features(data_train, column_id='id', column_sort='time',default_fc_parameters = settings)
train_features.head()
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 10/10 [01:02<00:00, 6.25s/it]
heartbeat_signals__sum_values | heartbeat_signals__median | heartbeat_signals__mean | heartbeat_signals__length | heartbeat_signals__standard_deviation | heartbeat_signals__variance | heartbeat_signals__root_mean_square | heartbeat_signals__maximum | heartbeat_signals__minimum | |
---|---|---|---|---|---|---|---|---|---|
0 | 38.927945 | 0.125531 | 0.189892 | 205.0 | 0.229783 | 0.052800 | 0.298093 | 1.000000 | 0.0 |
1 | 19.445634 | 0.030481 | 0.094857 | 205.0 | 0.169080 | 0.028588 | 0.193871 | 1.000000 | 0.0 |
2 | 21.192974 | 0.000000 | 0.103380 | 205.0 | 0.184119 | 0.033900 | 0.211157 | 1.000000 | 0.0 |
3 | 42.113066 | 0.241397 | 0.205430 | 205.0 | 0.186186 | 0.034665 | 0.277248 | 1.000000 | 0.0 |
4 | 69.756786 | 0.000000 | 0.340277 | 205.0 | 0.366213 | 0.134112 | 0.499901 | 0.999908 | 0.0 |
2.特征选择 train_features中包含了heartbeat_signals的779种常见的时间序列特征(所有这些特征的解释可以去看官方文档),这其中有的特征可能为NaN值(产生原因为当前数据不支持此类特征的计算),使用以下方式去除NaN值:
from tsfresh.utilities.dataframe_functions import impute
# 去除抽取特征中的NaN值
impute(train_features)
heartbeat_signals__sum_values | heartbeat_signals__median | heartbeat_signals__mean | heartbeat_signals__length | heartbeat_signals__standard_deviation | heartbeat_signals__variance | heartbeat_signals__root_mean_square | heartbeat_signals__maximum | heartbeat_signals__minimum | |
---|---|---|---|---|---|---|---|---|---|
0 | 38.927945 | 0.125531 | 0.189892 | 205.0 | 0.229783 | 0.052800 | 0.298093 | 1.000000 | 0.0 |
1 | 19.445634 | 0.030481 | 0.094857 | 205.0 | 0.169080 | 0.028588 | 0.193871 | 1.000000 | 0.0 |
2 | 21.192974 | 0.000000 | 0.103380 | 205.0 | 0.184119 | 0.033900 | 0.211157 | 1.000000 | 0.0 |
3 | 42.113066 | 0.241397 | 0.205430 | 205.0 | 0.186186 | 0.034665 | 0.277248 | 1.000000 | 0.0 |
4 | 69.756786 | 0.000000 | 0.340277 | 205.0 | 0.366213 | 0.134112 | 0.499901 | 0.999908 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 63.323449 | 0.388402 | 0.308895 | 205.0 | 0.211636 | 0.044790 | 0.374441 | 1.000000 | 0.0 |
99996 | 69.657534 | 0.421138 | 0.339793 | 205.0 | 0.199966 | 0.039986 | 0.394266 | 1.000000 | 0.0 |
99997 | 40.897057 | 0.213306 | 0.199498 | 205.0 | 0.200657 | 0.040263 | 0.282954 | 1.000000 | 0.0 |
99998 | 42.333303 | 0.264974 | 0.206504 | 205.0 | 0.164380 | 0.027021 | 0.263941 | 1.000000 | 0.0 |
99999 | 53.290117 | 0.320124 | 0.259952 | 205.0 | 0.194868 | 0.037974 | 0.324883 | 1.000000 | 0.0 |
100000 rows × 9 columns
接下来,按照特征和响应变量之间的相关性进行特征选择,这一过程包含两步:首先单独计算每个特征和响应变量之间的相关性,然后利用Benjamini-Yekutieli procedure [1] 进行特征选择,决定哪些特征可以被保留。
from tsfresh import select_features
# 按照特征和数据label之间的相关性进行特征选择
train_features_filtered = select_features(train_features, data_train_label)
train_features_filtered
heartbeat_signals__sum_values | heartbeat_signals__median | heartbeat_signals__mean | heartbeat_signals__standard_deviation | heartbeat_signals__variance | heartbeat_signals__root_mean_square | heartbeat_signals__maximum | heartbeat_signals__minimum | |
---|---|---|---|---|---|---|---|---|
0 | 38.927945 | 0.125531 | 0.189892 | 0.229783 | 0.052800 | 0.298093 | 1.000000 | 0.0 |
1 | 19.445634 | 0.030481 | 0.094857 | 0.169080 | 0.028588 | 0.193871 | 1.000000 | 0.0 |
2 | 21.192974 | 0.000000 | 0.103380 | 0.184119 | 0.033900 | 0.211157 | 1.000000 | 0.0 |
3 | 42.113066 | 0.241397 | 0.205430 | 0.186186 | 0.034665 | 0.277248 | 1.000000 | 0.0 |
4 | 69.756786 | 0.000000 | 0.340277 | 0.366213 | 0.134112 | 0.499901 | 0.999908 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 63.323449 | 0.388402 | 0.308895 | 0.211636 | 0.044790 | 0.374441 | 1.000000 | 0.0 |
99996 | 69.657534 | 0.421138 | 0.339793 | 0.199966 | 0.039986 | 0.394266 | 1.000000 | 0.0 |
99997 | 40.897057 | 0.213306 | 0.199498 | 0.200657 | 0.040263 | 0.282954 | 1.000000 | 0.0 |
99998 | 42.333303 | 0.264974 | 0.206504 | 0.164380 | 0.027021 | 0.263941 | 1.000000 | 0.0 |
99999 | 53.290117 | 0.320124 | 0.259952 | 0.194868 | 0.037974 | 0.324883 | 1.000000 | 0.0 |
100000 rows × 8 columns
可以看到经过特征选择,留下了700个特征。