TimesNet处理UEA数据集，用在InceptionTime上

最新推荐文章于 2024-10-31 17:33:22 发布

刘泓君

最新推荐文章于 2024-10-31 17:33:22 发布

阅读量2.8k

点赞数 4

文章标签： python pandas 机器学习

本文链接：https://blog.csdn.net/weixin_44907625/article/details/129666665

版权

class UEAloader(Dataset):
    """
    Dataset class for datasets included in:
        Time Series Classification Archive (www.timeseriesclassification.com)
    Argument:
        limit_size: float in (0, 1) for debug
    Attributes:
        all_df: (num_samples * seq_len, num_columns) dataframe indexed by integer indices, with multiple rows corresponding to the same index (sample).
            Each row is a time step; Each column contains either metadata (e.g. timestamp) or a feature.
        feature_df: (num_samples * seq_len, feat_dim) dataframe; contains the subset of columns of `all_df` which correspond to selected features
        feature_names: names of columns contained in `feature_df` (same as feature_df.columns)
        all_IDs: (num_samples,) series of IDs contained in `all_df`/`feature_df` (same as all_df.index.unique() )
        labels_df: (num_samples, num_labels) pd.DataFrame of label(s) for each sample
        max_seq_len: maximum sequence (time series) length. If None, script argument `max_seq_len` will be used.
            (Moreover, script argument overrides this attribute)
    """

    def __init__(self, root_path, file_list=None, limit_size=None, flag=None):
        self.root_path = root_path
        self.all_df, self.labels_df = self.load_all(root_path, file_list=file_list, flag=flag)
        self.all_IDs = self.all_df.index.unique()  # all sample IDs (integer indices 0 ... num_samples-1)

        if limit_size is not None:
            if limit_size > 1:
                limit_size = int(limit_size)
            else:  # interpret as proportion if in (0, 1]
                limit_size = int(limit_size * len(self.all_IDs))
            self.all_IDs = self.all_IDs[:limit_size]
            self.all_df = self.all_df.loc[self.all_IDs]

        # use all features
        self.feature_names = self.all_df.columns
        self.feature_df = self.all_df

        # pre_process
        normalizer = Normalizer()
        self.feature_df = normalizer.normalize(self.feature_df)
        print(len(self.all_IDs))

    def load_all(self, root_path, file_list=None, flag=None):
        """
        Loads datasets from csv files contained in `root_path` into a dataframe, optionally choosing from `pattern`
        Args:
            root_path: directory containing all individual .csv files
            file_list: optionally, provide a list of file paths within `root_path` to consider.
                Otherwise, entire `root_path` contents will be used.
        Returns:
            all_df: a single (possibly concatenated) dataframe with all data corresponding to specified files
            labels_df: dataframe containing label(s) for each sample
        """
        # Select paths for training and evaluation
        if file_list is None:
            data_paths = glob.glob(os.path.join(root_path, '*'))  # list of all paths
        else:
            data_paths = [os.path.join(root_path, p) for p in file_list]
        if len(data_paths) == 0:
            raise Exception('No files found using: {}'.format(os.path.join(root_path, '*')))
        if flag is not None:
            data_paths = list(filter(lambda x: re.search(flag, x), data_paths))
        input_paths = [p for p in data_paths if os.path.isfile(p) and p.endswith('.ts')]
        if len(input_paths) == 0:
            raise Exception("No .ts files found using pattern: '{}'".format(pattern))

        all_df, labels_df = self.load_single(input_paths[0])  # a single file contains dataset

        return all_df, labels_df

    def load_single(self, filepath):
        df, labels = load_data.load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True,
                                                             replace_missing_vals_with='NaN')
        labels = pd.Series(labels, dtype="category")
        self.class_names = labels.cat.categories
        labels_df = pd.DataFrame(labels.cat.codes,
                                 dtype=np.int8)  # int8-32 gives an error when using nn.CrossEntropyLoss

        lengths = df.applymap(
            lambda x: len(x)).values  # (num_samples, num_dimensions) array containing the length of each series

        horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))

        if np.sum(horiz_diffs) > 0:  # if any row (sample) has varying length across dimensions
            df = df.applymap(subsample)

        lengths = df.applymap(lambda x: len(x)).values
        vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))
        if np.sum(vert_diffs) > 0:  # if any column (dimension) has varying length across samples
            self.max_seq_len = int(np.max(lengths[:, 0]))
        else:
            self.max_seq_len = lengths[0, 0]

        # First create a (seq_len, feat_dim) dataframe for each sample, indexed by a single integer ("ID" of the sample)
        # Then concatenate into a (num_samples * seq_len, feat_dim) dataframe, with multiple rows corresponding to the
        # sample index (i.e. the same scheme as all datasets in this project)

        df = pd.concat((pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
            pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0])), axis=0)

        # Replace NaN values
        grp = df.groupby(by=df.index)
        df = grp.transform(interpolate_missing)

        return df, labels_df

    def instance_norm(self, case):
        if self.root_path.count('EthanolConcentration') > 0:  # special process for numerical stability
            mean = case.mean(0, keepdim=True)
            case = case - mean
            stdev = torch.sqrt(torch.var(case, dim=1, keepdim=True, unbiased=False) + 1e-5)
            case /= stdev
            return case
        else:
            return case

    def __getitem__(self, ind):
        return self.instance_norm(torch.from_numpy(self.feature_df.loc[self.all_IDs[ind]].values)), \
               torch.from_numpy(self.labels_df.loc[self.all_IDs[ind]].values)

    def __len__(self):
        return len(self.all_IDs)

比较关键的几个函数：

从tsfile文件中读取dataframe：

from sktime.utils import load_data
load_data.load_from_tsfile_to_dataframe

将数据集中categories变为分类标签：

labels = pd.Series(labels, dtype="category")
        self.class_names = labels.cat.categories
        labels_df = pd.DataFrame(labels.cat.codes,
                                 dtype=np.int8)

pd.Series.cat.codes 是 Pandas 库中 Series 对象的一个方法，用于将分类变量转换为分类编码（Category Codes）。分类变量是指具有有限可能取值的变量，例如性别、职业等。在机器学习中，经常需要将分类变量转换为数值编码，以便于机器学习算法的处理和分析。
pd.Series.cat.codes 方法可以将 Series 对象中的分类变量转换为整数编码。该方法返回一个包含整数编码的 Series 对象，其中每个不同的分类变量都被映射到一个唯一的整数值。例如，如果一个 Series 对象包含了不同的颜色分类变量，那么 pd.Series.cat.codes 方法将会将这些分类变量映射为 0、1、2、3 等整数编码，以便于进一步的处理和分析。
下面是一个示例代码：
python
Copy code
import pandas as pd
# 创建一个包含分类变量的Series对象
colors = pd.Series([“red”, “blue”, “green”, “blue”, “green”, “red”])
# 将分类变量转换为分类编码
color_codes = colors.astype(‘category’).cat.codes
# 输出转换后的编码
print(color_codes)
在上述示例中，colors.astype(‘category’).cat.codes 将包含颜色分类变量的 Series 对象转换为分类编码。最终输出的编码为：0 1 2 1 2 0。其中，“red” 被映射为 0，“blue” 被映射为 1，“green” 被映射为 2。

统一数据长度：

    lengths = df.applymap(
        lambda x: len(x)).values  # (num_samples, num_dimensions) array containing the length of each series

    horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))
   	if np.sum(horiz_diffs) > 0:  # if any row (sample) has varying length across dimensions
        df = df.applymap(subsample)

def subsample(y, limit=256, factor=2):
    """
    If a given Series is longer than `limit`, returns subsampled sequence by the specified integer factor
    """
    if len(y) > limit:
        return y[::factor].reset_index(drop=True)
    return y

在这段代码中，df.applymap() 被用于对数据框中的每个元素应用一个 lambda 函数，该函数用于计算字符串的长度。具体地，len(x) 计算字符串 x 的长度，而 df.applymap(lambda x: len(x)) 对数据框中的每个字符串元素都应用了这个函数，生成了一个包含每个字符串长度的数据框。
接着，.values 被用于将该数据框转换为一个 Numpy 数组，其中每个元素都是一个字符串的长度。这个 Numpy 数组的形状为 (num_samples, num_dimensions)，其中 num_samples 是数据集中的样本数，num_dimensions 是每个时间序列的维度或特征数。
最后，np.expand_dims(lengths[:, 0], -1) 被用于创建一个形状为 (num_samples, 1) 的数组，其中每个元素都是第一列时间序列的长度。这个数组将用于计算每个时间序列长度与第一个时间序列长度的差异。np.abs(lengths - np.expand_dims(lengths[:, 0], -1)) 计算了每个时间序列长度与第一个时间序列长度的绝对差异。这个数组的形状与 lengths 相同，即 (num_samples, num_dimensions)。

在这段代码中，y 是一个 Pandas Series 对象，表示一个时间序列的标签或类别。[::factor] 通过 Python 的切片操作，将 y 中的每隔 factor 个元素保留一个元素，从而完成下采样。例如，如果 factor 为2，则将 y 中的每隔一个元素保留一个元素，即将 y 的采样率减半。
.reset_index(drop=True) 将下采样后的 Series 对象的索引重新排序，并将其从 0 开始标号。drop=True 表示放弃原来的索引，使用新的索引。
最终，该函数返回下采样后的 Series 对象，其中包含了每隔 factor 个元素保留一个元素的标签或类别。

4.统一维度：

    lengths = df.applymap(lambda x: len(x)).values
    vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))

最后，使用 np.abs(lengths - np.expand_dims(lengths[0, :], 0)) 计算每个时间序列在不同维度上的长度与第一个时间序列在不同维度上的长度的绝对差异。np.expand_dims(lengths[0, :], 0) 用于将第一个时间序列的长度扩展为 (1, num_dimensions) 的形状，以便于与每个时间序列的长度进行差异计算。计算完成后，vert_diffs 是一个 (num_samples, num_dimensions) 形状的 Numpy 数组，其中每个元素都表示对应时间序列在该维度上与第一个时间序列在该维度上长度的绝对差异。

合并重组：

    # First create a (seq_len, feat_dim) dataframe for each sample, indexed by a single integer ("ID" of the sample)
    # Then concatenate into a (num_samples * seq_len, feat_dim) dataframe, with multiple rows corresponding to the
    # sample index (i.e. the same scheme as all datasets in this project)

    df = pd.concat((pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
        pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0])), axis=0)

这段代码首先使用列表推导式 (pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0]))，对数据集中的每个时间序列进行处理，生成一个 Pandas DataFrame 对象。具体地，这个列表推导式首先将一个样本的所有时间序列数据组成一个 Pandas DataFrame 对象，其中每行表示该时间序列在该样本中的一个时间点的特征向量，每列表示特征的维度。生成的 DataFrame 对象的形状为 (seq_len, feat_dim)，其中 seq_len 表示该时间序列的长度（即时间点数），feat_dim 表示特征的维度。
接着，通过 reset_index(drop=True) 对该 DataFrame 对象的索引进行重置，即使用从 0 开始的连续整数作为新的索引，并将其保存到新的 DataFrame 对象中。
然后，通过 set_index(pd.Series(lengths[row, 0] * [row])) 对新的 DataFrame 对象的索引进行设置，将新的索引设置为由当前样本的 ID 和该时间序列在该样本中的时间点组成的元组。具体地，pd.Series(lengths[row, 0] * [row]) 生成一个形状为 (seq_len,) 的 Pandas Series 对象，其中每个元素都等于当前样本的 ID。这个 Series 对象被用作新的索引，因此新的 DataFrame 对象的每个行都被标记为一个元组，包含了当前样本的 ID 和该时间序列在该样本中的时间点。最终，这个 DataFrame 对象的形状为 (seq_len, feat_dim + 1)，其中第一列是样本 ID，后面的列是该时间序列在该样本中的每个时间点的特征向量。
最后，通过 pd.concat() 将所有样本的时间序列数据拼接在一起，生成一个形状为 (num_samples * seq_len, feat_dim + 1) 的 Pandas DataFrame 对象，其中每行表示一个时间序列在一个样本中的一个时间点的特征向量。这个 DataFrame 对象中的第一列是样本 ID，后面的列是时间序列在该样本中的每个时间点的特征向量。

做的最后一步是将所有样本的时间序列数据拼接在一起，生成一个 Pandas DataFrame 对象，其中每行表示一个时间序列在一个样本中的一个时间点的特征向量。这个 DataFrame 对象的形状为 (num_samples * seq_len, feat_dim + 1)，其中 num_samples 是数据集中的样本数，seq_len 是所有样本中最长的时间序列的长度，feat_dim 是每个时间序列的特征数或维度。

这个操作的目的是将原始数据集转换为一个标准的格式，以便于进行后续的处理和分析。在转换后的数据集中，每行表示一个时间序列在一个样本中的一个时间点的特征向量，方便应用机器学习算法进行分类、聚类或其他任务。同时，由于数据集已经被整理为一个二维的表格形式，可以更方便地应用 Pandas 和其他 Python 库进行数据预处理和可视化，以及进一步的数据分析和建模。

batch：

def collate_fn(data, max_len=None):
    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
    Args:
        data: len(batch_size) list of tuples (X, y).
            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
            - y: torch tensor of shape (num_labels,) : class indices or numerical targets
                (for classification or regression, respectively). num_labels > 1 for multi-task models
        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
    Returns:
        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding
    """

    batch_size = len(data)
    features, labels = zip(*data)

    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
    if max_len is None:
        max_len = max(lengths)
    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]

    targets = torch.stack(labels, dim=0)  # (batch_size, num_labels)

    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16),
                                 max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep

    return X, targets, padding_masks

暂时还没搞懂：

这段代码实现了一个 PyTorch 的 collate_fn 函数，用于将一个 mini-batch 的数据进行整合和处理，以便于输入到神经网络中进行训练。该函数的输入是一个数据列表，其中每个元素都是一个 (X, y) 元组，表示一个样本的时间序列数据和标签。X 是一个 PyTorch 张量，表示时间序列数据，形状为 (seq_length, feat_dim)，其中 seq_length 是时间序列的长度，feat_dim 是每个时间点的特征数或维度；y 是一个 PyTorch 张量，表示标签或目标，形状为 (num_labels,)，其中 num_labels 是标签或目标的数量，通常为 1。
该函数的输出包括 4 个张量：X、targets、target_masks 和 padding_masks。其中，X 是一个形状为 (batch_size, padded_length, feat_dim) 的 PyTorch 张量，表示时间序列数据的 mini-batch。padded_length 是所有样本中最长的时间序列的长度，即对于所有时间序列数据，短的序列用零填充到与最长序列等长。targets 是一个形状为 (batch_size, num_labels) 的 PyTorch 张量，表示标签或目标的 mini-batch。target_masks 是一个形状为 (batch_size, padded_length, feat_dim) 的 PyTorch 张量，表示标签或目标的掩码，其中 0 表示填充值，1 表示实际值。padding_masks 是一个形状为 (batch_size, padded_length) 的 PyTorch 张量，表示时间序列数据的掩码，其中 0 表示填充值，1 表示实际值。
具体实现时，函数首先从输入的数据列表中分离出所有的时间序列数据和标签，分别存储在 features 和 labels 变量中。然后，计算每个时间序列数据的原始长度，并将其存储在 lengths 列表中。如果没有指定 max_len 参数，则将 max_len 设置为所有时间序列数据的最大长度。接着，使用 PyTorch 的 torch.zeros() 创建一个全零张量 X，形状为 (batch_size, max_len, feat_dim)。对于每个样本，将其时间序列数据的前 max_len 个时间点复制到 X 对应位置，并将不足 max_len 个时间点的序列用零填充。最后，使用 PyTorch 的 torch.stack() 将所有标签组成一个 mini-batch 张量 targets。同时，使用 padding_mask() 函数生成一个时间序列数据的掩码张量 padding_masks，其中的 0 表示填充值，1 表示实际值。

应用InceptionTime

在应用在InceptionTime上时，发现不对：
InceptionTime使用UCR时，直接：

def readucr(filename, delimiter=','):
    data = np.loadtxt(filename, delimiter=delimiter)
    Y = data[:, 0]
    X = data[:, 1:]
    print(X)
    print(Y)
    return X, Y

出来的结果是：

[[1.9305 1.9125 1.891 … 1.9099 1.9233 1.9301]
[1.9178 1.9029 1.8877 … 1.8925 1.9101 1.9162]
[1.887 1.8693 1.8393 … 1.8794 1.887 1.8896]
…
[1.8765 1.8655 1.8442 … 1.8572 1.869 1.8754]
[1.8908 1.8816 1.8569 … 1.8389 1.8487 1.8721]
[1.8465 1.8114 1.7723 … 1.8941 1.8858 1.8642]]

在读取UEA时，需要使用：

#########读取UEA ts#########
def interpolate_missing(y):
    """
    Replaces NaN values in pd.Series `y` using linear interpolation
    """
    if y.isna().any():
        y = y.interpolate(method='linear', limit_direction='both')
    return y

def subsample(y, limit=256, factor=2):
    """
    If a given Series is longer than `limit`, returns subsampled sequence by the specified integer factor
    """
    if len(y) > limit:
        return y[::factor].reset_index(drop=True)
    return y


def readuea(filename, delimiter=','):
    df, labels = load_data.load_from_tsfile_to_dataframe(filename, return_separate_X_and_y=True,
                                                             replace_missing_vals_with='NaN')
    
    labels = pd.Series(labels, dtype="category")
    labels_df = labels.cat.codes
    print('type',type(labels_df))
    

    lengths = df.applymap(
        lambda x: len(x)).values  # (num_samples, num_dimensions) array containing the length of each series

    horiz_diffs = np.abs(lengths - np.expand_dims(lengths[:, 0], -1))

    if np.sum(horiz_diffs) > 0:  # if any row (sample) has varying length across dimensions
        df = df.applymap(subsample)

    # lengths = df.applymap(lambda x: len(x)).values
    # vert_diffs = np.abs(lengths - np.expand_dims(lengths[0, :], 0))

    # # First create a (seq_len, feat_dim) dataframe for each sample, indexed by a single integer ("ID" of the sample)
    # # Then concatenate into a (num_samples * seq_len, feat_dim) dataframe, with multiple rows corresponding to the
    # # sample index (i.e. the same scheme as all datasets in this project)

    # df = pd.concat((pd.DataFrame({col: df.loc[row, col] for col in df.columns}).reset_index(drop=True).set_index(
    #     pd.Series(lengths[row, 0] * [row])) for row in range(df.shape[0])), axis=0)

    # Replace NaN values
    grp = df.groupby(by=df.index)
    df = grp.transform(interpolate_missing)

    # df转numpy
    df = df.to_numpy()
    labels_df = labels_df.to_numpy()
    print(df)
    print(labels_df)

    return df, labels_df

结果是：

ength: 1152, dtype: float64 0 54.31
1 56.03
2 56.66
3 54.84
4 52.12
…
1147 64.03
1148 64.09
1149 64.16
1150 65.03
1151 66.16
Length: 1152, dtype: float64
0 45.78
1 42.78
2 41.94
3 42.81
4 43.47
…
1147 68.50
1148 68.75
1149 68.06
1150 66.53
1151 64.88
Length: 1152, dtype: float64 … 0 42.22
1 42.12
2 42.34
3 41.28
4 39.81
…
1147 63.94
1148 62.66
1149 61.16
1150 59.88
1151 58.56
Length: 1152, dtype: float64
0 43.25
1 43.06
2 43.84
3 45.34
4 46.12
…
1147 54.00
1148 52.94
1149 52.06
1150 51.88
1151 52.34
Length: 1152, dtype: float64 0 43.56
1 43.75
2 42.50
3 40.78
4 40.41
…
1147 57.81
1148 58.81
1149 60.19
1150 61.34
1151 62.00
Length: 1152, dtype: float64]
…
[0 22.50
1 21.84
2 20.53
3 17.06
4 14.56

目前我是打算把pandas的结果转换成numpy。

成功方法：

参考这个代码的数据处理方法：https://github.com/donalee/DTW-Pool/blob/c9a6c1162fa8a5a264c7dbec19c7ebf12a86b522/timeseries.py#L16

def readuea(filename, delimiter=','):
    data, labels = load_data.load_from_tsfile_to_dataframe(filename, return_separate_X_and_y=True,
                                                             replace_missing_vals_with='NaN')

    # Replace NaN values
    grp = data.groupby(by=data.index)
    data = grp.transform(interpolate_missing)

    # df转numpy
    data = np.array([np.array([data.values[iidx, vidx].to_numpy(dtype=np.float) \
                                for vidx in range(data.values.shape[1])]) \
                                for iidx in range(data.values.shape[0])]) 
    
    label2idx = {label: idx for idx, label in enumerate(np.unique(labels))}
    labels = np.array([label2idx[label] for label in labels])

    print(data)
    print(labels)

    return data, labels

也是将pd彻底转成numpy的方法，以后可能会经常使用（这个只是针对uea）