时间序列分析源码学习——数据集准备(核心代码逐行注释)

基于开源时间序列分析库

https://github.com/thuml/Time-Series-Libraryicon-default.png?t=N7T8https://github.com/thuml/Time-Series-Library今天主要学习其中的数据集制作部分,核心代码块进行逐行解释理解。

0、源码

其中主要包括(此处只选择了自定义构造数据集)

1、数据类的定义

import os
import numpy as np
import pandas as pd
import glob
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from utils.timefeatures import time_features
from data_provider.m4 import M4Dataset, M4Meta
from data_provider.uea import subsample, interpolate_missing, Normalizer
from sktime.datasets import load_from_tsfile_to_dataframe
import warnings

warnings.filterwarnings('ignore')

class Dataset_Custom(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns)
        cols.remove(self.target)
        cols.remove('date')
        df_raw = df_raw[['date'] + cols + [self.target]]
        num_train = int(len(df_raw) * 0.7)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
        border2s = [num_train, num_train + num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

2、基于数据类制作数据集

from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom, Dataset_M4, PSMSegLoader, \
    MSLSegLoader, SMAPSegLoader, SMDSegLoader, SWATSegLoader, UEAloader
from data_provider.uea import collate_fn
from torch.utils.data import DataLoader

data_dict = {
    'ETTh1': Dataset_ETT_hour,
    'ETTh2': Dataset_ETT_hour,
    'ETTm1': Dataset_ETT_minute,
    'ETTm2': Dataset_ETT_minute,
    'custom': Dataset_Custom,
    'm4': Dataset_M4,
    'PSM': PSMSegLoader,
    'MSL': MSLSegLoader,
    'SMAP': SMAPSegLoader,
    'SMD': SMDSegLoader,
    'SWAT': SWATSegLoader,
    'UEA': UEAloader
}


def data_provider(args, flag):
    Data = data_dict[args.data]
    timeenc = 0 if args.embed != 'timeF' else 1

    if flag == 'test':
        shuffle_flag = False
        drop_last = True
        if args.task_name == 'anomaly_detection' or args.task_name == 'classification':
            batch_size = args.batch_size
        else:
            batch_size = 1  # bsz=1 for evaluation
        freq = args.freq
    else:
        shuffle_flag = True
        drop_last = True
        batch_size = args.batch_size  # bsz for train and valid
        freq = args.freq

    if args.task_name == 'anomaly_detection':
        drop_last = False
        data_set = Data(
            root_path=args.root_path,
            win_size=args.seq_len,
            flag=flag,
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last)
        return data_set, data_loader
    elif args.task_name == 'classification':
        drop_last = False
        data_set = Data(
            root_path=args.root_path,
            flag=flag,
        )

        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last,
            collate_fn=lambda x: collate_fn(x, max_len=args.seq_len)
        )
        return data_set, data_loader
    else:
        if args.data == 'm4':
            drop_last = False
        data_set = Data(
            root_path=args.root_path,
            data_path=args.data_path,
            flag=flag,
            size=[args.seq_len, args.label_len, args.pred_len],
            features=args.features,
            target=args.target,
            timeenc=timeenc,
            freq=freq,
            seasonal_patterns=args.seasonal_patterns
        )
        print(flag, len(data_set))
        data_loader = DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle_flag,
            num_workers=args.num_workers,
            drop_last=drop_last)
        return data_set, data_loader


 


1、首先分析数据类:Dataset_Custom:

1.1 初始化参数解释

class Dataset_Custom(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='ETTh1.csv',
                 target='OT', scale=True, timeenc=0, freq='h', seasonal_patterns=None):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

首先在类的初始化函数中定义构造数据会用到的参数。并将传入的参数定义赋值给本地的类中的全局变量。将传入的参数保存到类本身的self变量空间。

同时在最后一行调用了类的内部函数,完成数据集的制作

1.2 数据处理函数

    def __read_data__(self):
        self.scaler = StandardScaler()  # 实例化归一化对象
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))  # 读取数据

        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(df_raw.columns)  # 获得所有数据列的名称
        cols.remove(self.target)   # 移除目标列名称
        cols.remove('date')   # 移除时间列名称
        df_raw = df_raw[['date'] + cols + [self.target]]  # 规范化数据重组
        num_train = int(len(df_raw) * 0.7)   # 计算训练集的个数 7成
        num_test = int(len(df_raw) * 0.2)   # 计算测试集的个数 2成
        num_vali = len(df_raw) - num_train - num_test   # 相减得到验证集个数
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] # 计算不同数据集类型的起点
        border2s = [num_train, num_train + num_vali, len(df_raw)] # 计算不同数据集类型的中终点
        border1 = border1s[self.set_type]  # 根据本次制作的数据集类型得到起点
        border2 = border2s[self.set_type]  # 根据本次制作的数据集类型得到终点
        # 例子:如果是训练集,则self.set_type=0,所以数据集的范围为0到num_train

       # 根据代码目标(M:多变量预测,MS:多变量预测单变量,S:单变量预测)
        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:] 
            df_data = df_raw[cols_data]
        # 如果是M或者是MS,那么所有的特征都是要用到的,因此数据集为除去第一列(date)后的所有数据
        elif self.features == 'S':
            df_data = df_raw[[self.target]]
        # 如果是单变量S,则只需要目标序列数据,也即self.target的数据

        # 现在所有的数据都存在于df_data变量中

        # 归一化与否,若归一化,则使用训练数据集来fit并最后transform整个数据集
        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
        
        # 现在所有数据保存在变量data中

        # 下列代码为时间position embeding,也即将时间(date)数据量化,能表达时间之间的相互关系,此处不做过多结束。可简单理解为将一个不可计算的date时间转换为了4个可计算数据(data_stamp)
        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)
        
        # 获得数据集
        self.data_x = data[border1:border2]  # 得到数据的X
        self.data_y = data[border1:border2]  # 得到数据的y
        self.data_stamp = data_stamp         # 得到数据集的Mask
        # 此除的self.data_x/y 并不是最后模型的输入和预期输出。他只是做一个存储,最后获取模型输入输出是调用__getitem__方法

1.3 索引函数定义

    def __getitem__(self, index):
        s_begin = index  # 第index组数据,其模型输入开始位置等于index
        s_end = s_begin + self.seq_len # 其模型输入结束位置等于index+序列长度
        r_begin = s_end - self.label_len # 其模型输入开始位置等于s_end - label_len
        # PS: label_len的意义举例,若现有模型输入为5预测为1,设输入为[1 2 3 4 5],无label_len的情况模型预测输出为[6]。通常,我们在建立模型预测输出时,不光只是包括其本身,还将输入的部分数据加入其中,若设置label_len为2,则对应的预测输出应该是label值[4 5]和期望输出[6],也即[4 5 6],
        r_end = r_begin + self.label_len + self.pred_len # 其模型输出结束位置s_end + 预测长度

        seq_x = self.data_x[s_begin:s_end]  # 根据边界进行索引切片得到模型的输入
        seq_y = self.data_y[r_begin:r_end]  # 根据边界进行索引切片得到模型的对照输出
        seq_x_mark = self.data_stamp[s_begin:s_end] # 根据边界进行索引切片得到模型的输入的时间时间掩码
        seq_y_mark = self.data_stamp[r_begin:r_end] # 根据边界进行索引切片得到模型的预测时段的时间掩码

        return seq_x, seq_y, seq_x_mark, seq_y_mark # 返回值

__getitem__(index)方法定义的为类的内部函数,若该类实例化的对象为dataset,其显示调用方法为dataset[index],

1.4 长度函数定义和反变换

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1
        # 定义返回数据的数量。若实例化对象为dataset, 显示调用: len(dataset)

    def inverse_transform(self, data):
        # 反归一化定义
        return self.scaler.inverse_transform(data)

2、数据提供函数

由于该lib包含了大量的应用场景,因此其函数定义较为庞杂,本文只简单介绍通常情况下的数据集制作函数。精简如下:

from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom
from torch.utils.data import DataLoader

data_dict = {
    'ETTh1': Dataset_ETT_hour,
    'ETTh2': Dataset_ETT_hour,
    'ETTm1': Dataset_ETT_minute,
    'ETTm2': Dataset_ETT_minute,
    'custom': Dataset_Custom,
}


def data_provider(args, flag):
    Data = data_dict[args.data] # 首先获得类(此刻还没实例化)
    timeenc = 0 if args.embed != 'timeF' else 1 # 定义时间position embeding方法
    train_only = args.train_only # 是否仅训练

    if flag == 'test':   # 测试情况
        shuffle_flag = False # 不打乱顺序
        drop_last = False # 删除最后一个数据
        batch_size = args.batch_size # batchsize设置
        freq = args.freq # 时间position embeding方法中的频率参数
    else:
        shuffle_flag = True
        drop_last = True
        batch_size = args.batch_size
        freq = args.freq

    # 实例化类获得对象
    data_set = Data(
        root_path=args.root_path, # 路径
        data_path=args.data_path, # 文件路径
        flag=flag, # 训练 or 测试 or 验证
        size=[args.seq_len, args.label_len, args.pred_len], # 尺寸
        features=args.features, # M/MS/S 多变量or单变量
        target=args.target, # 目标序列
        timeenc=timeenc, # 时间embeding方法
        freq=freq, # 频率
        train_only=train_only # 是否仅训练
    )
    print(flag, len(data_set)) # 输出各类数据集大小
    # 调用torch中自带的数据加载方法
    data_loader = DataLoader(
        data_set, # 实例化对象
        batch_size=batch_size, # batchsize设置
        shuffle=shuffle_flag, # 是否打乱顺序设置
        num_workers=args.num_workers, # 进程并行数设置
        drop_last=drop_last) # 设置是否抛弃最后一个
    # 最后返回的包括data_set和data_loader,data_set时数据类的实例化,data_loader时基于data_set创立的数据集,添加了batch策略以及打乱顺序等
    return data_set, data_loader

实际应用:

# 得到数据
data_set, data_loader = data_provider(self.args, flag)
# 基于数据集的迭代训练(节选)
for epoch in range(self.args.train_epochs):
            iter_count = 0
            train_loss = []
            self.model.train()
            epoch_time = time.time()
            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
                pass

  • 6
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
Linux内核的ext4文件系统是一种高性能、可靠的文件系统,它是Linux操作系统中最常用的文件系统之一。下面我们来逐行逐行注释介绍它的代码。 ``` /* * linux/fs/ext4/super.c * * Copyright (C) 2001-2008 Jan Kara * * This file contains ext4 filesystem superblock operations * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ ``` 这是文件的头部,包括版权声明和一些说明。 ``` #include "ext4.h" ``` 包含了ext4文件系统的头文件。 ``` static int ext4_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long flags; /* ... */ } ``` 这个函数用于显示文件系统的挂载参数。其中,struct seq_file是内核序列文件的数据结构,struct dentry是一个目录项的数据结构,struct super_block是超级块的数据结构,struct ext4_sb_info是ext4文件系统的特有数据结构,用于存储文件系统的相关信息。unsigned long是一个无符号的长整型。 ``` static const match_table_t tokens = { {Opt_journal_dev, "journal_dev=%s"}, {Opt_journal_path, "journal_path=%s"}, {Opt_journal_check_interval, "journal_check_interval=%s"}, {Opt_max_batch_time, "max_batch_time=%s"}, {Opt_stripe, "stripe=%s"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_barrier, "barrier=%s"}, {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL} }; ``` 这个数据结构定义了文件系统的挂载参数和对应的字符串。 ``` static int parse_options(char *options, struct super_block *sb, struct ext4_mount_options *parsed) { char *p; substring_t args[MAX_OPT_ARGS]; int option; int token; int err = 0; /* ... */ } ``` 这个函数用于解析文件系统的挂载参数。其中,parsed是一个结构体,用于存储解析后的参数。 ``` static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct buffer_head *bh; struct ext4_super_block *es; struct ext4_sb_info *sbi; struct inode *root; int blocksize; int db_count; unsigned long long tmp; /* ... */ } ``` 这个函数用于填充文件系统的超级块。其中,struct buffer_head是缓冲头的数据结构,struct ext4_super_block是ext4文件系统的超级块数据结构,struct inode是一个inode节点的数据结构,unsigned long long是一个无符号的长长整型。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值