将时间序列转化为监督学习问题

这里提供两种不同的数据划分方式,看喜好选择了鸭

第一种数据划分的方式

pandas的shift()函数

import pandas as pd
df = pd.DataFrame()
df["time"] = [x for x in range(10)]
df
time
00
11
22
33
44
55
66
77
88
99
df["time-1"] = df["time"].shift(1)
df
timetime-1
00NaN
110.0
221.0
332.0
443.0
554.0
665.0
776.0
887.0
998.0
df["time+1"] = df["time"].shift(-1)
df
timetime-1time+1
00NaN1.0
110.02.0
221.03.0
332.04.0
443.05.0
554.06.0
665.07.0
776.08.0
887.09.0
998.0NaN
df["time+2"] = df["time"].shift(-2)
df
timetime-1time+1time+2
00NaN1.02.0
110.02.03.0
221.03.04.0
332.04.05.0
443.05.06.0
554.06.07.0
665.07.08.0
776.08.09.0
887.09.0NaN
998.0NaNNaN

在时间序列预测问题中, 当前时间t和未来时间(t+1,t+n)被称为预测时间,过去的观测值(t-1,t-n)是用于预测的

新的数据集被构造为Dataframe,每列根据变量的编号以及该列左移或者右移的步长命名

def series_to_supervisied(data,step_in,step_out,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols,names = [],[]
    #输入序列(t-n,t-(n+1),t-(n+2)...t-1)
    for i in range(step_in,0,-1):
        cols.append(df.shift(i))
        names+=[("var%d(t-%d)"%(j+1,i)) for j in range(n_vars)]
    
    #预测序列(t+1,t+2...t+n)
    for i in range(0,step_out):
        cols.append(df.shift(-i))
        names+=[("var%d(t+%d)"%(j+1,i)) for j in range(n_vars)]
    
    agg = pd.concat(cols,axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return  agg
values = [x for x in range(10)]
data = series_to_supervisied(data=values,step_in=1,step_out=1)
data
var1(t-1)var1(t+0)
10.01
21.02
32.03
43.04
54.05
65.06
76.07
87.08
98.09
def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame()
    df_time_in =pd.DataFrame() 
    df_time_out = pd.DataFrame()
    df["time"] =data 
    #输入序列(t-n,t-(n+1),t-(n+2)...t-1)
    for i in range(step_in,0,-1):
        name = "step_"+"time-"+str(i)
        print(name)
        df_time_in[name] = df["time"].shift(i)
        print(name)
    
    #预测序列(t+1,t+2...t+n)
    for i in range(1,step_out+1):
        name = "step_"+"time+"+str(i)
        print(name)
        df_time_out[name] = df["time"].shift(-i)
        print(name)
    df_re = pd.concat([df_time_in,df,df_time_out],axis =1)
    del df,df_time_in,df_time_out
    if dropnan:
        df_re.dropna(inplace=True)
    return  df_re
values = [x for x in range(10)]
data = series_to_supervisied_(data=values,step_in=3,step_out=0)
data
step_time-3
step_time-3
step_time-2
step_time-2
step_time-1
step_time-1
step_time-3step_time-2step_time-1time
30.01.02.03
41.02.03.04
52.03.04.05
63.04.05.06
74.05.06.07
85.06.07.08
96.07.08.09

单步单变量预测

用(t-1)作为输入变量预测当前时间的观测值(t),同理,可以指定任意长度的输入

def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame()
    df_time_in =pd.DataFrame() 
    df_time_out = pd.DataFrame()
    df["time"] =data 
    #输入序列(t-n,t-(n+1),t-(n+2)...t-1)
    for i in range(step_in,0,-1):
        name = "step_"+"time-"+str(i)
        df_time_in[name] = df["time"].shift(i)
    
    
    #预测序列(t+1,t+2...t+n)
    for i in range(0,step_out):
        name = "step_"+"time+"+str(i)
      
        df_time_out[name] = df["time"].shift(-i)
   
    df_re = pd.concat([df_time_in,df_time_out],axis =1)
    del df,df_time_in,df_time_out
    if dropnan:
        df_re.dropna(inplace=True)
    return  df_re

values = [x for x in range(10)]
data = series_to_supervisied_(data=values,step_in=1,step_out=1)
data
step_time-1step_time+0
10.01
21.02
32.03
43.04
54.05
65.06
76.07
87.08
98.09
def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame()
    df_time_in =pd.DataFrame() 
    df_time_out = pd.DataFrame()
    df["time"] =data 
    #输入序列(t-n,t-(n+1),t-(n+2)...t-1)
    for i in range(step_in,0,-1):
        name = "step_"+"time-"+str(i)
        df_time_in[name] = df["time"].shift(i)
    
    
    #预测序列(t+1,t+2...t+n)
    for i in range(0,step_out):
        name = "step_"+"time+"+str(i)
      
        df_time_out[name] = df["time"].shift(-i)
   
    df_re = pd.concat([df_time_in,df_time_out],axis =1)
    del df,df_time_in,df_time_out
    if dropnan:
        df_re.dropna(inplace=True)
    return  df_re

values = [x for x in range(10)]
data = series_to_supervisied_(data=values,step_in=2,step_out=1)
data
step_time-2step_time-1step_time+0
20.01.02
31.02.03
42.03.04
53.04.05
64.05.06
75.06.07
86.07.08
97.08.09

多步预测

def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame()
    df_time_in =pd.DataFrame() 
    df_time_out = pd.DataFrame()
    df["time"] =data 
    #输入序列(t-n,t-(n+1),t-(n+2)...t-1)
    for i in range(step_in,0,-1):
        name = "step_"+"time-"+str(i)
        df_time_in[name] = df["time"].shift(i)
    
    
    #预测序列(t+1,t+2...t+n)
    for i in range(0,step_out):
        name = "step_"+"time+"+str(i)
      
        df_time_out[name] = df["time"].shift(-i)
   
    df_re = pd.concat([df_time_in,df_time_out],axis =1)
    del df,df_time_in,df_time_out
    if dropnan:
        df_re.dropna(inplace=True)
    return  df_re

values = [x for x in range(10)]
data = series_to_supervisied_(data=values,step_in=2,step_out=2)
data
step_time-2step_time-1step_time+0step_time+1
20.01.023.0
31.02.034.0
42.03.045.0
53.04.056.0
64.05.067.0
75.06.078.0
86.07.089.0

多变量预测

def series_to_superivsed(data,step_in =1,step_out=1,dropnan = True):
    """
    param:data观测序列,类型为列表或者二维的numpy数组
    param:step_in:作为输入滞后观测值数量(x)
    param:step_out:作为输出的观测值为(y)
    param:dropnan:是否删除具有NaN的行,称为bool,默认为True
    
    return:为监督学习重组得到的dataframe序列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols = []
    names = []
    #输入序列:[(t-n),(t-n+1).....(t-1)]
    for i in range(step_in,0,-1):
        cols.append(df.shift(i))
        names+=[("var%d(t-%d)"%(j+1,i)) for j in range(n_vars)]
    #预测序列[t,(t+1),(t+2)....(t+n)]
    for i in range(0,step_out):
        cols.append(df.shift(-i))
        if i ==0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names+=[("var%d(t+%d)"%(j+1,i)) for j in range(n_vars)]
            
    df_re = pd.concat(cols,axis=1)
    df_re.columns = names
    if dropnan:
        df_re.dropna(inplace =True)
    return df_re
raw = pd.DataFrame()

raw['ob1'] = [x for x in range(10)]
raw['ob2'] = [x for x in range(50, 60)]
values = raw.values
data = series_to_superivsed(values)
data
var1(t-1)var2(t-1)var1(t)var2(t)
10.050.0151
21.051.0252
32.052.0353
43.053.0454
54.054.0555
65.055.0656
76.056.0757
87.057.0858
98.058.0959
raw['ob1'] = [x for x in range(10)]
raw['ob2'] = [x for x in range(50, 60)]
values = raw.values
data = series_to_superivsed(values,1,2)
data
var1(t-1)var2(t-1)var1(t)var2(t)var1(t+1)var2(t+1)
10.050.01512.052.0
21.051.02523.053.0
32.052.03534.054.0
43.053.04545.055.0
54.054.05556.056.0
65.055.06567.057.0
76.056.07578.058.0
87.057.08589.059.0

第二种数据划分的方式

索引数据
010
120
230
340
450
560
670
780
890
9100
10110

假如time_step(3)个步长预测一个样本 ,得到如下表

索引xy
010,20,3040
120,30,4050
230,40,5060
340,50,6070
450,60,7080
560,70,8090
670,80,90100
780,90,100110
890,100,110?
9100,110,???
10110,?,?????
import numpy as np 
def split_sequence(sequence,n_steps):
    x,y = [],[]
    for i in range(len(sequence)):
        #找到步长的最后一个值
        end_idx = i+n_steps
        if end_idx>len(sequence)-1:
            break
        input_x,input_y = sequence[i:end_idx],sequence[end_idx]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)
raw_seq = [10,20,30,40,50,60,70,80,90]
n_steps = 3
x,y = split_sequence(raw_seq,n_steps)
for i in range(len(x)):
    print(x[i],y[i])
[10 20 30] 40
[20 30 40] 50
[30 40 50] 60
[40 50 60] 70
[50 60 70] 80
[60 70 80] 90

多变量时间序列是指每个时间步长有一个观测值的数据

多个输入的系列·

索引x1,x2y
010,1525
120,2545
230,3565
340,4585
450,55105
560,65125
670,75145
780,85165
890,95185
in_seq1 =np. array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])

array([ 25,  45,  65,  85, 105, 125, 145, 165, 185])
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
data = np.hstack((in_seq1,in_seq2,out_seq))
data
array([[ 10,  15,  25],
       [ 20,  25,  45],
       [ 30,  35,  65],
       [ 40,  45,  85],
       [ 50,  55, 105],
       [ 60,  65, 125],
       [ 70,  75, 145],
       [ 80,  85, 165],
       [ 90,  95, 185]])
索引x1,x2y
010,15
120,25
230,3565
320,25
430,35
540,4585
6
def split_sequence(sequence,n_steps):
    x,y = [],[]
    for i in range(len(sequence)):
        end_idx = i+n_steps
        if end_idx>len(sequence):
            break
        input_x,input_y = sequence[i:end_idx,:-1],sequence[end_idx-1,-1]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)

n_steps = 3
x,y = split_sequence(data,n_steps)
for i in range(len(x)):
    print(x[i], y[i])
    print("="*15)

[[10 15]
 [20 25]
 [30 35]] 65
===============
[[20 25]
 [30 35]
 [40 45]] 85
===============
[[30 35]
 [40 45]
 [50 55]] 105
===============
[[40 45]
 [50 55]
 [60 65]] 125
===============
[[50 55]
 [60 65]
 [70 75]] 145
===============
[[60 65]
 [70 75]
 [80 85]] 165
===============
[[70 75]
 [80 85]
 [90 95]] 185
===============

多个序列输出

索引x1,x2y
010,1525
120,2545
230,3565
340,4585
450,55105
560,65125
670,75145
780,85165
890,95185
索引x1,x2y
010,1525
120,2545
230,3565
输出
340,4585
def split_sequence(sequences,n_step):
    x,y = [],[]
    for i in range(len(sequences)):
        end_idx = i+n_step
        if end_idx>len(sequences)-1:
            break
        input_x,input_y = sequences[i:end_idx,:],sequences[end_idx,:]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)

in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])

in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))

data =np. hstack((in_seq1, in_seq2, out_seq))
n_steps = 3
x,y = split_sequence(data,n_steps)
for i in range(len(x)):
    print(x[i], y[i])
    print("="*20)
[[10 15 25]
 [20 25 45]
 [30 35 65]] [40 45 85]
====================
[[20 25 45]
 [30 35 65]
 [40 45 85]] [ 50  55 105]
====================
[[ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]] [ 60  65 125]
====================
[[ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]] [ 70  75 145]
====================
[[ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]] [ 80  85 165]
====================
[[ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]] [ 90  95 185]
====================

多个输入,多个输出

索引数据
010
120
230
340
450
560
670
780
890
9100
10110
索引数据
010
120
230
输出
340
450
560
def split_sequence(sequence,n_steps_in,n_steps_out):
    x,y =[],[]
    for i in range(len(sequence)):
        end_idx = i+n_steps_in
        out_end_idx = end_idx + n_steps_out
        
        if out_end_idx>len(sequence):
            break
        
        input_x,input_y = sequence[i:end_idx],sequence[end_idx:out_end_idx]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)



raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90]

n_steps_in, n_steps_out = 3, 2
x, y = split_sequence(raw_seq, n_steps_in, n_steps_out)
for i in range(len(x)):
    print(x[i], y[i])
    print("="*20)
[10 20 30] [40 50]
====================
[20 30 40] [50 60]
====================
[30 40 50] [60 70]
====================
[40 50 60] [70 80]
====================
[50 60 70] [80 90]
====================

多个维度的输入输出

索引x1,x2y
010,1525
120,2545
230,3565
340,4585
450,55105
560,65125
670,75145
780,85165
890,95185
索引x1,x2y
010,15
120,25
230,35
输出
365
485
def split_sequence(sequences,n_steps_in,n_steps_out):
    x,y = [],[]
    for i in range(len(sequences)):
        end_idx = i+n_steps_in
        out_end_idx = end_idx+n_steps_out-1
        if out_end_idx>len(sequences):
            break
        input_x,input_y = sequences[i:end_idx,:-1],sequences[end_idx-1:out_end_idx,-1]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)


in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
data = np.hstack((in_seq1, in_seq2, out_seq))
n_steps_in, n_steps_out = 3, 2
x, y = split_sequence(data, n_steps_in, n_steps_out)
for i in range(len(x)):
    print(x[i], y[i])
    print("="*30)
[[10 15]
 [20 25]
 [30 35]] [65 85]
==============================
[[20 25]
 [30 35]
 [40 45]] [ 85 105]
==============================
[[30 35]
 [40 45]
 [50 55]] [105 125]
==============================
[[40 45]
 [50 55]
 [60 65]] [125 145]
==============================
[[50 55]
 [60 65]
 [70 75]] [145 165]
==============================
[[60 65]
 [70 75]
 [80 85]] [165 185]
==============================
索引x1,x2y
010,1525
120,2545
230,3565
340,4585
450,55105
560,65125
670,75145
780,85165
890,95185
索引x1,x2y
010,1525
120,2545
230,3565
340,4585
输出
450,55105
560,65125
def split_sequences(sequences,n_steps_in,n_steps_out):
    x,y = [],[]
    for i in range(len(sequences)):
        end_idx = i+n_steps_in
        out_end_idx = end_idx+n_steps_out
        
        if out_end_idx>len(sequences):
            break
        input_x,input_y = sequences[i:end_idx,:],sequences[end_idx:out_end_idx,:]
        x.append(input_x)
        y.append(input_y)
    return np.array(x),np.array(y)

in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])

in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))

dataset = np.hstack((in_seq1, in_seq2, out_seq))
n_steps_in, n_steps_out = 3, 2
X, y = split_sequences(dataset, n_steps_in, n_steps_out)
for i in range(len(X)):
    print(X[i], y[i])
    print("="*30)
[[10 15 25]
 [20 25 45]
 [30 35 65]] [[ 40  45  85]
 [ 50  55 105]]
==============================
[[20 25 45]
 [30 35 65]
 [40 45 85]] [[ 50  55 105]
 [ 60  65 125]]
==============================
[[ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]] [[ 60  65 125]
 [ 70  75 145]]
==============================
[[ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]] [[ 70  75 145]
 [ 80  85 165]]
==============================
[[ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]] [[ 80  85 165]
 [ 90  95 185]]
==============================

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值