Sktime中的窗口拆分
在此notebook中,我们将描述 sktime.forecasting.model_selection
模块下的窗口拆分。窗口拆分可以和 ForecastingGridSearchCV
一起来做模型选择(参考forecasting notebook)。
**注意:**需要强调的是,在做时间序列的交叉验证时,不能随机打散(shuffle)数据集,否则会导致信息泄漏(leaking information)。
参考:
准备工作
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.ticker import MaxNLocator
from sktime.datasets import load_airline
from sktime.forecasting.base import ForecastingHorizon
# 4种窗口拆分方式
from sktime.forecasting.model_selection import (
CutoffSplitter,
SingleWindowSplitter,
SlidingWindowSplitter,
temporal_train_test_split,
)
# 时序绘图
from sktime.utils.plotting import plot_series
# 绘图参数
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = [12, 6]
plt.rcParams["figure.dpi"] = 100
数据
我们将使用航空公司数据集(Box-Jenkins,单变量)的一小部分,该数据集包含1949-1960年间每个月国际航空公司乘客的数量。
y = load_airline().iloc[:30]
y.head()
# Period
# 1949-01 112.0
# 1949-02 118.0
# 1949-03 132.0
# 1949-04 129.0
# 1949-05 121.0
# Freq: M, Name: Number of airline passengers, dtype: float64
fig, ax = plot_series(y)
# 采用 forecasting horizon
fh = ForecastingHorizon([1, 2, 3, 4, 5])
y_train, y_test = temporal_train_test_split(y, fh=fh)
plot_series(y_train, y_test, labels=["y_train", "y_test"]);
窗口拆分
现在我们描述不同的拆分方法
1.temporal_train_test_split
把数据集拆分为 traininig 和test 。你可以采用以下两种方式:
- 设定training 或者 test 的 size
- 设定预测期数(forecasting horizon)
# 设定 test set size
y_train, y_test = temporal_train_test_split(y=y, test_size=0.25)
fig, ax = plot_series(y_train, y_test, labels=["y_train", "y_test"])
# 设定 forecasting horizon
fh = ForecastingHorizon([1, 2, 3, 4, 5])
y_train, y_test = temporal_train_test_split(y, fh=fh)
plot_series(y_train, y_test, labels=["y_train", "y_test"]);
2.SingleWindowSplitter
这个类一次性把时间序列拆分为training和test两个窗口 。这和temporal_train_test_split
有些类似。
首先,先定义好折数(fold)中的参数:
# 定义拆分窗口的参数
window_length = 10
fh = ForecastingHorizon([1, 2, 3])
fh_length = len(fh)
cv = SingleWindowSplitter(window_length=window_length, fh=fh)
n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")
# Number of Folds = 1
我们来绘图看看是怎么产生这1个fold的。首先定义以下一些帮助函数
def get_folds_arrays(y, cv):
"""Store folds as arrays."""
n_splits = cv.get_n_splits(y)
windows = np.empty((n_splits, window_length), dtype=np.int)
fhs = np.empty((n_splits, fh_length), dtype=np.int)
for i, (w, f) in enumerate(cv.split(y)):
windows[i] = w
fhs[i] = f
return windows, fhs
def get_y(length, split):
"""Creates a constant level vector based on the split."""
return np.ones(length) * split
windows, fhs = get_folds_arrays(y, cv)
window_color, fh_color = sns.color_palette("colorblind")[:2]
fig, ax = plt.subplots()
for i in range(n_splits):
ax.plot(np.arange(len(y)), get_y(len(y), i), marker="o", c="lightgray")
ax.plot(
windows[i], get_y(window_length, i), marker="o", c=window_color, label="Window"
)
ax.plot(
fhs[i], get_y(fh_length, i), marker="o", c=fh_color, label="Forecasting horizon"
)
ax.invert_yaxis()
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set(
title="SingleWindowSplitter Fold",
ylabel="Window number",
xlabel="Time",
xticklabels=y.index,
ylim=(-1, 1),
)
# remove duplicate labels/handles
handles, labels = [(leg[:2]) for leg in ax.get_legend_handles_labels()]
ax.legend(handles, labels);
3.SlidingWindowSplitter
This splitter generates folds which move with time. The length of the training and test sets for each fold remains constant.
cv = SlidingWindowSplitter(window_length=window_length, fh=fh, start_with_window=True)
n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")
windows, fhs = get_folds_arrays(y, cv)
fig, ax = plt.subplots()
for i in range(n_splits):
ax.plot(np.arange(len(y)), get_y(len(y), i), marker="o", c="lightgray")
ax.plot(
windows[i], get_y(window_length, i), marker="o", c=window_color, label="Window"
)
ax.plot(
fhs[i], get_y(fh_length, i), marker="o", c=fh_color, label="Forecasting horizon"
)
ax.invert_yaxis()
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set(
title="SlidingWindowSplitter Folds",
ylabel="Window number",
xlabel="Time",
xticklabels=y.index,
)
# remove duplicate labels/handles
handles, labels = [(leg[:2]) for leg in ax.get_legend_handles_labels()]
ax.legend(handles, labels);
4.CutoffSplitter
With this splitter we can manually select the cutoff points.
# Specify cutoff points (by array index).
cutoffs = np.array([10, 15, 20, 25])
cv = CutoffSplitter(cutoffs=cutoffs, window_length=window_length, fh=fh)
n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")
windows, fhs = get_folds_arrays(y, cv)
fig, ax = plt.subplots()
for i in range(n_splits):
ax.plot(np.arange(len(y)), get_y(len(y), i), marker="o", c="lightgray")
ax.plot(
windows[i], get_y(window_length, i), marker="o", c=window_color, label="Window"
)
ax.plot(
fhs[i], get_y(fh_length, i), marker="o", c=fh_color, label="Forecasting horizon"
)
ax.invert_yaxis()
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set(
title="CutoffSplitter Folds",
ylabel="Window number",
xlabel="Time",
xticklabels=y.index,
)
# remove duplicate labels/handles
handles, labels = [(leg[:2]) for leg in ax.get_legend_handles_labels()]
ax.legend(handles, labels);