调参简例:SARIMA模型_手工调参过程

一、SARIMA背景简介

SARIMA,简单说就是AR+MA+差分+季节性因素+趋势。所以参数在statsmodels.tsa.statespace.sarimax.SARIMAX里边,用3个指标涵盖核心参数,order(p,d,q)seasonal_order(P,D,Q,s)trend.
Seasonal AutoRegessive Integrated Moving Average with eXogenous regressors model

翻译原文链接这里

二、代码关键节点

2.1 准备部分

2.1.1 定义待传入参数的模型和评分

"""打分函数"""
def walk_forward_validation(data, n_test, cfg):
	predictions = []
	train, test = data[:-n_test], data[-n_test:]
	history = [i for i in train]
	for x in range(len(test)):
		order, sorder, trend = cfg
		model = SARIMAX(history,
						order=order, seasonal_order=sorder, trend=trend,
						enforce_stationarity=False,
						enforce_invertibility=False)
		model_fit = model.fit(disp=False)
		yhat = model_fit.predict(len(history), len(history))
		predictions.append(yhat)
		history.append(test[x])
	error = mean_squared_error(test, predictions)
	return error

保证顺利得到评分:通过过滤各种已知或未知报错,达到防止运行中断的目的

def score_model(data, n_test, cfg, debug=False):
	key = str(cfg)
	if debug:
		error = walk_forward_validation(data, n_test, cfg)
	else:
		try:
			with warnings.catch_warnings():
				warnings.filterwarnings('ignore')
				error = walk_forward_validation(data, n_test, cfg)
		except:
			error=None
	if error is not None:
		print(f'> Model{key} {error:.3f}')
	return key, error

2.1.2 定义一组要测试的参数组合

def sarima_config():
    # 造出自己预估的所有参数组合list
	cfg_list = []
	p_params = [0, 1, 2]
	d_params = [0, 1]
	q_params = [0, 1, 2]
	P_params = [0, 1, 2]
	D_params = [0, 1]
	Q_params = [0, 1, 2]
	s_list = [2, 4, 12]  # 注意当s=0时,PDQ都会被强制性归为0;
	t_list = ['n', 'c', 't', 'ct']
	for p in p_params:
		for d in d_params:
			for q in q_params:
				for P in P_params:
					for D in D_params:
						for Q in Q_params:
							for s in s_list:
								for t in t_list:
									cfg = [(p,d,q), (P,D,Q,s),t]
									cfg_list.append(cfg)
	return cfg_list

或者另一种更简洁的写法

def sarima_config():
    p_params = [0, 1, 2]
    d_params = [0, 1]
    q_params = [0, 1, 2]
    P_params = [0, 1, 2]
    D_params = [0, 1]
    Q_params = [0, 1, 2]
    s = [2, 4, 12]
    t = ['n', 'c', 't', 'ct']

    cfg_list = [[(p, d, q), (P, D, Q, s_value), t_value]
                for p in p_params
                for d in d_params
                for q in q_params
                for P in P_params
                for D in D_params
                for Q in Q_params
                for s_value in s
                for t_value in t]

    return cfg_list

2.2 调参部分

2.2.1 给各参数组合评分

def grid_search(data, cfg_list, n_test, parallel=True):
	# 把所有参数组合一一带入模型,并把所有参数组合及其对应模型得分记录下来,再排序。
	if parallel:
		executor = joblib.Parallel(n_jobs=cpu_count(),
									backend='multiprocessing')
		tasks = (joblib.delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
		scores = executor(tasks)
	else:
		scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
	scores = [r for r in scores if r[1] != None]
	scores.sort(key=lambda tup: tup[1])
	return scores

2.2.2 得到最优评分模型,并使用

2.3 完整代码:(在原文基础上有调整)

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import pandas as pd
import warnings
import joblib
from multiprocessing import cpu_count

def walk_forward_validation(data, n_test, cfg):
	# 定义一个给一套参数cfg打分的函数
	predictions = []
	train, test = data[:-n_test], data[-n_test:]
	history = [i for i in train]
	for x in range(len(test)):
		order, sorder, trend = cfg
		model = SARIMAX(history,
						order=order, seasonal_order=sorder, trend=trend,
						enforce_stationarity=False,
						enforce_invertibility=False)
		model_fit = model.fit(disp=False)
		yhat = model_fit.predict(len(history), len(history))
		predictions.append(yhat)
		history.append(test[x])
	error = mean_squared_error(test, predictions)
	return error


def score_model(data, n_test, cfg, debug=False):
	# 记录下一套参数,以及该参数下模型的得分
	key = str(cfg)
	if debug:
		error = walk_forward_validation(data, n_test, cfg)
	else:
		try:
			with warnings.catch_warnings():
				warnings.filterwarnings('ignore')
				error = walk_forward_validation(data, n_test, cfg)
		except:
			error=None
	if error is not None:
		print(f'> Model{key} {error:.3f}')
	return key, error

def grid_search(data, cfg_list, n_test, parallel=True):
	# 把所有参数组合一一带入模型,并把所有参数组合及其对应模型得分记录下来,排序。
	if parallel:
		executor = joblib.Parallel(n_jobs=cpu_count(),
									backend='multiprocessing')
		tasks = (joblib.delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
		scores = executor(tasks)
	else:
		scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
	scores = [r for r in scores if r[1] != None]
	scores.sort(key=lambda tup: tup[1])
	return scores

def sarima_config():
    # 造出自己预估的所有参数组合list
	cfg_list = []
	p_params = [0, 1, 2]
	d_params = [0, 1]
	q_params = [0, 1, 2]
	P_params = [0, 1, 2]
	D_params = [0, 1]
	Q_params = [0, 1, 2]
	s = [2, 4, 12]  # 注意当s=0时,PDQ都会被强制性归为0;
	t = ['n', 'c', 't', 'ct']
	for p in p_params:
		for d in d_params:
			for q in q_params:
				for P in P_params:
					for D in D_params:
						for Q in Q_params:
							for s in s:
								for t in t:
									cfg = [(p,d,q), (P,D,Q,s),t]
									cfg_list.append(cfg)
	return cfg_list


if __name__ == '__main__':
	df = pd.read_csv('filepath+filename.csv')
	data = df.values
	n_test = number_of_test
	cfg_list = sarima_config()
	scores = grid_search(data, n_test, cfg_list)
	print('Done')
	for cfg, error in scores[:5]: # 取出前5个最优的参数组合及对应的模型得分
		print(cfg, error)

三、手工选参vs自动选参的 aotu_arima()

3.1 针对如下的原始数据

在这里插入图片描述

3.2 应用aotu_arima()简介&结果案例

pmdarima.arima.auto_arima():可以自动使用训练数据集得到参数。但是很多人都不用它,应该是因为它找到的参数并不怎么好的缘故。
例如,如下的直接使用的话并不好

import numpy as np
import pmdarima as pm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# 1/2模型&结果
df = pd.read_csv('path+filename')
y = df.values
train, test = train_test_split(y, train_size=0.7)
model = pm.arima.auto_arima(train, seasonal=True, m=10) # m等于(P,D,Q,s)的s。m=10是通过线图,肉眼观察得到的。(原始数据x是时间)
print(model.summary())

# 2/2作图
forecasets = model.predict(len(test))
x = np.arange(len(train))
plt.plot(x, y, c='black')
plt.plot(x[len(train):], forecasets, c='blue')
plt.show()

结果&图如下:
在这里插入图片描述
在这里插入图片描述

3.3 对照组:使用手工选参的模型

结果貌似更好些,从AIC/BIC/HQIC看。结果&图如下:
在这里插入图片描述在这里插入图片描述
代码如下

from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings

df = pd.read_csv('path+filename')
y = df.values
x = np.arange(len(y))

warnings.filterwarings('ignore')

train, test = trains_test_split(y, train_size=0.7)

model = SARIMAX(train,
				order = (0, 1, 0),
				seasonal_order=(2, 1, 0, 10),
				trend='n')  # 利用我们手动测试出来的最优参数组合
model_fit = model.fit(disp=False)
print(model_fit.summery())

f = model_fit.predict(1, end=len(y)+10)
plt.plot(x, y, c='black')
plt.plot(np.arange(len(y)+10), f, c='blue')
plt.show()
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值