-
机器学习之ARIMA算法
-
原始数据
- 用ARIMA模型实现
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 23 16:58:08 2018
@author: muli
"""
#arima时序模型
import pandas as pd
#参数初始化
discfile = './data/arima_data.xls'
forecastnum = 5
#读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
data = pd.read_excel(discfile, index_col = u'日期')
#时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()
#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data).show()
#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量']))
#返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore
#差分后的结果
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()
plot_acf(D_data).show() #自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() #偏自相关图
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) #平稳性检测
#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值
from statsmodels.tsa.arima_model import ARIMA
data[u'销量'] = data[u'销量'].astype(float)
#定阶
pmax = int(len(D_data)/10) #一般阶数不超过length/10
qmax = int(len(D_data)/10) #一般阶数不超过length/10
bic_matrix = [] #bic矩阵
for p in range(pmax+1):
tmp = []
for q in range(qmax+1):
try: #存在部分报错,所以用try来跳过报错。
tmp.append(ARIMA(data, (p,1,q)).fit().bic)
except:
tmp.append(None)
bic_matrix.append(tmp)
#从中可以找出最小值
bic_matrix = pd.DataFrame(bic_matrix)
#先用stack展平,然后用idxmin找出最小值位置。
p,q = bic_matrix.stack().idxmin()
print(u'BIC最小的p值和q值为:%s、%s' %(p,q))
#建立ARIMA(0, 1, 1)模型
model = ARIMA(data, (p,1,q)).fit()
#给出一份模型报告
print(model.summary2())
#作为期5天的预测,返回预测结果、标准误差、置信区间。
print(model.forecast(5))
- 预测报告及 作为5天的预测,返回预测结果、标准误差、置信区间
BIC最小的p值和q值为:0、1
Results: ARIMA
====================================================================
Model: ARIMA BIC: 422.5101
Dependent Variable: D.销量 Log-Likelihood: -205.88
Date: 2018-12-23 21:47 Scale: 1.0000
No. Observations: 36 Method: css-mle
Df Model: 2 Sample: 01-02-2015
Df Residuals: 34 02-06-2015
Converged: 1.0000 S.D. of innovations: 73.086
AIC: 417.7595 HQIC: 419.418
----------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
----------------------------------------------------------------------
const 49.9555 20.1390 2.4805 0.0182 10.4838 89.4272
ma.L1.D.销量 0.6710 0.1648 4.0712 0.0003 0.3480 0.9941
-----------------------------------------------------------------------------
Real Imaginary Modulus Frequency
-----------------------------------------------------------------------------
MA.1 -1.4902 0.0000 1.4902 0.5000
====================================================================
(array([4873.96625288, 4923.92173955, 4973.87722621, 5023.83271288,
5073.78819955]), array([ 73.08574135, 142.32683622, 187.54287785, 223.8028904 ,
254.95712673]), array([[4730.72083205, 5017.2116737 ],
[4644.96626651, 5202.87721258],
[4606.29994008, 5341.45451235],
[4585.18710806, 5462.4783177 ],
[4574.08141355, 5573.49498555]]))