数据来源于CensusAtSchool网站(https://new.censusatschool.org.nz/)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from statsmodels.sandbox.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.api import VAR
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import adfuller as ADF
# 多元时间序列分析 VAR
# 1. 读取数据
dt = pd.read_csv("NZAccomodation.csv")
dt['Date']=pd.to_datetime(dt['DATE'])
data = dt.drop(['Date'],axis=1)
data.set_index('DATE',inplace=True)
print(data.describe())
print(data.info())
# 2. 可视化 图像可以看出数据具有周期性
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(3, 1, sharex='col', sharey='row') # 分区
ax[0].plot(data['Hotel'],color='green',marker='o',linestyle='dashed',linewidth=1)
ax[1].plot(data['BackPackers'],color='blue',marker='*',linestyle='dashed',linewidth=1)
ax[2].plot(data['Motel'],color='red',marker='+',linestyle='dashed',linewidth=1)
ax[0].set_ylabel("Hotel")
ax[1].set_ylabel("BackPackers")
ax[2].set_ylabel("Motel")
ax[0].set_title('新西兰住宿人数图')
tick_spacing = 20
ax[0].xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
plt.show()
# 3. 切分数据 75%训练 25%测试
trainnum = np.int64(data.shape[0]*0.75)
traindata = data.iloc[0:trainnum,:]
testdata = data.iloc[trainnum:data.shape[0],:]
print(traindata.shape) # (115, 3)
print(testdata.shape) # (39, 3)
# 4. 单位根检验:检验序列平稳性
def Adf_diy(data):
dftest = ADF(data,autolag='BIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used'])
print("检验结果:")
print(dfoutput)
Adf_diy(traindata.Hotel) # p-value 0.211125 不平稳
Adf_diy(traindata.BackPackers) # p-value 0.843871 不平稳
Adf_diy(traindata.Motel) # p-value 0.983517 不平稳
# ACF(自相关图)、PACF(偏自相关图)
def Acf(data):
f = plt.figure(facecolor='white')
ax1 = f.add_subplot(211)
plot_acf(data, lags=31, ax=ax1)
ax2 = f.add_subplot(212)
plot_pacf(data, lags=31, ax=ax2)
plt.show()
Acf(traindata.Hotel)
Acf(traindata.BackPackers)
Acf(traindata.Motel)
# 5.差分并输出图像
data1 = traindata['Hotel']
data2 = traindata['BackPackers']
data3 = traindata['Motel']
tr_diff = traindata.diff().diff()
ts_diff1 = data1.diff().diff() # 二阶差分
ts_diff2 = data2.diff().diff()
ts_diff3 = data3.diff().diff()
f = plt.figure(facecolor='white')
data1.plot(color='green', label='Hotel原始数据')
data2.plot(color='blue', label='BackPackers原始数据')
data3.plot(color='red', label='Motel原始数据')
ts_diff1.plot(color='green', linestyle='dashed',label='Hotel 2阶差分')
ts_diff2.plot(color='blue', linestyle='dashed',label='BackPackers 2阶差分')
ts_diff3.plot(color='red', linestyle='dashed',label='Motel 2阶差分')
# print(ts_diff1,ts_diff2,ts_diff3)
# plt.legend(loc='center left')
plt.title('原始数据(上)与差分后(下)')
plt.show()
# 检验一阶差分后序列是否平稳 二阶单整
tr_diff = tr_diff.dropna()
ts_diff1=ts_diff1.dropna() # 去除空值
ts_diff2=ts_diff2.dropna()
ts_diff3=ts_diff3.dropna()
Adf_diy(ts_diff1) # 一阶:p约等于0.0008 95%平稳
Adf_diy(ts_diff2) # p约等于0.1808 不平稳
Adf_diy(ts_diff3) # p约等于0.0791 90%下平稳
# 白噪声检验
def LB_test(timeseries):
[[lb], [p]] = acorr_ljungbox(timeseries, lags=1)
if p < 0.05:
print(u"原始序列为非白噪声序列")
else:
print(u"原始序列为白噪声序列")
LB_test(ts_diff1) # 差分后非白噪声序列
LB_test(ts_diff2)
LB_test(ts_diff3)
# 6. 协整(原始数据):存在则VECM(向量误差修正模型) 不存在则为伪回归
# EG两步协整检验法:仅适用于二元 1. 回归 2.残差单位根检验
# Johansen检验:特征根迹检验;最大特征值检验
# coint_johansen(endog, det_order, k_ar_diff)
# det_order(int):-1-无确定性 0-常数项 1-线性趋势
# k_ar_diff(非负int):模型中滞后差异的数量
def joh_output(res):
output = pd.DataFrame([res.lr2, res.lr1],
index=['max_eig_stat', "trace_stat"]) # max_eig_stat最大特征值 trace_stat 迹检验值
print(output.T, '\n')
print("Critical values(90%, 95%, 99%) of max_eig_stat\n", res.cvm, '\n') # 最大特征值检验临界值
print("Critical values(90%, 95%, 99%) of trace_stat\n", res.cvt, '\n') # 跟踪统计临界值
data = data.dropna()
joh_model1 = coint_johansen(data, 0, 1) # k_ar_diff +1 = K
joh_output(joh_model1) #(0,0)(0,1)(1,0)(1,1)(1,2)(1,3)(1,4)
def adjust(val, length= 6):
return str(val).ljust(length)
def cointegration_test(df, alpha=0.05):
"""Perform Johanson's Cointegration Test and Report Summary"""
out = coint_johansen(df,1,1)
d = {'0.90':0, '0.95':1, '0.99':2}
traces = out.lr1
cvts = out.cvt[:, d[str(1-alpha)]]
# Summary
print('Name : Test Stat > C(95%) => Signif \n', '--'*20)
for col, trace, cvt in zip(df.columns, traces, cvts):
print(adjust(col), ': ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' => ' , trace > cvt)
data = data.dropna()
cointegration_test(data)
# 平稳后变量选择
# 7. 阶数确定:LR;信息准则(AIC BIC SC)
# VAR模型的滞后阶数越大,自由度就越小
# VAR(p) 根据bic来推荐最优的模型
model = VAR(tr_diff)
for i in range(20):
result = model.fit(i)
print('Lag Order =', i)
print('AIC : ', result.aic)
print('BIC : ', result.bic, '\n')
# 参数估计 拟合模型
model_fitted = model.fit(14)
model_fitted.summary()
#8. 模型检验:模型稳定性检验;残差检验;各阶系数联合显著性检验
out = durbin_watson(model_fitted.resid) # 杜宾-瓦特森检验
for col, val in zip(data.columns, out):
print(adjust(col), ':', round(val, 2)) # 检验值越接近2,说明模型越好
# 9. 模型应用:格兰杰因果检验(平稳);脉冲响应;方差分解;预测
# 预测
lag_order = model_fitted.k_ar
# print(lag_order)
forecast_input = tr_diff.values[-lag_order:]
# print(forecast_input)
fc = model_fitted.forecast(y=forecast_input, steps=39)
df_forecast = pd.DataFrame(fc, index=data.index[-39:], columns=data.columns + '_2d')
# print(df_forecast)
# 还原
def invert_transformation(df_train, df_forecast,second_diff=False):
df_fc = df_forecast.copy()
columns = df_train.columns
for col in columns:
# 二阶差分还原
if second_diff:
df_fc[str(col) + '_1d'] = (df_train[col].iloc[-1] - df_train[col].iloc[-2]) + df_fc[
str(col) + '_2d'].cumsum()
# 一阶差分还原
df_fc[str(col) + '_forecast'] = df_train[col].iloc[-1] + df_fc[str(col) + '_1d'].cumsum()
return df_fc
df_results = invert_transformation(traindata, df_forecast,second_diff=True)
# print(df_results.Hotel_forecast,df_results.BackPackers_forecast,df_results.Motel_forecast)
# 存入csv文件
# df_results.to_csv("NZA预测.csv")
# 可视化
fig, axes = plt.subplots(3, 1, dpi=150, figsize=(10,10))
for i, (col,ax) in enumerate(zip(data.columns, axes.flatten())):
df_results[col+'_forecast'].plot(legend=False, ax=ax).autoscale(axis='x',tight=True)
testdata[col][-39:].plot(legend=False, ax=ax)
ax.set_ylabel(col)
plt.show()