[数据挖掘] 44 时间序列预测3 自相关与ARIMA模型基于回归的预测美国铁路（美铁）客运公司数据

本文链接：https://blog.csdn.net/uforfor1/article/details/131796090

from statsmodels.graphics import tsaplots


# 序列自相关模型绘图
tsaplots.plot_acf(train_df['1991-01-01':'1993-01-01'].Ridership,lags=23)
# plt.show()

# 趋势与季节性效应的残差序列自相关
formula = 'Ridership~trend + np.square(trend) + C(Month)'
ridership_lm_trendseason = smf.ols(formula=formula, data=train_df_season).fit()
ridership_lm_trendseason.predict(train_df_season)

residual2 = train_df_season['Ridership'] - ridership_lm_trendseason.predict(train_df_season)
print(residual2[-1]) #12.107558346749784
tsaplots.plot_acf(residual2,lags=(len(residual2[:14])-1))
plt.title('趋势与季节性效应模型的残差序列自相关')
# plt.show()



def singleGraphLayout(ax, ylim, train_ts, test_ts):
    if test_ts is not None:
        ax.set_xlim('1990', '2004-6')
        ax.set_ylim(*ylim)
        ax.set_xlabel('Time')
        one_month = pd.Timedelta('31 days')
        xtrain = (min(train_ts.index), max(train_ts.index) - one_month)
        xtest = (min(test_ts.index) + one_month, max(test_ts.index) - one_month)
        xtv = xtrain[1] + 0.5 * (xtest[0] - xtrain[1])
        ypos = 0.9 * ylim[1] + 0.1 * ylim[0]
        ax.add_line(plt.Line2D(xtrain, (ypos, ypos), color='black', linewidth=0.5))
        ax.add_line(plt.Line2D(xtest, (ypos, ypos), color='black', linewidth=0.5))
        ax.axvline(x=xtv, ymin=0, ymax=1, color='black', linewidth=0.5)
        ypos = 0.925 * ylim[1] + 0.075 * ylim[0]
        ax.text('1995', ypos, 'Training')
        ax.text('2002-3', ypos, 'test')
    else:
        one_month = pd.Timedelta('31 days')
        xtrain = (min(train_df.index), max(train_df.index) - one_month)
        xtest = (min(test_df.index) + one_month, max(test_df.index) - one_month)
        xtv = xtrain[1] + 0.5 * (xtest[0] - xtrain[1])
        ax.set_xlim('1990', '2004-6')
        ax.set_ylim(-250, 250)
        ax.set_xlabel('Time')
        one_month = pd.Timedelta('31 days')
        xtrain = (min(train_ts.index), max(train_ts.index) - one_month)
        ypos = 0.9 * 250 + 0.1 * (-250)
        ax.add_line(plt.Line2D(xtrain, (ypos, ypos), color='black', linewidth=0.5))
        ax.axvline(x=xtv, ymin=0, ymax=1, color='black', linewidth=0.5)
        ypos = 0.925 * ylim[1] + 0.075 * ylim[0]
        ax.text('1995', 0.9 * 250 - 12.5, 'Training')
        ax.text('2002-3', 0.9 * 250 - 12.5, 'test')


def graphLayout(axes, train_ts, test_ts):
    if test_ts is not None:
        singleGraphLayout(axes[0], [1300, 2550], train_ts, test_ts)
        singleGraphLayout(axes[1], [-550, 550], train_ts, test_ts)
        test_ts.plot(y='Ridership', ax=axes[0], color='C0', linestyle='dashed', linewidth=0.75)
        train_ts.plot(y='Ridership', ax=axes[0], color='C0', linewidth=0.75)

        axes[1].axhline(y=0, xmin=0, xmax=1, color='black', linewidth=0.5)
        axes[0].set_xlabel('')
        axes[0].set_ylabel('Ridership (in 000s)')
        axes[1].set_ylabel('Forecast Errors')
        if axes[0].get_legend():
            axes[0].get_legend().remove()
    else:
        singleGraphLayout(axes, [-550, 550], train_ts, test_ts=None)
        axes.set_ylabel('Forecast Errors')


# 加入自相关信息提高预测准确性
formula = 'Ridership~trend + np.square(trend) + C(Month)'
train_lm_trendseason = smf.ols(formula=formula, data=train_df_season).fit()
train_res_arima = ARIMA(train_lm_trendseason.resid, order=(1, 0, 0), freq='MS', trend='n').fit()

forecast = train_res_arima.get_forecast(1) # 预测未来1个时间点
forecast_mean = forecast.predicted_mean[0]  # 获取预测值的均值
conf_int = forecast.conf_int() # 获取预测值的置信区间
conf_int_low, conf_int_high = conf_int.iloc[0,0], conf_int.iloc[0,1] # 获取置信区间上下限
# print(forecast)
print(conf_int)
print(f'Forecast {forecast_mean:.3f} [{conf_int_low:.3f}, {conf_int_high:.3f}]')
print(pd.DataFrame({'coef': train_res_arima.params, 'std err': train_res_arima.bse}))
# print(train_res_arima.summary())

ridership_trendseason = ridership_df
ridership_trendseason['Month'] = ridership_trendseason.index.month
train_df_trendseason = ridership_trendseason[:nTrain]
valid_df_trendseason = ridership_trendseason[nTrain:]

# 提取训练集的残差序列
fitted_values = train_res_arima.fittedvalues  # 拟合的值
pred_residuals = train_res_arima.resid        # 预测的残差
actual_rresiduals = train_df_trendseason['Ridership'] - train_lm_trendseason.fittedvalues  # 实际的残差

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(9, 3.725), dpi=200)
actual_rresiduals.plot(ax=axes, color='green')
pred_residuals.plot(ax=axes,color='purple')
graphLayout(axes, train_df_season, test_ts=None)
plt.show()

#预测残差之间的自相关性
tsaplots.plot_acf(pred_residuals,lags=(len(residual2[:14])-1))
plt.show()