Day59
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题
# 1. 加载太阳黑子数据集
def load_sunspots_data():
"""加载太阳黑子数据集并返回时间序列"""
# 使用statsmodels内置的太阳黑子数据集
data = sm.datasets.sunspots.load_pandas()
df = data.data
# 设置年份为索引
df['YEAR'] = pd.to_datetime(df['YEAR'], format='%Y')
df.set_index('YEAR', inplace=True)
return df
# 2. 数据可视化与探索
def visualize_data(series):
"""可视化原始时间序列、分解图、ACF和PACF"""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 原始时间序列
axes[0, 0].plot(series)
axes[0, 0].set_title('原始太阳黑子序列')
# 分解图(使用11年周期,太阳黑子活动的平均周期)
decomposition = seasonal_decompose(series, model='additive', period=11)
decomposition.plot()
plt.tight_layout()
# ACF图
plot_acf(series, lags=40, ax=axes[0, 1])
axes[0, 1].set_title('原始序列的ACF')
# PACF图
plot_pacf(series, lags=40, ax=axes[1, 0])
axes[1, 0].set_title('原始序列的PACF')
plt.tight_layout()
plt.show()
# 打印基本统计信息
print(f"数据基本统计信息:")
print(series.describe())
# 3. 平稳性检验
def test_stationarity(series, title='原始序列'):
"""进行ADF检验并输出结果"""
print(f"\n{title}的ADF平稳性检验:")
result = adfuller(series)
print(f'ADF统计量: {result[0]:.4f}')
print(f'p值: {result[1]:.4f}')
print('临界值:')
for key, value in result[4].items():
print(f' {key}: {value:.4f}')
if result[1] <= 0.05:
print("序列平稳(拒绝原假设,p值 <= 0.05)")
return True
else:
print("序列非平稳(不能拒绝原假设,p值 > 0.05)")
return False
# 4. 确定差分阶数d和D
def determine_differencing(series):
"""确定差分阶数d(非季节性)和D(季节性)"""
# 检验原始序列
is_stationary = test_stationarity(series, '原始序列')
d = 0
if not is_stationary:
# 一阶非季节性差分
diff1 = series.diff().dropna()
is_stationary = test_stationarity(diff1, '一阶非季节性差分序列')
d = 1
if not is_stationary:
# 二阶非季节性差分
diff2 = diff1.diff().dropna()
is_stationary = test_stationarity(diff2, '二阶非季节性差分序列')
d = 2
# 季节性差分(使用11年周期)
seasonal_diff = series.diff(periods=11).dropna()
D = 1 if test_stationarity(seasonal_diff, '季节性差分序列') else 0
print(f"\n确定差分阶数: d = {d}, D = {D}")
# 可视化差分后的序列
if d > 0 or D > 0:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
if d == 1:
axes[0].plot(diff1)
axes[0].set_title('一阶非季节性差分序列')
elif d == 2:
axes[0].plot(diff2)
axes[0].set_title('二阶非季节性差分序列')
if D == 1:
axes[1].plot(seasonal_diff)
axes[1].set_title('季节性差分序列')
plt.tight_layout()
plt.show()
return d, D
# 5. 确定SARIMAX的p, q, P, Q参数
def determine_sarimax_params(series, d, D, s=11):
"""基于差分后的序列确定SARIMAX的p, q, P, Q参数"""
# 对序列进行差分
if d == 1:
diff_series = series.diff().dropna()
elif d == 2:
diff_series = series.diff().diff().dropna()
else:
diff_series = series
if D == 1:
diff_series = diff_series.diff(periods=s).dropna()
# 绘制ACF和PACF图
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
plot_acf(diff_series, lags=40, ax=axes[0])
axes[0].set_title('差分序列的ACF')
plot_pacf(diff_series, lags=40, ax=axes[1])
axes[1].set_title('差分序列的PACF')
plt.tight_layout()
plt.show()
# 根据ACF和PACF图确定参数
print("\n请根据ACF和PACF图确定SARIMAX参数:")
print("非季节性部分:")
print(" - PACF图中显著截断的滞后阶数为p")
print(" - ACF图中显著截断的滞后阶数为q")
print("季节性部分 (周期s=11):")
print(" - 季节性PACF图中显著截断的滞后阶数为P")
print(" - 季节性ACF图中显著截断的滞后阶数为Q")
# 使用自动化方法(AIC最小化)
best_aic = float('inf')
best_p, best_q, best_P, best_Q = 0, 0, 0, 0
# 设置参数搜索范围
max_p = 3
max_q = 3
max_P = 2
max_Q = 2
print("\n正在搜索最优SARIMAX参数组合...")
for p in range(0, max_p + 1):
for q in range(0, max_q + 1):
for P in range(0, max_P + 1):
for Q in range(0, max_Q + 1):
try:
model = sm.tsa.SARIMAX(
series,
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False
)
results = model.fit(disp=False)
if results.aic < best_aic:
best_aic = results.aic
best_p, best_q, best_P, best_Q = p, q, P, Q
print(f'SARIMAX({p},{d},{q})({P},{D},{Q},{s}) - AIC: {results.aic:.2f}')
except:
continue
print(f"\nAIC最优参数: p={best_p}, q={best_q}, P={best_P}, Q={best_Q} (AIC={best_aic:.2f})")
# 允许用户手动选择或使用AIC推荐值
use_auto = input("\n是否使用AIC推荐的参数?(y/n): ").lower().strip()
if use_auto == 'y':
p, q, P, Q = best_p, best_q, best_P, best_Q
else:
p = int(input("请输入p值: "))
q = int(input("请输入q值: "))
P = int(input("请输入P值: "))
Q = int(input("请输入Q值: "))
print(f"\n确定最终参数: p={p}, q={q}, P={P}, Q={Q}")
return p, q, P, Q
# 6. 构建并训练SARIMAX模型
def build_sarimax_model(series, p, d, q, P, D, Q, s=11):
"""构建并训练SARIMAX模型"""
print(f"\n构建SARIMAX模型: SARIMAX({p},{d},{q})({P},{D},{Q},{s})")
model = sm.tsa.SARIMAX(
series,
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False
)
model_fit = model.fit(disp=False)
print("\n模型摘要:")
print(model_fit.summary())
# 绘制残差图
residuals = pd.DataFrame(model_fit.resid)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 残差时序图
axes[0].plot(residuals)
axes[0].set_title('模型残差')
# 残差直方图
residuals.hist(ax=axes[1])
axes[1].set_title('残差直方图')
plt.tight_layout()
plt.show()
# 残差的ACF和PACF图
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
plot_acf(residuals, lags=40, ax=axes[0])
axes[0].set_title('残差的ACF')
plot_pacf(residuals, lags=40, ax=axes[1])
axes[1].set_title('残差的PACF')
plt.tight_layout()
plt.show()
# Ljung-Box检验(残差白噪声检验)
lb_test = sm.stats.acorr_ljungbox(residuals, lags=[10], return_df=True)
print("\nLjung-Box检验(残差白噪声检验):")
print(lb_test)
# 正态性检验
print("\n残差正态性检验:")
print(f'偏度: {residuals.skew()[0]:.4f}')
print(f'峰度: {residuals.kurt()[0]:.4f}')
return model_fit
# 7. 模型预测与评估
def make_predictions(model_fit, series, steps=30, s=11):
"""进行模型预测并可视化结果"""
# 获取预测结果
forecast = model_fit.get_forecast(steps=steps)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int(alpha=0.05) # 95%置信区间
# 生成预测日期
last_date = series.index[-1]
forecast_dates = pd.date_range(start=last_date + pd.DateOffset(years=1), periods=steps, freq='AS')
# 可视化预测结果
plt.figure(figsize=(14, 7))
# 绘制历史数据
plt.plot(series.index, series, label='历史数据')
# 绘制预测数据
plt.plot(forecast_dates, forecast_mean, 'r--', label='预测值')
plt.fill_between(forecast_dates, forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1], color='r', alpha=0.1, label='95%置信区间')
plt.title('太阳黑子数量预测(SARIMAX模型)')
plt.xlabel('年份')
plt.ylabel('太阳黑子数量')
plt.legend()
plt.grid(True)
plt.show()
# 打印预测结果
print("\n未来30年太阳黑子数量预测:")
forecast_df = pd.DataFrame({
'年份': forecast_dates.year,
'预测值': forecast_mean.values,
'下限': forecast_ci.iloc[:, 0].values,
'上限': forecast_ci.iloc[:, 1].values
})
print(forecast_df.round(2))
# 可视化季节性模式
fig, ax = plt.subplots(figsize=(12, 6))
seasonal_effects = model_fit.params[model_fit.param_names[-s:]]
ax.bar(range(1, s+1), seasonal_effects)
ax.set_title('季节性影响因子')
ax.set_xlabel('季节周期(年)')
ax.set_ylabel('影响强度')
plt.xticks(range(1, s+1))
plt.show()
return forecast_df
# 主函数
def main():
print("===== 太阳黑子数据集SARIMAX分析 =====")
# 1. 加载数据
df = load_sunspots_data()
series = df['SUNACTIVITY']
# 2. 数据可视化
visualize_data(series)
# 3. 确定差分阶数
d, D = determine_differencing(series)
# 4. 确定SARIMAX参数
p, q, P, Q = determine_sarimax_params(series, d, D)
# 5. 构建并训练SARIMAX模型
model_fit = build_sarimax_model(series, p, d, q, P, D, Q)
# 6. 模型预测
forecast = make_predictions(model_fit, series, steps=30)
print("\n===== SARIMAX分析完成 =====")
if __name__ == "__main__":
main()
8744

被折叠的 条评论
为什么被折叠?



