1.https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/
2.https://blog.csdn.net/qq_30219017/article/details/79539376
3.pandas时间序列操作
1)数据读取:
# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pylab as plt
# 读取数据,pd.read_csv默认生成DataFrame对象,需将其转换成Series对象
df = pd.read_csv('AirPassengers.csv', encoding='utf-8', index_col='date')
df.index = pd.to_datetime(df.index) # 将字符串索引转换成时间索引
ts = df['x'] # 生成pd.Series对象
# 查看数据格式
ts.head()
ts.head().index
2)查看某日的值既可以使用字符串作为索引,又可以直接使用时间对象作为索引:
ts['1949']
ts[datetime(1949,1,1)]
3)切片操作:
ts['1949-1' : '1949-6']
注意时间索引的切片操作起点和尾部都是包含的,这点与数值索引有所不同
4)平稳性检验:
平稳时间序列有两种定义:严平稳和宽平稳
严平稳顾名思义,是一种条件非常苛刻的平稳性,它要求序列随着时间的推移,其统计性质保持不变。对于任意的τ,其联合概率密度函数满足:
严平稳的条件只是理论上的存在,现实中用得比较多的是宽平稳的条件。
宽平稳也叫弱平稳或者二阶平稳(均值和方差平稳),它应满足:
- 常数均值
- 常数方差
- 常数自协方差
平稳性检验:观察法和单位根检验法
注:对于时间序列判断上篇博客有介绍
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 移动平均图
def draw_trend(timeSeries, size):
f = plt.figure(facecolor='white')
# 对size个数据进行移动平均
rol_mean = timeSeries.rolling(window=size).mean()
# 对size个数据进行加权移动平均
rol_weighted_mean = pd.ewma(timeSeries, span=size)
timeSeries.plot(color='blue', label='Original')
rolmean.plot(color='red', label='Rolling Mean')
rol_weighted_mean.plot(color='black', label='Weighted Rolling Mean')
plt.legend(loc='best')
plt.title('Rolling Mean')
plt.show()
def draw_ts(timeSeries):
f = plt.figure(facecolor='white')
timeSeries.plot(color='blue')
plt.show()
def testStationarity(ts):
dftest = adfuller(ts)
# 对上述函数求得的值进行语义描述
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
return dfoutput
# 自相关和偏相关图,默认阶数为31阶
def draw_acf_pacf(ts, lags=31):
f = plt.figure(facecolor='white')
ax1 = f.add_subplot(211)
plot_acf(ts, lags=31, ax=ax1)
ax2 = f.add_subplot(212)
plot_pacf(ts, lags=31, ax=ax2)
plt.show()
5)平稳性处理:
a. 对数变换
对数变换主要是为了减小数据的振动幅度,使其线性规律更加明显。对数变换相当于增加了一个惩罚机制,数据越大其惩罚越大,数据越小惩罚越小。这里强调一下,变换的序列需要满足大于0,小于0的数据不存在对数变换。
ts_log = np.log(ts)
test_stationarity.draw_ts(ts_log)
b. 平滑法
根据平滑技术的不同,平滑法具体分为移动平均法和指数平均法。
移动平均即利用一定时间间隔内的平均值作为某一期的估计值,而指数平均则是用变权的方法来计算均值
test_stationarity.draw_trend(ts_log, 12)
指数平均法是对周期内的数据进行了加权,能在一定程度上减小年周期因素,但并不能完全剔除,如要完全剔除可以进一步进行差分操作。
c. 差分
时间序列最常用来剔除周期性因素的方法当属差分了,它主要是对等周期间隔的数据进行线性求减。前面我们说过,ARIMA模型相对ARMA模型,仅多了差分操作,ARIMA模型几乎是所有时间序列软件都支持的,差分的实现与还原都非常方便。而statsmodel中,对差分的支持不是很好,它不支持高阶和多阶差分。我们可以先用pandas将序列差分好,然后在对差分好的序列进行ARIMA拟合,只不过这样后面会多了一步人工还原的工作。
diff_12 = ts_log.diff(12)#12阶差分
diff_12.dropna(inplace=True)
diff_12_1 = diff_12.diff(1)
diff_12_1.dropna(inplace=True)
test_stationarity.testStationarity(diff_12_1)
d. 分解
所谓分解就是将时序数据分离成不同的成分。statsmodels使用的X-11分解过程,它主要将时序数据分离成长期趋势、季节趋势和随机成分。与其它统计软件一样,statsmodels也支持两类分解模型,加法模型和乘法模型,这里我只实现加法,乘法只需将model的参数设置为"multiplicative"即可。
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(ts_log, model="additive")
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
完整代码:
#https://www.cnblogs.com/foley/p/5582358.html
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARMA
import sys
from dateutil.relativedelta import relativedelta
from copy import deepcopy
import matplotlib.pyplot as plt
class arima_model:
def __init__(self, ts, maxLag=9):
self.data_ts = ts
self.resid_ts = None
self.predict_ts = None
self.maxLag = maxLag
self.p = maxLag
self.q = maxLag
self.properModel = None
self.bic = sys.maxint
# 计算最优ARIMA模型,将相关结果赋给相应属性
def get_proper_model(self):
self._proper_model()
self.predict_ts = deepcopy(self.properModel.predict())
self.resid_ts = deepcopy(self.properModel.resid)
# 对于给定范围内的p,q计算拟合得最好的arima模型,这里是对差分好的数据进行拟合,故差分恒为0
def _proper_model(self):
for p in np.arange(self.maxLag):
for q in np.arange(self.maxLag):
# print p,q,self.bic
model = ARMA(self.data_ts, order=(p, q))
try:
results_ARMA = model.fit(disp=-1, method='css')
except:
continue
bic = results_ARMA.bic
# print 'bic:',bic,'self.bic:',self.bic
if bic < self.bic:
self.p = p
self.q = q
self.properModel = results_ARMA
self.bic = bic
self.resid_ts = deepcopy(self.properModel.resid)
self.predict_ts = self.properModel.predict()
# 参数确定模型
def certain_model(self, p, q):
model = ARMA(self.data_ts, order=(p, q))
try:
self.properModel = model.fit( disp=-1, method='css')
self.p = p
self.q = q
self.bic = self.properModel.bic
self.predict_ts = self.properModel.predict()
self.resid_ts = deepcopy(self.properModel.resid)
except:
print 'You can not fit the model with this parameter p,q, ' \
'please use the get_proper_model method to get the best model'
# 预测第二日的值
def forecast_next_day_value(self, type='day'):
# 我修改了statsmodels包中arima_model的源代码,添加了constant属性,需要先运行forecast方法,为constant赋值
self.properModel.forecast()
if self.data_ts.index[-1] != self.resid_ts.index[-1]:
raise ValueError('''The index is different in data_ts and resid_ts, please add new data to data_ts.
If you just want to forecast the next day data without add the real next day data to data_ts,
please run the predict method which arima_model included itself''')
if not self.properModel:
raise ValueError('The arima model have not computed, please run the proper_model method before')
para = self.properModel.params
# print self.properModel.params
if self.p == 0: # It will get all the value series with setting self.data_ts[-self.p:] when p is zero
ma_value = self.resid_ts[-self.q:]
values = ma_value.reindex(index=ma_value.index[::-1])
elif self.q == 0:
ar_value = self.data_ts[-self.p:]
values = ar_value.reindex(index=ar_value.index[::-1])
else:
ar_value = self.data_ts[-self.p:]
ar_value = ar_value.reindex(index=ar_value.index[::-1])
ma_value = self.resid_ts[-self.q:]
ma_value = ma_value.reindex(index=ma_value.index[::-1])
values = ar_value.append(ma_value)
predict_value = np.dot(para[1:], values) + self.properModel.constant[0]
self._add_new_data(self.predict_ts, predict_value, type)
return predict_value
# 动态添加数据函数,针对索引是月份和日分别进行处理
def _add_new_data(self, ts, dat, type='day'):
if type == 'day':
new_index = ts.index[-1] + relativedelta(days=1)
elif type == 'month':
new_index = ts.index[-1] + relativedelta(months=1)
ts[new_index] = dat
def add_today_data(self, dat, type='day'):
self._add_new_data(self.data_ts, dat, type)
if self.data_ts.index[-1] != self.predict_ts.index[-1]:
raise ValueError('You must use the forecast_next_day_value method forecast the value of today before')
self._add_new_data(self.resid_ts, self.data_ts[-1] - self.predict_ts[-1], type)
if __name__ == '__main__':
df = pd.read_csv('AirPassengers.csv', encoding='utf-8', index_col='date')
df.index = pd.to_datetime(df.index)
ts = df['x']
# 数据预处理
ts_log = np.log(ts)
rol_mean = ts_log.rolling(window=12).mean()
rol_mean.dropna(inplace=True)
ts_diff_1 = rol_mean.diff(1)
ts_diff_1.dropna(inplace=True)
ts_diff_2 = ts_diff_1.diff(1)
ts_diff_2.dropna(inplace=True)
# 模型拟合
model = arima_model(ts_diff_2)
# 这里使用模型参数自动识别
model.get_proper_model()
print 'bic:', model.bic, 'p:', model.p, 'q:', model.q
print model.properModel.forecast()[0]
print model.forecast_next_day_value(type='month')
# 预测结果还原
predict_ts = model.properModel.predict()
diff_shift_ts = ts_diff_1.shift(1)
diff_recover_1 = predict_ts.add(diff_shift_ts)
rol_shift_ts = rol_mean.shift(1)
diff_recover = diff_recover_1.add(rol_shift_ts)
rol_sum = ts_log.rolling(window=11).sum()
rol_recover = diff_recover*12 - rol_sum.shift(1)
log_recover = np.exp(rol_recover)
log_recover.dropna(inplace=True)
# 预测结果作图
ts = ts[log_recover.index]
plt.figure(facecolor='white')
log_recover.plot(color='blue', label='Predict')
ts.plot(color='red', label='Original')
plt.legend(loc='best')
plt.title('RMSE: %.4f'% np.sqrt(sum((log_recover-ts)**2)/ts.size))
plt.show()