时间序列分析 - AirPassengers
加载pandas、matplotlib等包,处理时间序列
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
# 解决坐标轴刻度负号乱码
plt.rcParams['axes.unicode_minus'] = False
# 解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['Simhei']
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 6
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('AirPassengers.csv')
data.head()
Month | #Passengers | |
---|---|---|
0 | 1949-01 | 112 |
1 | 1949-02 | 118 |
2 | 1949-03 | 132 |
3 | 1949-04 | 129 |
4 | 1949-05 | 121 |
data.dtypes
Month object
#Passengers int64
dtype: object
data.columns
Index(['Month', '#Passengers'], dtype='object')
Month 是object类型,在时间序列分析中,我们需要现将数据转化为时间序列
第一种方法:将Month以索引读入,转化为时间格式
# 读取数据,pd.read_csv默认生成DataFrame对象,需将其转换成Series对象
df = pd.read_csv('AirPassengers.csv', encoding='utf-8', index_col='Month')
df.head()
#Passengers | |
---|---|
Month | |
1949-01 | 112 |
1949-02 | 118 |
1949-03 | 132 |
1949-04 | 129 |
1949-05 | 121 |
df.index = pd.to_datetime(df.index) # 将字符串索引转换成时间索引
df.head()
#Passengers | |
---|---|
Month | |
1949-01-01 | 112 |
1949-02-01 | 118 |
1949-03-01 | 132 |
1949-04-01 | 129 |
1949-05-01 | 121 |
ts = df['#Passengers'] # 生成pd.Series对象
# 查看数据格式
ts.head()
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121
Name: #Passengers, dtype: int64
ts.head().index
DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
'1949-05-01'],
dtype='datetime64[ns]', name='Month', freq=None)
第二种方法:以时间格式读入Month,并将其设置为索引(推荐)
# pd.read_csv?
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
dateparse('1962-01')
datetime.datetime(1962, 1, 1, 0, 0)
df = pd.read_csv('AirPassengers.csv', parse_dates=['Month'], index_col='Month', date_parser=dateparse)
df.head()
#Passengers | |
---|---|
Month | |
1949-01-01 | 112 |
1949-02-01 | 118 |
1949-03-01 | 132 |
1949-04-01 | 129 |
1949-05-01 | 121 |
df.shape
(144, 1)
# 检查索引格式
df.index
DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01',
'1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01',
'1949-09-01', '1949-10-01',
...
'1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01',
'1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01',
'1960-11-01', '1960-12-01'],
dtype='datetime64[ns]', name='Month', length=144, freq=None)
# convert to time series:
ts = df['#Passengers']
ts.head(10)
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121
1949-06-01 135
1949-07-01 148
1949-08-01 148
1949-09-01 136
1949-10-01 119
Name: #Passengers, dtype: int64
检索时间序列
检索特定时间的序列值
# 1. 检索特定时间的序列值
ts['1949-01-01']
112
# 2. 导入datetime的datetime模块,生成索引检索特定时间的序列值
from datetime import datetime
ts[datetime(1949,1,1)]
112
获取一定时间区间的序列值
# 1. 切片,指定区间
ts['1949-01-01':'1949-05-01']
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121
Name: #Passengers, dtype: int64
# 2. 切片,从开始到指定时间点
ts[:'1949-05-01']
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121
Name: #Passengers, dtype: int64
Note: ends included here
# 所有1949年的数据
ts['1949']
Month
1949-01-01 112
1949-02-01 118
1949-03-01 132
1949-04-01 129
1949-05-01 121
1949-06-01 135
1949-07-01 148
1949-08-01 148
1949-09-01 136
1949-10-01 119
1949-11-01 104
1949-12-01 118
Name: #Passengers, dtype: int64
min(ts.index), max(ts.index)
(Timestamp('1949-01-01 00:00:00'), Timestamp('1960-12-01 00:00:00'))
平稳性检验
绘制时序图
plt.plot(ts);
[外链图片转存(img-y6tQm2EE-1562729474892)(output_30_0.png)]
平稳性检验
# 移动平均
# pd.rolling_mean?
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
# Determing rolling statistics
rolmean = timeseries.rolling(window=12, center=False).mean()
rolstd = timeseries.rolling(window=12, center=False).std()
# Plot rolling statistics:
orig = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value ({})'.format(key)] = value
print(dfoutput)
test_stationarity(ts)
[外链图片转存(img-FSxx9adZ-1562729474893)(output_34_0.png)]
Results of Dickey-Fuller Test:
Test Statistic 0.815369
p-value 0.991880
#Lags Used 13.000000
Number of Observations Used 130.000000
Critical Value (1%) -3.481682
Critical Value (5%) -2.884042
Critical Value (10%) -2.578770
dtype: float64
p = 0.991880 > 0.05 序列非平稳, 同时也可以注意到均值和方差很不稳定
# acorr_ljungbox?
from statsmodels.stats.diagnostic import acorr_ljungbox
# 白噪声检验:Ljung-Box test
def randomness(ts, lags=10):
rdtest = acorr_ljungbox(ts,lags=lags)
# 对上述函数求得的值进行语义描述
rddata = np.c_[range(1,lags+1),rdtest[1:][0]]
rdoutput = pd.DataFrame(rddata,columns=['lags','p-value'])
return rdoutput.set_index('lags')
randomness(ts)
p-value | |
---|---|
lags | |
1.0 | 1.393231e-30 |
2.0 | 4.556318e-54 |
3.0 | 5.751088e-74 |
4.0 | 2.817731e-91 |
5.0 | 7.360195e-107 |
6.0 | 4.264008e-121 |
7.0 | 1.305463e-134 |
8.0 | 6.496271e-148 |
9.0 | 5.249370e-162 |
10.0 | 1.100789e-177 |
p = 1.3932314e-30 < 0.05,序列为非白噪声序列
平稳性处理 - 估计和消除趋势
消除趋势的第一个诀窍是对数变换。因为,能清楚地看到我们的数据呈现一个显著的上升趋势。因此,我们可以应用log变换。当然还有其他的变换方式,如:平方根,立方根等等。
[外链图片转存失败(img-lWuJFawA-1562729474893)(log 函数曲线.jpg)]
对数变换(log transform)
ts_log = np.log(ts)
# Plot original ts and log(ts):
plt.subplot(211)
plt.title('Original')
plt.plot(ts,label='Original ts')
plt.legend(loc='best')
plt.subplot(212)
plt.title('Log')
plt.plot(ts_log, label='Log(ts)')
plt.legend(loc='best'<