关于Statsmodels模块部分应用
回归分析–python应用篇(statsmodels)
以下代码取自 《Python数据分析》第七章 信号处理与时间序列
- 移动平均值
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.stats.moments import rolling_mean
data_loader=sm.datasets.sunspots.load_pandas()
df= data_loader.data
year=df['TEAR'].values
plt.plot(year,df['SUNACTIVITY'].values,label="Original") # 原始数据
plt.plot(year,df.rolling(window=11).mean()['SUNACTIVITY'].values,label='SMA 11') # 11均线
plt.plot(year,df.rolling(window=22).mean()['SUNACTIVITY'].values,label='SMA 22') # 22均线
plt.legend()
plt.show()
#还可以实现递减加权策略
weights=np.exp(np.linspace(-1.,0.,N))
weights/=weights.sum()
#和等量加权策略
def sma(arr,n):
weights=np.ones(n)/n
return np.convolve(weights,arr)[n-1,n+1]
- 窗口函数
窗口函数是定义在一个区间(窗口)上的函数,超出定义域,函数值取零。可以使用它们来分析频谱,设计滤波器等。
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.stats.moments import rolling_window
import pandas as pd
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data.tail(150)
df = pd.DataFrame({'SUNACTIVITY':df['SUNACTIVITY'].values},index = df['YESR'])
ax = df.plot()
def plot_window(wintype):
df2 = df.rolling(window=22,win_type=wintype,center=False,axis=0).mean()
df.columns=[wintype]
df2.plot(ax=ax)
plot_window('boxcar')
plot_window('triang')
plot_window('blackman')
plot_window('banning')
plot_window('bartlett')
plt.show()
- 协整
如果两个时间序列x(t)和y(t)的线性组合是稳态的,那么就称这两个序列具有共整合性或协整性。
import numpy as np
import statsmodels.api as sm
from pandas.stats.moments import rolling_window
import pandas as pd
import statsmodels.tsa.stattools as ts
def calc_adf(x,y):
result = sm.OLS(x,y).fit()
return ts.adfuller(result.resid)
data_loader=sm.datasets.sunspots.load_pandas()
data= data_loader.data.values
N=len(data)
t = np.linspace(-2*np.pi,2*np.pi,N)
sine = np.sin(np.sin(t))
print(calc_adf(sine,sine)) # 自相关
noise=np.random.normal(0,0.01,N)
print(calc_adf(sine,sine+noise)) # 添加噪音后的自相关
cosine= 100*np.cos(t)+10
print(calc_adf(sine,cosine+noise)) # 添加噪音的余弦波
print(calc_adf(sine,data)) # 太阳黑子与正弦之间的协整验证
- 自相关
自相关是数据集内部的相关性,可用来指明趋势。
import numpy as np
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
data_loader=sm.datasets.sunspots.load_pandas()
data= data_loader.data['SUNACTIVITY'].values
y = data-np.mean(data)
norm = np.sum(y **2)
correlated=np.correlate(y,y,model='full')/norm
res=correlated[len(correlated)/2:]
plt.plot(res)
plt.grid(True)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.show() # plt绘图结果
autocorrelation_plot(data) #pandas绘图结果
plt.show()
- 自回归模型
自回归模型可以用于预测时间序列将来的值。
from scipy.optimize import leastsq
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
def model(p,x1,x10):
p1,p10 = p
return p1*x1+p10*x10
def error(p,data,x1,x10):
return data - model(p,x1,x10)
def fit(data):
p0=[0.5,0.5]
params = leastsq(error ,p0,args=(data[10:],data[9:-1],data[:-10]))[0]
return params
data_loader=sm.datasets.sunspots.load_pandas()
sunspots= data_loader.data['SUNACTIVITY'].values
cutoff = 0.9*len(sunspots)
params = fit(sunspots[:cutoff])
print(params)
pred = params[0]*sunspots[cutoff-1:-1]+params[1]*sunspots[cutoff-10:-10]
actual = sunspots[cutoff:]
year_range =data_loader.data['YEAR'].values[cutoff:]
plt.plot(year_range,actual,'o',label='Sunspots')
plt.plot(year_range,pred,'x',label='Prediction')
plt.grid(True) # 设置网格线
plt.xlabel('YEAR')
plt.ylabel('SUNACTIVITY')
plt.legend() # 设置图例
plt.show()
- ARMA模型
ARMA模型由自回归模型和移动平均模型结合而成,常用于时间序列的预测。
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import datetime
data_loader=sm.datasets.sunspots.load_pandas()
df= data_loader.data
years=df['YEAR'].values.astype(int)
df.index = pd.Index(sm.tsa.datetools.dates_from_range(str(years[0]),str(years[-1])))
def df['YEAR']
model = sm.tsa.ARMA(df,(10,1)).fit()
prediction = model.predict('1975',str(years[-1]),dynamic =True)
df['1975'].plot()
prediction.plot(style ='--',label='Prediction')
plt.legend()
plt.show()
- 傅里叶分析
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from scipy.fftpack import rfft,fftshift
data_loader=sm.datasets.sunspots.load_pandas()
sunspots= data_loader.data['SUNACTIVITY'].values
N=len(data)
t = np.linspace(-2*np.pi,2*np.pi,N)
mid=np.ptp(sunspots)/2
sine = mid+mid*np.sin(np.sin(t))
sine_fft = np.abs(fftshift(rfft(sine)))
np.argsort(sine_fft)[-5:] # 5个最大振幅的索引
transformed = np.abs(fftshift(rfft(sunspots)))
np.argsort(transformed)[-5:] # 5个最大振幅值
plt.subplot(311)
plt.plot(sunspots,label='Sunspots')
plt.plot(sine,lw=2,label='Sine')
plt.grid(True)
plt.legend()
plt.subplot(312)
plt.plot(transformed,label='Transformed Sunspots')
plt.grid(True)
plt.legend()
plt.subplot(313)
plt.plot(sine_fft,lw=2,label='Transformed Sine')
plt.grid(True)
plt.legend()
plt.show()