2021-10-12

本文通过Python实现重庆碳排放权价格的时间序列预测,使用了经验模态分解(EMD)、SVM、LSTM和ARIMA等方法。首先,对数据进行了预处理,然后利用EMD分解提取主要趋势。接着,通过ADF检验验证了序列的平稳性,并使用ARIMA模型进行短期预测。同时,利用SVM进行时间序列建模,最后采用LSTM网络进行长期预测。实验结果显示,这些模型在预测碳排放权价格方面各有优劣。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#!/usr/bin/env python
# coding: utf-8

# In[3]:


import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# 网上提到该设置可能有其他风险
import numpy as np
from matplotlib import pyplot as plt
from sklearn.svm import SVR
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from math import sqrt
from matplotlib import pyplot
import numpy
import pandas as pd
import math
import numpy as np 
import pylab as pl
import matplotlib.pyplot as plt
import scipy.signal as signal
from scipy import fftpack  
import scipy.signal as signal
from scipy import interpolate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyhht.emd import EMD
from pyhht.visualization import plot_imfs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pandas import Series, DataFrame
import seaborn as sns
from datetime import datetime   #数据索引改为时间
import numpy as np
import statsmodels.api as sm     #acf,pacf图
from statsmodels.tsa.stattools import adfuller  #adf检验
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pandas import Series, DataFrame
import seaborn as sns
from datetime import datetime  # 数据索引改为时间
import numpy as np
import statsmodels.api as sm  # acf,pacf图
from statsmodels.tsa.stattools import adfuller  # adf检验
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA


# In[4]:


df1=pd.read_excel('国内外碳排放交易.xlsx') 
df1.index=df1.指标名称

df1=df1.dropna(axis=0, thresh=4)


# In[5]:


gsyh =df1['重庆碳排放权(CQEA):成交均价'].pct_change() 
print(type(gsyh))
plt.subplot(221) # 第一行的左图
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title('碳排放收益率频数分布图')
plt.xlabel('收益率')
plt.ylabel('天数')
plt.hist(gsyh)
plt.show()


# In[6]:


cum_daily_return = (1 + gsyh).cumprod()

#输出'cum_daily_return'

print(cum_daily_return)

#绘制累积日收益率曲线
plt.subplot(222)
cum_daily_return.plot(figsize=(12,8))

plt.xlabel('天数')
plt.title('碳排放累计收益率')


# In[7]:


df = df1['重庆碳排放权(CQEA):成交均价']

df.index=pd.to_datetime(df1.index)


# In[8]:


logret=np.log(df/df.shift(1))[1:]

year=[]

d0=df.index

for i in range(0,np.size(logret)):

    year.append(d0[i].strftime("%Y"))

y=pd.DataFrame(logret.values,year,columns=['年收益率'])

ret_annual=np.exp(y.groupby(y.index).sum())-1

ret_annual


# In[9]:


ret_annual.plot(figsize=(12,8))


# In[10]:



##############       EMD分解

#判定当前的时间序列是否是单调序列
def ismonotonic(x):
    max_peaks=signal.argrelextrema(x,np.greater)[0]
    min_peaks=signal.argrelextrema(x,np.less)[0]
    all_num=len(max_peaks)+len(min_peaks)
    if all_num>0:
        return False
    else:
        return True


#寻找当前时间序列的极值点
def findpeaks(x):
    return signal.argrelextrema(x,np.greater)[0]


# In[70]:



#判断当前的序列是否为 IMF 序列
def isImf(x):
    N=np.size(x)
    pass_zero=np.sum(x[0:N-2]*x[1:N-1]<0)#过零点的个数
    peaks_num=np.size(findpeaks(x))+np.size(findpeaks(-x))#极值点的个数
    if abs(pass_zero-peaks_num)>1:
        return False
    else:
        return True



# In[71]:


#获取当前样条曲线
def getspline(x):
    N=np.size(x)
    peaks=findpeaks(x)
    print('当前极值点个数:',len(peaks))
    if(len(peaks)<=3):
        if(len(peaks)<2):
            peaks=np.concatenate(([0],peaks))
            peaks=np.concatenate((peaks,[N-1]))#这里是为了防止样条次数不够,无法插值的情况
        t=interpolate.splrep(peaks,y=x[peaks], w=None, xb=None, xe=None,k=len(peaks)-1)
        return interpolate.splev(np.arange(N),t)
    t=interpolate.splrep(peaks,y=x[peaks])
    return interpolate.splev(np.arange(N),t)
#     f=interp1d(np.concatenate(([0,1],peaks,[N+1])),np.concatenate(([0,1],x[peaks],[0])),kind='cubic')
#     f=interp1d(peaks,x[peaks],kind='cubic')
#     return f(np.linspace(1,N,N))



# In[72]:


#经验模态分解方法
def emd(x):
    imf=[]
    while not ismonotonic(x):
        x1=x
        sd=np.inf
        while sd>0.1 or  (not isImf(x1)):
            print(isImf(x1))
            s1=getspline(x1)
            s2=-getspline(-1*x1)
            x2=x1-(s1+s2)/2
            sd=np.sum((x1-x2)**2)/np.sum(x1**2)
            x1=x2
        imf.append(x1)
        x=x-x1
    imf.append(x)
    return imf


# In[11]:


emd = EMD(df[:10000])
imfs = emd.decompose()
plt.plot(df)


# In[12]:



df.fillna(method='pad',inplace=True) 
df.fillna(0)
temp = np.array(df.diff())
t = adfuller(df)  # ADF检验


# In[13]:



output=pd.DataFrame(index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used","Critical Value(1%)","Critical Value(5%)","Critical Value(10%)"],columns=['value'])
output['value']['Test Statistic Value'] = t[0]
output['value']['p-value'] = t[1]
output['value']['Lags Used'] = t[2]
output['value']['Number of Observations Used'] = t[3]
output['value']['Critical Value(1%)'] = t[4]['1%']
output['value']['Critical Value(5%)'] = t[4]['5%']
output['value']['Critical Value(10%)'] = t[4]['10%']
output


# In[14]:


plt.figure(figsize=(20,12))

plot_imfs(df , imfs, df.index) 


# In[15]:


for i in  range(0, 8):
    plt.figure(figsize=[40,20])
    plt.subplot(311)
    plt.plot(df.index,imfs[i],'b',label = "Input Signal")
    plt.xlabel("Time")
    plt.ylabel("signal")
plt.title("Input Signal")


# In[16]:


print(np.mean(imfs,axis=1))


# In[17]:


plt.figure(figsize=[40,20])
plt.subplot(311)
plt.plot(df.index,imfs[2]+imfs[5],'b',label = "Input Signal")
plt.xlabel("Time")
plt.ylabel("signal")


# In[18]:


plt.figure(figsize=[40,20])
plt.subplot(311)
plt.plot(df.index,imfs[0]+imfs[1]+imfs[3]+imfs[4]+imfs[6],'b',label = "Input Signal")
plt.xlabel("Time")
plt.ylabel("signal")


# In[19]:


plt.figure(figsize=[40,20])
plt.subplot(311)
plt.plot(df.index,imfs[0]+imfs[1]+imfs[3]+imfs[4]+imfs[6],'b',label = "Input Signal")
plt.xlabel("Time")
plt.ylabel("signal")


# In[20]:


df2=pd.DataFrame([imfs[2]+imfs[5],imfs[0]+imfs[1]+imfs[3]+imfs[4]+imfs[6]],columns=df.index,index=['低频','高频'])


# In[21]:


df2.T.to_csv('cq4.csv')


# In[22]:



######### SARIMX
df1 = pd.read_csv('cq4.csv')

df1 = df1.dropna(axis=0, thresh=3)

df1.fillna(method='pad', inplace=True)

df1



df1.index = pd.to_datetime(df1.指标名称)
df2 = df1['低频']
C_diff = df2.diff()

# In[111]:


df2[-20:]

# In[73]:


C_week = df2.resample('W-MON').mean()
C_year = df1.resample('Y').sum()
C_day = df2.resample('D').mean()
df_week_test = C_week[-20:]

# In[74]:


C_day

# In[75]:


C_diff = C_day.diff()

# In[49]:


C_week[-20:]

# In[15]:


C_year.to_csv("重庆C成交量.csv")

# In[16]:


# In[76]:


plt.figure()
plt.plot(C_diff)

# In[78]:


fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(C_diff, lags=20, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(C_diff, lags=20, ax=ax2)
plt.show()

# In[93]:


temp = np.array(df2)

# In[94]:


t = adfuller(df2)  # ADF检验

# In[95]:


output = pd.DataFrame(
    index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used", "Critical Value(1%)",
           "Critical Value(5%)", "Critical Value(10%)"], columns=['value'])
output['value']['Test Statistic Value'] = t[0]
output['value']['p-value'] = t[1]
output['value']['Lags Used'] = t[2]
output['value']['Number of Observations Used'] = t[3]
output['value']['Critical Value(1%)'] = t[4]['1%']
output['value']['Critical Value(5%)'] = t[4]['5%']
output['value']['Critical Value(10%)'] = t[4]['10%']
output

# In[29]:


# 为了控制计算量,我们限制AR最大阶不超过6,MA最大阶不超过4。
# .tsa.arma_order_select_ic(df2,max_ar=6,max_ma=4,ic='aic')['aic_min_order']  # AIC

# BIC


# In[96]:


mod = sm.tsa.statespace.SARIMAX(df2, enforce_stationarity=False, enforce_invertibility=False)

# In[97]:


C_week[:-10]

# In[98]:


res = mod.fit()

# In[99]:


# res.get_prediction(start=pd.to_datetime('2017-12-1'))


# In[100]:


# # 看趋势
# plt.figure(figsize=[15, 7])
# sm.tsa.seasonal_decompose(df2).plot()
# print("air_passengers test: p={}".format( adfuller(df2)[1]))
# # air_passengers test: p=0.996129346920727


# In[101]:


# model = ARIMA(df2, order=(2,1,4)).fit()

# model.summary2()        #生成一份模型报告


# In[102]:


sm.graphics.tsa.plot_acf(res.resid.values.squeeze(), lags=48)

# In[103]:


predictions_ARIMA_diff = pd.Series(res.fittedvalues, copy=True)
print(predictions_ARIMA_diff.head())

# In[104]:


# 下图是对残差进行的检验。可以确认服从正太分布,且不存在滞后效应。
res.plot_diagnostics(lags=30, figsize=(16, 12))


# In[105]:


def SARIMA_model(series, predict_num=1):  # predict_num 为向后预测的数量
    for i in range(predict_num):
        model = SARIMAX(series, order=(3, 0, 4))
        model_fit = model.fit(disp=False)
        predictions = model_fit.predict(start=len(series), end=len(series), dynamic=False)
        series = series.append(predictions)
    return series


# In[122]:


from statsmodels.tsa.statespace.sarimax import SARIMAX

df_month2 = df_week_test
# best_model.predict()  设定开始结束时间
# invboxcox函数用于还愿boxcox序列
df_month2['forecast'] = SARIMA_model(res.forecast(steps=20),1)
plt.figure(figsize=(15, 7))

# 获取mse
# print('mean_squared_error: {}'.format(mean_squared_error(df_month2.y,  df_month2.forecast)))


# In[115]:


df2

# In[121]:


df_month2['forecast'].index

# In[129]:


df2.plot()
df_month2.forecast.plot(color='r', ls='--', label='Predicted Sales')

# In[126]:


df_month2['forecast'].index = df2.index[-21:]

# In[127]:


df_month2['forecast']

# In[128]:
df_month2['forecast']
df_SMA=df_month2['forecast'][-20:]

# In[ ]:


plt.show()


# In[23]:


df_month2['forecast'][-20:]


# In[24]:



##          SVM模型



# 需要预测的长度是多少
long_predict = 20


def svm_timeseries_prediction(c_parameter, gamma_paramenter):
    X_data = df_cq.指标名称
    Y_data = df_cq['低频']
    print(len(X_data))
    # 整个数据的长度
    long = len(X_data)
    # 取前多少个X_data预测下一个数据
    X_long = 1
    error = []
    svr_rbf = SVR(kernel='rbf', C=c_parameter, gamma=gamma_paramenter)
    # svr_rbf = SVR(kernel='rbf', C=1e5, gamma=1e1)
    # svr_rbf = SVR(kernel='linear',C=1e5)
    # svr_rbf = SVR(kernel='poly',C=1e2, degree=1)
    X = []
    Y = []
    for k in range(len(X_data) - X_long - 1):
        t = k + X_long
        X.append(Y_data[k:t])
        Y.append(Y_data[t + 1])
    y_rbf = svr_rbf.fit(X[:-long_predict], Y[:-long_predict]).predict(X[:])
    for e in range(len(y_rbf)):
        error.append(Y_data[X_long + 1 + e] - y_rbf[e])
    return X_data, Y_data, X_data[X_long + 1:], y_rbf, error


# In[34]:


df_cq = pd.read_csv("cq4.csv")
df_cq

# In[35]:


X_data, Y_data, X_prediction, y_prediction, error = svm_timeseries_prediction(10, 1)

# In[36]:


# figure = plt.figure()
# tick_plot = figure.add_subplot(1, 1, 1)
plt.plot(X_data, Y_data, label='data', color='green', linestyle='-')

# tick_plot = figure.add_subplot(1, 1, 1)
# figure = plt.figure()
# tick_plot = figure.add_subplot(1, 1, 1)
# tick_plot.axvline(x=X_data[long_predict], alpha=0.2, color='gray')
plt.axvline(x=X_data[long_predict], alpha=0.2, color='gray')
# tick_plot.plot(X_data[:-X_long-1], y_rbf, label='data', color='red', linestyle='--')
plt.plot(X_prediction, y_prediction, label='data', color='red', linestyle='--')

# In[45]:


# np.array(y_prediction[-20:])

# In[43]:


# tick_plot.plot(X_prediction,error)
# plt.show()
error

# In[41]:


df_SVM=y_prediction[-20:]

# In[42]:


Y_data

# In[ ]:




#!/usr/bin/env python
# coding: utf-8

# In[1]:


# In[25]:




###   LSTM 模型

# date-time parsing function for loading the dataset
def parser(x):
  return datetime.strptime(x, '%d.%m.%Y')

# frame a sequence as a supervised learning problem
def timeseries_to_supervised(data, lag=1):
  df = DataFrame(data)
  columns = [df.shift(i) for i in range(1, lag+1)]
  columns.append(df)
  df = concat(columns, axis=1)
  df.fillna(0, inplace=True)
  return df

# create a differenced series
def difference(dataset, interval=1):
  diff = list()
  for i in range(interval, len(dataset)):
      value = dataset[i] - dataset[i - interval]
      diff.append(value)
  return Series(diff)

# invert differenced value
def inverse_difference(history, yhat, interval=1):
  return yhat + history[-interval]

# scale train and test data to [-1, 1]
def scale(train, test):
  # fit scaler
  scaler = MinMaxScaler(feature_range=(-1, 1))
  scaler = scaler.fit(train)
  # transform train
  train = train.reshape(train.shape[0], train.shape[1])
  train_scaled = scaler.transform(train)
  # transform test
  test = test.reshape(test.shape[0], test.shape[1])
  test_scaled = scaler.transform(test)
  return scaler, train_scaled, test_scaled

# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
  new_row = [x for x in X] + [value]
  array = numpy.array(new_row)
  array = array.reshape(1, len(array))
  inverted = scaler.inverse_transform(array)
  return inverted[0, -1]

# fit an LSTM network to training data
def fit_lstm(train, batch_size, nb_epoch, neurons):
  X, y = train[:, 0:-1], train[:, -1]
  X = X.reshape(X.shape[0], 1, X.shape[1])
  model = Sequential()
  model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer='adam')
  for i in range(nb_epoch):
      model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
      model.reset_states()
  return model

# make a one-step forecast
def forecast_lstm(model, batch_size, X):
  X = X.reshape(1, 1, len(X))
  yhat = model.predict(X, batch_size=batch_size)
  return yhat[0,0]




# load dataset
series = pd.read_csv('cq4.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)



series

series.sort_values(['指标名称'], ascending=True, inplace=True)


# In[79]:



# transform data to be stationary
raw_values = series['低频']


# In[80]:


series.info()


# In[86]:


raw_values


# In[87]:


diff_values = difference(raw_values.astype('float'), 1)



# In[88]:


diff_values


# In[107]:


# transform data to be supervised learning
supervised = timeseries_to_supervised(raw_values, 1)

supervised_values = supervised.values

# split data into train and test-sets
train, test = supervised_values[0:-21], supervised_values[-21:]



# In[108]:


raw_values


# In[109]:


supervised


# In[110]:


supervised_values[-20:]


# In[111]:


supervised_values


# In[112]:


train


# In[113]:


test


# In[114]:


# transform the scale of the data
scaler, train_scaled, test_scaled = scale(train, test)

# fit the model
lstm_model = fit_lstm(train_scaled, 1, 30, 4)




# In[26]:



# forecast the entire training dataset to build up state for forecasting
train_reshaped = train_scaled[:, 0].reshape(len(train_scaled), 1, 1)
lstm_model.predict(train_reshaped, batch_size=1)


# In[27]:


len(test_scaled)


# In[28]:


# test_scaled


# In[29]:





# In[117]:


len(test_scaled)


# In[120]:


# walk-forward validation on the test data

predictions = list()
for i in range(len(test_scaled)-1):
  # make one-step forecast
  X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
  yhat = forecast_lstm(lstm_model, 1, X)
  # invert scaling
  yhat = invert_scale(scaler, X, yhat)
  # invert differencing
  yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)/2
  # store forecast
  predictions.append(yhat)
#   expected = raw_values[len(train) + i + 1]
#   print('Month=%d, Predicted=%f, Expected=%f' % (i+1, yhat, expected))


# In[ ]:


#report performance
rmse = sqrt(mean_squared_error(raw_values[-20:], predictions))
print('Test RMSE: %.3f' % rmse)
# line plot of observed vs predicted



df_LST=predictions







# In[30]:


df_merge=pd.DataFrame(np.stack([np.array(df_SMA),df_SVM,np.array(df_LST),np.array(df2[-20:])]).T,index=df_SMA.index,columns=["SMAIRX预测","SVM预测","LSTM预测","真实值"])


# In[31]:


df_merge


# In[32]:


c1=((df_merge['SMAIRX预测']-df_merge['真实值'])/df_merge['真实值']).describe()[1]


# In[33]:


c2=((df_merge['SVM预测']-df_merge['真实值'])/df_merge['真实值']).describe()[1]


# In[34]:


c3=((df_merge['LSTM预测']-df_merge['真实值'])/df_merge['真实值']).describe()[1]


# In[35]:


[c1,c2,c3] # 模型误差


# In[36]:


# df_merge.to_csv("低频预测.csv")


# In[ ]:





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值