使用最大离散重叠小波变换MODWT和支持向量回归 SVR的金融时间序列预测

哥廷根数学学派

已于 2023-08-13 13:55:47 修改

阅读量2.7k

点赞数 3

分类专栏：小波分析文章标签：回归金融数据挖掘

于 2022-08-12 10:05:15 首次发布

本文链接：https://blog.csdn.net/weixin_39402231/article/details/126298253

版权

小波分析专栏收录该内容

74 篇文章 40 订阅

订阅专栏

第一部分，原始时间序列SVR + 滑动窗方法

首先读取数据

prices = pd.read_csv('../Data/AUD-JPY-2003-2014-day.csv',delimiter=";", header=0, encoding='utf-8', parse_dates=['Date'])
prices

删除不使用的列

prices.drop(["Open", "High", "Low"],axis = 1, inplace = True)

定义变量

dates = prices['Date'].copy()
closing_prices = prices['Close'].copy()


#使用 matplotlib 绘制原始时间序列
plt.subplots(figsize=(16,4))
plt.plot(dates, closing_prices, label='Original series AUD-JPY 2003-2014')
plt.legend(loc = 'best')
plt.show()

SVR + 滑动窗，定义滑动窗函数

def slideWindow(series, window_lenght = 2):

    _X, _Y = [], []
    aux_Window =  sliding_window_view(series, window_lenght+1)
    # 将第一个“window_lenght”值作为输入 (X)，将最后一个值 (window_lenght+1) 作为输出 (Y)
    for i in range(len(aux_Window)):
        _Y.append(aux_Window[i][-1])
        _X.append(aux_Window[i][:-1])
    
    return _X, _Y
window_lenght = 2
#调用滑动窗函数    
X, Y = slideWindow(closing_prices,window_lenght)
idx_test_date = int(0.75*len(Y)) + window_lenght
df = pd.DataFrame(columns = ['test_date']) 
df['test_date'] = prices['Date'].iloc[idx_test_date:]

拆分并绘制测试数据，将数据拆分为训练集（75%）和测试集（25%），shuffle = False 表示并非随机打乱数据

x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.25, random_state=None, shuffle=False)

fig, ax = plt.subplots(2,1,figsize=(16,8))
ax[0].plot(dates, closing_prices, label='Original')
ax[0].plot(df['test_date'], y_test, label='Values to test the model out',color='orange')
ax[1].plot(df['test_date'], y_test, label='Values to test the model out',color='orange')

ax[0].legend(loc = 'best')
ax[1].legend(loc = 'best')
plt.show()

定义训练函数并拟合

def evaluateSVR(_x_train,_y_train,_x_test,_y_test, kernel = 'rbf'):
    
    if (kernel == 'rbf'):
        clf = svm.SVR(kernel ='rbf', C=1e3, gamma=0.1)
    elif (kernel == 'poly'):
        clf = svm.SVR(kernel ='poly', C=1e3, degree=2)
    else:
        clf = svm.SVR(kernel ='linear',C=1e3)
    _y_predict = clf.fit(_x_train,_y_train).predict(_x_test)
    
    return _y_predict

y_predict = evaluateSVR(x_train,y_train,x_test,y_test)

plotValuesWt = y_test.copy()

#绘制预测值
plt.subplots(figsize=(18, 6))
plt.plot(df['test_date'], y_test, label = "Real")
plt.plot(df['test_date'], y_predict, label = "Predicted")
plt.legend(loc = 'best')
plt.show()

第二部分，使用 MODWT 将时间序列分解

使用“sym4”小波，modwt分解为4层（4 个细节系数 (dC) 和 1 个近似系数 (aC)）

def applyModwt(_data, type='sym4', _level=3):
    _coeff = modwt(_data, type, _level)
    return _coeff

level = 4
coeff = applyModwt(closing_prices,type='sym4',_level=level)


#检查系数，一个 len(close_prices) 列和 5 行的数组
print(np.shape(coeff))

#画系数图
fig, ax =  plt.subplots(len(coeff), 1, figsize=(16, 8))
for i in range(len(coeff)):
    if i == len(coeff)-1:
        ax[i].plot(coeff[i], label = 'cA[%.0f]'%(i))
        ax[i].legend(loc = 'best')
    else:
        ax[i].plot(coeff[i], label = 'cD[%.0f]'%(i))
        ax[i].legend(loc = 'best')

重建原始时间序列

#初始化存储数组
recwt = np.zeros((np.shape(coeff)[0], np.shape(coeff)[1]))

#分配近似系数和细节系数
aCdC = coeff.copy()

recwt[level:] = coeff[level]

#只使用 aC 来重建时间序列
dFs = imodwt(recwt,'sym4')

#还可以使用所有的系数来重新构建金融序列
rFs = imodwt(coeff,'sym4')

#绘图比较
fig, ax = plt.subplots(4,1,figsize=(16,8))
ax[0].plot(dates, closing_prices, label='Original')
#使用所有aC和dC系数重建
ax[1].plot(dates, rFs, label='Re-constructed (using all coeff)', color = 'green')
#仅使用aC系数重建
ax[2].plot(dates, dFs, label='Re-constructed (using just aC)', color = 'orange')
#原始信号与降噪后的信号
ax[3].plot(dates, closing_prices, label='Original')
ax[3].plot(dates, dFs, label='Re-constructed (using just aC)', color = 'orange')

ax[0].legend(loc = 'best')
ax[1].legend(loc = 'best')
ax[2].legend(loc = 'best')
ax[3].legend(loc = 'best')

plt.show()

第三部分，使用 SVR 估计小波系数

new_coeff = []
#使用滑动窗口生成 X 和 Y 
for i in range(len(aCdC)):
    index = int(len(aCdC[i])*0.75)#+ window_lenght
    X, Y = slideWindow(aCdC[i], window_lenght=5)
    #划分数据
    x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.25, random_state=None, shuffle=False)
    #Evaluating each dC in the SVR function
    y_predict = evaluateSVR(x_train,y_train,x_test,y_test)
    #存储预测值和训练数据 
    new_coeff.append(np.concatenate((aCdC[i][:index], y_predict)))
    #绘制每个系数的预测值
    plt.subplots(figsize=(18, 6))
    plt.plot(y_test, label = "Real")
    plt.plot(y_predict, label = "Predicted")
    plt.legend(loc = 'best')
    plt.show()

用预测值绘制新的时间序列

rpFs = imodwt(new_coeff,'sym4')

index = int(len(rpFs)*0.75)#+ window_lenght

fig, ax = plt.subplots(3,1,figsize=(16,8))
ax[0].plot(df['test_date'], plotValuesWt, label='Original')
#使用所有的dC and aC系数重建
ax[1].plot(rpFs[index:] ,label='Re-constructed (using all coeff)', color = 'green')#df['test_date'], 

ax[2].plot(df['test_date'], plotValuesWt, label='Original')
ax[2].plot(df['test_date'], rpFs[index:] ,label='Re-constructed (using all coeff)', color = 'green')

ax[0].legend(loc = 'best')
ax[1].legend(loc = 'best')
ax[2].legend(loc = 'best')

print('MSE',mean_squared_error(plotValuesWt, rpFs[index:],squared=False))

第四部分，构建预测模型（使用所有系数进行预测）

def evaluateModel(svr, X, Y, prediction_days, past_days):
    X_ = []
    Y_ = []
    Y_.append(np.array(Y)[-1])
    X_.append(X[-1])
    for i in range(prediction_days):
        Y_array = np.array([Y_[-1]])
        X_array = np.array(X_[-1][-past_days+1:])
        X_Y_concat = np.array([np.concatenate((X_array,Y_array))])
        X_ = np.concatenate(( X_, X_Y_concat ))
        p_value = svr.predict(X_[-1].reshape(1, -1))
        Y_ = np.concatenate(( Y_,  p_value))
    return Y_

def predictValue(past_days = 7, prediction_days = 5, file_Path = 'Data/AUD-JPY-2003-2014-day.csv', dateColName = 'Date', 
                 closingPColName = 'Close', delimiter = ';'):
    #从文件中获取数据
    dates, closing_prices = getDatesAndPrices(file_Path, dateColName, closingPColName, delimiter)
    #从小波获取系数
    coeff = getCoeffFromSeries(closing_prices)
    
    #使用 SVR 估计系数

    predictedCoeff = trainModel(coeff, prediction_days ,past_days)
    return predictedCoeff, dates, closing_prices
    
def getDatesAndPrices(filePath, dateColName, closingPColName, _delimiter):
    #从 csv 文件中读取数据
    #使用 'parse_dates' 将日期字符串转换为可以使用的对象
    prices = pd.read_csv(filePath,delimiter=_delimiter, header=0, encoding='utf-8', parse_dates=[dateColName])
    # 定义变量
    dates = prices[dateColName].copy()
    closing_prices = prices[closingPColName].copy()
    
    return dates, closing_prices

def getCoeffFromSeries(closing_prices):
    #调用之前定义的函数
    level = 4
    coeff = applyModwt(closing_prices,type='sym4',_level=level)
    return coeff

def trainModel(coeff, prediction_days, past_days):
    new_coeff = []
    print('coeff shape: ',np.shape(coeff))
    for i in range(len(coeff)):
        firstWindowValues = coeff[i][:past_days]
        X, Y = slideWindow(coeff[i], past_days)       
        svr = svm.SVR(kernel ='rbf', C=1e3, gamma=0.1)
        svr.fit(X, Y)
        
        predictCoeff = evaluateModel(svr, X, Y, prediction_days, past_days)
        newCoeff_concat = np.concatenate((coeff[i][:-1], predictCoeff))
        new_coeff.append(newCoeff_concat)
    print('NEW coeff shape: ',np.shape(new_coeff))
    return new_coeff




daysToPredict = 7
predictedCoeff, dates, closing_prices = predictValue(prediction_days = daysToPredict)

接下来准备绘图进行对比

def plotValues(dates, original, predicted, prediction_days):
    fig, ax = plt.subplots(3,1,figsize=(16,8))
    ax[0].plot(dates, original, label='Original')
    #使用dC and aC系数重建
    ax[1].plot(predicted ,label='Re-constructed (using all coeff)', color = 'green')
    #print(type(dates))
    newDates = (addDayToDates(dates, prediction_days))
    
    ax[2].plot(dates, original, label='Original')
    ax[2].plot(newDates,predicted ,label='Re-constructed (using all coeff)', color = 'green')
    
    ax[0].legend(loc = 'best')
    ax[1].legend(loc = 'best')
    ax[2].legend(loc = 'best')

def addDayToDates(dates, prediction_days):
    _dates = copy.deepcopy(dates)
    lastDate = np.array(_dates)[-1]
    for i in range (prediction_days+1):
        newDate = pd.to_datetime(lastDate) + pd.DateOffset(days=i)
        _dates[len(_dates)-1+i] = newDate
    return _dates
rpFs = imodwt(predictedCoeff,'sym4')
plotValues(dates, closing_prices ,rpFs, daysToPredict)

然后，仅使用近似系数进行预测

def readData(past_days = 7, prediction_days = 5, file_Path = '../Data/AUD-JPY-2003-2014-day.csv', 
                        dateColName = 'Date', closingPColName = 'Close', delimiter = ';'):
    #获取数据
    dates, closing_prices = getDatesAndPrices(file_Path, dateColName, closingPColName, delimiter)

    return dates, closing_prices


def getApproxCoeffFromSeries(closing_prices):
    #调用函数
    level = 4
    coeff = applyModwt(closing_prices,type='sym4',_level=level)
    return coeff

def trainModelApprox(X, Y, past_days):
    #完全重建时间序列所需的值
    svr = svm.SVR(kernel ='rbf', C=1e3, gamma=0.1)
    svr.fit(X, Y)
    
    return svr

daysToPredict = 7
past_days = 7
level = 4
dates, closing_prices = readData(past_days = past_days, prediction_days = daysToPredict)

然后

# 仅获取近似系数和最后的细节系数
approxCoeff = getApproxCoeffFromSeries(closing_prices)

#初始化存储数组
recwt = np.zeros((np.shape(approxCoeff)[0], np.shape(approxCoeff)[1]))

#存储系数
recwt[(level-1):] = approxCoeff[-2]
recwt[level:] = approxCoeff[-1]


#只使用 aC 来重建时间序列，相当于给金融时间序列降噪了
dFs = imodwt(recwt,'sym4')

使用近似系数训练模型

X, Y = slideWindow(dFs, past_days)

svr = trainModelApprox(X, Y, daysToPredict)

执行预测

predictedValues = evaluateModel(svr, X, Y, prediction_days=daysToPredict, past_days=past_days) 
rpFs = np.concatenate((dFs, predictedValues[1:]))
#绘图
plotValues(dates, closing_prices ,rpFs, daysToPredict)

基于最大离散重叠小波变换MODWT和支持向量回归 SVR的金融时间序列预测的步骤大致如此，前面基于滑动窗+SVR的金融序列预测还比较好理解，到小波这边可能就难以理解了，实际上还是各种倒腾小波系数，在每分阶层的小波系数上进行预测，最后再综合，小波分析还是有很大的灵活性的，不管使用近似系数进行预测，还是挑选近似系数+几个细节系数进行预测，并没有一个明确的指导方案，还是要靠自己多试几次。

关于最大离散重叠小波,找了几个金融相关的文章，看一下吧

[1]王健.中美股市联动性——基于极大重叠离散小波变换的研究[J].世界经济文汇,2014(02):72-89.

[2]隋新,何建敏,李亮.时变视角下基于MODWT的沪深300指数现货与期货市场间波动溢出效应[J].系统工程,2015,33(01):31-38.

[3]徐梅. 金融波动分析的小波和频域方法研究[D].天津大学,2004.