【金融】【pytorch】使用深度学习预测期货收盘价涨跌——数据处理

【金融】【pytorch】使用深度学习预测期货收盘价涨跌——数据处理

读取数据

其中pandas_techinal_indicators参考jmartinezheras/reproduce-stock-market-direction-random-forests

import torch
from torch.autograd import Variable
import torch.nn as nn
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import random

import pandas_techinal_indicators as ta
np.random.seed(42)
random.seed(42)
# %matplotlib inline
# %config InlineBackend.figure_format = 'svg'

df1 = pd.read_csv(r'xxxxxxx/hengSheng_0404.csv')
# df1 = pd.read_csv(r'D:/MINE_FILE/dataSet/market_index_data/hengSheng_0404.csv')
df1.head()

在这里插入图片描述

将数据按3年+3月分段

每次取3年为训练集,后三个月为测试集。下一组训练集和测试集后推三个月。 每组训练集中80%为训练集,20%为验证集。参考:M’Ng J , Mehralizadeh M . Forecasting East Asian Indices Futures via a Novel Hybrid of Wavelet-PCA Denoising and Artificial Neural Network Models[J]. PLOS ONE, 2016, 11.

train_ptr = []
test_ptr = []
end_ptr = []

date_flag = [2, 3*12+2, 3*12+5] # 分别代表2000.2,2003.2,2003.5

for i in range(0, len(df1)):
    num = (df1.iloc[i]['year'] - 2006) * 12 + df1.iloc[i]['month']
    if num == date_flag[0]:
        train_ptr.append(i)
        date_flag[0] += 3
    if num == date_flag[1]:
        test_ptr.append(i)
        date_flag[1] += 3
    if num == date_flag[2]:
        end_ptr.append(i)
        date_flag[2] += 3

print(len(end_ptr))
print(train_ptr)

取数据并进行初步处理

aapl = df1[['Open', 'High', 'Low', 'Close', 'Volume']]
aapl.head()

def get_exp_preprocessing(df, alpha=0.9):
    edata = df.ewm(alpha=alpha).mean()    
    return edata

saapl = get_exp_preprocessing(aapl)
saapl.head() #saapl stands for smoothed aapl

处理出各种期货金融指标

def feature_extraction(data):
    for x in [5, 14, 26, 44, 66]:
#     for x in [14]:
        data = ta.relative_strength_index(data, n=x)
        data = ta.stochastic_oscillator_d(data, n=x)
        data = ta.accumulation_distribution(data, n=x)
        data = ta.average_true_range(data, n=x)
        data = ta.momentum(data, n=x)
        data = ta.money_flow_index(data, n=x)
        data = ta.rate_of_change(data, n=x)
        data = ta.on_balance_volume(data, n=x)
        data = ta.commodity_channel_index(data, n=x)
        data = ta.ease_of_movement(data, n=x)
        data = ta.trix(data, n=x)
        data = ta.vortex_indicator(data, n=x)
    
    data['ema50'] = data['Close'] / data['Close'].ewm(50).mean()
    data['ema21'] = data['Close'] / data['Close'].ewm(21).mean()
    data['ema14'] = data['Close'] / data['Close'].ewm(14).mean()
    data['ema5'] = data['Close'] / data['Close'].ewm(5).mean()
    
    #Williams %R is missing
    data = ta.macd(data, n_fast=12, n_slow=26)
    
#     del(data['Open'])
#     del(data['High'])
#     del(data['Low'])
#     del(data['Volume'])
    
    return data
   
def compute_prediction_int(df, n):
    pred = (df.shift(-n)['Close'] >= df['Close'])
    pred = pred.iloc[:-n]
    return pred.astype(int)

def prepare_data(df, horizon):
    data = feature_extraction(df).dropna().iloc[:-horizon]
    data['pred'] = compute_prediction_int(data, n=horizon)
#     del(data['Close'])
    return data.dropna()
# 10天后收盘价是否上涨
data = prepare_data(saapl, 10)

y = data['pred']

#remove the output from the input
features = [x for x in data.columns if x not in ['gain', 'pred']]
X = data[features]

print(list(X.columns))

在这里插入图片描述

制作数据集

miData = X.values
scalarX = np.max(miData, axis=0) - np.min(miData, axis=0)
miData = (miData - np.min(miData, axis=0)) / scalarX

yData = y.values
print(yData)
#数据集和目标值赋值,dataset为数据,look_back为以几行数据为特征维度数量
def create_dataset(dataset, label, look_back):
    data_x = []
    data_y = []
    batch_size = 50
    
    ind = list(range(len(dataset)-look_back))
    random.shuffle(ind)
    
    # print(len(dataset), len(ind), int(len(ind) / batch_size))
    for i in range(int(len(ind) / batch_size)):
        # TODO: 考虑LSTM的机制,output是对应的后一天的还是全放最后一天的数据
        ptr = ind[i * batch_size]
        x_item = dataset[ptr:ptr+look_back, :]
        y_item = label[ptr:ptr+look_back]
        # TODO: 暂时设batch_size为1
        x_item = torch.from_numpy(x_item.astype(np.float32))
        y_item = torch.from_numpy(y_item.astype(np.float32))
        x_item = torch.reshape(x_item, (look_back, 1 , dataset.shape[1]))
        y_item = torch.reshape(y_item, (look_back, 1 , 1))
        for j in range(1, batch_size):
            ptr = ind[i * batch_size + j]
            x_temp = dataset[ptr:ptr+look_back, :]
            y_temp = label[ptr:ptr+look_back]
            x_temp = torch.from_numpy(x_temp.astype(np.float32))
            y_temp = torch.from_numpy(y_temp.astype(np.float32))
            x_temp = torch.reshape(x_temp, (look_back, 1 , dataset.shape[1]))
            y_temp = torch.reshape(y_temp, (look_back, 1 , 1))
            x_item = torch.cat([x_item, x_temp], dim = 1)
            y_item = torch.cat([y_item, y_temp], dim = 1)
        y_item = y_item.long()    
        
        data_x.append(x_item)
        data_y.append(y_item)
    # return np.asarray(data_x), np.asarray(data_y) #转为ndarray数据
    return data_x, data_y

def create_Test_dataset(dataset, label, look_back):
    data_x = []
    data_y = []
    
    x_item = torch.tensor([])
    y_item = torch.tensor([])
    
    for i in range(len(dataset)-look_back):
        x_temp = dataset[i:i+look_back, :]
        y_temp = label[i:i+look_back]
        x_temp = torch.from_numpy(x_temp.astype(np.float32))
        y_temp = torch.from_numpy(y_temp.astype(np.float32))
        x_temp = torch.reshape(x_temp, (look_back, 1 , dataset.shape[1]))
        y_temp = torch.reshape(y_temp, (look_back, 1 , 1))
        x_item = torch.cat([x_item, x_temp], dim = 1)
        y_item = torch.cat([y_item, y_temp], dim = 1)
    
    y_item = y_item.long()
    
    data_x.append(x_item)
    data_y.append(y_item)
    
    return data_x, data_y

def trainSet_split(dataX, dataY):
    '''
    80%为训练集,20%为验证集
    '''
    train_size = int(len(dataX)*0.8)
    # TODO: 此时数据集顺序未被打乱
    ind = list(range(len(dataX)))
    random.shuffle(ind)

    trainLoaderX = []
    trainLoaderY = []
    for i in ind[:train_size]:
        trainLoaderX.append(dataX[i])
        trainLoaderY.append(dataY[i])

    validateLoaderX = []
    validateLoaderY = []
    for i in ind[train_size:]:
        validateLoaderX.append(dataX[i])
        validateLoaderY.append(dataY[i])
    return trainLoaderX, trainLoaderY, validateLoaderX, validateLoaderY
  • 1
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
PyTorch是一个用于深度学习的开源框架,它提供了一组工具和接口,使得我们可以轻松地进行模型训练、预测和部署。在PyTorch中,数据处理深度学习应用的重要部分之一。 PyTorch中的数据处理主要涉及以下几个方面: 1.数据预处理:包括数据清洗、数据归一化、数据增强等操作,以提高模型的鲁棒性和泛化能力。 2.数据加载:PyTorch提供了多种数据加载方式,包括内置的数据集、自定义的数据集数据加载器等,以便我们更好地管理和使用数据。 3.数据可视化:为了更好地理解数据和模型,PyTorch提供了多种数据可视化工具,如Matplotlib、TensorBoard等。 下面是一个简单的数据预处理示例,展示如何将图像进行归一化和数据增强: ```python import torch import torchvision.transforms as transforms from torchvision.datasets import CIFAR10 # 定义一个数据预处理管道 transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) ]) # 加载CIFAR10数据集,进行预处理 trainset = CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) ``` 在上面的例子中,我们首先定义了一个数据预处理管道,其中包括了对图像进行随机裁剪、水平翻转、归一化等操作。然后,我们使用PyTorch内置的CIFAR10数据集,并将其预处理后,使用DataLoader进行批量加载。这个过程可以帮助我们更好地管理和使用数据,同时提高模型的训练效率和泛化能力。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值