CNTK API文档翻译(8)——使用Pandas和金融数据进行时序数据基本分析

最新推荐文章于 2024-02-21 14:34:56 发布

阔活洵信

最新推荐文章于 2024-02-21 14:34:56 发布

阅读量1.2k

点赞数

分类专栏： CNTK文档翻译文章标签： CNTK 时序数据机器学习

CNTK文档翻译专栏收录该内容

25 篇文章 0 订阅

订阅专栏

本期将带来使用CNTK处理时间序列数据的教程。本教程中会展示怎样为深度学习算法准备时间数据、训练神经网络和评估神经网络。具体来说，我们会探究预测交易性开放式指数基金（Exchange-traded Funds,EFI）的分类是否靠谱，进而通过这种简单的分类来决定是买是卖。本教程仅仅是CNTK分析时序数据的例子，不保证训练的结果可以用于基金买卖的决策，股票市场过于复杂，非常难以预测，目前为止做的最好的依然是该领域的专家。

本教程介绍了pandas数据读取器和pandas库的使用方法，之前用过的numpy数据结构在Pandas数据帧中也会表现良好。

from __future__ import print_function
import datetime
import numpy as np
import os
import pandas as pd

# default='warn'
pd.options.mode.chained_assignment = None  

import cntk as C

在下面的代码中，我们通过检查在CNTK内部定义的环境变量来选择正确的设备（GPU或者CPU）来运行代码，如果不检查的话，会使用CNTK的默认策略来使用最好的设备（如果GPU可用的话就使用GPU，否则使用CPU）

# Select the right target device when this notebook is being tested:
if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        C.device.try_set_default_device(C.device.cpu())
    else:
        C.device.try_set_default_device(C.device.gpu(0))

导入股票数据

首先我们使用get_stock_data函数获取股票数据。这个函数的功能是从雅虎金融获取以天为单位的股票数据（也可以换成谷歌金融或者其他数据源）。下面的代码中展示了pandas数据读取器的多个用例。

# A method which obtains stock data from Yahoo finance
# Requires that you have an internet connection to retreive stock data from Yahoo finance

import time
try:
    from  pandas_datareader import data
except ImportError:
    !pip install pandas_datareader
    from  pandas_datareader import data 

# Set a random seed
np.random.seed(123)

def get_stock_data(contract, s_year, s_month, s_day, e_year, e_month, e_day):
    """
    Args:
        contract (str): the name of the stock/etf
        s_year (int): start year for data
        s_month (int): start month
        s_day (int): start day
        e_year (int): end year
        e_month (int): end month
        e_day (int): end day
    Returns:
        Pandas Dataframe: Daily OHLCV bars
    """
    start = datetime.datetime(s_year, s_month, s_day)
    end = datetime.datetime(e_year, e_month, e_day)

    retry_cnt, max_num_retry = 0, 3

    while(retry_cnt < max_num_retry):
        try:
            bars = data.DataReader(contract,"google", start, end)
            return bars
        except:
            retry_cnt += 1
            time.sleep(np.random.randint(1,10)) 

    print("Google Finance is not reachable")
    raise Exception('Google Finance is not reachable')


import pickle as  pkl

# We search in cached stock data set with symbol SPY.               
# Check for an environment variable defined in CNTK's test infrastructure
envvar = 'CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'
def is_test(): return envvar in os.environ

def download(data_file):
    try:
        data = get_stock_data("SPY", 2000, 1,2,2017,1,1)
    except:
        raise Exception("Data could not be downloaded")

    dir = os.path.dirname(data_file)

    if not os.path.exists(dir):
        os.makedirs(dir)

    if not os.path.isfile(data_file):
        print("Saving", data_file )
        with open(data_file, 'wb') as f:
            pkl.dump(data, f, protocol = 2)
    return data

data_file = os.path.join("data", "Stock", "stock_SPY.pkl")

# Check for data in local cache
if os.path.exists(data_file):
        print("File already exists", data_file)
        data = pd.read_pickle(data_file) 
else: 
    # If not there we might be running in CNTK's test infrastructure
    if is_test():
        test_file = os.path.join(os.environ[envvar], 'Tutorials','data','stock','stock_SPY.pkl')
        if os.path.isfile(test_file):
            print("Reading data from test data directory")
            data = pd.read_pickle(test_file)
        else:
            print("Test data directory missing file", test_file)
            print("Downloading data from Google Finance")
            data = download(data_file)         
    else:
        # Local cache is not present and not test env
        # download the data from Yahoo finance and cache it in a local directory
        # Please check if there is trade data for the chosen stock symbol during this period
        data = download(data_file)

创建训练参数

股市的涨跌很大程度上表现出自相关性。我们使用ETF的SPY指数代表股市，这个指数涵盖了美国上市公司五百强。我们的交易决策会根据这种通过股市自相关预测出来的短期结果来做。

预测

股市接下来一天的数据比当天的数据高还是低。

预测器

通过前八天的数据，接下来一天是否会比当天好。
数据波动百分比
前一天数据波动的百分比

注意，我们不会直接把股价作为神经网络的输入值。金融时序数据充满噪音，这对减少过度拟合至关重要。我们在进行训练之前可以做很多事情，比如数据平滑、添加更多的要素等，不过为了使教程简单，我们就不做处理，仅仅表现一下CNTK处理时间序列数据的能力就可以了。

# Feature name list
predictor_names = []

# Compute price difference as a feature
data["diff"] = np.abs((data["Close"] - data["Close"].shift(1)) / data["Close"]).fillna(0) 
predictor_names.append("diff")

# Compute the volume difference as a feature
data["v_diff"] = np.abs((data["Volume"] - data["Volume"].shift(1)) / data["Volume"]).fillna(0) 
predictor_names.append("v_diff")

# Compute the stock being up (1) or down (0) over different day offsets compared to current dat closing price
num_days_back = 8

for i in range(1,num_days_back+1):
    data["p_" + str(i)] = np.where(data["Close"] > data["Close"].shift(i), 1, 0) # i: number of look back days
    predictor_names.append("p_" + str(i))

# If you want to save the file to your local drive
#data.to_csv("PATH_TO_SAVE.csv")
data.head(10)

我们要预测什么

我们要预测下一天股市数据是涨是跌，如果涨，我们用1表示，否则用0（我们忽略了不大可能出现的情况：不变）。

data["next_day"] = np.where(data["Close"].shift(-1) > data["Close"], 1, 0)
data["next_day_opposite"] = np.where(data["next_day"]==1,0,1) # The label must be one-hot encoded

# Establish the start and end date of our training timeseries (picked 2000 days before the market crash)
training_data = data["2001-02-05":"2009-01-20"] 

# We define our test data as: data["2008-01-02":]
# This example allows to to include data up to current date

test_data= data["2009-01-20":"2016-12-29"] 
training_features = np.asarray(training_data[predictor_names], dtype = "float32")
training_labels = np.asarray(training_data[["next_day","next_day_opposite"]], dtype="float32")

现在我们创建了神经网络，我们将使用简单的前馈神经网络。注意，我们会使用CNTK的Layer库，不会自建网络层。

# Lets build the network
input_dim = 2 + num_days_back
num_output_classes = 2 #Remember we need to have 2 since we are trying to classify if the market goes up or down 1 hot encoded
num_hidden_layers = 2
hidden_layers_dim = 2 + num_days_back
input_dynamic_axes = [C.Axis.default_batch_axis()]
input = C.input_variable(input_dim, dynamic_axes=input_dynamic_axes)
label = C.input_variable(num_output_classes, dynamic_axes=input_dynamic_axes)

def create_model(input, num_output_classes):
    h = input
    with C.layers.default_options(init = C.glorot_uniform()):
        for i in range(0,num_hidden_layers):
            h = C.layers.Dense(hidden_layers_dim, 
                               activation = C.relu)(h)
        r = C.layers.Dense(num_output_classes, activation=None)(h)   
    return r

z = create_model(input, num_output_classes)
loss = C.cross_entropy_with_softmax(z, label)
label_error = C.classification_error(z, label)
lr_per_minibatch = C.learning_rate_schedule(0.125,C.UnitType.minibatch)
trainer = C.Trainer(z, (loss, label_error), [C.sgd(z.parameters, lr=lr_per_minibatch)])


#Initialize the parameters for the trainer, we will train in large minibatches in sequential order
minibatch_size = 100
num_minibatches = len(training_data.index) // minibatch_size

#Run the trainer on and perform model training
training_progress_output_freq = 1

# Visualize the loss over minibatch
plotdata = {"batchsize":[], "loss":[], "error":[]}

我们如何训练时序数据：数据使用的次数

与我们之前的数据不同，我们这次不随机往训练器里面送数据了，这次我们的每个取样包都是按时间序列组织的。在处理时间数据时，我们需要让更新是的数据权重稍大一些。你也可以把这些数据使用几次，不过这回导致过度拟合，进而导致训练结果不理想。当然也可以使用另外的一些方法来避免过度拟合，比如L1正则化（详情请看我的Python与人工神经网络系列第六期）。

tf = np.split(training_features,num_minibatches)

print("Number of mini batches")
print(len(tf))

print("The shape of the training feature minibatch")
print(tf[0].shape)

tl = np.split(training_labels, num_minibatches)

# It is key that we make only one pass through the data linearly in time
num_passes = 1 

# Defines a utility that prints the training progress
def print_training_progress(trainer, mb, frequency, verbose=1):
    training_loss = "NA"
    eval_error = "NA"
    if mb%frequency == 0:
        training_loss = trainer.previous_minibatch_loss_average
        eval_error = trainer.previous_minibatch_evaluation_average
        if verbose: 
            print ("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%".format(mb, training_loss, eval_error*100))
    return mb, training_loss, eval_error

# Train our neural network
tf = np.split(training_features,num_minibatches)
tl = np.split(training_labels, num_minibatches)

for i in range(num_minibatches*num_passes): # multiply by the 
    features = np.ascontiguousarray(tf[i%num_minibatches])
    labels = np.ascontiguousarray(tl[i%num_minibatches])

    # Specify the mapping of input variables in the model to actual minibatch data to be trained with
    trainer.train_minibatch({input : features, label : labels})
    batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)
    if not (loss == "NA" or error =="NA"):
        plotdata["batchsize"].append(batchsize)
        plotdata["loss"].append(loss)
        plotdata["error"].append(error)

训练结果

Minibatch: 0, Loss: 0.7874, Error: 54.00%
Minibatch: 1, Loss: 0.7570, Error: 51.00%
Minibatch: 2, Loss: 0.7579, Error: 61.00%
Minibatch: 3, Loss: 0.6916, Error: 47.00%
Minibatch: 4, Loss: 0.7127, Error: 54.00%
Minibatch: 5, Loss: 0.7286, Error: 59.00%
Minibatch: 6, Loss: 0.7056, Error: 50.00%
Minibatch: 7, Loss: 0.6975, Error: 48.00%
Minibatch: 8, Loss: 0.7059, Error: 56.00%
Minibatch: 9, Loss: 0.7037, Error: 54.00%
Minibatch: 10, Loss: 0.7567, Error: 60.00%
Minibatch: 11, Loss: 0.8480, Error: 52.00%
Minibatch: 12, Loss: 0.6917, Error: 45.00%
Minibatch: 13, Loss: 0.7526, Error: 58.00%
Minibatch: 14, Loss: 0.6823, Error: 47.00%
Minibatch: 15, Loss: 0.8856, Error: 40.00%
Minibatch: 16, Loss: 0.8299, Error: 48.00%
Minibatch: 17, Loss: 1.1737, Error: 51.00%
Minibatch: 18, Loss: 0.7951, Error: 53.00%
Minibatch: 19, Loss: 0.7809, Error: 48.00%

可视化

import matplotlib.pyplot as plt

plt.figure(1)
plt.subplot(211)
plt.plot(plotdata["batchsize"], plotdata["loss"], 'b--')
plt.xlabel('Minibatch number')
plt.ylabel('Loss')
plt.title('Minibatch run vs. Training loss ')
plt.show()

plt.subplot(212)
plt.plot(plotdata["batchsize"], plotdata["error"], 'r--')
plt.xlabel('Minibatch number')
plt.ylabel('Label Prediction Error')
plt.title('Minibatch run vs. Label Prediction Error ')
plt.show()

ERROR值在百分之五十左右。注意这些数据都是根据时间变化的，因此系统会随着时间推移产生一些噪声。与此同时，模型还在继续从市场数据中学习。在有很多噪音的情况下，达到低于50%的ERROR值是一个不错的结果。对于投资公司来说，他们有很高的交易频率，因此可以在较低正确率的情况下获利。试图通过分类来决定每天的交易从交易成本的角度看是比较昂贵的，那么我们是不是可以训练出一个模型，来判定是否会盈利呢？

让我们试试看：

# Now that we have trained the net, and we will do out of sample test to see how we did.
# and then more importantly analyze how that set did

test_features = np.ascontiguousarray(test_data[predictor_names], dtype = "float32")
test_labels = np.ascontiguousarray(test_data[["next_day","next_day_opposite"]], dtype="float32")

avg_error = trainer.test_minibatch({input : test_features, label : test_labels})
print("Average error: {0:2.2f}%".format(avg_error * 100))

这里打印的结果应该也在50%左右，似乎跟瞎猜差不多，让我们再做一次点点工作，看看有没有点预测的效果：

out = C.softmax(z)
predicted_label_prob = out.eval({input:test_features})
test_data["p_up"] = pd.Series(predicted_label_prob[:,0], index = test_data.index)
test_data["p_down"] = predicted_label_prob[:,1]
test_data['long_entries'] = np.where((test_data.p_up > 0.55), 1, 0)
test_data['short_entries'] = np.where((test_data.p_down > 0.55) , -1, 0)
test_data['positions'] = test_data['long_entries'].fillna(0) + test_data['short_entries'].fillna(0)

评估我们的数据

通过上面的代码得到了测试数据的输出值，通过softmax函数归一化成了概率，但仅仅是概率还没有有，换句话说，我们需要找到一个明显的标志，告诉我们可以买，而不是仅仅得出股市明天会涨会跌：数据有太多噪音，而且频繁交易需要支付较多的交易费。

在代码中我们设定的概率是55%。如果预测出来下一天高于当天的概率是55%，我们就买，如果预测出来低于当天的概率是55%，我们就卖。

我们还需要使用其他参数来评估时序数据：月平均回报率、月回报率标准差、夏普比率、最大资金回撤。夏普比率是平均回报率减去无风险收益率再除以回报率标准差，上述数据均以年为计量单位。

一般来说，夏普比率越高，表示获得相同回报需要承担的风险越小，这个参数建立在平均回报率和回报率标准差足以描述回报分布的基础之上。

def create_drawdowns(equity_curve):
    """
    Calculate the largest peak-to-trough drawdown of the PnL curve
    as well as the duration of the drawdown. Requires that the 
    pnl_returns is a pandas Series.

    Parameters:
    pnl - A pandas Series representing period percentage returns.

    Returns:
    drawdown, duration - Highest peak-to-trough drawdown and duration.
    """

    # Calculate the cumulative returns curve 
    # and set up the High Water Mark
    # Then create the drawdown and duration series
    hwm = [0]
    eq_idx = equity_curve.index
    drawdown = pd.Series(index = eq_idx)
    duration = pd.Series(index = eq_idx)

    # Loop over the index range
    for t in range(1, len(eq_idx)):
        cur_hwm = max(hwm[t-1], equity_curve[t])
        hwm.append(cur_hwm)
        drawdown[t]= (hwm[t] - equity_curve[t]) 
        duration[t]= 0 if drawdown[t] == 0 else duration[t-1] + 1
    return drawdown.max(), duration.max()


plt.figure()
test_data["p_up"].hist(bins=20, alpha=0.4)
test_data["p_down"].hist(bins=20, alpha=0.4)
plt.title("Distribution of Probabilities")
plt.legend(["p_up", "p_down"])
plt.ylabel("Frequency")
plt.xlabel("Probablity")
plt.show()

test_data["pnl"] = test_data["Close"].diff().shift(-1).fillna(0)*test_data["positions"]/np.where(test_data["Close"]!=0,test_data["Close"],1)
test_data["perc"] = (test_data["Close"] - test_data["Close"].shift(1)) / test_data["Close"].shift(1)
monthly = test_data.pnl.resample("M").sum()
monthly_spy = test_data["perc"].resample("M").sum()
avg_return = np.mean(monthly)
std_return = np.std(monthly)
sharpe = np.sqrt(12) * avg_return / std_return
drawdown = create_drawdowns(monthly.cumsum())
spy_drawdown = create_drawdowns(monthly_spy.cumsum())
print("TRADING STATS")
print("AVG Monthly Return :: " + "{0:.2f}".format(round(avg_return*100,2))+ "%")
print("STD Monthly        :: " + "{0:.2f}".format(round(std_return*100,2))+ "%")
print("SHARPE             :: " + "{0:.2f}".format(round(sharpe,2)))
print("MAX DRAWDOWN       :: " + "{0:.2f}".format(round(drawdown[0]*100,2)) + "%, " + str(drawdown[1]) + " months" )
print("Correlation to SPY :: " + "{0:.2f}".format(round(np.corrcoef(test_data["pnl"], test_data["diff"])[0][1],2)))
print("NUMBER OF TRADES   :: " + str(np.sum(test_data.positions.abs())))
print("TOTAL TRADING DAYS :: " + str(len(data)))
print("SPY MONTHLY RETURN :: " + "{0:.2f}".format(round(monthly_spy.mean()*100,2)) + "%")
print("SPY STD RETURN     :: " + "{0:.2f}".format(round(monthly_spy.std()*100,2)) + "%")
print("SPY SHARPE         :: " + "{0:.2f}".format(round(monthly_spy.mean()/monthly_spy.std()*np.sqrt(12),2)))
print("SPY DRAWDOWN       :: " + "{0:.2f}".format(round(spy_drawdown[0]*100,2)) + "%, "  + str(spy_drawdown[1]) + " months" )

print(drawdown[0])
(monthly.cumsum()*100).plot()
(monthly_spy.cumsum()*100).plot()
plt.legend(["NN", "SPY"],loc=2)
plt.ylabel("% Return")
plt.title("TRADING SPY OUT OF SAMPLE")
plt.show()

输出结果：

TRADING STATS
AVG Monthly Return :: -0.45%
STD Monthly        :: 3.17%
SHARPE             :: -0.49
MAX DRAWDOWN       :: 48.20%, nan months
Correlation to SPY :: -0.01
NUMBER OF TRADES   :: 1175
TOTAL TRADING DAYS :: 4000
SPY MONTHLY RETURN :: 1.19%
SPY STD RETURN     :: 3.92%
SPY SHARPE         :: 1.05
SPY DRAWDOWN       :: 17.25%, 11.0 months
0.482027152898