【金融】【随机森林】使用随机森林对期货数据(涨跌)进行回归
参考《jmartinezheras/reproduce-stock-market-direction-random-forests》《基于随机森林做回归任务(数据预处理、MAPE指标评估、可视化展示、特征重要性、预测和实际值差异显示图)》
RF-RF_train3year3month
读取数据
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (7,4.5) # Make the default figures a bit bigger
import numpy as np
import random
#Let's make this notebook reproducible
np.random.seed(42)
random.seed(42)
import pandas_techinal_indicators as ta #https://github.com/Crypto-toolbox/pandas-technical-indicators/blob/master/technical_indicators.py
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, confusion_matrix, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
df_org = pd.read_csv(r'F:/data/market_index_data/hengSheng_0404.csv')
df_org.head()
划分训练集与数据集,3年+3月,以此类推
为什么要这么划分?详见:
M’Ng J , Mehralizadeh M . Forecasting East Asian Indices Futures via a Novel Hybrid of Wavelet-PCA Denoising and Artificial Neural Network Models[J]. PLOS ONE, 2016, 11.
train_ptr = []
test_ptr = []
end_ptr = []
date_flag = [2, 3*12+2, 3*12+5] # 分别代表2000.2,2003.2,2003.5
# date_flag = [12, 3*12+12, 3*12+12+3] # 分别代表2000.2,2003.2,2003.5
for i in range(0, len(df_org)):
num = (df_org.iloc[i]['year'] - 2006) * 12 + df_org.iloc[i]['month']
if num == date_flag[0]:
train_ptr.append(i)
date_flag[0] += 3
if num == date_flag[1]:
test_ptr.append(i)
date_flag[1] += 3
if num == date_flag[2]:
end_ptr.append(i)
date_flag[2] += 3
print(len(end_ptr))
取特定数据
aapl = df_org[['Open', 'High', 'Low', 'Close', 'Volume']]
aapl.head()
Exponential smoothing
def get_exp_preprocessing(df, alpha=0.9):
edata = df.ewm(alpha=alpha).mean()
return edata
saapl = get_exp_preprocessing(aapl)
saapl.head() #saapl stands for smoothed aapl
Feature Extraction - Technical Indicators
def feature_extraction(data):
for x in [5, 14, 26, 44, 66]:
# for x in [14]:
data = ta.relative_strength_index(data, n=x)
data = ta.stochastic_oscillator_d(data, n=x)
data = ta.accumulation_distribution(data, n=x)
data = ta.average_true_range(data, n=x)
data = ta.momentum(data, n=x)
data = ta.money_flow_index(data, n=x)
data = ta.rate_of_change(data, n=x)
data = ta.on_balance_volume(data, n=x)
data = ta.commodity_channel_index(data, n=x)
data = ta.ease_of_movement(data, n=x)
data = ta.trix(data, n=x)
data = ta.vortex_indicator(data, n=x)
data['ema50'] = data['Close'] / data['Close'].ewm(50).mean()
data['ema21'] = data['Close'] / data['Close'].ewm(21).mean()
data['ema14'] = data['Close'] / data['Close'].ewm(14).mean()
data['ema5'] = data['Close'] / data['Close'].ewm(5).mean()
#Williams %R is missing
data = ta.macd(data, n_fast=12, n_slow=26)
del(data['Open'])
del(data['High'])
del(data['Low'])
del(data['Volume'])
return data
def compute_prediction_int(df, n):
pred = (df.shift(-n)['Close'] >= df['Close'])
pred = pred.iloc[:-n]
return pred.astype(int)
def prepare_data(df, horizon):
data = feature_extraction(df).dropna().iloc[:-horizon]
data['pred'] = compute_prediction_int(data, n=horizon)
del(data['Close'])
return data.dropna()
Prepare the data with a prediction horizon of 10 days
# 10天后收盘价是否上涨
data = prepare_data(saapl, 10)
y = data['pred']
#remove the output from the input
features = [x for x in data.columns if x not in ['gain', 'pred']]
X = data[features]
print(list(data.columns))
分数据集依次训练
rf = RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=42)
accuracy_his = []
recall_his = []
f1_his = []
precision_his = []
all_p = np.array([])
all_prob = np.array([])
for k in range(len(end_ptr)):
if end_ptr[k] >= len(X):
break
# 使用从头到当前时刻的所有数据
X_train = X[train_ptr[0]:test_ptr[k]]
y_train = y[train_ptr[0]:test_ptr[k]]
X_test = X[test_ptr[k]:end_ptr[k]]
y_test = y[test_ptr[k]:end_ptr[k]]
print('\nDataSet No.{} data row {}-{}-{}'.format(k, train_ptr[k], test_ptr[k], end_ptr[k]))
rf.fit(X_train, y_train.values.ravel());
pred = rf.predict(X_test)
prob = rf.predict_proba(X_test)
if k == 0:
all_prob = prob
else :
all_prob = np.concatenate((all_prob, prob), axis=0)
all_p = np.concatenate((all_p,pred))
precision = precision_score(y_pred=pred, y_true=y_test)
recall = recall_score(y_pred=pred, y_true=y_test)
f1 = f1_score(y_pred=pred, y_true=y_test)
accuracy = accuracy_score(y_pred=pred, y_true=y_test)
print('precision: {0:1.2f}, recall: {1:1.2f}, f1: {2:1.2f}, accuracy: {3:1.2f}'.format(precision, recall, f1, accuracy))
accuracy_his.append(accuracy)
recall_his.append(recall)
f1_his.append(f1)
precision_his.append(precision)
confusion = confusion_matrix(y_pred=pred, y_true=y_test)
print('Confusion Matrix')
print(confusion)
查看指标
print(np.mean(accuracy_his))
print(np.mean(recall_his))
print(np.mean(precision_his))
Acc = accuracy_score(all_p, y[test_ptr[0]:end_ptr[42]])
print('------------- DataSet:, Accuracy:{:.5f} -------------'.format(Acc))
Pc = precision_score(all_p, y[test_ptr[0]:end_ptr[42]])
print('------------- DataSet:, Precision:{:.5f} -------------'.format(Pc))
Recall = recall_score(all_p, y[test_ptr[0]:end_ptr[42]])
print('------------- DataSet:, Recall:{:.5f} -------------'.format(Recall))
f1 = f1_score(all_p, y[test_ptr[0]:end_ptr[42]])
print('------------- DataSet:, Specificity:{:.5f} -------------'.format(f1))
year_accuracy_his = []
year_precision_his = []
year_recall_his = []
year_f1_his = []
for i in range(int(len(accuracy_his) / 4)):
left = test_ptr[i*4] - test_ptr[0]
right = end_ptr[i*4+3] - test_ptr[0]
item_y = y[left:right]
item_p = all_p[left:right]
Acc = accuracy_score(item_p, item_y)
year_accuracy_his.append(Acc)
print('------------- DataSet:{}, Accuracy:{:.5f} -------------'.format(i, Acc))
Pc = precision_score(item_p, item_y)
year_precision_his.append(Pc)
print('------------- DataSet:{}, Precision:{:.5f} -------------'.format(i, Pc))
Recall = recall_score(item_p, item_y)
year_recall_his.append(Recall)
print('------------- DataSet:{}, Recall:{:.5f} -------------'.format(i, Recall))
f1 = f1_score(item_p, item_y)
year_f1_his.append(f1)
print('------------- DataSet:{}, Specificity:{:.5f} -------------'.format(i, f1))
if len(accuracy_his)%4 != 0:
left = test_ptr[len(accuracy_his)-(len(accuracy_his)%4)-1] - test_ptr[0]
right = end_ptr[len(accuracy_his)-1] - test_ptr[0]
item_y = y[left:right]
item_p = all_p[left:right]
Acc = accuracy_score(item_p, item_y)
year_accuracy_his.append(Acc)
print('------------- DataSet:final, Accuracy:{:.5f} -------------'.format(Acc))
Pc = precision_score(item_p, item_y)
year_precision_his.append(Pc)
print('------------- DataSet:fianl, Precision:{:.5f} -------------'.format(Pc))
Recall = recall_score(item_p, item_y)
year_recall_his.append(Recall)
print('------------- DataSet:fianl, Recall:{:.5f} -------------'.format(Recall))
f1 = f1_score(item_p, item_y)
year_f1_his.append(f1)
print('------------- DataSet:fianl, Specificity:{:.5f} -------------'.format(f1))
plt.figure(figsize=(20,7))
plt.plot(np.arange(len(year_accuracy_his)), year_accuracy_his, label='year_accuracy')
plt.plot(np.arange(len(year_recall_his)), year_recall_his, label='year_recall')
plt.plot(np.arange(len(year_f1_his)), year_f1_his, label='year_f1_his')
plt.plot(np.arange(len(year_precision_his)), year_precision_his, label='year_precision')
# plt.grid(True, ls=':', c='r')
plt.axhline(y=0.5, c='r', ls='--', lw=2)
plt.legend();
plt.show()
plt.figure(figsize=(20,7))
plt.plot(np.arange(len(accuracy_his)), accuracy_his, label='accuracy')
plt.plot(np.arange(len(recall_his)), recall_his, label='recall')
plt.plot(np.arange(len(f1_his)), f1_his, label='f1')
plt.plot(np.arange(len(precision_his)), precision_his, label='precision')
plt.grid(True, ls=':', c='r')
plt.axhline(y=0.5, c='r', ls='--', lw=2)
plt.legend();
plt.show()