忽然看到一个关于机器学习预测股票的文章,尝试复现了一下代码并补充了一点注释,代码可以直接运行。
原文链接:没忍住,还是用机器学习预测了一下股票 - 知乎 (zhihu.com)
事先准备:python安装numpy,pandas,sklearn,pytorch,tensorflow,matplotlib,tushare库
python代码如下:
import tushare as ts
import numpy as np
import pandas as pd
import sklearn
import torch
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
import tensorflow as tf
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
ts_code = '000005.SZ' #股票代码。自己找相应股票代码
start_date = '2006-01-01'
end_date = '2023-01-01' #股票数据起止日期
pro=ts.pro_api(' ') #‘’中填写相应接口数据。https://tushare.pro/user/token 前往tushare网站注册,复制个人主页-接口中代码
#需要一定积分才能查询相应数据
df = pro.daily(
ts_code=ts_code,
start_date=start_date,
end_date=end_date) #导入数据
#使数据变成LSTM所需的3D样本
def transform_dataset(train_set, test_set, y_train, y_test, n_input, n_output):
all_data = np.vstack((train_set, test_set))
y_set = np.vstack((y_train, y_test))[:,0]
X = np.empty((1, n_input, all_data.shape[1]))
y = np.empty((1, n_output))
for i in range(all_data.shape[0] - n_input - n_output):
X_sample = all_data[i:i + n_input, :]
y_sample = y_set[i + n_input:i + n_input + n_output]
if i == 0:
X[i] = X_sample
y[i] = y_sample
else:
X = np.append(X, np.array([X_sample]), axis=0)
y = np.append(y, np.array([y_sample.T]), axis=0)
train_X = X[:train_set.shape[0] - n_input, :, :]
train_y = y[:train_set.shape[0] - n_input, :]
test_X = X[train_set.shape[0] -
n_input:all_data.shape[0] -
n_input -
n_output, :, :]
test_y = y[train_set.shape[0] -
n_input:all_data.shape[0] -
n_input -
n_output, :]
return train_X, train_y, test_X, test_y
use_cols = ['close', 'open', 'high', 'pre_close', 'vol', 'amount']
df['trade_date'] = pd.to_datetime(df['trade_date'])
atrain=df.truncate(after = '2017-01')
btest=df.truncate(before = '2017-02')
#定义时间,利用truncate函数根据时间拆分数据
train_raw=atrain[use_cols]
test_raw=btest[use_cols]
y_train_raw = train_raw[['close']]
y_test_raw = test_raw[['close']]
# scale the data
sc = MinMaxScaler(feature_range=(0, 1))
y_sc = MinMaxScaler(feature_range=(0, 1))
training_scaled = sc.fit_transform(train_raw.values)
test_scaled = sc.transform(test_raw.values)
y_train_scaled = y_sc.fit_transform(y_train_raw.values)
y_test_scaled = y_sc.transform(y_test_raw.values)
n_input = 63 #预测所需历史数据
n_output =1 #预测未来x天的数据
train_X, train_y, test_X, test_y = transform_dataset(
training_scaled, test_scaled,y_train_scaled,y_test_scaled, n_input, n_output)
#一个简单的神经网络,输入层为LSTM,包含20个节点,输出层为普通的Dense,损失函数采用mean_absolute_error
n_timesteps, n_features, n_outputs = train_X.shape[1], train_X.shape[2], train_y.shape[1]
# create a model
model = Sequential()
model.add(keras.layers.LSTM(10, input_shape=(n_timesteps, n_features),kernel_initializer='glorot_uniform',
kernel_regularizer=keras.regularizers.l2(0.0)))
#model.add(LSTM(20, input_shape=(n_timesteps, n_features),kernel_initializer='glorot_uniform',
# kernel_regularizer=regularizers.l2(0.0)))
model.add(keras.layers.Dense(n_outputs,kernel_initializer='glorot_uniform',
kernel_regularizer=keras.regularizers.l2(0.0)))
model.compile(optimizer='adam', loss='mean_absolute_error')
print(model.summary()) #模型摘要
def dropout(x, level): #定义dropout函数,如果过拟合可以使用
if level < 0. or level >= 1: #level是概率值,必须在0~1之间
raise ValueError('Dropout level must be in interval [0, 1[.')
retain_prob = 1. - level
# 我们通过binomial函数,生成与x一样的维数向量。binomial函数就像抛硬币一样,我们可以把每个神经元当做抛硬币一样
# 硬币 正面的概率为p,n表示每个神经元试验的次数
# 因为我们每个神经元只需要抛一次就可以了所以n=1,size参数是我们有多少个硬币。
random_tensor = np.random.binomial(n=1, p=retain_prob, size=x.shape) #即将生成一个0、1分布的向量,0表示这个神经元被屏蔽,不工作了,也就是dropout了
x *= random_tensor
x /= retain_prob
return x
#调用fit函数对训练集进行学习。由于时间序列具有很明显的趋势,因此有必要将样本打乱
train_X,train_y = shuffle(train_X,train_y,random_state=42)
# fit the RNN model
history = model.fit(
train_X,
train_y,
epochs=300,
batch_size=512,
validation_split=0.3)
figure = plt.Figure()
plt.plot(history.history['loss'],
'b',
label='Training loss')
plt.plot(history.history['val_loss'],
'r',
label='Validation loss')
plt.legend(loc='upper right')
plt.xlabel('Epochs')
plt.show()
# Finalizing predictions
scaled_preds = model.predict(test_X)
test_preds = np.zeros_like(scaled_preds)
for i in range(scaled_preds.shape[1]):
test_preds[:, i] = y_sc.inverse_transform(
[scaled_preds[:, i]]).reshape(1, scaled_preds.shape[0])
test_preds_df = pd.DataFrame(
test_preds, columns=[f'pred_{i+1}_step' for i in range(test_preds.shape[1])])
test_preds_df['true_value'] = test_raw.values[-len(test_preds):,0]
test_preds_df['naive_pred'] = test_raw.values[-len(test_preds) - 1:-1,0]
test_preds_df[['pred_1_step', 'true_value'
]].plot()
from sklearn.metrics import mean_absolute_error
err = mean_absolute_error(test_preds_df['pred_1_step'].values,test_preds_df['true_value'].values)
print(f'abs error for testset is {err}')
#预测值几乎相当于真实值随时间的便宜,基本没啥参考价值。over
没啥价值,不过可以试着玩玩。over