RL进行股票交易
交易环境定义
import gym
import gym.spaces
from gym.utils import seeding
import enum
import numpy as np
from . import data
DEFAULT_BARS_COUNT = 10
DEFAULT_COMMISSION_PERC = 0.1
class Actions(enum.Enum):
Skip = 0 # 跳过
Buy = 1 # 买入
Close = 2 # 平仓
class State:
def __init__(self, bars_count, commission_perc, reset_on_close, reward_on_close=True, volumes=True):
assert isinstance(bars_count, int)
assert bars_count > 0
assert isinstance(commission_perc, float)
assert commission_perc >= 0.0
assert isinstance(reset_on_close, bool)
assert isinstance(reward_on_close, bool)
self.bars_count = bars_count # 在观察中经历的bar 默认为10
self.commission_perc = commission_perc # 支付给经纪人的股票价格百分比 默认为0.1%
# 默认为True 则每当agent平仓(出售)时都会停止该片段
# 否则 片段将持续到时间序列结束 也就是一年的数据
self.reset_on_close = reset_on_close
# True时 agent将仅在平仓动作产生式获得奖励
# 否则 将在每个bar都给一个小额奖励
self.reward_on_close = reward_on_close
# 决定是否在观察中增加交易量 默认禁用
self.volumes = volumes
# 重置环境 保存传入的价格数据和其实偏移量
def reset(self, prices, offset):
assert isinstance(prices, data.Prices)
assert offset >= self.bars_count-1
self.have_position = False
self.open_price = 0.0
self._prices = prices # 作为字典包含一个或多个机构的一支或多支股票价格
self._offset = offset
# 返回以numpy数组表示的状态的形状
@property
def shape(self):
# [h, l, c] * bars + position_flag + rel_profit (since open)
if self.volumes:
return (4 * self.bars_count + 1 + 1, )
else:
return (3*self.bars_count + 1 + 1, )
# 将当前偏移量的价格编码为numpy数组 这将是agent的观察
def encode(self):
"""
Convert current state into numpy array.
"""
res = np.ndarray(shape=self.shape, dtype=np.float32)
shift = 0
for bar_idx in range(-self.bars_count+1, 1):
res[shift] = self._prices.high[self._offset + bar_idx]
shift += 1
res[shift] = self._prices.low[self._offset + bar_idx]
shift += 1
res[shift] = self._prices.close[self._offset + bar_idx]
shift += 1
if self.volumes:
res[shift] = self._prices.volume[self._offset + bar_idx]
shift += 1
res[shift] = float(self.have_position)
shift += 1
if not self.have_position:
res[shift] = 0.0
else:
res[shift] = (self._cur_close() - self.open_price) / self.open_price
return res
# 计算当前bar的收盘价
def _cur_close(self):
"""
Calculate real close price for the current bar
"""
open = self._prices.open[self._offset]
rel_close = self._prices.close[self._offset]
return open * (1.0 + rel_close)
# 在环境中执行一步 并返回以百分比表示的奖励以及是否结束的标志
def step(self, action):
"""
Perform one step in our price, adjust offset, check for the end of prices
and handle position change
:param action:
:return: reward, done
"""
assert isinstance(action, Actions)
reward = 0.0
done = False
close = self._cur_close()
# 购买股票 更改状态并支付佣金
if action == Actions.Buy and not self.have_position:
self.have_position = True
self.open_price = close
reward -= self.commission_perc
# 平仓 支付佣金 如果处于reset_on_close模式 则更改done标志 提供奖励并更改状态
elif action == Actions.Close and self.have_position:
reward -= self.commission_perc
done |= self.reset_on_close
if self.reward_on_close:
reward += 100.0 * (close - self.open_price) / self.open_price
self.have_position = False
self.open_price = 0.0
# 修改当前偏移量并为最后的bar波动提供奖励
self._offset += 1
prev_close = close
close = self._cur_close()
done |= self._offset >= self._prices.close.shape[0]-1
if self.have_position and not self.reward_on_close:
reward += 100.0 * (close - prev_close) / prev_close
return reward, done
class State1D(State):
"""
State with shape suitable for 1D convolution
"""
@property
def shape(self):
if self.volumes:
return (6, self.bars_count)
else:
return (5, self.bars_count)
# 根据当前的偏移量、是否需要交易量以及是否持股 在矩阵中对价格进行编码
def encode(self):
res = np.zeros(shape=self.shape, dtype=np.float32)
ofs = self.bars_count-1
res[0] = self._prices.high[self._offset-ofs:self._offset+1]
res[1] = self._prices.low[self._offset-ofs:self._offset+1]
res[2] = self._prices.close[self._offset-ofs:self._offset+1]
if self.volumes:
res[3] = self._prices.volume[self._offset-ofs:self._offset+1]
dst = 4
else:
dst = 3
if self.have_position:
res[dst] = 1.0
res[dst+1] = (self._cur_close() - self.open_price) / self.open_price
return res
class StocksEnv(gym.Env):
metadata = {'render.modes': ['human']}
"""
prices:包含若干个机构的若干支股票价格 键时机构名 值是容器对象data.Prices
bars_count:观察中经历的bar 默认10
commission:买卖股票时给经纪人的股票价格百分比 默认0.1%
reset_on_close:默认True 则agent平仓时都会停止该片段 否则将持续到时间序列结束(一年的数据)
state_1d:传递给agent的观察的价格数据的不同表现形式
True时 观察为2D 不同bar同类型的价格(最高 最低 收盘)放在同一行(最高价第一行 最低价第二行..)
False时,所有bar的数据组件都放在一行 H1 L1 C1 V1 H2 L2 C2 V2 ...
random_ofs_on_reset:默认True 则环境重置时将从时间序列的随机偏移开始 否则从头开始
reward_on_close:True时agent仅在平仓时获得奖励 否则将在每个bar都给一个小额奖励
volumes:决定是否在观察中增加交易量 默认为禁用
"""
def __init__(self, prices, bars_count=DEFAULT_BARS_COUNT,
commission=DEFAULT_COMMISSION_PERC, reset_on_close=True, state_1d=False,
random_ofs_on_reset=True, reward_on_close=False, volumes=False):
assert isinstance(prices, dict)
self._prices = prices
if state_1d:
self._state = State1D(bars_count, commission, reset_on_close, reward_on_close=reward_on_close,
volumes=volumes)
else:
self._state = State(bars_count, commission, reset_on_close, reward_on_close=reward_on_close,
volumes=volumes)
self.action_space = gym.spaces.Discrete(n=len(Actions))
self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=self._state.shape, dtype=np.float32)
self.random_ofs_on_reset = random_ofs_on_reset
self.seed()
# 随机切换要处理的时间序列 并选择起始偏移量
def reset(self):
# make selection of the instrument and it's offset. Then reset the state
self._instrument = self.np_random.choice(list(self._prices.keys()))
prices = self._prices[self._instrument]
bars = self._state.bars_count
if self.random_ofs_on_reset:
offset = self.np_random.choice(prices.high.shape[0]-bars*10) + bars
else:
offset = bars
self._state.reset(prices, offset)
return self._state.encode()
# 处理agent动作 并返回下一个观察、奖励和是否完成的标志
def step(self, action_idx):
action = Actions(action_idx)
reward, done = self._state.step(action)
obs = self._state.encode()
info = {"instrument": self._instrument, "offset": self._state._offset}
return obs, reward, done, info
# 窥视环境的内部状态
def render(self, mode='human', close=False):
pass
# 释放分配的资源
def close(self):
pass
# 与随机数生成器有关
def seed(self, seed=None):
self.np_random, seed1 = seeding.np_random(seed)
seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31
return [seed1, seed2]
# 使用数据目录作为参数调用from_dir方法创建实例
# 这将加载目录中的csv文件的所有报价并构建环境
@classmethod
def from_dir(cls, data_dir, **kwargs):
prices = {file: data.load_relative(file) for file in data.price_files(data_dir)}
return StocksEnv(prices, **kwargs)
模型定义
架构:三层的前馈网络和作为特征提取器的一维卷积网络,随后都是两个全连接层以输出Q值
卷积模型有一个常见的一维卷积特征提取层以及两个全连接层以输出状态值和动作优势值
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class NoisyLinear(nn.Linear):
def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
self.sigma_weight = nn.Parameter(torch.full((out_features, in_features), sigma_init))
self.register_buffer("epsilon_weight", torch.zeros(out_features, in_features))
if bias:
self.sigma_bias = nn.Parameter(torch.full((out_features,), sigma_init))
self.register_buffer("epsilon_bias", torch.zeros(out_features))
self.reset_parameters()
def reset_parameters(self):
std = math.sqrt(3 / self.in_features)
self.weight.data.uniform_(-std, std)
self.bias.data.uniform_(-std, std)
def forward(self, input):
self.epsilon_weight.normal_()
bias = self.bias
if bias is not None:
self.epsilon_bias.normal_()
bias = bias + self.sigma_bias * self.epsilon_bias
return F.linear(input, self.weight + self.sigma_weight * self.epsilon_weight, bias)
class SimpleFFDQN(nn.Module):
def __init__(self, obs_len, actions_n):
super(SimpleFFDQN, self).__init__()
self.fc_val = nn.Sequential(
nn.Linear(obs_len, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 1)
)
self.fc_adv = nn.Sequential(
nn.Linear(obs_len, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, actions_n)
)
def forward(self, x):
val = self.fc_val(x)
adv = self.fc_adv(x)
return val + (adv - adv.mean(dim=1, keepdim=True))
class DQNConv1D(nn.Module):
def __init__(self, shape, actions_n):
super(DQNConv1D, self).__init__()
self.conv = nn.Sequential(
nn.Conv1d(shape[0], 128, 5),
nn.ReLU(),
nn.Conv1d(128, 128, 5),
nn.ReLU(),
)
out_size = self._get_conv_out(shape)
self.fc_val = nn.Sequential(
nn.Linear(out_size, 512),
nn.ReLU(),
nn.Linear(512, 1)
)
self.fc_adv = nn.Sequential(
nn.Linear(out_size, 512),
nn.ReLU(),
nn.Linear(512, actions_n)
)
def _get_conv_out(self, shape):
o = self.conv(torch.zeros(1, *shape))
return int(np.prod(o.size()))
def forward(self, x):
conv_out = self.conv(x).view(x.size()[0], -1)
val = self.fc_val(conv_out)
adv = self.fc_adv(conv_out)
return val + (adv - adv.mean(dim=1, keepdim=True))
class DQNConv1DLarge(nn.Module):
def __init__(self, shape, actions_n):
super(DQNConv1DLarge, self).__init__()
self.conv = nn.Sequential(
nn.Conv1d(shape[0], 32, 3),
nn.MaxPool1d(3, 2),
nn.ReLU(),
nn.Conv1d(32, 32, 3),
nn.MaxPool1d(3, 2),
nn.ReLU(),
nn.Conv1d(32, 32, 3),
nn.MaxPool1d(3, 2),
nn.ReLU(),
nn.Conv1d(32, 32, 3),
nn.MaxPool1d(3, 2),
nn.ReLU(),
nn.Conv1d(32, 32, 3),
nn.ReLU(),
nn.Conv1d(32, 32, 3),
nn.ReLU(),
)
out_size = self._get_conv_out(shape)
self.fc_val = nn.Sequential(
nn.Linear(out_size, 512),
nn.ReLU(),
nn.Linear(512, 1)
)
self.fc_adv = nn.Sequential(
nn.Linear(out_size, 512),
nn.ReLU(),
nn.Linear(512, actions_n)
)
def _get_conv_out(self, shape):
o = self.conv(torch.zeros(1, *shape))
return int(np.prod(o.size()))
def forward(self, x):
conv_out = self.conv(x).view(x.size()[0], -1)
val = self.fc_val(conv_out)
adv = self.fc_adv(conv_out)
return val + (adv - adv.mean(dim=1, keepdim=True))
模型训练
前馈模型
#!/usr/bin/env python3
import ptan
import pathlib
import argparse
import gym.wrappers
import numpy as np
import torch
import torch.optim as optim
from ignite.engine import Engine
from ignite.contrib.handlers import tensorboard_logger as tb_logger
from lib import environ, data, models, common, validation
SAVES_DIR = pathlib.Path("saves")
STOCKS = "data/YNDX_160101_161231.csv"
VAL_STOCKS = "data/YNDX_150101_151231.csv"
BATCH_SIZE = 32
BARS_COUNT = 10
EPS_START = 1.0
EPS_FINAL = 0.1
EPS_STEPS = 1000000
GAMMA = 0.99
REPLAY_SIZE = 100000
REPLAY_INITIAL = 10000
REWARD_STEPS = 2
LEARNING_RATE = 0.0001
STATES_TO_EVALUATE = 1000
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--cuda", help="Enable cuda", default=False, action="store_true")
parser.add_argument("--data", default=STOCKS, help=f"Stocks file or dir, default={STOCKS}")
parser.add_argument("--year", type=int, help="Year to train on, overrides --data")
parser.add_argument("--val", default=VAL_STOCKS, help="Validation data, default=" + VAL_STOCKS)
parser.add_argument("-r", "--run", required=True, help="Run name")
args = parser.parse_args()
device = torch.device("cuda" if args.cuda else "cpu")
saves_path = SAVES_DIR / f"simple-{args.run}"
saves_path.mkdir(parents=True, exist_ok=True)
data_path = pathlib.Path(args.data)
val_path = pathlib.Path(args.val)
if args.year is not None or data_path.is_file():
if args.year is not None:
stock_data = data.load_year_data(args.year)
else:
stock_data = {"YNDX": data.load_relative(data_path)}
env = environ.StocksEnv(
stock_data, bars_count=BARS_COUNT)
env_tst = environ.StocksEnv(
stock_data, bars_count=BARS_COUNT)
elif data_path.is_dir():
env = environ.StocksEnv.from_dir(
data_path, bars_count=BARS_COUNT)
env_tst = environ.StocksEnv.from_dir(
data_path, bars_count=BARS_COUNT)
else:
raise RuntimeError("No data to train on")
env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
val_data = {"YNDX": data.load_relative(val_path)}
env_val = environ.StocksEnv(val_data, bars_count=BARS_COUNT)
net = models.SimpleFFDQN(env.observation_space.shape[0],
env.action_space.n).to(device)
tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.EpsilonGreedyActionSelector(EPS_START)
eps_tracker = ptan.actions.EpsilonTracker(
selector, EPS_START, EPS_FINAL, EPS_STEPS)
agent = ptan.agent.DQNAgent(net, selector, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, GAMMA, steps_count=REWARD_STEPS)
buffer = ptan.experience.ExperienceReplayBuffer(
exp_source, REPLAY_SIZE)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
def process_batch(engine, batch):
optimizer.zero_grad()
loss_v = common.calc_loss(
batch, net, tgt_net.target_model,
gamma=GAMMA ** REWARD_STEPS, device=device)
loss_v.backward()
optimizer.step()
eps_tracker.frame(engine.state.iteration)
if getattr(engine.state, "eval_states", None) is None:
eval_states = buffer.sample(STATES_TO_EVALUATE)
eval_states = [np.array(transition.state, copy=False)
for transition in eval_states]
engine.state.eval_states = np.array(eval_states, copy=False)
return {
"loss": loss_v.item(),
"epsilon": selector.epsilon,
}
engine = Engine(process_batch)
tb = common.setup_ignite(engine, exp_source, f"simple-{args.run}",
extra_metrics=('values_mean',))
@engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED)
def sync_eval(engine: Engine):
tgt_net.sync()
mean_val = common.calc_values_of_states(
engine.state.eval_states, net, device=device)
engine.state.metrics["values_mean"] = mean_val
if getattr(engine.state, "best_mean_val", None) is None:
engine.state.best_mean_val = mean_val
if engine.state.best_mean_val < mean_val:
print("%d: Best mean value updated %.3f -> %.3f" % (
engine.state.iteration, engine.state.best_mean_val,
mean_val))
path = saves_path / ("mean_value-%.3f.data" % mean_val)
torch.save(net.state_dict(), path)
engine.state.best_mean_val = mean_val
@engine.on(ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED)
def validate(engine: Engine):
res = validation.validation_run(env_tst, net, device=device)
print("%d: tst: %s" % (engine.state.iteration, res))
for key, val in res.items():
engine.state.metrics[key + "_tst"] = val
res = validation.validation_run(env_val, net, device=device)
print("%d: val: %s" % (engine.state.iteration, res))
for key, val in res.items():
engine.state.metrics[key + "_val"] = val
val_reward = res['episode_reward']
if getattr(engine.state, "best_val_reward", None) is None:
engine.state.best_val_reward = val_reward
if engine.state.best_val_reward < val_reward:
print("Best validation reward updated: %.3f -> %.3f, model saved" % (
engine.state.best_val_reward, val_reward
))
engine.state.best_val_reward = val_reward
path = saves_path / ("val_reward-%.3f.data" % val_reward)
torch.save(net.state_dict(), path)
event = ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED
tst_metrics = [m + "_tst" for m in validation.METRICS]
tst_handler = tb_logger.OutputHandler(
tag="test", metric_names=tst_metrics)
tb.attach(engine, log_handler=tst_handler, event_name=event)
val_metrics = [m + "_val" for m in validation.METRICS]
val_handler = tb_logger.OutputHandler(
tag="validation", metric_names=val_metrics)
tb.attach(engine, log_handler=val_handler, event_name=event)
engine.run(common.batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE))
一维卷积层模型
#!/usr/bin/env python3
import ptan
import pathlib
import argparse
import gym.wrappers
import numpy as np
import torch
import torch.optim as optim
from ignite.engine import Engine
from ignite.contrib.handlers import tensorboard_logger as tb_logger
from lib import environ, data, models, common, validation
SAVES_DIR = pathlib.Path("saves")
STOCKS = "data/YNDX_160101_161231.csv"
VAL_STOCKS = "data/YNDX_150101_151231.csv"
BATCH_SIZE = 32
BARS_COUNT = 10
EPS_START = 1.0
EPS_FINAL = 0.1
EPS_STEPS = 1000000
GAMMA = 0.99
REPLAY_SIZE = 100000
REPLAY_INITIAL = 10000
REWARD_STEPS = 2
LEARNING_RATE = 0.0001
STATES_TO_EVALUATE = 1000
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--cuda", help="Enable cuda", default=False, action="store_true")
parser.add_argument("--data", default=STOCKS, help=f"Stocks file or dir, default={STOCKS}")
parser.add_argument("--year", type=int, help="Year to train on, overrides --data")
parser.add_argument("--val", default=VAL_STOCKS, help="Validation data, default=" + VAL_STOCKS)
parser.add_argument("-r", "--run", required=True, help="Run name")
args = parser.parse_args()
device = torch.device("cuda" if args.cuda else "cpu")
saves_path = SAVES_DIR / f"conv-{args.run}"
saves_path.mkdir(parents=True, exist_ok=True)
data_path = pathlib.Path(args.data)
val_path = pathlib.Path(args.val)
if args.year is not None or data_path.is_file():
if args.year is not None:
stock_data = data.load_year_data(args.year)
else:
stock_data = {"YNDX": data.load_relative(data_path)}
env = environ.StocksEnv(stock_data, bars_count=BARS_COUNT, state_1d=True)
env_tst = environ.StocksEnv(stock_data, bars_count=BARS_COUNT, state_1d=True)
elif data_path.is_dir():
env = environ.StocksEnv.from_dir(data_path, bars_count=BARS_COUNT, state_1d=True)
env_tst = environ.StocksEnv.from_dir(data_path, bars_count=BARS_COUNT, state_1d=True)
else:
raise RuntimeError("No data to train on")
env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
val_data = {"YNDX": data.load_relative(val_path)}
env_val = environ.StocksEnv(val_data, bars_count=BARS_COUNT, state_1d=True)
net = models.DQNConv1D(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = ptan.agent.TargetNet(net)
selector = ptan.actions.EpsilonGreedyActionSelector(EPS_START)
eps_tracker = ptan.actions.EpsilonTracker(
selector, EPS_START, EPS_FINAL, EPS_STEPS)
agent = ptan.agent.DQNAgent(net, selector, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(
env, agent, GAMMA, steps_count=REWARD_STEPS)
buffer = ptan.experience.ExperienceReplayBuffer(
exp_source, REPLAY_SIZE)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
def process_batch(engine, batch):
optimizer.zero_grad()
loss_v = common.calc_loss(
batch, net, tgt_net.target_model,
gamma=GAMMA ** REWARD_STEPS, device=device)
loss_v.backward()
optimizer.step()
eps_tracker.frame(engine.state.iteration)
if getattr(engine.state, "eval_states", None) is None:
eval_states = buffer.sample(STATES_TO_EVALUATE)
eval_states = [np.array(transition.state, copy=False)
for transition in eval_states]
engine.state.eval_states = np.array(eval_states, copy=False)
return {
"loss": loss_v.item(),
"epsilon": selector.epsilon,
}
engine = Engine(process_batch)
tb = common.setup_ignite(engine, exp_source, f"conv-{args.run}",
extra_metrics=('values_mean',))
@engine.on(ptan.ignite.PeriodEvents.ITERS_1000_COMPLETED)
def sync_eval(engine: Engine):
tgt_net.sync()
mean_val = common.calc_values_of_states(
engine.state.eval_states, net, device=device)
engine.state.metrics["values_mean"] = mean_val
if getattr(engine.state, "best_mean_val", None) is None:
engine.state.best_mean_val = mean_val
if engine.state.best_mean_val < mean_val:
print("%d: Best mean value updated %.3f -> %.3f" % (
engine.state.iteration, engine.state.best_mean_val,
mean_val))
path = saves_path / ("mean_value-%.3f.data" % mean_val)
torch.save(net.state_dict(), path)
engine.state.best_mean_val = mean_val
@engine.on(ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED)
def validate(engine: Engine):
res = validation.validation_run(env_tst, net, device=device)
print("%d: tst: %s" % (engine.state.iteration, res))
for key, val in res.items():
engine.state.metrics[key + "_tst"] = val
res = validation.validation_run(env_val, net, device=device)
print("%d: val: %s" % (engine.state.iteration, res))
for key, val in res.items():
engine.state.metrics[key + "_val"] = val
val_reward = res['episode_reward']
if getattr(engine.state, "best_val_reward", None) is None:
engine.state.best_val_reward = val_reward
if engine.state.best_val_reward < val_reward:
print("Best validation reward updated: %.3f -> %.3f, model saved" % (
engine.state.best_val_reward, val_reward
))
engine.state.best_val_reward = val_reward
path = saves_path / ("val_reward-%.3f.data" % val_reward)
torch.save(net.state_dict(), path)
event = ptan.ignite.PeriodEvents.ITERS_10000_COMPLETED
tst_metrics = [m + "_tst" for m in validation.METRICS]
tst_handler = tb_logger.OutputHandler(
tag="test", metric_names=tst_metrics)
tb.attach(engine, log_handler=tst_handler, event_name=event)
val_metrics = [m + "_val" for m in validation.METRICS]
val_handler = tb_logger.OutputHandler(
tag="validation", metric_names=val_metrics)
tb.attach(engine, log_handler=val_handler, event_name=event)
engine.run(common.batch_generator(buffer, REPLAY_INITIAL, BATCH_SIZE))