训练数据预处理
# -*- coding: utf-8 -*-
# @Time : 2021/5/15 10:25 下午
# @Author : jiangming
# @FileName: data_preprocess.py
# @Software: PyCharm
# @Desc : 数据预处理,生成训练数据
import datetime
from tqdm import tqdm
from pandas import DataFrame
import pandas as pd
import numpy as np
from vnpy.trader.constant import Exchange, Interval
from vnpy.trader.database import database_manager
from mycode.utils import bardata_to_dataframe
from joblib import Parallel,delayed
class DataProcesser:
def __init__(self, symbol: str, exchanges_type: str, exchange: Exchange, interval: Interval, start_date: str,
end_date: str):
self.retrace_rate = None
self.profit_rate = None
self.profit_hour = None
"""
初始化时读取数据集
"""
if exchanges_type == 'spot':
symbol = symbol.lower()
elif exchanges_type == 'future':
symbol = symbol.upper()
sd = datetime.datetime.strptime(start_date, '%Y-%m-%d')
ed = datetime.datetime.strptime(end_date, '%Y-%m-%d')
data = database_manager.load_bar_data(symbol, exchange, interval, sd, ed)
self.barframe = bardata_to_dataframe(data)
def apply_scheme1(self, retrace_rate, profit_rate, profit_hour):
"""
单线程数据预处理
"""
res = self.barframe.apply(self.scheme1, barframe=self.barframe, retrace_rate=retrace_rate, profit_rate=profit_rate,
profit_hour=profit_hour, axis=1)
res = res.dropna(how='any')
return res
def multi_apply_scheme1(self, retrace_rate, profit_rate, profit_hour):
"""
多线程数据预处理+tqdm
"""
self.retrace_rate = retrace_rate
self.profit_rate = profit_rate
self.profit_hour = profit_hour
df_grouped = self.barframe.groupby(self.barframe.index)
res = self.apply_parallel(df_grouped, self.tmp_func)
res = res.dropna(how='any')
return res
def tmp_func(self, df):
return df.apply(self.scheme1, barframe=self.barframe, retrace_rate=self.retrace_rate, profit_rate=self.profit_rate,
profit_hour=self.profit_hour, axis=1)
def apply_parallel(self, df_grouped, func):
# 修改这一行即可
results = Parallel(n_jobs=2)(delayed(func)(group) for name, group in tqdm(df_grouped))
return pd.concat(results)
def scheme1(self, bi, barframe: DataFrame, retrace_rate: float = 0.04, profit_rate: float = 0.02, profit_hour: int = 1):
"""
获取正样本数据
retrace_rat: 最大回撤范围
profit_tme: 止盈时间(小时)
方案一: 示例,未来一小时内,回撤小于4%,盈利大于2%的数据作为正样本,如果做10倍杠杆的话,就是盈利大于20%,回撤小于40%的数据作为正样本
"""
t = bi.time
t_profit = t + datetime.timedelta(hours=profit_hour)
subject_price = (bi.open_price + bi.close_price) / 2 # 标的价格
# 做多的回撤和止盈价格
up_retrace_price = subject_price * (1 - retrace_rate)
up_profit_price = subject_price * (1 + profit_rate)
# 做空的回撤和止盈价格
short_retrace_price = subject_price * (1 + retrace_rate)
short_profit_price = subject_price * (1 - profit_rate)
columns = ['id', 'time', 'subject_price', 'deal_price', 'real_profit_rate', 'real_profit_hour',
'expect_profit_hour',
'max_retrace_rate', 'is_short']
# train_data = pd.DataFrame(columns=columns)
sub_frame = barframe[(barframe['time'] > t) & (barframe['time'] <= t_profit)].reset_index(drop=True)
if (len(sub_frame) == 0):
return
low_price = sub_frame.low_price.to_list()
high_price = sub_frame.high_price.to_list()
# 判断是否可以做多
if min(low_price) > up_retrace_price and max(high_price) > up_profit_price:
profit_price = subject_price # 找到最近的止盈位置
for p in high_price:
if p > up_profit_price:
profit_price = p
break
profit_price_row_series = sub_frame[sub_frame['high_price'] == profit_price]
profit_price_row = profit_price_row_series.reset_index(drop=True).iloc[0]
if profit_price_row_series.index.values[0] == 0:
return
# 计算指标
deal_price = profit_price_row.high_price
real_profit_rate = deal_price / subject_price - 1
real_profit_hour = (profit_price_row.time - t).seconds / 3600
max_retrace_rate = (subject_price - min(
low_price[0:profit_price_row_series.index.values[0]])) / subject_price
row_i = pd.Series(
[bi.id, t, subject_price, deal_price, real_profit_rate, real_profit_hour, profit_hour, max_retrace_rate,
0], index=columns)
# train_data.append(row_i, ignore_index=True)
return row_i
# 判断是否可以做空
if max(high_price) < short_retrace_price and min(low_price) < short_profit_price:
profit_price = subject_price # 找到最近的止盈位置
for p in low_price:
if p<short_profit_price:
profit_price = p
break
profit_price_row_series = sub_frame[sub_frame['low_price'] == profit_price] # 找到最近的止盈位置的行数据
profit_price_row = profit_price_row_series.reset_index(drop=True).iloc[0]
deal_price = profit_price_row.low_price # 止盈Bar开盘和收盘价的均值作为止盈价格
if profit_price_row_series.index.values[0] == 0:
return
# 计算指标
real_profit_rate = subject_price / deal_price - 1
real_profit_hour = (profit_price_row.time - t).seconds / 3600
max_retrace_rate = (max(
high_price[0:profit_price_row_series.index.values[0]]) - subject_price) / subject_price
row_i = pd.Series(
[bi.id, t, subject_price, deal_price, real_profit_rate, real_profit_hour, profit_hour, max_retrace_rate,
1],
index=columns)
# train_data.append(row_i, ignore_index=True)
return row_i
return pd.Series(
[bi.id, t, subject_price, np.NAN, np.NAN, np.NAN, profit_hour, np.NAN, np.NAN],
index=columns)
用法
from vnpy.trader.constant import Interval, Exchange
import pandas as pd
import numpy as np
from mycode.utils import bardata_to_dataframe
from mycode.data_preprocess import DataProcesser
pd.set_option('display.max_columns', None)
# bar数据转dataframe
symbol = "STORJUSDT" # 币
interval = Interval.MINUTE
retrace_rate = 0.04
profit_rate = 0.02
profit_hour = 1
data_processer = DataProcesser(symbol, 'future', Exchange.BINANCE, interval, '2021-05-01', '2021-05-15')
schema_data = data_processer.apply_scheme1(retrace_rate, profit_rate, profit_hour)
# schema_data = data_processer.multi_apply_scheme1(retrace_rate, profit_rate, profit_hour)
bar_frame = data_processer.barframe
# 训练数据可视化
from mycode.utils import BarPlot
bar_plot = BarPlot(schema_data, bar_frame)
sample_schema_data = bar_plot.grid_5K_line()