数据预处理

最新推荐文章于 2023-12-25 22:56:03 发布

BeKnown

最新推荐文章于 2023-12-25 22:56:03 发布

阅读量298

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/m0_37953759/article/details/116903417

版权

Python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

训练数据预处理

# -*- coding: utf-8 -*-
# @Time    : 2021/5/15 10:25 下午
# @Author  : jiangming
# @FileName: data_preprocess.py
# @Software: PyCharm
# @Desc    : 数据预处理，生成训练数据
import datetime

from tqdm import tqdm
from pandas import DataFrame
import pandas as pd
import numpy as np
from vnpy.trader.constant import Exchange, Interval
from vnpy.trader.database import database_manager
from mycode.utils import bardata_to_dataframe
from joblib import Parallel,delayed



class DataProcesser:
    def __init__(self, symbol: str, exchanges_type: str, exchange: Exchange, interval: Interval, start_date: str,
                 end_date: str):
        self.retrace_rate = None
        self.profit_rate = None
        self.profit_hour = None
        """
        初始化时读取数据集
        """
        if exchanges_type == 'spot':
            symbol = symbol.lower()
        elif exchanges_type == 'future':
            symbol = symbol.upper()

        sd = datetime.datetime.strptime(start_date, '%Y-%m-%d')
        ed = datetime.datetime.strptime(end_date, '%Y-%m-%d')
        data = database_manager.load_bar_data(symbol, exchange, interval, sd, ed)
        self.barframe = bardata_to_dataframe(data)

    def apply_scheme1(self, retrace_rate, profit_rate, profit_hour):
        """
        单线程数据预处理
        """
        res = self.barframe.apply(self.scheme1, barframe=self.barframe, retrace_rate=retrace_rate, profit_rate=profit_rate,
                                  profit_hour=profit_hour, axis=1)
        res = res.dropna(how='any')
        return res

    def multi_apply_scheme1(self, retrace_rate, profit_rate, profit_hour):
        """
        多线程数据预处理+tqdm
        """
        self.retrace_rate = retrace_rate
        self.profit_rate = profit_rate
        self.profit_hour = profit_hour

        df_grouped = self.barframe.groupby(self.barframe.index)
        res = self.apply_parallel(df_grouped, self.tmp_func)
        res = res.dropna(how='any')
        return res

    def tmp_func(self, df):
        return df.apply(self.scheme1, barframe=self.barframe, retrace_rate=self.retrace_rate, profit_rate=self.profit_rate,
                                  profit_hour=self.profit_hour, axis=1)

    def apply_parallel(self, df_grouped, func):
        # 修改这一行即可
        results = Parallel(n_jobs=2)(delayed(func)(group) for name, group in tqdm(df_grouped))
        return pd.concat(results)

    def scheme1(self, bi, barframe: DataFrame, retrace_rate: float = 0.04, profit_rate: float = 0.02, profit_hour: int = 1):
        """
        获取正样本数据
        retrace_rat: 最大回撤范围
        profit_tme: 止盈时间(小时)
        方案一: 示例，未来一小时内，回撤小于4%，盈利大于2%的数据作为正样本，如果做10倍杠杆的话，就是盈利大于20%，回撤小于40%的数据作为正样本
        """
        t = bi.time
        t_profit = t + datetime.timedelta(hours=profit_hour)
        subject_price = (bi.open_price + bi.close_price) / 2  # 标的价格
        # 做多的回撤和止盈价格
        up_retrace_price = subject_price * (1 - retrace_rate)
        up_profit_price = subject_price * (1 + profit_rate)
        # 做空的回撤和止盈价格
        short_retrace_price = subject_price * (1 + retrace_rate)
        short_profit_price = subject_price * (1 - profit_rate)

        columns = ['id', 'time', 'subject_price', 'deal_price', 'real_profit_rate', 'real_profit_hour',
                   'expect_profit_hour',
                   'max_retrace_rate', 'is_short']
        # train_data = pd.DataFrame(columns=columns)
        sub_frame = barframe[(barframe['time'] > t) & (barframe['time'] <= t_profit)].reset_index(drop=True)
        if (len(sub_frame) == 0):
            return
        low_price = sub_frame.low_price.to_list()
        high_price = sub_frame.high_price.to_list()
        # 判断是否可以做多
        if min(low_price) > up_retrace_price and max(high_price) > up_profit_price:
            profit_price = subject_price    # 找到最近的止盈位置
            for p in high_price:
                if p > up_profit_price:
                    profit_price = p
                    break
            profit_price_row_series = sub_frame[sub_frame['high_price'] == profit_price]
            profit_price_row = profit_price_row_series.reset_index(drop=True).iloc[0]
            if profit_price_row_series.index.values[0] == 0:
                return
            # 计算指标
            deal_price = profit_price_row.high_price
            real_profit_rate = deal_price / subject_price - 1
            real_profit_hour = (profit_price_row.time - t).seconds / 3600
            max_retrace_rate = (subject_price - min(
                low_price[0:profit_price_row_series.index.values[0]])) / subject_price
            row_i = pd.Series(
                [bi.id, t, subject_price, deal_price, real_profit_rate, real_profit_hour, profit_hour, max_retrace_rate,
                 0], index=columns)
            # train_data.append(row_i, ignore_index=True)
            return row_i

        # 判断是否可以做空
        if max(high_price) < short_retrace_price and min(low_price) < short_profit_price:
            profit_price = subject_price    # 找到最近的止盈位置
            for p in low_price:
                if p<short_profit_price:
                    profit_price = p
                    break
            profit_price_row_series = sub_frame[sub_frame['low_price'] == profit_price]  # 找到最近的止盈位置的行数据
            profit_price_row = profit_price_row_series.reset_index(drop=True).iloc[0]
            deal_price = profit_price_row.low_price  # 止盈Bar开盘和收盘价的均值作为止盈价格
            if profit_price_row_series.index.values[0] == 0:
                return
            # 计算指标
            real_profit_rate = subject_price / deal_price - 1
            real_profit_hour = (profit_price_row.time - t).seconds / 3600
            max_retrace_rate = (max(
                high_price[0:profit_price_row_series.index.values[0]]) - subject_price) / subject_price
            row_i = pd.Series(
                [bi.id, t, subject_price, deal_price, real_profit_rate, real_profit_hour, profit_hour, max_retrace_rate,
                 1],
                index=columns)
            # train_data.append(row_i, ignore_index=True)
            return row_i

        return pd.Series(
            [bi.id, t, subject_price, np.NAN, np.NAN, np.NAN, profit_hour, np.NAN, np.NAN],
            index=columns)

用法

from vnpy.trader.constant import Interval, Exchange
import pandas as pd
import numpy as np
from mycode.utils import bardata_to_dataframe
from mycode.data_preprocess import DataProcesser


pd.set_option('display.max_columns', None)

# bar数据转dataframe
symbol = "STORJUSDT"  # 币
interval = Interval.MINUTE

retrace_rate = 0.04
profit_rate = 0.02
profit_hour = 1
data_processer = DataProcesser(symbol, 'future', Exchange.BINANCE, interval, '2021-05-01', '2021-05-15')
schema_data = data_processer.apply_scheme1(retrace_rate, profit_rate, profit_hour)
# schema_data = data_processer.multi_apply_scheme1(retrace_rate, profit_rate, profit_hour)
bar_frame = data_processer.barframe


# 训练数据可视化
from mycode.utils import BarPlot
bar_plot = BarPlot(schema_data, bar_frame)
sample_schema_data = bar_plot.grid_5K_line()

BeKnown

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据预处理

训练数据预处理# -*- coding: utf-8 -*-# @Time : 2021/5/15 10:25 下午# @Author : jiangming# @FileName: data_preprocess.py# @Software: PyCharm# @Desc : 数据预处理，生成训练数据import datetimefrom pandas import DataFrameimport pandas as pdimport numpy as npfrom
复制链接

扫一扫