import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import time
import warnings
warnings.filterwarnings("ignore")
from tqz_data_parser.tqz_footPrint_data import TQZTimestampTool
class TQZBarFilter:
@classmethod
def get_filter_bar_data(cls, bar_data: pd.DataFrame, footPrint_data: pd.DataFrame):
max_timestamp_daily, min_timestamp_daily = footPrint_data['timestamp'].max(), footPrint_data['timestamp'].min()
bar_data_filter = bar_data[
(min_timestamp_daily <= bar_data['datetime']) & (bar_data['datetime'] <= max_timestamp_daily)
]
bar_data_filter.reset_index(inplace=True)
del bar_data_filter['index']
return bar_data_filter
class TQZHFTFeatures:
@classmethod
def trades_features_marker(cls, bar_data, footPrint_data, price_digital_points: int = 1):
"""
add trades features.
:param price_digital_points: price digital point counts
:param bar_data: whatever day
:param footPrint_data: single day
:return: bar_data of single day with trades features.
"""
# bar_data_filter = TQZBarFilter.get_filter_bar_data(bar_data=bar_data, footPrint_data=footPrint_data)
bar_data_filter = bar_data
footPrint_data['ab_size'] = footPrint_data['ask_size'] + footPrint_data['bid_size']
for timestamp in sorted(list(set(footPrint_data['timestamp'].values))):
footPrint_data_filter = footPrint_data[footPrint_data['timestamp'] == timestamp]
# poc
poc = footPrint_data_filter.loc[footPrint_data_filter["ab_size"] == footPrint_data_filter["ab_size"].max(), "price"].mean()
poc_size = footPrint_data_filter.loc[footPrint_data_filter["ab_size"] == footPrint_data_filter["ab_size"].max(), 'ab_size'].mean()
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'poc'] = poc
# big trades ratio
big_trades = footPrint_data_filter.loc[footPrint_data_filter["ab_size"] > (footPrint_data_filter["ab_size"].mean() + 3 * footPrint_data_filter["ab_size"].std()), 'ab_size'].sum()
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'big_trades_ratio'] = big_trades / footPrint_data_filter['ab_size'].sum()
# poc_weight
poc_weight_df = footPrint_data_filter.loc[footPrint_data_filter["ab_size"] > (footPrint_data_filter["ab_size"].mean() + 3 * footPrint_data_filter["ab_size"].std()), ['price', 'ab_size']]
poc_weight = round((poc_weight_df['price'] * poc_weight_df['ab_size']).sum() / poc_weight_df['ab_size'].sum(), price_digital_points)
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'poc_weight'] = poc_weight
# poc_upper_size-poc_lower_size: f(x) = poc_upper_size - poc_lower_size; f(x)↗ long_market↗
poc_upper_size = footPrint_data_filter[footPrint_data_filter['price'] > poc]['ab_size'].sum()
poc_lower_size = footPrint_data_filter[footPrint_data_filter['price'] < poc]['ab_size'].sum()
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'pus_pls'] = poc_upper_size - poc_lower_size
# poc_weight_upper_size-poc_weight_lower_size: f(x) = poc_weight_upper_size - poc_weight_lower_size; f(x)↗ long_market↗
poc_weight_upper_size = footPrint_data_filter[footPrint_data_filter['price'] > poc_weight]['ab_size'].sum()
poc_weight_lower_size = footPrint_data_filter[footPrint_data_filter['price'] < poc_weight]['ab_size'].sum()
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'pwus_pwls'] = poc_weight_upper_size - poc_weight_lower_size
# sas-sbs: f(x) = sum_ask_size - sum_bid_size; f(x)↗ long_market↗
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'sas_sbs'] = footPrint_data_filter['ask_size'].sum() - footPrint_data_filter['bid_size'].sum()
# poc_size / sum(size)
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'ps_ss_ratio'] = poc_size / footPrint_data_filter['ab_size'].sum()
# substance_bar_trades(open-close) ratio: trades(open-close) / trades(sum)
bar_open, bar_close = bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'open'].mean(), bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'close'].mean()
min_price, max_price = min(bar_open, bar_close), max(bar_open, bar_close)
substance_bar = footPrint_data_filter[(min_price <= footPrint_data_filter['price']) & (footPrint_data_filter['price'] <= max_price)]
substance_bar_trades_ratio = substance_bar['ab_size'].sum() / footPrint_data_filter['ab_size'].sum()
bar_data_filter.loc[bar_data_filter['datetime'] == timestamp, 'sbar_trades_ratio'] = substance_bar_trades_ratio
# hp / pl ratio: f(x) = (high - poc) / (poc - low); f(x)↗ long_market↗
bar_data_filter['hp_pl_ratio'] = (bar_data_filter['high'] - bar_data_filter['poc']) / (bar_data_filter['poc'] - bar_data_filter['low'])
# hpw / pwl ratio: f(x) = (high - poc_weight) / (poc_weight - low); f(x)↗ long_market↗
bar_data_filter['hpw_pwl_ratio'] = (bar_data_filter['high'] - bar_data_filter['poc_weight']) / (bar_data_filter['poc_weight'] - bar_data_filter['low'])
# cp: f(x) = close - poc; f(x)↗ long_market↗
bar_data_filter['cp'] = bar_data_filter['close'] - bar_data_filter['poc']
# cpw: f(x) = close - poc_weight; f(x)↗ long_market↗
bar_data_filter['cpw'] = bar_data_filter['close'] - bar_data_filter['poc_weight']
# co / hl ratio: f(x) = (close - open) / (high - low); |f(x)|↗ trend_degree↗
bar_data_filter['co_hl_ratio'] = (bar_data_filter['close'] - bar_data_filter['open']) / (bar_data_filter['high'] - bar_data_filter['low'])
# op+cp: f(x) = (open - poc) + (close - poc); f(x)↗ long_market↗
bar_data_filter['op+cp'] = (bar_data_filter['open'] - bar_data_filter['poc']) + (bar_data_filter['close'] - bar_data_filter['poc'])
# opw+cpw: f(x) = (open - poc_weight) + (close - poc_weight); f(x)↗ long_market↗
bar_data_filter['opw+cpw'] = (bar_data_filter['open'] - bar_data_filter['poc_weight']) + (bar_data_filter['close'] - bar_data_filter['poc_weight'])
return bar_data_filter
@classmethod
def orderbook_marker(cls, source_orderbook_data, price_digital_points: int = 1, size_digital_points: int = 3):
"""
add base orderbook features.
:param source_orderbook_data: source orderbook data.
:param price_digital_points: price digital point counts.
:param size_digital_points: size digital point counts.
:return:
"""
# price, size: 1 gear
source_orderbook_data['o_s_1gear_price'] = round((source_orderbook_data['Ask_Price_01'] + source_orderbook_data['Bid_Price_01']) * 0.5, price_digital_points)
source_orderbook_data['o_1gear_size'] = round(source_orderbook_data['Ask_Volume_01'] + source_orderbook_data['Bid_Volume_01'], size_digital_points)
# price, size: 10 gears
source_orderbook_data['o_s_10gears_price'] = round(
(source_orderbook_data['Ask_Price_01'] + source_orderbook_data['Ask_Price_02'] + source_orderbook_data['Ask_Price_03'] + source_orderbook_data['Ask_Price_04'] +
source_orderbook_data['Ask_Price_05'] + source_orderbook_data['Ask_Price_06'] + source_orderbook_data['Ask_Price_07'] + source_orderbook_data['Ask_Price_08'] + source_orderbook_data['Ask_Price_09'] + source_orderbook_data['Ask_Price_10'] +
source_orderbook_data['Bid_Price_01'] + source_orderbook_data['Bid_Price_02'] + source_orderbook_data['Bid_Price_03'] + source_orderbook_data['Bid_Price_04'] + source_orderbook_data['Bid_Price_05'] + source_orderbook_data['Bid_Price_06'] +
source_orderbook_data['Bid_Price_07'] + source_orderbook_data['Bid_Price_08'] + source_orderbook_data['Bid_Price_09'] + source_orderbook_data['Bid_Price_10']) * 0.05, price_digital_points)
source_orderbook_data['o_10gears_ask_size'] = round(source_orderbook_data['Ask_Volume_01'] + source_orderbook_data['Ask_Volume_02'] + source_orderbook_data['Ask_Volume_03'] + source_orderbook_data['Ask_Volume_04'] + source_orderbook_data['Ask_Volume_05'] + source_orderbook_data['Ask_Volume_06'] + source_orderbook_data['Ask_Volume_07'] + source_orderbook_data['Ask_Volume_08'] + source_orderbook_data['Ask_Volume_09'] + source_orderbook_data['Ask_Volume_10'], size_digital_points)
source_orderbook_data['o_10gears_bid_size'] = round(source_orderbook_data['Bid_Volume_01'] + source_orderbook_data['Bid_Volume_02'] + source_orderbook_data['Bid_Volume_03'] + source_orderbook_data['Bid_Volume_04'] + source_orderbook_data['Bid_Volume_05'] + source_orderbook_data['Bid_Volume_06'] + source_orderbook_data['Bid_Volume_07'] + source_orderbook_data['Bid_Volume_08'] + source_orderbook_data['Bid_Volume_09'] + source_orderbook_data['Bid_Volume_10'], size_digital_points)
source_orderbook_data['o_10gears_size'] = round(source_orderbook_data['o_10gears_ask_size'] + source_orderbook_data['o_10gears_bid_size'], size_digital_points)
# weight price: 1 gear
source_orderbook_data['o_w_1gear_price'] = round((source_orderbook_data['Ask_Price_01'] * source_orderbook_data['Bid_Volume_01'] + source_orderbook_data['Bid_Price_01'] * source_orderbook_data['Ask_Volume_01']) / source_orderbook_data['o_1gear_size'], price_digital_points)
# weight price: 10 gears
source_orderbook_data['o_w_10gears_price'] = round(
(source_orderbook_data['Ask_Price_01'] * source_orderbook_data['Bid_Volume_01'] + source_orderbook_data['Bid_Price_01'] * source_orderbook_data['Ask_Volume_01'] +
source_orderbook_data['Ask_Price_02'] * source_orderbook_data['Bid_Volume_02'] + source_orderbook_data['Bid_Price_02'] * source_orderbook_data['Ask_Volume_02'] +
source_orderbook_data['Ask_Price_03'] * source_orderbook_data['Bid_Volume_03'] + source_orderbook_data['Bid_Price_03'] * source_orderbook_data['Ask_Volume_03'] +
source_orderbook_data['Ask_Price_04'] * source_orderbook_data['Bid_Volume_04'] + source_orderbook_data['Bid_Price_04'] * source_orderbook_data['Ask_Volume_04'] +
source_orderbook_data['Ask_Price_05'] * source_orderbook_data['Bid_Volume_05'] + source_orderbook_data['Bid_Price_05'] * source_orderbook_data['Ask_Volume_05'] +
source_orderbook_data['Ask_Price_06'] * source_orderbook_data['Bid_Volume_06'] + source_orderbook_data['Bid_Price_06'] * source_orderbook_data['Ask_Volume_06'] +
source_orderbook_data['Ask_Price_07'] * source_orderbook_data['Bid_Volume_07'] + source_orderbook_data['Bid_Price_07'] * source_orderbook_data['Ask_Volume_07'] +
source_orderbook_data['Ask_Price_08'] * source_orderbook_data['Bid_Volume_08'] + source_orderbook_data['Bid_Price_08'] * source_orderbook_data['Ask_Volume_08'] +
source_orderbook_data['Ask_Price_09'] * source_orderbook_data['Bid_Volume_09'] + source_orderbook_data['Bid_Price_09'] * source_orderbook_data['Ask_Volume_09'] +
source_orderbook_data['Ask_Price_10'] * source_orderbook_data['Bid_Volume_10'] + source_orderbook_data['Bid_Price_10'] * source_orderbook_data['Ask_Volume_10']) / source_orderbook_data['o_10gears_size'], price_digital_points)
# max ask_price_diff, bid_price_diff
source_orderbook_data['o_max_ap_diff'] = source_orderbook_data['Ask_Price_10'] - source_orderbook_data['Ask_Price_01']
source_orderbook_data['o_max_bp_diff'] = source_orderbook_data['Bid_Price_01'] - source_orderbook_data['Bid_Price_10']
return source_orderbook_data
@classmethod
def orderbook_features_marker(cls, bar_data_filter, orderbook_data, price_digital_points: int = 1, size_digital_points: int = 3, bar_interval: int = 15):
"""
add orderbook features to bar_data.
:param bar_interval: bar interval(type is int).
:param price_digital_points: price digital point counts.
:param size_digital_points: size digital point counts.
:param bar_data_filter: single day
:param orderbook_data: source orderbook data with base features.
:return: bar_data_filter with orderbook features.
"""
start_timestamp = TQZTimestampTool.get_timestamp(date_str='2022-07-01')
end_timestamp = TQZTimestampTool.get_timestamp(date_str='2022-07-02')
timestamps_interval = bar_interval * 60000
while True:
orderbook_data_filter = orderbook_data[(start_timestamp <= orderbook_data['Datatime']) & (orderbook_data['Datatime'] < start_timestamp + timestamps_interval)]
# orderbook counts
bar_data_filter.loc[bar_data_filter['datetime'] == start_timestamp, 'o_counts'] = len(orderbook_data_filter)
# o_10gears_size_mean | o_1gear_size_mean
bar_data_filter.loc[bar_data_filter['datetime'] == start_timestamp, 'o_1gear_size_mean'] = round(orderbook_data_filter['o_1gear_size'].mean(), size_digital_points)
bar_data_filter.loc[bar_data_filter['datetime'] == start_timestamp, 'o_10gears_size_mean'] = round(orderbook_data_filter['o_10gears_size'].mean(), size_digital_points)
# o_max_ap_diff_mean | o_max_bp_diff_mean
bar_data_filter.loc[bar_data_filter['datetime'] == start_timestamp, 'o_max_ap_diff_mean'] = round(orderbook_data_filter['o_max_ap_diff'].mean(), price_digital_points)
bar_data_filter.loc[bar_data_filter['datetime'] == start_timestamp, 'o_max_bp_diff_mean'] = round(orderbook_data_filter['o_max_bp_diff'].mean(), price_digital_points)
if start_timestamp >= end_timestamp:
break
start_timestamp = start_timestamp + timestamps_interval
return bar_data_filter
class TQZHFTFeaturesEngineering:
@classmethod
def refresh_bar_data(cls, bar_data_df: pd.DataFrame(), start_date_str: str, end_date_str: str, footPrint_data_fold: str) -> pd.DataFrame():
start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d').date(), datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date()
while True:
if start_date > end_date:
break
bar_data_df = TQZHFTFeatures.trades_features_marker(
bar_data=bar_data_df,
footPrint_data=pd.read_csv(f'{footPrint_data_fold}/BTCUSDT_15mBar_footPrint_{str(start_date)}.csv')
)
start_date += datetime.timedelta(days=1)
return bar_data_df.dropna(inplace=False, axis=0)
from public_module.tqz_extern.tools.pandas_operator.pandas_operator import pandas
if __name__ == '__main__':
_btc_bar_15m_data = pd.read_csv('../source_data/BTCUSDT_15m_end.csv', index_col=0)
_btc_bar_15m_data['datetime'] = pd.to_datetime(_btc_bar_15m_data.index.values, format='%Y/%m/%d %H:%M').astype('int64') / 1000000 # bian 原生数据有问题, 时间戳需要在这里重新覆盖一下;
"""
_btc_orderbook_data = pd.read_csv('../source_data/Order_Book_BTCUSDT_2022-07-01.csv')
_bar_data_filter = TQZFeatureEngineering.orderbook_features_marker(
bar_data_filter=_bar_data_filter,
orderbook_data=TQZFeatureEngineering.orderbook_marker(source_orderbook_data=_btc_orderbook_data)
)
"""
_bar_data = TQZHFTFeaturesEngineering.refresh_bar_data(
start_date_str='2022-07-20',
end_date_str='2022-07-22',
bar_data_df=_btc_bar_15m_data,
footPrint_data_fold=f'F:/Tick_Binance/BTCUSDT/data/trades/footPrint'
)
量化交易之数字货币篇 - 高频数据特征工程(hft数据低频化)
最新推荐文章于 2023-02-24 14:55:02 发布