import datetime
import os
import re
import shutil
from constant import Exchange
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
from base_dict import dateToContract, clear_off_trading
class HftDataPath:
@classmethod
def data_path(cls, date_str: str, exchange: Exchange) -> str:
path_str: str = ''
if exchange == Exchange.DCE:
path_str = f'{cls.source_data_dir()}/ht_dce_L2-{date_str}.log'
elif exchange == Exchange.SHFE:
pass
elif exchange == Exchange.CZCE:
pass
elif exchange == Exchange.CFFEX:
pass
else:
pass
return path_str
@classmethod
def source_data_dir(cls) -> str:
return "D:/ret_datas/source_datas"
@classmethod
def output_data_dir(cls) -> str:
return f'D:/ret_datas/output_data'
@classmethod
def output_main_data_dir(cls) -> str:
return f'D:/ret_datas/output_main_data'
class HftDataOperator:
__columns: list = [
"ContractID",
"UpdateTime",
"LastPrice",
"MatchTotQty",
"LastOpenInterest",
"OpenInterest",
"InterestChg",
"Turnover",
"BidPrice1",
"BidVolume1",
"BidImplyVolume1",
"BidPrice2",
"BidVolume2",
"BidImplyVolume2",
"BidPrice3",
"BidVolume3",
"BidImplyVolume3",
"BidPrice4",
"BidVolume4",
"BidImplyVolume4",
"BidPrice5",
"BidVolume5",
"BidImplyVolume5",
"AskPrice1",
"AskVolume1",
"AskImplyVolume1",
"AskPrice2",
"AskVolume2",
"AskImplyVolume2",
"AskPrice3",
"AskVolume3",
"AskImplyVolume3",
"AskPrice4",
"AskVolume4",
"AskImplyVolume4",
"AskPrice5",
"AskVolume5",
"AskImplyVolume5",
]
__format_columns: list = [
'Date',
'Time',
'Symbol',
'LastPrice',
'AccVolume',
'OpenInterest',
'Turnover',
'HighLimit',
'LowLimit',
'BidPrice1',
'BidVolume1',
'BidPrice2',
'BidVolume2',
'BidPrice3',
'BidVolume3',
'BidPrice4',
'BidVolume4',
'BidPrice5',
'BidVolume5',
'AskPrice1',
'AskVolume1',
'AskPrice2',
'AskVolume2',
'AskPrice3',
'AskVolume3',
'AskPrice4',
'AskVolume4',
'AskPrice5',
'AskVolume5',
'LocalTime',
'LocalNS',
'TotalBuyQty',
'TotalSellQty',
'AvgBuyPrice',
'AvgSellPrice',
'timestamp'
]
def __init__(self, date_str: str, exchange: Exchange, target_instruments=None, filter_option: bool = True):
if target_instruments is None:
target_instruments = []
path = HftDataPath.data_path(date_str=date_str, exchange=exchange)
# path = f'ht_dce_L2-{date_str}.log'
# path = f'test_{path}'
assert os.path.exists(path), f'{path} not exists.'
self.__date_str: str = date_str
self.__data: pd.DataFrame() = pd.read_csv(path, low_memory=False, header=None)
self.__data.columns = self.__columns
self.__exchange = exchange
self.__rets_map: {str: pd.DataFrame()} = None
self.__target_instruments: list = target_instruments
if filter_option:
self.__data = self.__data[~self.__data.ContractID.str.contains('-')]
self.__data.reset_index(inplace=True)
del self.__data['index']
self.__reset_format()
self.__data.dropna(inplace=True)
def split_by_symbol(self) -> {str: pd.DataFrame()}:
assert self.__data is not None, f'__data is None'
self.__rets_map: dict = {}
symbols = list(set(self.__data.Symbol.tolist()))
if self.__target_instruments is not None:
reset_symbols = []
for target_instrument in self.__target_instruments:
target_instru = re.match(r"^[a-zA-Z]{1,3}", target_instrument).group()
for symbol in symbols:
ret_sym = re.match(r"^[a-zA-Z]{1,3}", symbol).group()
if ret_sym == target_instru:
reset_symbols.append(symbol)
symbols = reset_symbols
for symbol in symbols:
sub_data = self.__data[self.__data['Symbol'] == symbol]
sub_data.reset_index(inplace=True)
del sub_data['index']
self.__rets_map[symbol] = sub_data
return self
def dump_to_csv(self):
assert self.__rets_map is not None, f'__rets_map is None'
# check output 目标目录 是否存在
target_dir = HftDataPath.output_data_dir()
if os.path.exists(path=target_dir) is False:
os.mkdir(target_dir)
# check 品种&对应日期 目标目录 是否存在
symbol_dir_map: {str: str} = {}
date_str = self.__date_str.replace("_", "")
for symbol in self.rets_map().keys():
symbol_dir = f'{target_dir}/data{re.match(r"^[a-zA-Z]{1,3}", symbol).group()}00'
if os.path.exists(path=symbol_dir) is False:
os.mkdir(symbol_dir)
symbol_date_dir = f'{symbol_dir}/{date_str}'
if os.path.exists(path=symbol_date_dir) is False:
os.mkdir(symbol_date_dir)
symbol_dir_map[symbol] = symbol_date_dir
# write to csv
for symbol, symbol_df in self.rets_map().items():
target_path = f'{symbol_dir_map[symbol]}/{symbol}_{date_str}.csv'
symbol_df.to_csv(target_path, index=False)
def data(self) -> pd.DataFrame():
return self.__data
def rets_map(self) -> {str: pd.DataFrame()}:
return self.__rets_map
def __reset_format(self):
assert self.__data is not None, f'__data is None.'
# Date
self.__data['Date'] = self.__date_str.replace('_', '')
# Time, Symbol
self.__data.rename(columns={'UpdateTime': 'Time'}, inplace=True)
self.__data.rename(columns={'ContractID': 'Symbol'}, inplace=True)
# HighLimit, LowLimit
self.__data['HighLimit'] = 1000000
self.__data['LowLimit'] = 0
# LocalTime, timestamp
self.__data['LocalTime'] = self.__data.Time
self.__data['LocalTime'] = self.__data.LocalTime.str.replace(':', '', regex=True)
self.__data['LocalTime'] = self.__data.LocalTime.str.replace('.', '', regex=True)
self.__data['timestamp'] = self.__data.LocalTime
# LocalNS
self.__data["LocalNS"] = pd.to_datetime(self.__date_str.replace('_', '-') + ' ' + self.__data['Time']).apply(lambda x: x.value)
# TotalBuyQty, TotalSellQty, AvgBuyPrice, AvgSellPrice
self.__data['TotalBuyQty'] = 0
self.__data['TotalSellQty'] = 0
self.__data['AvgBuyPrice'] = 0
self.__data['AvgSellPrice'] = 0
# AccVolume
self.__data['AccVolume'] = self.__data.MatchTotQty
# to format
self.__data = self.__data[self.__format_columns]
class HftDataManager:
@classmethod
def parser(cls, start_date_str: str, end_date_str: str, exchange: Exchange, target_instruments: list):
"""
Clear source hft data
:param target_instruments: instruments which need parser
:param start_date_str: begin date of source file
:param end_date_str: end date of source file
:param exchange: enum of exchang
"""
start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()
while True:
if start_date > end_date:
break
format_date_str = str(start_date).replace("-", "_")
if os.path.exists(path=HftDataPath.data_path(date_str=format_date_str, exchange=exchange)):
HftDataOperator(
date_str=format_date_str,
exchange=exchange,
target_instruments=target_instruments
).split_by_symbol().dump_to_csv()
print(f'date({start_date}) parser over.' + " local_time: " + str(current_time()))
start_date += datetime.timedelta(days=1)
@classmethod
def refresh_main_contracts_dir(cls, target_instruments: list, start_date_str: str, end_date_str: str):
"""
Refresh main contract's dir
:param target_instruments: target contracts which need refresh
:param start_date_str: begin date of source file
:param end_date_str: end date of source file
"""
source_dir = HftDataPath.output_data_dir()
assert os.path.exists(path=source_dir), f'{source_dir} not exist.'
target_dir = HftDataPath.output_main_data_dir()
if os.path.exists(path=target_dir) is True:
shutil.rmtree(target_dir)
os.mkdir(target_dir)
for contract in target_instruments:
start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()
source_sub_dir = f'{source_dir}/data{contract}'
target_sub_dir = f'{target_dir}/data{contract}'
if os.path.exists(path=target_sub_dir) is False:
os.mkdir(target_sub_dir)
while True:
if start_date > end_date:
break
format_date_str = str(start_date).replace('-', '')
main_contract = dateToContract(instrument=contract, date=format_date_str)
source_main_contract_dir = f'{source_sub_dir}/{format_date_str}'
target_main_contract_dir = f'{target_sub_dir}/{format_date_str}'
if os.path.exists(path=source_main_contract_dir):
os.mkdir(target_main_contract_dir)
source_main_contract_path = f'{source_main_contract_dir}/{main_contract}_{format_date_str}.csv'
target_main_contract_path = f'{target_main_contract_dir}/{main_contract}_{format_date_str}.csv'
if os.path.exists(path=source_main_contract_path):
shutil.copy(src=source_main_contract_path, dst=target_main_contract_path)
# clear main contract data.
target_df = pd.read_csv(target_main_contract_path)
clear_off_trading(target_df, "timestamp")
target_df = target_df.drop_duplicates(subset='timestamp', keep='first', inplace=False)
target_df = target_df.reset_index(drop=True)
target_df['Turnover'] = np.int64(target_df.loc[:, 'Turnover']).copy()
target_df['BidPrice1'] = np.int64(target_df.loc[:, 'BidPrice1']).copy()
target_df['BidPrice2'] = np.int64(target_df.loc[:, 'BidPrice2']).copy()
target_df['BidPrice3'] = np.int64(target_df.loc[:, 'BidPrice3']).copy()
target_df['BidPrice4'] = np.int64(target_df.loc[:, 'BidPrice4']).copy()
target_df['BidPrice5'] = np.int64(target_df.loc[:, 'BidPrice5']).copy()
target_df['AskPrice1'] = np.int64(target_df.loc[:, 'AskPrice1']).copy()
target_df['AskPrice2'] = np.int64(target_df.loc[:, 'AskPrice2']).copy()
target_df['AskPrice3'] = np.int64(target_df.loc[:, 'AskPrice3']).copy()
target_df['AskPrice4'] = np.int64(target_df.loc[:, 'AskPrice4']).copy()
target_df['AskPrice5'] = np.int64(target_df.loc[:, 'AskPrice5']).copy()
target_df['BidVolume1'] = np.int64(target_df.loc[:, 'BidVolume1']).copy()
target_df['BidVolume2'] = np.int64(target_df.loc[:, 'BidVolume2']).copy()
target_df['BidVolume3'] = np.int64(target_df.loc[:, 'BidVolume3']).copy()
target_df['BidVolume4'] = np.int64(target_df.loc[:, 'BidVolume4']).copy()
target_df['BidVolume5'] = np.int64(target_df.loc[:, 'BidVolume5']).copy()
target_df['AskVolume1'] = np.int64(target_df.loc[:, 'AskVolume1']).copy()
target_df['AskVolume2'] = np.int64(target_df.loc[:, 'AskVolume2']).copy()
target_df['AskVolume3'] = np.int64(target_df.loc[:, 'AskVolume3']).copy()
target_df['AskVolume4'] = np.int64(target_df.loc[:, 'AskVolume4']).copy()
target_df['AskVolume5'] = np.int64(target_df.loc[:, 'AskVolume5']).copy()
target_df.to_csv(target_main_contract_path, index=False)
start_date += datetime.timedelta(days=1)
@classmethod
def _check_main_contract_files(cls, target_instruments: list, start_date_str: str, end_date_str: str):
"""
Check main contract files
:param target_instruments: target contracts which need check main contract
:param start_date_str: begin date of source file
:param end_date_str: end date of source file
:return: date_list & leading_list of target_instruments, return type is map
"""
source_dir = HftDataPath.output_data_dir()
for contract in target_instruments:
start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()
source_sub_dir = f'{source_dir}/data{contract}'
while True:
if start_date > end_date:
break
format_date_str = str(start_date).replace('-', '')
source_main_contract_dir = f'{source_sub_dir}/{format_date_str}'
if os.path.exists(path=source_main_contract_dir):
contracts_path_df: {str: pd.DataFrame()} = {}
for file_name in os.listdir(source_main_contract_dir):
file_path = f'{source_main_contract_dir}/{file_name}'
target_contract_df = pd.read_csv(file_path, low_memory=False)
clear_off_trading(target_contract_df, "timestamp")
target_contract_df = target_contract_df.drop_duplicates(subset='timestamp', keep='first', inplace=False)
target_contract_df = target_contract_df.reset_index(drop=True)
contracts_path_df[file_path] = target_contract_df
main_contract_path, max_total_volume = '', 0
for contract_path, contract_df in contracts_path_df.items():
total_volume = contract_df.iloc[len(contract_df)-1]['AccVolume']
if total_volume > max_total_volume:
max_total_volume, main_contract_path = total_volume, contract_path
print("main_contract_path: " + str(main_contract_path))
start_date += datetime.timedelta(days=1)
import time
def current_time():
return time.strftime('%Y:%m:%d %H:%M:%S', time.localtime(int(time.time())))
if __name__ == '__main__':
_start_date_str = "2021_06_01"
_end_date_str = "2021_12_30"
HftDataManager.parser(
start_date_str=_start_date_str,
end_date_str=_end_date_str,
target_instruments=["eb00"],
exchange=Exchange.DCE
)
# HftDataManager.refresh_main_contracts_dir(
# target_instruments=['v00'],
# start_date_str=_start_date_str,
# end_date_str=_end_date_str
# )
def _test_check():
# import pandas.testing
# pandas.testing.assert_frame_equal(_ret_format_main_contract_df, _ret_output_main_data_df)
pass
量化交易之HFT篇 - 大商所L2高频数据清洗&筛选主力合约&自定义筛选当日主力合约
最新推荐文章于 2024-10-30 13:16:11 发布
本文介绍了一个用于处理高频交易数据的Python程序,该程序能够读取特定格式的日志文件,并将其转换为便于分析的数据格式。此外,还实现了数据按品种拆分、导出至CSV文件的功能,并提供了针对主合约数据的清理和更新流程。
摘要由CSDN通过智能技术生成