量化交易之HFT篇 - 大商所L2高频数据清洗（stable版）

最新推荐文章于 2024-08-02 12:25:21 发布
_Erwin_
最新推荐文章于 2024-08-02 12:25:21 发布
阅读量285
点赞数 1
分类专栏： HFT 文章标签： python
本文链接：https://blog.csdn.net/Michael_234198652/article/details/128348623
版权
HFT 专栏收录该内容
41 篇文章 6 订阅
订阅专栏
import datetime
import os
import re
import shutil

from tqdm import tqdm

from constant import Exchange, HftDataPath, current_time

import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from base_dict import dateToContract, clear_off_trading


class HftDataOperator:

    __columns: list = [
            "ContractID",
            "UpdateTime",
            "LastPrice",
            "MatchTotQty",
            "LastOpenInterest",
            "OpenInterest",
            "InterestChg",
            "Turnover",
            "BidPrice1",
            "BidVolume1",
            "BidImplyVolume1",
            "BidPrice2",
            "BidVolume2",
            "BidImplyVolume2",
            "BidPrice3",
            "BidVolume3",
            "BidImplyVolume3",
            "BidPrice4",
            "BidVolume4",
            "BidImplyVolume4",
            "BidPrice5",
            "BidVolume5",
            "BidImplyVolume5",
            "AskPrice1",
            "AskVolume1",
            "AskImplyVolume1",
            "AskPrice2",
            "AskVolume2",
            "AskImplyVolume2",
            "AskPrice3",
            "AskVolume3",
            "AskImplyVolume3",
            "AskPrice4",
            "AskVolume4",
            "AskImplyVolume4",
            "AskPrice5",
            "AskVolume5",
            "AskImplyVolume5",
        ]
    __format_columns: list = [
        'Date',
        'Time',
        'Symbol',
        'LastPrice',
        'AccVolume',
        'OpenInterest',
        'Turnover',
        'HighLimit',
        'LowLimit',
        'BidPrice1',
        'BidVolume1',
        'BidPrice2',
        'BidVolume2',
        'BidPrice3',
        'BidVolume3',
        'BidPrice4',
        'BidVolume4',
        'BidPrice5',
        'BidVolume5',
        'AskPrice1',
        'AskVolume1',
        'AskPrice2',
        'AskVolume2',
        'AskPrice3',
        'AskVolume3',
        'AskPrice4',
        'AskVolume4',
        'AskPrice5',
        'AskVolume5',
        'LocalTime',
        'LocalNS',
        'TotalBuyQty',
        'TotalSellQty',
        'AvgBuyPrice',
        'AvgSellPrice',
        'timestamp'
    ]

    def __init__(self, date_str: str, exchange: Exchange, target_instruments=None, filter_option: bool = True):
        if target_instruments is None:
            target_instruments = []
        path = HftDataPath.data_path(date_str=date_str, exchange=exchange)
        # path = f'ht_dce_L2-{date_str}.log'
        # path = f'test_{path}'
        assert os.path.exists(path), f'{path} not exists.'

        self.__date_str: str = date_str
        self.__data: pd.DataFrame() = pd.read_csv(path, low_memory=False, header=None)
        self.__data.columns = self.__columns
        self.__exchange = exchange
        self.__rets_map: {str: pd.DataFrame()} = None
        self.__target_instruments: list = target_instruments

        if filter_option:
            self.__data = self.__data[~self.__data.ContractID.str.contains('-')]
            self.__data.reset_index(inplace=True)
            del self.__data['index']

        self.__reset_format()

        self.__data.dropna(inplace=True)

    def split_by_symbol(self) -> {str: pd.DataFrame()}:
        assert self.__data is not None, f'__data is None'

        self.__rets_map: dict = {}
        symbols = list(set(self.__data.Symbol.tolist()))

        if self.__target_instruments is not None:
            reset_symbols = []
            for target_instrument in self.__target_instruments:
                target_instru = re.match(r"^[a-zA-Z]{1,3}", target_instrument).group()
                for symbol in symbols:
                    ret_sym = re.match(r"^[a-zA-Z]{1,3}", symbol).group()
                    if ret_sym == target_instru:
                        reset_symbols.append(symbol)
            symbols = reset_symbols

        for symbol in symbols:
            sub_data = self.__data[self.__data['Symbol'] == symbol]
            sub_data.reset_index(inplace=True)
            del sub_data['index']

            self.__rets_map[symbol] = sub_data

        return self

    def dump_to_csv(self):
        assert self.__rets_map is not None, f'__rets_map is None'

        # check output 目标目录 是否存在
        target_dir = HftDataPath.output_data_dir()
        if os.path.exists(path=target_dir) is False:
            os.mkdir(target_dir)

        # check 品种&对应日期 目标目录 是否存在
        symbol_dir_map: {str: str} = {}
        date_str = self.__date_str.replace("_", "")
        for symbol in self.rets_map().keys():
            symbol_dir = f'{target_dir}/data{re.match(r"^[a-zA-Z]{1,3}", symbol).group()}00'
            if os.path.exists(path=symbol_dir) is False:
                os.mkdir(symbol_dir)

            symbol_date_dir = f'{symbol_dir}/{date_str}'
            if os.path.exists(path=symbol_date_dir) is False:
                os.mkdir(symbol_date_dir)

            symbol_dir_map[symbol] = symbol_date_dir

        # write to csv
        for symbol, symbol_df in self.rets_map().items():
            target_path = f'{symbol_dir_map[symbol]}/{symbol}_{date_str}.csv'
            symbol_df.to_csv(target_path, index=False)

    def data(self) -> pd.DataFrame():
        return self.__data

    def rets_map(self) -> {str: pd.DataFrame()}:
        return self.__rets_map

    def __reset_format(self):
        assert self.__data is not None, f'__data is None.'

        # Date
        self.__data['Date'] = self.__date_str.replace('_', '')

        # Time, Symbol
        self.__data.rename(columns={'UpdateTime': 'Time'}, inplace=True)
        self.__data.rename(columns={'ContractID': 'Symbol'}, inplace=True)

        # HighLimit, LowLimit
        self.__data['HighLimit'] = 1000000
        self.__data['LowLimit'] = 0

        # LocalTime, timestamp
        self.__data['LocalTime'] = self.__data.Time
        self.__data['LocalTime'] = self.__data.LocalTime.str.replace(':', '', regex=True)
        self.__data['LocalTime'] = self.__data.LocalTime.str.replace('.', '', regex=True)
        self.__data['timestamp'] = self.__data.LocalTime

        # LocalNS
        self.__data["LocalNS"] = pd.to_datetime(self.__date_str.replace('_', '-') + ' ' + self.__data['Time']).apply(lambda x: x.value)

        # TotalBuyQty, TotalSellQty, AvgBuyPrice, AvgSellPrice
        self.__data['TotalBuyQty'] = 0
        self.__data['TotalSellQty'] = 0
        self.__data['AvgBuyPrice'] = 0
        self.__data['AvgSellPrice'] = 0

        # AccVolume
        self.__data['AccVolume'] = self.__data.MatchTotQty

        # to format
        self.__data = self.__data[self.__format_columns]


class HftDataManager:

    @classmethod
    def parser(cls, start_date_str: str, end_date_str: str, exchange: Exchange, target_instruments: list):
        """
        Clear source hft data
        :param target_instruments: instruments which need parser
        :param start_date_str: begin date of source file
        :param end_date_str: end date of source file
        :param exchange: enum of exchang
        """
        start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()

        while True:
            if start_date > end_date:
                break

            format_date_str = str(start_date).replace("-", "_")
            if os.path.exists(path=HftDataPath.data_path(date_str=format_date_str, exchange=exchange)):
                HftDataOperator(
                    date_str=format_date_str,
                    exchange=exchange,
                    target_instruments=target_instruments
                ).split_by_symbol().dump_to_csv()

            print(f'date({start_date}) parser over.' + "  local_time: " + str(current_time()))
            start_date += datetime.timedelta(days=1)

    @classmethod
    def refresh_main_contracts_dir(cls, target_instruments: list, start_date_str: str, end_date_str: str):
        """
        Refresh main contract's dir
        :param target_instruments: target contracts which need refresh
        :param start_date_str: begin date of source file
        :param end_date_str: end date of source file
        """

        source_dir = HftDataPath.output_data_dir()
        assert os.path.exists(path=source_dir), f'{source_dir} not exist.'

        target_dir = HftDataPath.output_main_data_dir()
        if os.path.exists(path=target_dir) is True:
            shutil.rmtree(target_dir)
        os.mkdir(target_dir)

        for contract in target_instruments:
            start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()

            source_sub_dir = f'{source_dir}/data{contract}'
            target_sub_dir = f'{target_dir}/data{contract}'
            if os.path.exists(path=target_sub_dir) is False:
                os.mkdir(target_sub_dir)

            while True:
                if start_date > end_date:
                    break

                format_date_str = str(start_date).replace('-', '')
                main_contract = dateToContract(instrument=contract, date=format_date_str)

                source_main_contract_dir = f'{source_sub_dir}/{format_date_str}'
                target_main_contract_dir = f'{target_sub_dir}/{format_date_str}'
                if os.path.exists(path=source_main_contract_dir):
                    os.mkdir(target_main_contract_dir)

                source_main_contract_path = f'{source_main_contract_dir}/{main_contract}_{format_date_str}.csv'
                target_main_contract_path = f'{target_main_contract_dir}/{main_contract}_{format_date_str}.csv'

                if os.path.exists(path=source_main_contract_path):
                    shutil.copy(src=source_main_contract_path, dst=target_main_contract_path)

                    # clear main contract data.
                    target_df = pd.read_csv(target_main_contract_path)

                    clear_off_trading(target_df, "timestamp")
                    target_df = target_df.drop_duplicates(subset='timestamp', keep='first', inplace=False)
                    target_df = target_df.reset_index(drop=True)

                    target_df['Turnover'] = np.int64(target_df.loc[:, 'Turnover']).copy()
                    target_df['BidPrice1'] = np.int64(target_df.loc[:, 'BidPrice1']).copy()
                    target_df['BidPrice2'] = np.int64(target_df.loc[:, 'BidPrice2']).copy()
                    target_df['BidPrice3'] = np.int64(target_df.loc[:, 'BidPrice3']).copy()
                    target_df['BidPrice4'] = np.int64(target_df.loc[:, 'BidPrice4']).copy()
                    target_df['BidPrice5'] = np.int64(target_df.loc[:, 'BidPrice5']).copy()
                    target_df['AskPrice1'] = np.int64(target_df.loc[:, 'AskPrice1']).copy()
                    target_df['AskPrice2'] = np.int64(target_df.loc[:, 'AskPrice2']).copy()
                    target_df['AskPrice3'] = np.int64(target_df.loc[:, 'AskPrice3']).copy()
                    target_df['AskPrice4'] = np.int64(target_df.loc[:, 'AskPrice4']).copy()
                    target_df['AskPrice5'] = np.int64(target_df.loc[:, 'AskPrice5']).copy()
                    target_df['BidVolume1'] = np.int64(target_df.loc[:, 'BidVolume1']).copy()
                    target_df['BidVolume2'] = np.int64(target_df.loc[:, 'BidVolume2']).copy()
                    target_df['BidVolume3'] = np.int64(target_df.loc[:, 'BidVolume3']).copy()
                    target_df['BidVolume4'] = np.int64(target_df.loc[:, 'BidVolume4']).copy()
                    target_df['BidVolume5'] = np.int64(target_df.loc[:, 'BidVolume5']).copy()
                    target_df['AskVolume1'] = np.int64(target_df.loc[:, 'AskVolume1']).copy()
                    target_df['AskVolume2'] = np.int64(target_df.loc[:, 'AskVolume2']).copy()
                    target_df['AskVolume3'] = np.int64(target_df.loc[:, 'AskVolume3']).copy()
                    target_df['AskVolume4'] = np.int64(target_df.loc[:, 'AskVolume4']).copy()
                    target_df['AskVolume5'] = np.int64(target_df.loc[:, 'AskVolume5']).copy()

                    target_df.to_csv(target_main_contract_path, index=False)

                start_date += datetime.timedelta(days=1)

    @classmethod
    def calculate_dateList_leadingList(cls, target_instruments: list):
        """
        Calculate date_list and leading_list with target_instruments
        :param target_instruments: instruments which need calculate date_list and leading_list
        """
        output_data_dir = HftDataPath.output_data_dir()
        assert os.path.exists(path=output_data_dir), f'{output_data_dir} not exist.'

        ret_df_map: {str, pd.DataFrame()} = {}
        for contract in target_instruments:
            output_data_sub_dir = f'{output_data_dir}/data{contract}'
            assert os.path.exists(path=output_data_sub_dir), f'{output_data_sub_dir} not exist.'

            ret_contract_df = pd.DataFrame(columns=["trade_date", "main_contract"])
            for date_dir_str in tqdm(sorted(os.listdir(path=output_data_sub_dir))):
                date_dir = f'{output_data_sub_dir}/{date_dir_str}'
                assert os.path.exists(path=date_dir), f'{date_dir} not exist.'

                date_dir_sub_files = os.listdir(path=date_dir)
                max_total_volume, today_main_contract = 0, ''
                for file in sorted(date_dir_sub_files):
                    file_path = f'{date_dir}/{file}'
                    assert os.path.exists(path=file_path), f'{file_path} not exist.'

                    contract_df = pd.read_csv(file_path, low_memory=False)
                    total_volume = contract_df.iloc[len(contract_df) - 1]['AccVolume']
                    if total_volume > max_total_volume:
                        max_total_volume, today_main_contract = total_volume, file.split("_")[0]

                row = len(ret_contract_df)
                ret_contract_df.loc[row, 'trade_date'] = date_dir_str
                ret_contract_df.loc[row, 'main_contract'] = today_main_contract

            # ret_contract_df.to_csv(f'{contract}.csv', index=False)
            ret_df_map[contract] = ret_contract_df

        ret_contract_map = {}
        for contract, contract_df in ret_df_map.items():
            contract_df = contract_df.groupby(["main_contract"]).apply(
                lambda x: list(x['trade_date'])
            ).reset_index(
                name='trade_date'
            )
            contract_df_map = contract_df.set_index("main_contract", drop=True).T.to_dict('list')

            date_list: list = []
            leading_list: list = []
            for key, value in contract_df_map.items():
                value = value[0]
                date_list.append([value[0], value[-1]])
                leading_list.append(key)

            ret_contract_map[contract] = {
                'date_list': date_list,
                'leading_list': leading_list
            }


    @classmethod
    def _check_main_contract_files(cls, target_instruments: list, start_date_str: str, end_date_str: str):
        """
        Check main contract files
        :param target_instruments: target contracts which need check main contract
        :param start_date_str: begin date of source file
        :param end_date_str: end date of source file
        """

        output_data_dir = HftDataPath.output_data_dir()
        for contract in target_instruments:
            start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()

            output_data_sub_dir = f'{output_data_dir}/data{contract}'
            while True:
                if start_date > end_date:
                    break

                format_date_str = str(start_date).replace('-', '')
                source_main_contract_dir = f'{output_data_sub_dir}/{format_date_str}'
                if os.path.exists(path=source_main_contract_dir):
                    contracts_path_df: {str: pd.DataFrame()} = {}
                    for file_name in os.listdir(source_main_contract_dir):
                        file_path = f'{source_main_contract_dir}/{file_name}'
                        target_contract_df = pd.read_csv(file_path, low_memory=False)

                        clear_off_trading(target_contract_df, "timestamp")
                        target_contract_df = target_contract_df.drop_duplicates(subset='timestamp', keep='first', inplace=False)
                        target_contract_df = target_contract_df.reset_index(drop=True)

                        contracts_path_df[file_path] = target_contract_df

                    main_contract_path, max_total_volume = '', 0
                    for contract_path, contract_df in contracts_path_df.items():
                        total_volume = contract_df.iloc[len(contract_df)-1]['AccVolume']
                        if total_volume > max_total_volume:
                            max_total_volume, main_contract_path = total_volume, contract_path

                    print("main_contract_path: " + str(main_contract_path))

                start_date += datetime.timedelta(days=1)


if __name__ == '__main__':
    _start_date_str = "2021_01_01"
    _end_date_str = "2022_11_01"
    _target_instruments = ["y00", "pg00"]

    HftDataManager.parser(
        start_date_str=_start_date_str,
        end_date_str=_end_date_str,
        target_instruments=_target_instruments,
        exchange=Exchange.DCE
    )

    # HftDataManager.refresh_main_contracts_dir(
    #     target_instruments=_target_instruments,
    #     start_date_str=_start_date_str,
    #     end_date_str=_end_date_str
    # )

    # HftDataManager.calculate_dateList_leadingList(target_instruments=_target_instruments)


def _test_check():
    # import pandas.testing
    # pandas.testing.assert_frame_equal(_ret_format_main_contract_df, _ret_output_main_data_df)
    pass