量化交易之HFT篇 - 大商所L2高频数据清洗&筛选主力合约

// base_dict.py

import sys

def price_lis(mktdata, key):
    y_10_price = mktdata.groupby('mktdate', group_keys=False, sort=False)[key].apply(
        lambda x: -x.diff(-10).fillna(x - x.iloc[-1])
    ).values
    return y_10_price

price_tick_dic = {
    'IF00': 0.2, 'IH00': 0.2, 'IC00': 0.2, 'TF00': 0.005, 'T00': 0.005,

    'cu00': 10.0, 'al00': 5.0, 'zn00': 5.0, 'ni00': 10.0, 'sn00': 10.0, 'au00': 0.05, 'ag00': 1.0,
    'rb00': 1.0, 'fu00': 1.0, 'bu00': 2.0, 'ru00': 5,

    'm00': 1.0, 'y00': 2.0, 'a00': 1.0, 'p00': 2.0, 'c00': 1.0, 'cs00': 1.0, 'l00': 5.0, 'v00': 1.0,
    'eg00': 1.0, 'pp00': 1.0, 'j00': 0.5, 'jm00': 0.5, 'i00': 0.5, 'pg00': 1.0,

    'SR00': 1.0, 'CF00': 5, 'ZC00': 0.2, 'FG00': 1.0, 'TA00': 2.0, 'MA00': 1.0, 'SA00': 1.0,
    'OI00': 1.0, 'RM00': 1.0, 'AP00': 1.0, 'CJ00': 5.0, 'PF00': 2.0,

    'sc00': 0.1, 'sc01': 0.1,
    'AgTD': 1.0, 'hc00': 1,
}
factor_dic = {
    'IF00': 300, 'IH00': 300, 'IC00': 200, 'TF00': 10000, 'T00': 10000,

    'cu00': 5.0, 'al00': 5.0, 'zn00': 5.0, 'ni00': 1.0, 'sn00': 1.0, 'au00': 1000, 'ag00': 15,
    'rb00': 10.0, 'fu00': 10.0, 'bu00': 10.0, 'ru00': 10,

    'm00': 10, 'y00': 10, 'a00': 10, 'p00': 10, 'c00': 10, 'cs00': 10, 'l00': 5, 'v00': 5,
    'eg00': 10, 'pp00': 5.0, 'j00': 100, 'jm00': 60, 'i00': 100, 'pg00': 20,

    'SR00': 10, 'CF00': 5, 'ZC00': 100, 'FG00': 20, 'TA00': 5.0, 'MA00': 10.0, 'SA00': 20,
    'OI00': 10.0, 'RM00': 10.0, 'AP00': 10.0, 'CJ00': 5.0, 'PF00': 5.0,

    'sc00': 1000, 'sc01': 1000,
    'AgTD': 1, 'hc00': 10,
}

fees_dic = {
    'IF00': 0.23/10000,
    'rb00': 0.5 * 1/10000,
    'ru00': -0.75,
    'ag00': 0.05 * 1/1000 * 1/2 * 1,
    'v00': -1 * 0.3,
    'TA00': -2.1,
    'CF00': -3.01,
    'i00': 1/10000,
    'sn00': -1.5,
    'p00': -2.5 * 0.6,
    'zn00': -3 * 0.5 * 0.5,
    'eg00': -3 * 0.3,
}

instrument_dic = {
    'IF00': {
        'price_tick': 0.2,
        'factor': 300,
        'date_list': [[20200601, 20200617],
                      [20200618, 20200715],
                      [20200716, 20200819],
                      [20200820, 20200916],
                      [20200917, 20201015],
                      [20201016, 20201118],
                      [20201119, 20201216],
                      [20201217, 20210114],
                      [20210115, 20210210],
                      [20210218, 20210318],
                      [20210319, 20210414],
                      [20210415, 20210519],
                      [20210520, 20210617],
                      [20210618, 20210714],
                      [20210715, 20210818],
                      [20210819, 20210915],
                      [20210916, 20211014],
                      [20211015, 20211117],
                      [20211118, 20211216],
                      [20211217, 20220120],
                      [20220121, 20220216],
                      [20220217, 20220316],
                      [20220317, 20220414],
                      [20220414, 20220518],
                      [20220519, 20220616],
                      [20220617, 20220714],
                      [20220715, 20220817],
                      [20220818, 20220908],
                      ],
        'leading_list': ['IF2006', 'IF2007', 'IF2008', 'IF2009', 'IF2010', 'IF2011', 'IF2012', 'IF2101',
                      'IF2102', 'IF2103', 'IF2104', 'IF2105', 'IF2106', 'IF2107', 'IF2108', 'IF2109',
                      'IF2110', 'IF2111', 'IF2112', 'IF2201', 'IF2202', 'IF2203', 'IF2204', 'IF2205', 'IF2206', 'IF2207', 'IF2208', 'IF2209']
    },
    'IH00': {
        'price_tick': 0.2,
        'factor': 300,
        'date_list': [],
        'leading_list': [],
    },
    'IC00': {
        'price_tick': 0.2,
        'factor': 200,
        'date_list': [
            [20210319, 20210414],
            [20210415, 20210519],
            [20210520, 20210617],
            [20210618, 20210714],
            [20210715, 20210818],
            [20210819, 20210915],
            [20210916, 20211014],
            [20211015, 20211117],
            [20211118, 20211215],
            [20211216, 20220119],
            [20220120, 20220216],
            [20220217, 20220316],
            [20220317, 20220413],
            [20220414, 20220517],
            [20220518, 20220615],
            [20220616, 20220714],
            [20220715, 20220817],
            [20220818, 20220908],
        ],
        'leading_list': ['IC2104', 'IC2105', 'IC2106', 'IC2107', 'IC2108', 'IC2109', 'IC2110', 'IC2111', 'IC2112', 'IC2201', 'IC2202', 'IC2203', 'IC2204', 'IC2205', 'IC2206', 'IC2207', 'IC2208', 'IC2209'],
    },
    'hc00': {
        'price_tick': 1,
        'factor': 10,
        'date_list': [
            [20210409, 20210813],
            [20210816, 20211201],
            [20211202, 20220329],
            [20220330, 20220830],
            [20220831, 20221120],
        ],
        'leading_list': ['hc2110', 'hc2201', 'hc2205', 'hc2210', 'hc2301'],
    },
    'TF00': {
        'price_tick': 0.005,
        'factor': 10000,
        'date_list': [],
        'leading_list': [],
    },
    'T00': {
        'price_tick': 0.005,
        'factor': 10000,
        'date_list': [],
        'leading_list': [],
    },
    'cu00': {
        'price_tick': 10.0,
        'factor': 5.0,
        'date_list': [],
        'leading_list': [],
    },
    'al00': {
        'price_tick': 5.0,
        'factor': 5.0,
        'date_list': [],
        'leading_list': [],
    },
    'zn00': {
        'price_tick': 5.0,
        'factor': 5.0,
        'date_list': [
            [20210115, 20210222],
            [20210223, 20210318],
            [20210319, 20210419],
            [20210420, 20210520],
            [20210521, 20210623],
            [20210624, 20210723],
            [20210726, 20210820],
            [20210823, 20210923],
            [20210924, 20211022],
            [20211025, 20211111],
            [20211112, 20211221],
            [20211222, 20220118],
            [20220119, 20220218],
            [20220221, 20220317],
            [20220318, 20220420],
            [20220421, 20220524],
            [20220525, 20220623],
            [20220624, 20220725],
            [20220726, 20220824],
            [20220825, 20220926],
            [20220927, 20221025],
            [20221026, 20221125],
        ],
        'leading_list': ['zn2103', 'zn2104', 'zn2105', 'zn2106', 'zn2107', 'zn2108', 'zn2109', 'zn2110', 'zn2111', 'zn2112', 'zn2201', 'zn2202', 'zn2203', 'zn2204', 'zn2205', 'zn2206', 'zn2207', 'zn2208', 'zn2209', 'zn2210', 'zn2211', 'zn2212'],
    },
    'rb00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [
            [20200601, 20200824],
            [20200825, 20201204],
            [20201207, 20210406],
            [20210406, 20210809],
            [20210810, 20211124],
            [20211125, 20220327],
            [20220328, 20220829],
            [20220830, 20221129],
        ],
        'leading_list': ['rb2010', 'rb2101', 'rb2105', 'rb2110', 'rb2201', 'rb2205', 'rb2210', 'rb2301'],
    },
    'sn00': {
        'price_tick': 10.0,
        'factor': 1.0,
        'date_list': [
            [20210308, 20210331],
            [20210401, 20210506],
            [20210507, 20210616],
            [20210617, 20210722],
            [20210723, 20210826],
            [20210827, 20210924],
            [20210927, 20211022],
            [20211025, 20211123],
            [20211124, 20211221],
            [20211222, 20220122],
            [20220120, 20220222],
            [20220223, 20220425],
            [20220426, 20220529],
            [20220530, 20220627],
            [20220628, 20220725],
            [20220726, 20220817],
            [20220818, 20220923],
            [20220924, 20221012],
            [20221013, 20221130],
        ],
        'leading_list': ['sn2105', 'sn2106', 'sn2107', 'sn2108', 'sn2109', 'sn2110', 'sn2111', 'sn2112', 'sn2201', 'sn2202', 'sn2203', 'sn2205', 'sn2206', 'sn2207', 'sn2208', 'sn2209', 'sn2210', 'sn2211', 'sn2212'],
    },
    'au00': {
        'price_tick': 0.05,
        'factor': 1000,
        'date_list': [],
        'leading_list': [],
    },
    'ag00': {
        'price_tick': 1.0,
        'factor': 15,
        'date_list': [
            [20200601, 20201125],
            [20201126, 20210125],
            [20210126, 20210525],
            [20210526, 20211126],
            [20211129, 20220525],
            [20220526, 20221025],
        ],
        'leading_list': ['ag2012', 'ag2102', 'ag2106', 'ag2112', 'ag2206', 'ag2212'],
    },
    'fu00': {
        'price_tick': 1.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'bu00': {
        'price_tick': 2.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'ru00': {
        'price_tick': 5,
        'factor': 10,
        'date_list': [
            [20200601, 20200804],
            [20200805, 20201130],
            [20201201, 20210401],
            [20210402, 20210808],
            [20210809, 20211123],
            [20211124, 20220330],
            [20220331, 20220808],
            [20220809, 20221030],
        ],
        'leading_list': ['ru2009', 'ru2101', 'ru2105', 'ru2109', 'ru2201', 'ru2205', 'ru2209', 'ru2301'],
    },
    'm00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [
            [20200601, 20200803],
            [20200804, 20201111],
            [20201112, 20210330],
            [20210331, 20210809],
            [20210810, 20211202],
            [20211203, 20220329],
            [20220330, 20220506],
        ],
        'leading_list': ['m2009', 'm2101', 'm2105', 'm2109', 'm2201', 'm2205', 'm2209'],
    },
    'y00': {
        'price_tick': 2.0,
        'factor': 10,
        'date_list': [
            [20200601, 20200804],
            [20200805, 20201210],
            [20201211, 20210408],
            [20210409, 20210813],
            [20210816, 20211208],
            [20211209, 20220323],
            [20220324, 20220506],
        ],
        'leading_list': ['y2009', 'y2101', 'y2105', 'y2109', 'y2201', 'y2205', 'y2209'],
    },
    'a00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [
            [20201217, 20210420],
            [20210421, 20210813],
            [20210816, 20211019],
            [20211020, 20211207],
            [20211208, 20220222],
            [20220223, 20220614],
            [20220615, 20220824],
            [20220825, 20221014],
        ],
        'leading_list': ['a2105', 'a2109', 'a2111', 'a2201', 'a2203', 'a2207', 'a2209', 'a2211'],
    },
    'p00': {
        'price_tick': 2.0,
        'factor': 10,
        'date_list': [
            [20200601, 20200812],
            [20200813, 20201208],
            [20201209, 20201210],
            [20201211, 20210411],
            [20210412, 20210816],
            [20210817, 20211209],
            [20211210, 20220331],
            [20220401, 20220816],
            [20220817, 20221124],
        ],
        'leading_list': ['p2009', 'p2101', 'p2102', 'p2105', 'p2109', 'p2201', 'p2205', 'p2209', 'p2301'],
    },
    'c00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [],
        'leading_list': [],
    },
    'cs00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [],
        'leading_list': [],
    },
    'l00': {
        'price_tick': 5.0,
        'factor': 5,
        'date_list': [],
        'leading_list': [],
    },
    'v00': {
        'price_tick': 1.0,
        'factor': 5,
        'date_list': [
            [20200601, 20200814],
            [20200817, 20201216],
            [20201217, 20210413],
            [20210414, 20210818],
            [20210819, 20211207],
            [20211208, 20220420],
            [20220421, 20220821],
            [20220822, 20221124],
        ],
        'leading_list': ['v2009', 'v2101', 'v2105', 'v2109', 'v2201', 'v2205', 'v2209', 'v2301'],
    },
    'eg00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [
            [20201211, 20210414],
            [20210415, 20210823],
            [20210824, 20211214],
            [20211215, 20220422],
            [20220425, 20220819],
            [20220820, 20221120],
        ],
        'leading_list': ['eg2105', 'eg2109', 'eg2201', 'eg2205', 'eg2209', 'eg2301'],
    },
    'eb': {
        'price_tick': 1.0,
        'factor': 5,
        'date_list': [],
        'leading_list': [],
    },
    'pp00': {
        'price_tick': 1.0,
        'factor': 5.0,
        'date_list': [],
        'leading_list': [],
    },
    'j00': {
        'price_tick': 0.5,
        'factor': 100,
        'date_list': [],
        'leading_list': [],
    },
    'jm00': {
        'price_tick': 0.5,
        'factor': 60,
        'date_list': [],
        'leading_list': [],
    },
    'i00': {
        'price_tick': 0.5,
        'factor': 100,
        'date_list': [
            [20210331, 20210803],
            [20210804, 20211130],
            [20211201, 20220323],
            [20220324, 20220803],
            [20220804, 20220809],
        ],
        'leading_list': ['i2109', 'i2201', 'i2205', 'i2209', 'i2301'],
    },
    'pg00': {
        'price_tick': 1.0,
        'factor': 20,
        'date_list': [],
        'leading_list': [],
    },
    'SR00': {
        'price_tick': 1.0,
        'factor': 10,
        'date_list': [],
        'leading_list': [],
    },
    'CF00': {
        'price_tick': 5,
        'factor': 5,
        'date_list': [
            [20200601, 20200810],
            [20200811, 20201207],
            [20201208, 20210408],
            [20210409, 20210805],
            [20210806, 20211201],
            [20211202, 20220414],
            [20220415, 20220711],
        ],
        'leading_list': ['CF009', 'CF101', 'CF105', 'CF109', 'CF201', 'CF205', 'CF209'],
    },
    'ZC00': {
        'price_tick': 0.2,
        'factor': 100,
        'date_list': [],
        'leading_list': [],
    },
    'FG00': {
        'price_tick': 1.0,
        'factor': 20,
        'date_list': [],
        'leading_list': [],
    },
    'TA00': {
        'price_tick': 2.0,
        'factor': 5.0,
        'date_list': [
            [20211210, 20220411],
            [20220412, 20220620],
        ],
        'leading_list': ['TA205', 'TA209'],
    },
    'MA00': {
        'price_tick': 1.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'SA00': {
        'price_tick': 1.0,
        'factor': 20,
        'date_list': [],
        'leading_list': [],
    },
    'OI00': {
        'price_tick': 1.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'RM00': {
        'price_tick': 1.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'AP00': {
        'price_tick': 1.0,
        'factor': 10.0,
        'date_list': [],
        'leading_list': [],
    },
    'CJ00': {
        'price_tick': 5.0,
        'factor': 5.0,
        'date_list': [],
        'leading_list': [],
    },
    'PF00': {
        'price_tick': 2.0,
        'factor': 5.0,
        'date_list': [],
        'leading_list': [],
    },
    'sc00': {
        'price_tick': 0.1,
        'factor': 1000,
        'date_list': [],
        'leading_list': [],
    },
    'sc01': {
        'price_tick': 0.1,
        'factor': 1000,
        'date_list': [],
        'leading_list': [],
    },
    'AgTD': {
        'price_tick': 1.0,
        'factor': 1,
        'date_list': [],
        'leading_list': [],
    }
}


def dateToContract(instrument, date):
    dateList = instrument_dic[instrument]['date_list']
    leadingList = instrument_dic[instrument]['leading_list']

    if len(dateList) != len(leadingList):
        sys.exit('dismatch')

    for ii in range(len(leadingList)):
        if dateList[ii][0] <= int(date) <= dateList[ii][1]:
            return leadingList[ii]

    return False


import numpy as np

def clear_off_trading(df, axis, off_range=[(23000000, 90000000),  # (230000000, 240000000),
  (113000000, 133000000), (150000000, 210000000)]):
    mask = np.zeros(len(df), dtype=bool)
    for r in off_range:
        mask |= (df[axis] > r[0]) & (df[axis] < r[1])
    df.drop(df.index[mask], axis=0, inplace=True)


if __name__ == '__main__':
    v00_main_contract = dateToContract(instrument='v00', date='20221020')
    eg00_main_contract = dateToContract(instrument='eg00', date='20221020')
    print("v00_main_contract: " + str(v00_main_contract))
    print("eg00_main_contract: " + str(eg00_main_contract))
// constant.py


from enum import Enum

class Exchange(Enum):
    """
    Exchange items.
    """

    CFFEX = 'CFFEX'
    DCE = 'DCE'
    SHFE = 'SHFE'
    CZCE = 'CZCE'
// main.py

import datetime
import os
import re
import shutil

from constant import Exchange

import pandas as pd
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

from base_dict import dateToContract, clear_off_trading


class HftDataPath:

    @classmethod
    def data_path(cls, date_str: str, exchange: Exchange) -> str:
        path_str: str = ''
        if exchange == Exchange.DCE:
            path_str = f'{cls.source_data_dir()}/ht_dce_L2-{date_str}.log'
        elif exchange == Exchange.SHFE:
            pass
        elif exchange == Exchange.CZCE:
            pass
        elif exchange == Exchange.CFFEX:
            pass
        else:
            pass

        return path_str


    @classmethod
    def source_data_dir(cls) -> str:
        # return "C:/Users/tqz_trader/Desktop/hft_data_parser"
        return "E:\海通期货\大连L2\ITfuwu_dce_l2\ITfuwu_dce_l2_2021\ht_dce_L2-2021_05"

    @classmethod
    def output_data_dir(cls) -> str:
        # return "C:/Users/tqz_trader/Desktop/hft_data_parser/output_data"
        return f'{cls.source_data_dir()}/output_data'

    @classmethod
    def output_main_data_dir(cls) -> str:
        # return "C:/Users/tqz_trader/Desktop/hft_data_parser/output_main_data"
        return f'{cls.source_data_dir()}/output_main_data'


class HftDataOperator:

    __columns: list = [
            "ContractID",
            "UpdateTime",
            "LastPrice",
            "MatchTotQty",
            "LastOpenInterest",
            "OpenInterest",
            "InterestChg",
            "Turnover",
            "BidPrice1",
            "BidVolume1",
            "BidImplyVolume1",
            "BidPrice2",
            "BidVolume2",
            "BidImplyVolume2",
            "BidPrice3",
            "BidVolume3",
            "BidImplyVolume3",
            "BidPrice4",
            "BidVolume4",
            "BidImplyVolume4",
            "BidPrice5",
            "BidVolume5",
            "BidImplyVolume5",
            "AskPrice1",
            "AskVolume1",
            "AskImplyVolume1",
            "AskPrice2",
            "AskVolume2",
            "AskImplyVolume2",
            "AskPrice3",
            "AskVolume3",
            "AskImplyVolume3",
            "AskPrice4",
            "AskVolume4",
            "AskImplyVolume4",
            "AskPrice5",
            "AskVolume5",
            "AskImplyVolume5",
        ]
    __format_columns: list = [
        'Date',
        'Time',
        'Symbol',
        'LastPrice',
        'AccVolume',
        'OpenInterest',
        'Turnover',
        'HighLimit',
        'LowLimit',
        'BidPrice1',
        'BidVolume1',
        'BidPrice2',
        'BidVolume2',
        'BidPrice3',
        'BidVolume3',
        'BidPrice4',
        'BidVolume4',
        'BidPrice5',
        'BidVolume5',
        'AskPrice1',
        'AskVolume1',
        'AskPrice2',
        'AskVolume2',
        'AskPrice3',
        'AskVolume3',
        'AskPrice4',
        'AskVolume4',
        'AskPrice5',
        'AskVolume5',
        'LocalTime',
        'LocalNS',
        'TotalBuyQty',
        'TotalSellQty',
        'AvgBuyPrice',
        'AvgSellPrice',
        'timestamp'
    ]

    def __init__(self, date_str: str, exchange: Exchange, filter_option: bool = True):
        path = HftDataPath.data_path(date_str=date_str, exchange=exchange)
        # path = f'ht_dce_L2-{date_str}.log'
        # path = f'test_{path}'
        assert os.path.exists(path), f'{path} not exists.'

        self.__date_str: str = date_str
        self.__data: pd.DataFrame() = pd.read_csv(path, low_memory=False, header=None)
        self.__data.columns = self.__columns
        self.__exchange = exchange
        self.__rets_map: {str: pd.DataFrame()} = None

        if filter_option:
            self.__data = self.__data[~self.__data.ContractID.str.contains('-')]
            self.__data.reset_index(inplace=True)
            del self.__data['index']

        self.__reset_format()

        self.__data.dropna(inplace=True)


    def split_by_symbol(self) -> {str: pd.DataFrame()}:
        assert self.__data is not None, f'__data is None'

        self.__rets_map: dict = {}
        for symbol in list(set(self.__data.Symbol.tolist())):
            sub_data = self.__data[self.__data['Symbol'] == symbol]
            sub_data.reset_index(inplace=True)
            del sub_data['index']

            self.__rets_map[symbol] = sub_data

        return self


    def dump_to_csv(self):
        assert self.__rets_map is not None, f'__rets_map is None'

        # check output 目标目录 是否存在
        target_dir = HftDataPath.output_data_dir()
        if os.path.exists(path=target_dir) is False:
            os.mkdir(target_dir)

        # check 品种&对应日期 目标目录 是否存在
        symbol_dir_map: {str: str} = {}
        date_str = self.__date_str.replace("_", "")
        for symbol in self.rets_map().keys():
            symbol_dir = f'{target_dir}/data{re.match(r"^[a-zA-Z]{1,3}", symbol).group()}00'
            if os.path.exists(path=symbol_dir) is False:
                os.mkdir(symbol_dir)

            symbol_date_dir = f'{symbol_dir}/{date_str}'
            if os.path.exists(path=symbol_date_dir) is False:
                os.mkdir(symbol_date_dir)

            symbol_dir_map[symbol] = symbol_date_dir

        # write to csv
        for symbol, symbol_df in self.rets_map().items():
            target_path = f'{symbol_dir_map[symbol]}/{symbol}_{date_str}.csv'
            symbol_df.to_csv(target_path, index=False)


    def data(self) -> pd.DataFrame():
        return self.__data


    def rets_map(self) -> {str: pd.DataFrame()}:
        return self.__rets_map


    def __reset_format(self):
        assert self.__data is not None, f'__data is None.'

        # Date
        self.__data['Date'] = self.__date_str.replace('_', '')

        # Time, Symbol
        self.__data.rename(columns={'UpdateTime': 'Time'}, inplace=True)
        self.__data.rename(columns={'ContractID': 'Symbol'}, inplace=True)

        # HighLimit, LowLimit
        self.__data['HighLimit'] = 1000000
        self.__data['LowLimit'] = 0

        # LocalTime, timestamp
        self.__data['LocalTime'] = self.__data.Time
        self.__data['LocalTime'] = self.__data.LocalTime.str.replace(':', '', regex=True)
        self.__data['LocalTime'] = self.__data.LocalTime.str.replace('.', '', regex=True)
        self.__data['timestamp'] = self.__data.LocalTime

        # LocalNS
        self.__data["LocalNS"] = pd.to_datetime(self.__date_str.replace('_', '-') + ' ' + self.__data['Time']).apply(
            lambda x: x.value)

        # TotalBuyQty, TotalSellQty, AvgBuyPrice, AvgSellPrice
        self.__data['TotalBuyQty'] = 0
        self.__data['TotalSellQty'] = 0
        self.__data['AvgBuyPrice'] = 0
        self.__data['AvgSellPrice'] = 0

        # AccVolume
        self.__data['AccVolume'] = self.__data.MatchTotQty

        # to format
        self.__data = self.__data[self.__format_columns]


class HftDataManager:

    @classmethod
    def parser(cls, start_date_str: str, end_date_str: str, exchange: Exchange):
        """
        clear source hft data
        :param start_date_str: begin date of source file
        :param end_date_str: end date of source file
        :param exchange: enum of exchang
        """
        start_date, end_date = datetime.datetime.strptime(start_date_str, '%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()

        while True:
            if start_date > end_date:
                break

            format_date_str = str(start_date).replace("-", "_")
            if os.path.exists(path=HftDataPath.data_path(date_str=format_date_str, exchange=exchange)):
                HftDataOperator(date_str=format_date_str, exchange=exchange).split_by_symbol().dump_to_csv()

            print(f'date({start_date}) parser over.')
            start_date += datetime.timedelta(days=1)

    @classmethod
    def refresh_main_contracts_dir(cls, target_contracts: list, start_date_str: str, end_date_str: str):
        """
        refresh main contract's dir
        :param target_contracts: target contracts which need refresh
        :param start_date_str: begin date of source file
        :param end_date_str: end date of source file
        """

        source_dir = HftDataPath.output_data_dir()
        assert os.path.exists(path=source_dir), f'{source_dir} not exist.'

        target_dir = HftDataPath.output_main_data_dir()
        if os.path.exists(path=target_dir) is True:
            shutil.rmtree(target_dir)
        os.mkdir(target_dir)

        for contract in target_contracts:
            start_date, end_date = datetime.datetime.strptime(start_date_str,'%Y_%m_%d').date(), datetime.datetime.strptime(end_date_str, '%Y_%m_%d').date()

            source_sub_dir = f'{source_dir}/data{contract}'
            target_sub_dir = f'{target_dir}/data{contract}'
            if os.path.exists(path=target_sub_dir) is False:
                os.mkdir(target_sub_dir)

            while True:
                if start_date > end_date:
                    break

                format_date_str = str(start_date).replace('-', '')
                main_contract = dateToContract(instrument=contract, date=format_date_str)

                source_main_contract_dir = f'{source_sub_dir}/{format_date_str}'
                target_main_contract_dir = f'{target_sub_dir}/{format_date_str}'
                if os.path.exists(path=source_main_contract_dir):
                    os.mkdir(target_main_contract_dir)

                source_main_contract_path = f'{source_main_contract_dir}/{main_contract}_{format_date_str}.csv'
                target_main_contract_path = f'{target_main_contract_dir}/{main_contract}_{format_date_str}.csv'

                if os.path.exists(path=source_main_contract_path):
                    shutil.copy(src=source_main_contract_path, dst=target_main_contract_path)

                    # clear main contract data.
                    target_df = pd.read_csv(target_main_contract_path)

                    clear_off_trading(target_df, "timestamp")
                    target_df = target_df.drop_duplicates(subset='timestamp', keep='first', inplace=False)
                    target_df = target_df.reset_index(drop=True)

                    target_df.to_csv(target_main_contract_path, index=False)

                start_date += datetime.timedelta(days=1)


if __name__ == '__main__':
    _start_date_str = "2021_05_14"
    _end_date_str = "2021_05_24"

    HftDataManager.parser(start_date_str=_start_date_str, end_date_str=_end_date_str, exchange=Exchange.DCE)

    # HftDataManager.refresh_main_contracts_dir(
    #     target_contracts=['v00', 'eg00'],
    #     start_date_str=_start_date_str,
    #     end_date_str=_end_date_str
    # )

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值