Jane Street Real-Time Market Data Forecasting Baseline模型分享

无敌叉烧包z

已于 2024-10-15 04:10:11 修改

阅读量1.5w

点赞数 29

文章标签：深度学习机器学习神经网络算法数据挖掘

于 2024-10-15 03:46:20 首次发布

本文链接：https://blog.csdn.net/weixin_51484067/article/details/142932726

版权

有些人喜欢说加入初创量化机会更大，但是没人告诉你。。。

kaggle又又又又又又办了个量化比赛，主办方是对校招生pay最高的公司之一。

这里先分享了一个baseline模型，这里参考了之前分享的baseline，针对比赛指标实现了gbdt对应的metric函数，方便监控训练效果。
baseline链接地址

比赛介绍

这里一看数据感觉是选了流动性比较好，但是很难预测的标的，提交上去公榜分数感人，线下验证集的分数是0.007左右，估计最后过拟合会很多。标签没有太看明白，如果是比较长周期的预测的话，比赛最终结果的随机性就有会点感人了…

kaggle改变了API的格式，现在这种api有很多的限制，不知道是哪位大天才设计得这种API，报错信息只打印一句话实在是很逆天。

数据集描述

train.parquet - 训练集，包含历史数据和收益。为方便起见，训练集被分成了十个部分。
date_id 和 time_id - 按顺序排序的整数值，为数据提供时间顺序结构，尽管 time_id 值之间的实际时间间隔可能有所不同。
symbol_id - 股票id。
weight - 用于计算评分函数的权重。
feature_{00…78} - 匿名特征。
responder_{0…8} - 介于 -5 和 5 之间的匿名标签。responder_6 字段是您要预测的标签。

简单分析了一下数据，由于股票的个数很少，历史数据中存在非常多的缺失值，所以baseline代码中加了丢掉一部分开头数据的选项。不知道未来的数据中会不会有股票被剔除，有被剔除的数据就挺尴尬了，由于每次提供的数据都非常少，导致预测的效率很低，加上最新api的设计方式，对代码进行加速写起来会很麻烦，submit的时候只提供了一份假数据（至少需要数天的数据才能有效debug？），其实根本没办法拿来debug …

baseline代码

读取数据

import os
import joblib 

import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import numpy as np 

from joblib import Parallel, delayed

import kaggle_evaluation.jane_street_inference_server

# !pip install lightgbm==4.2.0 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install catboost==1.2.7 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install xgboost==2.0.3 -i https://mirrors.aliyun.com/pypi/simple/
# !pip install joblib==1.4.2 -i https://mirrors.aliyun.com/pypi/simple/


def reduce_mem_usage(self, float16_as32=True):
   #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
   start_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

   for col in df.columns:#遍历每列的列名
       col_type = df[col].dtype#列名的type
       if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
           c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
           if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
               #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
               if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                   df[col] = df[col].astype(np.int8)
               #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
               elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                   df[col] = df[col].astype(np.int16)
               #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
               elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                   df[col] = df[col].astype(np.int32)
               #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
               elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                   df[col] = df[col].astype(np.int64)  
           else:#如果是浮点数类型.
               #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
               if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                   if float16_as32:#如果数据需要更高的精度可以选择float32
                       df[col] = df[col].astype(np.float32)
                   else:
                       df[col] = df[col].astype(np.float16)  
               #如果数值在float32的取值范围内，对它进行类型转换
               elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                   df[col] = df[col].astype(np.float32)
               #如果数值在float64的取值范围内，对它进行类型转换
               else:
                   df[col] = df[col].astype(np.float64)
   #计算一下结束后的内存
   end_mem = df.memory_usage().sum() / 1024**2
   print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
   #相比一开始的内存减少了百分之多少
   print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

   return df
# Define the path to the input data directory
# If the local directory exists, use it; otherwise, use the Kaggle input directory
input_path = './jane-street-real-time-market-data-forecasting/' if os.path.exists('./jane-street-real-time-market-data-forecasting') else '/kaggle/input/jane-street-real-time-market-data-forecasting/'

# Flag to determine if the script is in training mode or not
TRAINING = False

# Define the feature names based on the number of features (79 in this case)
feature_names = [f"feature_{i:02d}" for i in range(79)]

# Number of validation dates to use
num_valid_dates = 100

# Number of dates to skip from the beginning of the dataset
skip_dates = 500

# Number of folds for cross-validation
N_fold = 5

# If in training mode, load the training data
if TRAINING:
   # Load the training data from a Parquet file
   df = pd.read_parquet(f'{input_path}/train.parquet')
   
   # Reduce memory usage of the DataFrame (function not provided here)
   df = reduce_mem_usage(df, False)
   
   # Filter the DataFrame to include only dates greater than or equal to skip_dates
   df = df[df['date_id'] >= skip_dates].reset_index(drop=True)
   
   # Get unique dates from the DataFrame
   dates = df['date_id'].unique()
   
   # Define validation dates as the last `num_valid_dates` dates
   valid_dates = dates[-num_valid_dates:]
   
   # Define training dates as all dates except the last `num_valid_dates` dates
   train_dates = dates[:-num_valid_dates]
   
   # Display the last few rows of the DataFrame (for debugging purposes)
   print(df.tail())

训练模型

这里针对 xgboost、lgb、lightgbm创建了各自的 r2指标评价函数

# Create a directory to store the trained models
os.system('mkdir models')

# Define the path to load pre-trained models (if not in training mode)
model_path = '/kaggle/input/jsbaselinezyz'

# If in training mode, prepare validation data
if TRAINING:
    # Extract features, target, and weights for validation dates
    X_valid = df[feature_names].loc[df['date_id'].isin(valid_dates)]
    y_valid = df['responder_6'].loc[df['date_id'].isin(valid_dates)]
    w_valid = df['weight'].loc[df['date_id'].isin(valid_dates)]

# Initialize a list to store trained models
models = []

# Function to train a model or load a pre-trained model
def train(model_dict, model_name='lgb'):
    if TRAINING:
        # Select dates for training based on the fold number
        selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_fold != i]
        
        # Get the model from the dictionary
        model = model_dict[model_name]
        
        # Extract features, target, and weights for the selected training dates
        X_train = df[feature_names].loc[df['date_id'].isin(selected_dates)]
        y_train = df['responder_6'].loc[df['date_id'].isin(selected_dates)]
        w_train = df['weight'].loc[df['date_id'].isin(selected_dates)]

        # Train the model based on the type (LightGBM, XGBoost, or CatBoost)
        if model_name == 'lgb':
            # Train LightGBM model with early stopping and evaluation logging
            model.fit(X_train, y_train, w_train,  
                      eval_metric=[r2_lgb],
                      eval_set=[(X_valid, y_valid, w_valid)], 
                      callbacks=[
                          lgb.early_stopping(100), 
                          lgb.log_evaluation(10)
                      ])
            
        elif model_name == 'cbt':
            # Prepare evaluation set for CatBoost
            evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)
            
            # Train CatBoost model with early stopping and verbose logging
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[evalset], 
                      verbose=10, 
                      early_stopping_rounds=100)
            
        else:
            # Train XGBoost model with early stopping and verbose logging
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[(X_valid, y_valid)], 
                      sample_weight_eval_set=[w_valid], 
                      verbose=10, 
                      early_stopping_rounds=100)

        # Append the trained model to the list
        models.append(model)
        
        # Save the trained model to a file
        joblib.dump(model, f'./models/{model_name}_{i}.model')
        
        # Delete training data to free up memory
        del X_train
        del y_train
        del w_train
        
        # Collect garbage to free up memory
        import gc
        gc.collect()
        
    else:
        # If not in training mode, load the pre-trained model from the specified path
        models.append(joblib.load(f'{model_path}/{model_name}_{i}.model'))
        
    return 

# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return -r2

# Custom R2 metric for LightGBM
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return 'r2', r2, True

# Custom R2 metric for CatBoost
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum

# Dictionary to store different models with their configurations
model_dict = {
    'lgb': lgb.LGBMRegressor(n_estimators=500, device='gpu', gpu_use_dp=True, objective='l2'),
    'xgb': xgb.XGBRegressor(n_estimators=2000, learning_rate=0.1, max_depth=6, tree_method='hist', device="cuda", objective='reg:squarederror', eval_metric=r2_xgb, disable_default_eval_metric=True),
    'cbt': cbt.CatBoostRegressor(iterations=1000, learning_rate=0.05, task_type='GPU', loss_function='RMSE', eval_metric=r2_cbt()),
}

# Train models for each fold
for i in range(N_fold):
    train(model_dict, 'lgb')
    train(model_dict, 'xgb')
    train(model_dict, 'cbt')

提交结果

lags_ : pl.DataFrame | None = None

# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 10 minutes of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    
    feat = test[feature_names].to_numpy()
    
    pred = [model.predict(feat) for model in models]
    pred = np.mean(pred, axis=0)
    
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions
    
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/lags.parquet',
        )
    )