AI量化【2023科大讯飞】NN基线方案

@Einson

已于 2023-08-06 16:15:42 修改

阅读量173

点赞数 1

文章标签：人工智能

于 2023-08-06 14:59:37 首次发布

本文链接：https://blog.csdn.net/pu666/article/details/132131021

版权

AI量化【2023科大讯飞】NN基线方案

飞桨项目传送门（基于paddle实现)
2023 iFLYTEK

参赛及数据集获取传送门
竞赛主页

以下为Torch实现版本

# 导入基础依赖库
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm

from sklearn.metrics import f1_score

import shutil

# 准备好相应的文件夹
folders = ['logs', 'submit']

_ = [shutil.rmtree(folder) for folder in folders if os.path.exists(folder)]
_ = [os.makedirs(folder, exist_ok=True) for folder in folders]

# 获取数据文件路径
def get_paths(dir):
    df_paths = pd.DataFrame({'path':glob(f'{dir}/*.csv')})
    df_paths['sym'] = df_paths['path'].apply(lambda x: int(x.split('_')[-3][3:]))
    df_paths['date'] = df_paths['path'].apply(lambda x: int(x.split('_')[-2][4:]))
    df_paths['seg'] = df_paths['path'].apply(lambda x: x.split('_')[-1][:2])

    return df_paths.sort_values('date').reset_index(drop=True)

df_train_paths = get_paths('data/train')
df_test_paths = get_paths('data/test')

df_train_paths.head()

0 data/train/snapshot_sym8_date0_am.csv 8 0 am
1 data/train/snapshot_sym9_date0_am.csv 9 0 am
2 data/train/snapshot_sym1_date0_am.csv 1 0 am
3 data/train/snapshot_sym3_date0_am.csv 3 0 am
4 data/train/snapshot_sym0_date0_am.csv 0 0 am

# 对测试集进行标注，方便线下验证
# Note：切勿直接提交标注结果，这相当于是[0.999]分的参考答案
def labeling(series, ticks, alpha):
    """ 
    参照官方标注规则的标注实现
    """
    series_shifted = series.shift(-ticks)
    deltas = series_shifted - series
    labels = []
    for x in deltas:
        if x < -alpha:
            labels.append(0)
        elif x > alpha:
            labels.append(2)
        else:
            labels.append(1)

    return labels

labels = [
    ['label_5', 5, 0.0005], ['label_10', 10, 0.0005],
    ['label_20', 20, 0.001], ['label_40', 40, 0.001],
    ['label_60', 60, 0.001]
]
for path in tqdm(df_test_paths['path']):
    df = pd.read_csv(path)
    for item in labels:
        label_i, ticks, alpha = item
        df[label_i] = labeling(df['n_midprice'], ticks, alpha)

    df.to_csv(path, index=False)

# 数据提取

# 一串神秘代码用于标识分割线
cypher = [
    2, 0, 2, 3, 
    0, 8, 0, 5,
    1, 3, 1, 4,
    0, 4, 1, 7,
    1, 3, 1, 4,
    0, 4, 1, 7,
    9, 
    1, 1, 1, 1, 1
    ]

def feature_process(path):
    """ 
    从csv文件中提取特征及标签数据，以numpy格式进行组织，
    并在每一个文件的末尾增加一行标识行
    """
    global cypher

    df = pd.read_csv(path)
    df['time_hour'] = df['time'].apply(lambda x: int(x.split(':')[0]))
    df['time_min'] = df['time'].apply(lambda x: int(x.split(':')[1]))

    feature_cols = [
        'n_close',
        'amount_delta',
        'n_midprice',
        'n_bid1',

        'n_bsize1',
        'n_bid2',
        'n_bsize2',
        'n_bid3',

        'n_bsize3',
        'n_bid4',
        'n_bsize4',
        'n_bid5',

        'n_bsize5',
        'n_ask1',
        'n_asize1',
        'n_ask2',

        'n_asize2',
        'n_ask3',
        'n_asize3',
        'n_ask4',

        'n_asize4',
        'n_ask5',
        'n_asize5',
        'time_hour',

        'time_min',
    ]
    target_cols = ['label_5', 'label_10', 'label_20', 'label_40', 'label_60']

    try:
        x = df[feature_cols + target_cols].values
    except:
        x = df[feature_cols].values
        t = np.zeros((x.shape[0], len(target_cols)))
        x = np.concatenate((x,t),axis=1)

    split = np.array([cypher])

    return np.concatenate((x, split), axis=0)

# 合并数据
train_data = np.vstack([feature_process(path) for path in tqdm(df_train_paths['path'])])
test_data = np.vstack([feature_process(path) for path in tqdm(df_test_paths['path'])])

print(train_data.shape, test_data.shape)

100%|██████████| 1225/1225 [00:07<00:00, 158.16it/s]
100%|██████████| 296/296 [00:01<00:00, 165.19it/s]
(2450000, 30) (592000, 30)

# 提取有效数据行的索引
train_index = train_data.sum(axis=1) != np.sum(cypher)
test_index = test_data.sum(axis=1) != np.sum(cypher)

# 考虑到真实场景中，未来数据的均值、标准差是不可获得的，因此利用训练集数据的统计信息
mean = train_data[train_index,:-5].mean(axis=0)
std = train_data[train_index,:-5].std(axis=0)

# 对数据进行标准化，使得均值为0、方差为1
train_data[train_index,:-5] = (train_data[train_index,:-5] - mean) / std
test_data[test_index,:-5] = (test_data[test_index,:-5] - mean) / std

# 搭建数据读取管道

import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    """ 
    自定义数据集
    """
    def __init__(self, data, timestep, feat_dim) -> None:
        super(MyDataset, self).__init__()
        self.data = data
        self.timestep = timestep
        self.feat_dim = feat_dim
        self.num_sample = data.shape[0]
        self.offset = 0
    
    def __getitem__(self, index):
        """ 
        获取一个样本的输入x 和标签y
        Note: 将timestep帧历史数据(包含当前时刻)作为输入, 缺失的历史帧用0填充
        """
        current_values = self.data[index]
        if self.offset > index:
            self.offset = 0
        if current_values.sum() == np.sum(cypher):
            index += 1
            self.offset = index
            current_values = self.data[index]
        
        head = index + 1
        tail = index + 1 - self.timestep
        tail = 0 if tail < 0 else tail
        tail = self.offset if tail < self.offset else tail
        
        x = self.data[tail:head, :self.feat_dim]
 
        if x.shape[0] < self.timestep:
            pad_values = np.zeros((self.timestep-x.shape[0], x.shape[1]))
            x = np.concatenate((pad_values, x), axis=0)
        y = current_values[self.feat_dim:]

        x = torch.from_numpy(x).float()
        return x, y
    
    def __len__(self):

        return self.num_sample-1

timestep = 30 # 这里使用包含当前时刻共 30 个ticks的历史数据作为输入
feat_dim = 25
batch_size = 512
num_workers = 2
train_loader = DataLoader(
    dataset=MyDataset(
        data=train_data,
        timestep=timestep,
        feat_dim=feat_dim
    ),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)

test_loader = DataLoader(
    dataset=MyDataset(
        data=test_data,
        timestep=timestep,
        feat_dim=feat_dim
    ),
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)

for (batch_x, batch_y) in train_loader:
    print(batch_x.shape, batch_y.shape)
    break

torch.Size([512, 30, 25]) torch.Size([512, 5])

# 模型设计
import torch.nn as nn
import torch.optim as optim

class BranchNet(nn.Module):
    """
    单分支网络，可用于单个标签的建模
    """
    def __init__(self, input_size, hidden_size, num_layers, output_size) -> None:
        super(BranchNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.block = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
     
        self.head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):

        output = self.block(x)[0]

        logits = self.head(output[:,-1,:])

        return logits

class Nets(nn.Module):
    """ 
    模型组网，同时输出多个标签的预测结果
    """
    def __init__(self, input_size, hidden_size:list, num_layers:list, output_size):
        super(Nets, self).__init__()
        self.branchs = nn.ModuleList([
            BranchNet(input_size,h_s,n_l,output_size) for h_s,n_l in zip(hidden_size, num_layers)
        ])

    def forward(self, x):
        outputs = torch.stack([bi(x) for bi in self.branchs], dim=1)

        return outputs
    
device = 'cuda'
model = Nets(feat_dim, 
             hidden_size=[32, 32, 32, 32, 32], 
             num_layers=[3, 3, 3, 3, 3], 
             output_size=3).to(device)

logits = model(batch_x.to(device))
logits.shape

# 模型训练

epochs = 50 # 训练迭代次数
verbose = len(train_loader) // 4
save_per_epoch = 10 # 模型保存周期

loss_fun = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(epochs):
    
    model.train()
    train_loss = []
    for batch_idx, (batch_x, batch_y) in enumerate(tqdm(train_loader)):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    
        logits = model(batch_x)

        # 分别计算5个标签的交叉熵损失
        loss_list = torch.stack([loss_fun(logits[:,i,:], batch_y[:,i].long()) for i in range(5)])

        # 对5个标签的损失求和用于反向传播计算梯度
        loss = torch.mean(loss_list)

        optimizer.zero_grad()
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10, norm_type=2)

        # 权重更新
        optimizer.step()

        train_loss.append(loss.item())

        if batch_idx % verbose == 0:
            print(f'Batch {[batch_idx]} --> loss: {loss_list.detach().cpu().numpy()}')
            
    train_loss = np.mean(train_loss)
    print(f'[{epoch+1}/{epochs}] train_loss: {train_loss}\n')

    if (epoch+1)%save_per_epoch == 0:
        torch.save(model.state_dict(), f'./logs/model-{epoch+1}.pt')

0%| | 16/4786 [00:00<00:54, 86.80it/s]
Batch [0] --> loss: [1.1785672 1.4807242 1.1306101 1.2786355 1.0270504]
25%|██▌ | 1218/4786 [00:09<00:27, 131.37it/s]
Batch [1196] --> loss: [0.5144607 0.68563956 0.43091246 0.56429195 0.65197116]
50%|█████ | 2408/4786 [00:18<00:17, 134.28it/s]
Batch [2392] --> loss: [0.47660968 0.6510711 0.7598877 0.78759354 0.890825 ]
75%|███████▌ | 3598/4786 [00:27<00:09, 119.70it/s]
Batch [3588] --> loss: [0.50301814 0.6622437 0.7138787 0.8827888 0.9295403 ]
100%|██████████| 4786/4786 [00:35<00:00, 133.05it/s]
Batch [4784] --> loss: [0.48527205 0.7702761 0.65639585 0.9325873 1.0255642 ]
[1/50] train_loss: 0.8931142623784201

# 预测
@torch.no_grad()
def predict(data_loader):
    model.eval()
    predict_outputs = []
    for _,(batch_x, _) in enumerate(tqdm(data_loader)):
        logits = model(batch_x.to(device))
        predicts = torch.argmax(logits, -1)
        predicts = predicts.detach().cpu().numpy()

        predict_outputs.append(predicts)

    return np.concatenate(predict_outputs)

predict_tests = predict(test_loader)
predict_trains = predict(train_loader)

# 验证分数
def eval_metric(predicts, trues):

    assert predicts.shape == trues.shape

    num_label = predicts.shape[1]
    for i in range(num_label):
        y_pred, y_true = predicts[:,i], trues[:,i]
        acc = np.mean(y_pred == y_true)
  
        f1_s = f1_score(y_true, y_pred, average='macro')
        print(f'[{i+1}/{num_label}] Acc: {round(acc,5)} | macro-F1: {round(f1_s,5)}')

print('eval train: ')
eval_metric(predict_trains[train_index[:-1]], train_data[train_index,feat_dim:])

print('\neval test: ')
eval_metric(predict_tests[test_index[:-1]], test_data[test_index,feat_dim:])

# 生成提交文件
submit_tests = predict_tests[test_index[:-1]]

offset = 0
for path in tqdm(df_test_paths['path']):
    df_sub = pd.read_csv(path)[['uuid']]
    length = df_sub.shape[0]
    temp = submit_tests[offset:offset+length]
    offset += length

    df_temp = pd.DataFrame(temp, columns=['label_5', 'label_10', 'label_20', 'label_40', 'label_60'])
    df_sub = pd.concat([df_sub, df_temp], axis=1)

    filename = path.split('/')[-1]
    df_sub.to_csv(f'submit/{filename}',index=False) 

! zip -r submit.zip submit/