# 导入基础依赖库
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from sklearn.metrics import f1_score
import shutil
# 准备好相应的文件夹
folders = ['logs', 'submit']
_ = [shutil.rmtree(folder) for folder in folders if os.path.exists(folder)]
_ = [os.makedirs(folder, exist_ok=True) for folder in folders]
# 获取数据文件路径
def get_paths(dir):
df_paths = pd.DataFrame({'path':glob(f'{dir}/*.csv')})
df_paths['sym'] = df_paths['path'].apply(lambda x: int(x.split('_')[-3][3:]))
df_paths['date'] = df_paths['path'].apply(lambda x: int(x.split('_')[-2][4:]))
df_paths['seg'] = df_paths['path'].apply(lambda x: x.split('_')[-1][:2])
return df_paths.sort_values('date').reset_index(drop=True)
df_train_paths = get_paths('data/train')
df_test_paths = get_paths('data/test')
df_train_paths.head()
0 data/train/snapshot_sym8_date0_am.csv 8 0 am
1 data/train/snapshot_sym9_date0_am.csv 9 0 am
2 data/train/snapshot_sym1_date0_am.csv 1 0 am
3 data/train/snapshot_sym3_date0_am.csv 3 0 am
4 data/train/snapshot_sym0_date0_am.csv 0 0 am
# 对测试集进行标注,方便线下验证
# Note:切勿直接提交标注结果,这相当于是[0.999]分的参考答案
def labeling(series, ticks, alpha):
"""
参照官方标注规则的标注实现
"""
series_shifted = series.shift(-ticks)
deltas = series_shifted - series
labels = []
for x in deltas:
if x < -alpha:
labels.append(0)
elif x > alpha:
labels.append(2)
else:
labels.append(1)
return labels
labels = [
['label_5', 5, 0.0005], ['label_10', 10, 0.0005],
['label_20', 20, 0.001], ['label_40', 40, 0.001],
['label_60', 60, 0.001]
]
for path in tqdm(df_test_paths['path']):
df = pd.read_csv(path)
for item in labels:
label_i, ticks, alpha = item
df[label_i] = labeling(df['n_midprice'], ticks, alpha)
df.to_csv(path, index=False)
# 数据提取
# 一串神秘代码用于标识分割线
cypher = [
2, 0, 2, 3,
0, 8, 0, 5,
1, 3, 1, 4,
0, 4, 1, 7,
1, 3, 1, 4,
0, 4, 1, 7,
9,
1, 1, 1, 1, 1
]
def feature_process(path):
"""
从csv文件中提取特征及标签数据,以numpy格式进行组织,
并在每一个文件的末尾增加一行标识行
"""
global cypher
df = pd.read_csv(path)
df['time_hour'] = df['time'].apply(lambda x: int(x.split(':')[0]))
df['time_min'] = df['time'].apply(lambda x: int(x.split(':')[1]))
feature_cols = [
'n_close',
'amount_delta',
'n_midprice',
'n_bid1',
'n_bsize1',
'n_bid2',
'n_bsize2',
'n_bid3',
'n_bsize3',
'n_bid4',
'n_bsize4',
'n_bid5',
'n_bsize5',
'n_ask1',
'n_asize1',
'n_ask2',
'n_asize2',
'n_ask3',
'n_asize3',
'n_ask4',
'n_asize4',
'n_ask5',
'n_asize5',
'time_hour',
'time_min',
]
target_cols = ['label_5', 'label_10', 'label_20', 'label_40', 'label_60']
try:
x = df[feature_cols + target_cols].values
except:
x = df[feature_cols].values
t = np.zeros((x.shape[0], len(target_cols)))
x = np.concatenate((x,t),axis=1)
split = np.array([cypher])
return np.concatenate((x, split), axis=0)
# 合并数据
train_data = np.vstack([feature_process(path) for path in tqdm(df_train_paths['path'])])
test_data = np.vstack([feature_process(path) for path in tqdm(df_test_paths['path'])])
print(train_data.shape, test_data.shape)
100%|██████████| 1225/1225 [00:07<00:00, 158.16it/s]
100%|██████████| 296/296 [00:01<00:00, 165.19it/s]
(2450000, 30) (592000, 30)
# 提取有效数据行的索引
train_index = train_data.sum(axis=1) != np.sum(cypher)
test_index = test_data.sum(axis=1) != np.sum(cypher)
# 考虑到真实场景中,未来数据的均值、标准差是不可获得的,因此利用训练集数据的统计信息
mean = train_data[train_index,:-5].mean(axis=0)
std = train_data[train_index,:-5].std(axis=0)
# 对数据进行标准化,使得均值为0、方差为1
train_data[train_index,:-5] = (train_data[train_index,:-5] - mean) / std
test_data[test_index,:-5] = (test_data[test_index,:-5] - mean) / std
# 搭建数据读取管道
import torch
from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
"""
自定义数据集
"""
def __init__(self, data, timestep, feat_dim) -> None:
super(MyDataset, self).__init__()
self.data = data
self.timestep = timestep
self.feat_dim = feat_dim
self.num_sample = data.shape[0]
self.offset = 0
def __getitem__(self, index):
"""
获取一个样本的输入x 和标签y
Note: 将timestep帧历史数据(包含当前时刻)作为输入, 缺失的历史帧用0填充
"""
current_values = self.data[index]
if self.offset > index:
self.offset = 0
if current_values.sum() == np.sum(cypher):
index += 1
self.offset = index
current_values = self.data[index]
head = index + 1
tail = index + 1 - self.timestep
tail = 0 if tail < 0 else tail
tail = self.offset if tail < self.offset else tail
x = self.data[tail:head, :self.feat_dim]
if x.shape[0] < self.timestep:
pad_values = np.zeros((self.timestep-x.shape[0], x.shape[1]))
x = np.concatenate((pad_values, x), axis=0)
y = current_values[self.feat_dim:]
x = torch.from_numpy(x).float()
return x, y
def __len__(self):
return self.num_sample-1
timestep = 30 # 这里使用包含当前时刻共 30 个ticks的历史数据作为输入
feat_dim = 25
batch_size = 512
num_workers = 2
train_loader = DataLoader(
dataset=MyDataset(
data=train_data,
timestep=timestep,
feat_dim=feat_dim
),
batch_size=batch_size,
shuffle=False,
num_workers=num_workers
)
test_loader = DataLoader(
dataset=MyDataset(
data=test_data,
timestep=timestep,
feat_dim=feat_dim
),
batch_size=batch_size,
shuffle=False,
num_workers=num_workers
)
for (batch_x, batch_y) in train_loader:
print(batch_x.shape, batch_y.shape)
break
torch.Size([512, 30, 25]) torch.Size([512, 5])
# 模型设计
import torch.nn as nn
import torch.optim as optim
class BranchNet(nn.Module):
"""
单分支网络,可用于单个标签的建模
"""
def __init__(self, input_size, hidden_size, num_layers, output_size) -> None:
super(BranchNet, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.block = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.head = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.BatchNorm1d(hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, output_size)
)
def forward(self, x):
output = self.block(x)[0]
logits = self.head(output[:,-1,:])
return logits
class Nets(nn.Module):
"""
模型组网,同时输出多个标签的预测结果
"""
def __init__(self, input_size, hidden_size:list, num_layers:list, output_size):
super(Nets, self).__init__()
self.branchs = nn.ModuleList([
BranchNet(input_size,h_s,n_l,output_size) for h_s,n_l in zip(hidden_size, num_layers)
])
def forward(self, x):
outputs = torch.stack([bi(x) for bi in self.branchs], dim=1)
return outputs
device = 'cuda'
model = Nets(feat_dim,
hidden_size=[32, 32, 32, 32, 32],
num_layers=[3, 3, 3, 3, 3],
output_size=3).to(device)
logits = model(batch_x.to(device))
logits.shape
# 模型训练
epochs = 50 # 训练迭代次数
verbose = len(train_loader) // 4
save_per_epoch = 10 # 模型保存周期
loss_fun = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(epochs):
model.train()
train_loss = []
for batch_idx, (batch_x, batch_y) in enumerate(tqdm(train_loader)):
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
logits = model(batch_x)
# 分别计算5个标签的交叉熵损失
loss_list = torch.stack([loss_fun(logits[:,i,:], batch_y[:,i].long()) for i in range(5)])
# 对5个标签的损失求和用于反向传播计算梯度
loss = torch.mean(loss_list)
optimizer.zero_grad()
loss.backward()
# 梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10, norm_type=2)
# 权重更新
optimizer.step()
train_loss.append(loss.item())
if batch_idx % verbose == 0:
print(f'Batch {[batch_idx]} --> loss: {loss_list.detach().cpu().numpy()}')
train_loss = np.mean(train_loss)
print(f'[{epoch+1}/{epochs}] train_loss: {train_loss}\n')
if (epoch+1)%save_per_epoch == 0:
torch.save(model.state_dict(), f'./logs/model-{epoch+1}.pt')
0%| | 16/4786 [00:00<00:54, 86.80it/s]
Batch [0] --> loss: [1.1785672 1.4807242 1.1306101 1.2786355 1.0270504]
25%|██▌ | 1218/4786 [00:09<00:27, 131.37it/s]
Batch [1196] --> loss: [0.5144607 0.68563956 0.43091246 0.56429195 0.65197116]
50%|█████ | 2408/4786 [00:18<00:17, 134.28it/s]
Batch [2392] --> loss: [0.47660968 0.6510711 0.7598877 0.78759354 0.890825 ]
75%|███████▌ | 3598/4786 [00:27<00:09, 119.70it/s]
Batch [3588] --> loss: [0.50301814 0.6622437 0.7138787 0.8827888 0.9295403 ]
100%|██████████| 4786/4786 [00:35<00:00, 133.05it/s]
Batch [4784] --> loss: [0.48527205 0.7702761 0.65639585 0.9325873 1.0255642 ]
[1/50] train_loss: 0.8931142623784201
# 预测
@torch.no_grad()
def predict(data_loader):
model.eval()
predict_outputs = []
for _,(batch_x, _) in enumerate(tqdm(data_loader)):
logits = model(batch_x.to(device))
predicts = torch.argmax(logits, -1)
predicts = predicts.detach().cpu().numpy()
predict_outputs.append(predicts)
return np.concatenate(predict_outputs)
predict_tests = predict(test_loader)
predict_trains = predict(train_loader)
# 验证分数
def eval_metric(predicts, trues):
assert predicts.shape == trues.shape
num_label = predicts.shape[1]
for i in range(num_label):
y_pred, y_true = predicts[:,i], trues[:,i]
acc = np.mean(y_pred == y_true)
f1_s = f1_score(y_true, y_pred, average='macro')
print(f'[{i+1}/{num_label}] Acc: {round(acc,5)} | macro-F1: {round(f1_s,5)}')
print('eval train: ')
eval_metric(predict_trains[train_index[:-1]], train_data[train_index,feat_dim:])
print('\neval test: ')
eval_metric(predict_tests[test_index[:-1]], test_data[test_index,feat_dim:])
# 生成提交文件
submit_tests = predict_tests[test_index[:-1]]
offset = 0
for path in tqdm(df_test_paths['path']):
df_sub = pd.read_csv(path)[['uuid']]
length = df_sub.shape[0]
temp = submit_tests[offset:offset+length]
offset += length
df_temp = pd.DataFrame(temp, columns=['label_5', 'label_10', 'label_20', 'label_40', 'label_60'])
df_sub = pd.concat([df_sub, df_temp], axis=1)
filename = path.split('/')[-1]
df_sub.to_csv(f'submit/{filename}',index=False)
! zip -r submit.zip submit/