AI4Code Pytorch DistilBert Baseline(Kaggle竞赛)

AI4Code Pytorch DistilBert Baseline

导库

import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased"

data_dir = Path('../input/AI4Code')

读取文件

NUM_TRAIN = 10000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))#[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

在这里插入图片描述

读取顺序文件

df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

在这里插入图片描述

示例文件展示

# Get an example notebook
nb_id = df.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

无序的:

在这里插入图片描述

有序的:

在这里插入图片描述

获取df_ranks

def get_ranks(base, derived):
    """获取每个cell_id应该放到对应位置的索引"""
    return [base.index(d) for d in derived]
cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

在这里插入图片描述

根据id获取有序cell_id和无序的cell_id
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)
df_orders_

在这里插入图片描述

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}
df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id') # 给两个轴[index,columns]设置名称 
    .apply(pd.Series.explode) # 将列表拆分成一行行的内容
    .set_index('cell_id', append=True) # 将每个id的内容拼接起来
)

df_ranks

在这里插入图片描述

获取祖先信息

df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

在这里插入图片描述

将祖先信息、ranks信息到df
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df

在这里插入图片描述

df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count") # 计算每个notebook有多少cell,然后对rank进行归一化处理 其实是定义好了cell的排列顺序
# 对归一化后的rank进行分箱统计
df["pct_rank"].hist(bins=10)

在这里插入图片描述

数据预处理

划分训练集和验证集

from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"])) # 按照组别划分 详细可见reference 2

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

train_df.head()

在这里插入图片描述

针对不同cell_type处理数据

train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

衡量指标计算函数

from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

计算示例:

y_dummy = val_df.groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

0.42511216883092573

数据建模

选择NLP模型

from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch
MAX_LEN = 128

class MarkdownModel(nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained(BERT_PATH)
        self.top = nn.Linear(768, 1)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        # 使用最后一层CLS做投影到1维数值 
        x = self.top(x[:, 0, :])
        return x

数据格式封装

from torch.utils.data import DataLoader, Dataset



class MarkdownDataset(Dataset):
    
    def __init__(self, df, max_len):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]
train_ds = MarkdownDataset(train_df_mark, max_len=MAX_LEN)
val_ds = MarkdownDataset(val_df_mark, max_len=MAX_LEN)

val_ds[0]

在这里插入图片描述

BS = 32 # 设置batch_size
NW = 8	# 设置数据处理时,num_workers,表示使用多少个cpu核心来准备数据

train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

设置工具函数

def adjust_lr(optimizer, epoch):
    """根据epoch调整学习率"""
    if epoch < 1:
        lr = 5e-5
    elif epoch < 2:
        lr = 1e-3
    elif epoch < 5:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    """传入网络,返回优化器"""
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

def read_data(data):
    """将数据送入GPU,并将前n-1个数据按照元组返回,最后一个元素单独返回,意味着将X,y分开返回"""
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

定义训练、验证函数

def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(inputs[0], inputs[1])

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds) # 将每个batch的结果拼接在一起
def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    
    optimizer = get_optimizer(model)

    criterion = torch.nn.MSELoss()
    
    for e in range(epochs):   
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs[0], inputs[1])

            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
            
            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
        y_val, y_pred = validate(model, val_loader)
            
        print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
        print()
    return model, y_pred

训练并预测

model = MarkdownModel() # 该模型可以预测数据集中markdown块的百分比顺序
model = model.cuda()
model, y_pred = train(model, train_loader, val_loader, epochs=3) # y_pred表示验证集的预测输出
val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
val_df

在这里插入图片描述

# 将markdown块的预测值 插入到pred 对于code块的pct_rank不做改变
val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
val_df

在这里插入图片描述

# 将pred列的值进行排序 那么就可以得到每个id对应的cell_id顺序
y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
y_dummy 

在这里插入图片描述

# 计算预测效果
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)
# 0.6471850137972645

预测测试数据

读取测试集文件

paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()
test_df

在这里插入图片描述

# 测试集缺少祖先信息和rank信息
# 每个id初始化的rank设置为原始序号
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount() # 根据id cell_type进行累计计数
test_df

在这里插入图片描述

# 根据rank计算出pct_rank,并赋值给pred列
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
test_df

在这里插入图片描述

预处理测试集

# 设置pct_rank初始化为0
test_df["pct_rank"] = 0
test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), max_len=MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

len(test_ds), test_ds[0]

在这里插入图片描述

_, y_test = validate(model, test_loader)
# 预测出markdown块的值
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

保存文件

# 对prd进行排序 
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

在这里插入图片描述

sub_df.to_csv("submission.csv", index=False)

小结

  1. 本文的数据预处理方法是:将训练集的id、cell_id、cell_type、source、rank、ancestor_id、parent_id整合在一起作为属性列,然后为了方便预训练模型训练,将cell_type为markdown类型注释提取出来作为X,将rank归一化作为y。
  2. 在划分数据集时,使用到了GroupShuffleSplit,将祖先id作为组别信息,让模型在同组notebook中学习顺序关系。
  3. 通过在训练集上训练,让模型能够预测markdown在notebook中的位置。

reference

  1. https://www.kaggle.com/code/aerdem4/ai4code-pytorch-distilbert-baseline
  2. https://blog.csdn.net/y1040468929/article/details/124977901
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值