K折交叉验证

导包

import pandas as pd
from sklearn.model_selection import StratifiedKFold

加载时

这里就是把train这个Dataframe切割成5份训练集和测试集

skf = StratifiedKFold(n_splits=5)
for fold, (_, val_) in enumerate(skf.split(X=train, y=train.label_id, groups=train.label_id)):
    train.loc[val_, "fold"] = int(fold)

训练时

可以通过手动调整fold值

fold = 0
tr_data = train[train['fold']!=fold].reset_index(drop=True)
va_data = train[train['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset =TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)

inference加载时

class TestDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.title = df['title'].values
        self.assignee = df['assignee'].values
        self.abstract = df['abstract'].values
        self.tokenizer = tokenizer
        self.sep_token = tokenizer.sep_token

    def __len__(self):
        return len(self.title)

    def __getitem__(self, item):
        title = self.title[item]
        assignee = self.assignee[item]
        abstract = self.abstract[item]
        input_text = title + self.sep_token + assignee + self.sep_token + abstract
        inputs = self.tokenizer(input_text, truncation=True, max_length=400, padding='max_length')
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)

def infer(test_loader, model, device):
    model.to(device)
    model.eval()
    preds = []
    probs = []
    for step, batch in tqdm(enumerate(test_loader)):
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=mask)
        logits = F.softmax(output.logits, dim=-1)
        prob, y_preds = logits.max(dim=-1)
        probs.append(prob.to('cpu').numpy())
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    probs = np.concatenate(probs)
    return predictions, probs

def infer_5folds(test_loader, model, device):
    model.to(device)
    model.eval()
    preds = []
    probs = []
    for step, batch in tqdm(enumerate(test_loader)):
        mask = batch[1].to(device)
        input_ids = batch[0].to(device)
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=mask)
        logits = F.softmax(output.logits, dim=-1)
        # prob, y_preds = logits.max(dim=-1)
        # probs.append(prob.to('cpu').numpy())
        # preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(logits)
    #probs = np.concatenate(probs)
    return predictions #, probs

res = []
for fold in range(5):
    saved_path = CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold)
    model.load_state_dict(torch.load(saved_path)['model'])
    test_dataset = TestDataset(test, tokenizer)
    test_dataloader = DataLoader(test_dataset,
                                batch_size=CFG.batch_size * 2,
                                shuffle=False,
                                num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    result_1fold = infer_5folds(test_dataloader, model, CFG.device)
    res.append(result_1fold)
res = np.mean(res, axis=1)
res = np.argmax(a, axis=-1)
test['label'] = res

test = test[['id', 'label']]
test.to_csv('submit_A.csv', index=None)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值