导包
import pandas as pd
from sklearn.model_selection import StratifiedKFold
加载时
这里就是把train这个Dataframe切割成5份训练集和测试集
skf = StratifiedKFold(n_splits=5)
for fold, (_, val_) in enumerate(skf.split(X=train, y=train.label_id, groups=train.label_id)):
train.loc[val_, "fold"] = int(fold)
训练时
可以通过手动调整fold值
fold = 0
tr_data = train[train['fold']!=fold].reset_index(drop=True)
va_data = train[train['fold']==fold].reset_index(drop=True)
tr_dataset = TrainDataset(tr_data,tokenizer)
va_dataset =TrainDataset(va_data,tokenizer)
val_result = train_loop(fold, model,tr_dataset, va_dataset)
inference加载时
class TestDataset(Dataset):
def __init__(self, df, tokenizer):
self.title = df['title'].values
self.assignee = df['assignee'].values
self.abstract = df['abstract'].values
self.tokenizer = tokenizer
self.sep_token = tokenizer.sep_token
def __len__(self):
return len(self.title)
def __getitem__(self, item):
title = self.title[item]
assignee = self.assignee[item]
abstract = self.abstract[item]
input_text = title + self.sep_token + assignee + self.sep_token + abstract
inputs = self.tokenizer(input_text, truncation=True, max_length=400, padding='max_length')
return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
torch.as_tensor(inputs['attention_mask'], dtype=torch.long)
def infer(test_loader, model, device):
model.to(device)
model.eval()
preds = []
probs = []
for step, batch in tqdm(enumerate(test_loader)):
mask = batch[1].to(device)
input_ids = batch[0].to(device)
with torch.no_grad():
output = model(input_ids=input_ids, attention_mask=mask)
logits = F.softmax(output.logits, dim=-1)
prob, y_preds = logits.max(dim=-1)
probs.append(prob.to('cpu').numpy())
preds.append(y_preds.to('cpu').numpy())
predictions = np.concatenate(preds)
probs = np.concatenate(probs)
return predictions, probs
def infer_5folds(test_loader, model, device):
model.to(device)
model.eval()
preds = []
probs = []
for step, batch in tqdm(enumerate(test_loader)):
mask = batch[1].to(device)
input_ids = batch[0].to(device)
with torch.no_grad():
output = model(input_ids=input_ids, attention_mask=mask)
logits = F.softmax(output.logits, dim=-1)
# prob, y_preds = logits.max(dim=-1)
# probs.append(prob.to('cpu').numpy())
# preds.append(y_preds.to('cpu').numpy())
predictions = np.concatenate(logits)
#probs = np.concatenate(probs)
return predictions #, probs
res = []
for fold in range(5):
saved_path = CFG.OUTPUT_DIR + "{}_best{}.pth".format(CFG.model_path.replace('/', '_'),fold)
model.load_state_dict(torch.load(saved_path)['model'])
test_dataset = TestDataset(test, tokenizer)
test_dataloader = DataLoader(test_dataset,
batch_size=CFG.batch_size * 2,
shuffle=False,
num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
result_1fold = infer_5folds(test_dataloader, model, CFG.device)
res.append(result_1fold)
res = np.mean(res, axis=1)
res = np.argmax(a, axis=-1)
test['label'] = res
test = test[['id', 'label']]
test.to_csv('submit_A.csv', index=None)