CCF 小样本文本分类Baseline 0.565+
文本分类问题,一共36类,这里采用bert微调的方式进行训练,采用torch框架,加载huggingface的 hfl/chinese-roberta-wwm-ext 预训练模型。调整参数后线上最高0.565(seed 42,lr 2e-4,bert lr 5e-5 , batch 16,epoch 19)。线下及其不稳定,最大10+的gap,采用多种不同评价策略均不能做到一致,甚至不能同增减。猜测由于数据过少,验证集选的不够好,如val size过小,或seed不对会使得某些难类别过少或没有。
参数设置 config.py
import argparse datapath ='/media/zsy/CCF/data/' def parse_args(): parser = argparse.ArgumentParser(description="Baseline for CCF Challenge 2022") parser.add_argument("--seed", type=int, default=42, help="random seed.") parser.add_argument('--dropout', type=float, default=0.3, help='dropout ratio') parser.add_argument('--cls_dropout', type=float, default=0.1, help='dropout ratio') parser.add_argument('--ema', type=bool, default=True, help='ema') parser.add_argument('--attack', type=str, default=None, help='attack') parser.add_argument('--use_fp16', type=bool, default=False, help='fp16') parser.add_argument('--all', type=bool, default=False, help='all_data') # ========================= Data Configs ========================== parser.add_argument('--train_annotation', type=str, default=datapath+'train.json') parser.add_argument('--test_annotation', type=str, default=datapath+'testA.json') parser.add_argument('--test_output_csv', type=str, default=datapath+'submission.csv') parser.add_argument('--val_ratio', default=0.1, type=float, help='split 10 percentages of training data as validation') parser.add_argument('--batch_size', default=16, type=int, help="use for training duration per worker") parser.add_argument('--val_batch_size', default=128, type=int, help="use for validation duration per worker") parser.add_argument('--test_batch_size', default=512, type=int, help="use for testing duration per worker") parser.add_argument('--prefetch', default=16, type=int, help="use for training duration per worker") parser.add_argument('--num_workers', default=4, type=int, help="num_workers for dataloaders") # ======================== SavedModel Configs ========================= parser.add_argument('--savedmodel_path', type=str, default='save') parser.add_argument('--ckpt_file', type=str, default='/media/zsy/CCF/save/flod_/model_epoch_19_mean_f1_0.6616.bin') parser.add_argument('--best_score', default=-0.5, type=float, help='save checkpoint if mean_f1 > best_score') # ========================= Learning Configs ========================== parser.add_argument('--max_epochs', type=int, default=30, help='How many epochs') parser.add_argument('--max_steps', default=50000, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--print_steps', type=int, default=20, help="Number of steps to log training metrics.") parser.add_argument('--warmup_steps', default=200, type=int, help="warm ups for parameters not in bert or vit") parser.add_argument('--minimum_lr', default=0., type=float, help='minimum learning rate') parser.add_argument('--learning_rate', default=2e-4, type=float, help='initial learning rate') parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") # ========================== Title BERT ============================= parser.add_argument('--bert_dir', type=str, default='hfl/chinese-roberta-wwm-ext') parser.add_argument('--test_bert_dir', type=str, default='roberta_wwm_chinese') parser.add_argument('--bert_cache', type=str, default='data/cache') parser.add_argument('--bert_learning_rate', type=float, default=5e-5) parser.add_argument('--bert_warmup_steps', type=int, default=5000) parser.add_argument('--bert_max_steps', type=int, default=30000) parser.add_argument("--bert_hidden_dropout_prob", type=float, default=0.1) parser.add_argument("--bert_output_dim", type=float, default=768) parser.add_argument("--bert_hidden_size", type=float, default=768) return parser.parse_args()
数据加载 data_helper.py
-
直接random.shuffer将数据分成了9:1,也可以用skf多折划分数据。
-
将标题 出处 描述三种文本分别加载roberta tokenizer,然后cat起来输入
-
数据量太少简单用了repeat增强,即将训练数据复制了一次。
import json import random import torch from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from transformers import BertTokenizer import random from sklearn.model_selection import train_test_split,StratifiedKFold def create_dataloaders(args, test_mode = False): val_ratio = args.val_ratio anns=list() with open(args.train_annotation,'r',encoding='utf8') as f: for line in f.readlines(): ann =json.loads(line) anns.append(ann) random.shuffle(anns) val_anns = anns[:int(val_ratio*len(anns))] train_anns = anns[int(val_ratio*len(anns)):] # repeat <offline enhance> train_anns = train_anns + train_anns val_dataset = MultiModalDataset(args, val_anns) train_dataset = MultiModalDataset(args, train_anns) train_sampler = RandomSampler(train_dataset) val_sampler = SequentialSampler(val_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, drop_last=True, pin_memory=True, num_workers=args.num_workers, prefetch_factor=args.prefetch) val_dataloader = DataLoader(val_dataset, batch_size=args.val_batch_size, sampler=val_sampler, drop_last=False, pin_memory=True, num_workers=args.num_workers, prefetch_factor=args.prefetch) return train_dataloader, val_dataloader class MultiModalDataset(Dataset): def __init__(self, args, anns, test_mode: bool = False, idx= [] ): self.test_mode = test_mode if test_mode: self.tokenizer = BertTokenizer.from_pretrained(args.test_bert_dir) else: self.tokenizer = BertTokenizer.from_pretrained(args.bert_dir) self.anns=anns def __len__(self) -> int: return len(self.anns) def __getitem__(self, idx: int) -> dict: id = self.anns[idx]['id'] title = self.anns[idx]['title'] assignee = self.anns[idx]['assignee'] abstract = self.anns[idx]['abstract'] # <online enhance here> # Step 2, load title tokens # text = title+assignee+abstract # text_inputs = self.tokenizer(title, max_length=512, padding='max_length', truncation=True) text_inputs = {} title_inputs = self.tokenizer(title, max_length=30, padding='max_length', truncation=True) assignee_inputs = self.tokenizer(assignee, max_length=15, padding='max_length', truncation=True) abstract_inputs = self.tokenizer(abstract, max_length= 450, padding='max_length', truncation=True) title_inputs['input_ids'][0] = 101 assignee_inputs['input_ids'] = assignee_inputs['input_ids'][1:] abstract_inputs['input_ids'] = abstract_inputs['input_ids'][1:] assignee_inputs['attention_mask'] = assignee_inputs['attention_mask'][1:] abstract_inputs['attention_mask'] = abstract_inputs['attention_mask'][1:] assignee_inputs['token_type_ids'] = assignee_inputs['token_type_ids'][1:] abstract_inputs['token_type_ids'] = abstract_inputs['token_type_ids'][1:] for each in title_inputs: text_inputs[each] = title_inputs[each] + assignee_inputs[each] + abstract_inputs[each] text_inputs = {k: torch.LongTensor(v) for k,v in text_inputs.items()} text_mask = text_inputs['attention_mask'] data = dict( text_inputs=text_inputs['input_ids'], text_mask=text_mask, text_type_ids = text_inputs['token_type_ids'], ) # Step 4, load label if not test mode if (not self.test_mode): data['label'] = torch.LongTensor([self.anns[idx]['label_id']]) return data
模型 model.py
-
直接 bert-base加载 roberta
-
最终特征采用 last 4 mean pooling,即取bert最后四层的特征平均池化
-
最终再接一个映射到36的分类头,即分36类
-
loss部分即传统交叉熵,后续也可考虑focal loss 等
import torch import torch.nn as nn import torch.nn.functional as F from transformers import BertModel class clsModel(nn.Module): def __init__(self, args): super(clsModel, self).__init__() self.bert = BertModel.from_pretrained(args.bert_dir, output_hidden_states=True) # config = BertConfig(output_hidden_states=True) # self.bert = BertModel(config=config) self.cls = nn.Linear(768*4, 36) self.text_embedding = self.bert.embeddings self.text_cls = nn.Linear(768, 36) def build_pre_input(self, data): text_inputs=data['text_inputs'] text_mask=data['text_mask'] textembedding = self.text_embedding(text_inputs.cuda(), data['text_type_ids'].cuda()) return textembedding,text_mask def forward(self, data, inference=False,multi = False): inputs_embeds, mask = self.build_pre_input(data) bert_out = self.bert(attention_mask=mask, inputs_embeds=inputs_embeds) # last 4 mean pooling hidden_stats = bert_out.hidden_states[-4:] hidden_stats = [i.mean(dim=1) for i in hidden_stats] out = self.cls(torch.cat(hidden_stats,dim=1)) if inference: if multi: return out else: return torch.argmax(out, dim=1) else: all_loss, all_acc, all_pre,label = self.cal_loss(out,data['label'].cuda()) return all_loss, all_acc, all_pre, label @staticmethod def cal_loss(prediction, label): label = label.squeeze(dim=1) loss = F.cross_entropy(prediction, label) with torch.no_grad(): pred_label_id = torch.argmax(prediction, dim=1) accuracy = (label == pred_label_id).float().sum() / label.shape[0] return loss, accuracy, pred_label_id, label
一些基本设置,价函数,及常用trick util.py
-
设置了分层学习率,即让bert学习率小一点儿,分类头学习率大一点儿
-
调试验证集时分别计算了几种不同评价指标
-
常用trick: ema swa fgm pgd rdrop f1优化等,目前只用了ema
import logging import random import numpy as np from sklearn.metrics import f1_score, accuracy_score import torch from transformers import AdamW, get_linear_schedule_with_warmup import warnings warnings.filterwarnings("ignore") import torch.nn as nn def setup_device(args): args.device = 'cuda' if torch.cuda.is_available() else 'cpu' args.n_gpu = torch.cuda.device_count() def setup_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) def setup_logging(): logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) return logger # 设置分层学习率 def build_optimizer(args, model): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] large_lr = [''] optimizer_grouped_parameters = [ {'params': [j for i, j in model.named_parameters() if (not 'bert' in i and not any(nd in i for nd in no_decay))], 'lr': args.learning_rate, 'weight_decay': args.weight_decay}, {'params': [j for i, j in model.named_parameters() if (not 'bert' in i and any(nd in i for nd in no_decay))], 'lr': args.learning_rate, 'weight_decay': 0.0}, {'params': [j for i, j in model.named_parameters() if ('bert' in i and not any(nd in i for nd in no_decay))], 'lr': args.bert_learning_rate, 'weight_decay': args.weight_decay}, {'params': [j for i, j in model.named_parameters() if ('bert' in i and any(nd in i for nd in no_decay))], 'lr': args.bert_learning_rate, 'weight_decay': 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps) return optimizer, scheduler def evaluate(predictions, labels): # prediction and labels are all level class ids temp_dict=dict() no_ignores_2=list() no_ignores_4=list() for key in labels: temp_dict[key] = temp_dict.get(key, 0) + 1 for i in range(36): if i in temp_dict.keys(): if temp_dict[i]>2: no_ignores_2.append(i) if temp_dict[i]>4: no_ignores_4.append(i) f1_macro = f1_score(labels, predictions,average='macro')# f1_micro = f1_score(labels, predictions,average='micro')# f1_weight = f1_score(labels, predictions,average='weighted')# f1_macro_2 = f1_score(labels, predictions,labels=no_ignores_2,average='macro')# f1_macro_4 = f1_score(labels, predictions,labels=no_ignores_4,average='macro')# eval_results = {'f1_macro':f1_macro,'f1_micro':f1_micro,'f1_weight':f1_weight,'f1_macro_2':f1_macro_2,'f1_macro_4':f1_macro_4} return eval_results # FGM class FGM: def __init__(self, model: nn.Module, eps=1.): self.model = ( model.module if hasattr(model, "module") else model ) self.eps = eps self.backup = {} # only attack word embedding def attack(self, emb_name='word_embeddings'): for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: self.backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm and not torch.isnan(norm): r_at = self.eps * param.grad / norm param.data.add_(r_at) def restore(self, emb_name='word_embeddings'): for name, para in self.model.named_parameters(): if para.requires_grad and emb_name in name: assert name in self.backup para.data = self.backup[name] self.backup = {} # PGD class PGD: def __init__(self, model, eps=1., alpha=0.3): self.model = ( model.module if hasattr(model, "module") else model ) self.eps = eps self.alpha = alpha self.emb_backup = {} self.grad_backup = {} def attack(self, emb_name='embeddings', is_first_attack=False): for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: if is_first_attack: self.emb_backup[name] = param.data.clone() norm = torch.norm(param.grad) if norm != 0 and not torch.isnan(norm): r_at = self.alpha * param.grad / norm param.data.add_(r_at) param.data = self.project(name, param.data) def restore(self, emb_name='embeddings'): for name, param in self.model.named_parameters(): if param.requires_grad and emb_name in name: assert name in self.emb_backup param.data = self.emb_backup[name] self.emb_backup = {} def project(self, param_name, param_data): r = param_data - self.emb_backup[param_name] if torch.norm(r) > self.eps: r = self.eps * r / torch.norm(r) return self.emb_backup[param_name] + r def backup_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad and param.grad is not None: self.grad_backup[name] = param.grad.clone() def restore_grad(self): for name, param in self.model.named_parameters(): if param.requires_grad and param.grad is not None: param.grad = self.grad_backup[name] class EMA(): def __init__(self, model, decay): self.model = model self.decay = decay self.shadow = {} self.backup = {} def register(self): for name, param in self.model.named_parameters(): if param.requires_grad: self.shadow[name] = param.data.clone() def update(self): for name, param in self.model.named_parameters(): if param.requires_grad: assert name in self.shadow new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name] self.shadow[name] = new_average.clone() def apply_shadow(self): for name, param in self.model.named_parameters(): if param.requires_grad: assert name in self.shadow self.backup[name] = param.data param.data = self.shadow[name] def restore(self): for name, param in self.model.named_parameters(): if param.requires_grad: assert name in self.backup param.data = self.backup[name] self.backup = {}
训练代码 train.py
-
尝试冻结部分bert层无效
import logging import os import time import torch from config import parse_args from data_helper import create_dataloaders from model import clsModel # os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3' from util import * import json from sklearn.model_selection import StratifiedKFold from torch.cuda.amp import autocast as ac def validate(model, val_dataloader): model.eval() predictions = [] labels = [] losses = [] with torch.no_grad(): for batch in val_dataloader: loss, _, pred_label_id, label = model(batch) loss = loss.mean() predictions.extend(pred_label_id.cpu().numpy()) labels.extend(label.cpu().numpy()) losses.append(loss.cpu().numpy()) loss = sum(losses) / len(losses) results = evaluate(predictions, labels) model.train() return loss, results def train_and_validate(args): # 1. load data if not os.path.exists(f'{args.savedmodel_path}/flod_'): os.makedirs(f'{args.savedmodel_path}/flod_') train_dataloader, val_dataloader = create_dataloaders(args) # 2. build model and optimizers model = clsModel(args) #尝试冻结 # unfreeze_layers = ['layer.10','layer.11','bert.pooler','out.'] # for name ,param in model.bert.named_parameters(): # param.requires_grad = False # for ele in unfreeze_layers: # if ele in name: # param.requires_grad = True # break optimizer, scheduler = build_optimizer(args, model) if args.device == 'cuda': model = torch.nn.parallel.DataParallel(model.to(args.device)) #-------ema here----------------- if args.ema: ema = EMA(model, 0.999) ema.register() fgm, pgd = None, None if args.attack == 'fgm': fgm = FGM(model=model) print('fgming') elif args.attack == 'pgd': pgd = PGD(model=model) pgd_k = 3 print('pgding') if args.use_fp16: scaler = torch.cuda.amp.GradScaler() model.train() loss, results = validate(model, val_dataloader) #------------------------------- # 3. training step = 0 best_score = args.best_score start_time = time.time() num_total_steps = len(train_dataloader) * args.max_epochs for epoch in range(args.max_epochs): for i, batch in enumerate(train_dataloader): model.train() if args.use_fp16: with ac(): loss, accuracy, _, _ = model(batch) loss = loss.mean() accuracy = accuracy.mean() scaler.scale(loss).backward() scaler.unscale_(optimizer) scaler.step(optimizer) scaler.update() else: loss, accuracy, _, _ = model(batch) loss = loss.mean() accuracy = accuracy.mean() loss.backward() if fgm is not None: fgm.attack() if args.use_fp16: with ac(): loss_adv, _, _, _ = model(batch) else: loss_adv, _, _, _ = model(batch) loss_adv = loss_adv.mean() if args.use_fp16: scaler.scale(loss_adv).backward() else: loss_adv.backward() fgm.restore() elif pgd is not None: pgd.backup_grad() for _t in range(pgd_k): pgd.attack(is_first_attack=(_t == 0)) if _t != pgd_k - 1: model.zero_grad() else: pgd.restore_grad() if args.use_fp16: with ac(): loss_adv, _, _, _ = model(batch) else: loss_adv, _, _, _ = model(batch) loss_adv = loss_adv.mean() if args.use_fp16: scaler.scale(loss_adv).backward() else: loss_adv.backward() pgd.restore() if args.use_fp16: scaler.unscale_(optimizer) scaler.step(optimizer) scaler.update() else: optimizer.step() model.zero_grad() scheduler.step() if args.ema: #------ema update-------- ema.update() #------------------------ step += 1 if i % (100000//args.batch_size//4) == 0 and i > 0 and i < (100000//args.batch_size-100000//args.batch_size//3-100) and epoch>1: if args.ema: #--------ema shadow-------- ema.apply_shadow() #-------------------------- loss, results = validate(model, val_dataloader) results = {k: round(v, 4) for k, v in results.items()} logging.info(f"Epoch {epoch} step {step}: loss {loss:.3f}, {results}") mean_f1 = results['mean_f1'] if mean_f1 >= best_score: best_score = mean_f1 torch.save({'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'mean_f1': mean_f1}, f'{args.savedmodel_path}/flod_/model_epoch_{epoch}_{i}_mean_f1_{mean_f1}.bin') best_score = mean_f1 if step % args.print_steps == 0: time_per_step = (time.time() - start_time) / max(1, step) remaining_time = time_per_step * (num_total_steps - step) remaining_time = time.strftime('%H:%M:%S', time.gmtime(remaining_time)) logging.info(f"Epoch {epoch} step {step} eta {remaining_time}: loss {loss:.3f}, accuracy {accuracy:.3f}") if args.ema: #--------ema shadow-------- ema.apply_shadow() #-------------------------- # 4. validation loss, results = validate(model, val_dataloader) results = {k: round(v, 4) for k, v in results.items()} logging.info(f"Epoch {epoch} step {step}: loss {loss:.3f}, {results}") # 5. save checkpoint mean_f1 = results['f1_macro'] if mean_f1 > best_score: best_score = mean_f1 torch.save({'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'mean_f1': mean_f1}, f'{args.savedmodel_path}/flod_/model_epoch_{epoch}_mean_f1_{mean_f1}.bin') if args.ema: #--------ema restore------- ema.restore() #-------------------------- def main(): args = parse_args() setup_logging() setup_device(args) setup_seed(args) os.makedirs(args.savedmodel_path, exist_ok=True) logging.info("Training/evaluation parameters: %s", args) train_and_validate(args) if __name__ == '__main__': main()
推理 infer.py
import torch from torch.utils.data import SequentialSampler, DataLoader import os from config import parse_args from model import clsModel from tqdm import tqdm from data_helper import MultiModalDataset os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3' def inference(): args = parse_args() print(args.ckpt_file) print(args.test_batch_size) # 1. load data dataset = MultiModalDataset(args, args.test_annotation,test_mode=True) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, batch_size=args.test_batch_size, sampler=sampler, drop_last=False, pin_memory=True, num_workers=args.num_workers, prefetch_factor=args.prefetch) # 2. load model model = clsModel(args) checkpoint = torch.load(args.ckpt_file, map_location='cpu') new_key = model.load_state_dict(checkpoint['model_state_dict'],strict=False) # model.half() if torch.cuda.is_available(): model = torch.nn.parallel.DataParallel(model.cuda()) model.eval() # 3. inference predictions = [] with torch.no_grad(): for batch in tqdm(dataloader): pred_label_id = model(data = batch,inference=True) predictions.extend(pred_label_id) # 4. dump results with open(args.test_output_csv, 'w') as f: f.write(f'id,label\n') for pred_label_id, ann in zip(predictions, dataset.anns): video_id = ann['id'] f.write(f'{video_id},{pred_label_id}\n') if __name__ == '__main__': inference()
多模型融合推理 infer_multi.py
import torch from torch.utils.data import SequentialSampler, DataLoader import numpy as np from config import parse_args from data_helper import MultiModalDataset from model import clsModel import os os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3' def inference(): args = parse_args() # 1. load data dataset = MultiModalDataset(args, args.test_annotation,test_mode=True) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, batch_size=args.test_batch_size, sampler=sampler, drop_last=False, pin_memory=True, num_workers=args.num_workers, prefetch_factor=args.prefetch) # 2. load model i models=[] for i in range(5): model = clsModel(args) save_path = f'save/flod_{i}' best_model = os.path.join(save_path, 'model_best.bin') checkpoint = torch.load(best_model, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) if torch.cuda.is_available(): model = torch.nn.parallel.DataParallel(model.cuda()) model.eval() models.append(model) # 3. inference all_outs=[] for model in models: print('infering') predictions = [] with torch.no_grad(): for batch in dataloader: outs = model(batch, inference=True,multi=True) predictions.extend(outs.cpu().numpy()) predictions = np.array(predictions) all_outs.append(predictions) all_outs=np.array(all_outs) out = np.sum(all_outs,axis=0) predictions = np.argmax(out,axis=1) # 4. dump results with open(args.test_output_csv, 'w') as f: f.write(f'id,label\n') for pred_label_id, ann in zip(predictions, dataset.anns): video_id = ann['id'] f.write(f'{video_id},{pred_label_id}\n') if __name__ == '__main__': inference()
写在最后
baseline目前问题:
-
目前由于数据问题线下不稳定,最高0.67.... 线上0.565
-
repeat增加数据太简单
-
参数还未调到最优,我也很懵
可以做的方向:
-
文本增强,特征工程,词袋词频模型等
-
对抗训练 、r drop 、f1优化等trick
-
自己做mlm预训练
-
对比学习,对比损失
-
用伪标签,换大模型
-
ensemble is all you need
本代码修改自2022微信大数据挑战赛baseline,祝大家都能取得好成绩~