1 工作准备
比赛地址:2023 iFLYTEK A.I.开发者大赛-讯飞开放平台
数据下载: 科大讯飞学术论文分类挑战赛数据集_数据集-飞桨AI Studio星河社区
环境
- HuggingFace官网:GitHub - huggingface/transformers: 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
- HuggingFace模型地址:https://huggingface.co/
模型需要的文件:
1 下载文件保存到本地:
2 load model
# 导入transformers
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
PRE_TRAINED_MODEL_NAME = '../bert-base-uncased' # 英文bert预训练模型
tokenizer = BertTokenizer.from_pretrained("../bert-base-uncased/")
tokenizer
"""
BertTokenizer(name_or_path='../bert-base-uncased/', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
('[PAD]', 0)
('[UNK]', 100)
('[CLS]', 101)
('[SEP]', 102)
('[MASK]', 103)
"""
2 数据分析
# 导入transformers
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
# 导入torch
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
# 常用包
import re
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from tqdm import tqdm
%matplotlib inline
%config InlineBackend.figure_format='retina' # 主题
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
train = pd.read_csv('../data/train.csv', sep='\t')
test = pd.read_csv('../data/test.csv', sep='\t')
sub = pd.read_csv('../data/sample_submit.csv')
# 拼接title与abstract
train['text'] = train['title'] + ' ' + train['abstract']
test['text'] = test['title'] + ' ' + test['abstract']
label_id2cate = dict(enumerate(train.categories.unique()))
label_cate2id = {value: key for key, value in label_id2cate.items()}
train['label'] = train['categories'].map(label_cate2id)
RANDOM_SEED = 2021
class MyDataSet(Dataset):
def __init__(self,texts,labels,tokenizer,max_len):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self,item):
text = str(self.texts[item])
label = self.labels[item]
encoding = self.tokenizer(text=text,
max_length=self.max_len,
pad_to_max_length=True,
add_special_tokens=True,
return_attention_mask=True,
return_token_type_ids=True,
return_tensors='pt')
return {"text":text,
"input_ids":encoding['token_type_ids'].flatten(),
"attention_mask":encoding['attention_mask'].flatten(),
"labels":torch.tensor(label,dtype=torch.long)}
df_train, df_test = train_test_split(train, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
df_train.shape, df_val.shape, df_test.shape
def create_data_loader(df,tokenizer,max_len,batch_size=4):
ds = MyDataSet(texts=df['text'].values,
labels=df['label'].values,
tokenizer = tokenizer,
max_len=max_len
)
return DataLoader(ds,batch_size=batch_size)
MAX_LEN = 64
BATCH_SIZE = 4
train_data_loader = create_data_loader(df_train,tokenizer,max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_data_loader = create_data_loader(df_val,tokenizer,max_len=MAX_LEN, batch_size=BATCH_SIZE)
test_data_loader = create_data_loader(df_test,tokenizer,max_len=MAX_LEN, batch_size=BATCH_SIZE)
class BaseBertModel(nn.Module):
def __init__(self,n_class=39):
super(BaseBertModel,self).__init__()
self.bert = BertModel.from_pretrained("../bert-base-uncased/")
self.drop = nn.Dropout(0.2)
self.out = nn.Linear(self.bert.config.hidden_size, n_class)
pass
def forward(self,input_ids,attention_mask):
_,pooled_output = self.bert(input_ids=input_ids,
attention_mask=attention_mask,
return_dict = False)
out = self.drop(pooled_output)
return self.out(out)
#pooled_output
model = BaseBertModel()
model = model.to(device)
EPOCHS = 5 # 训练轮数
optimizer = AdamW(model.parameters(),lr=2e-5,correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
schedule = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,
num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)
def train_epoch(model,data_loader,loss_fn,device,schedule,n_exmaples):
model = model.train()
losses = []
correct_predcitions = 0
for d in tqdm(data_loader):
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['labels'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_,preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs,targets)
losses.append(loss.item())
correct_predcitions += torch.sum(preds==targets)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval() # 验证预测模式
losses = []
correct_predictions = 0
with torch.no_grad():
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["labels"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
# train model
history = defaultdict(list) # 记录10轮loss和acc
best_accuracy = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
# 模型评估
test_acc, _ = eval_model(
model,
test_data_loader,
loss_fn,
device,
len(df_test)
)
test_acc.item()
def get_predictions(model, data_loader):
model = model.eval()
texts = []
predictions = []
prediction_probs = []
real_values = []
with torch.no_grad():
for d in data_loader:
texts = d["texts"]
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["labels"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
probs = F.softmax(outputs, dim=1)
texts.extend(texts)
predictions.extend(preds)
prediction_probs.extend(probs)
real_values.extend(targets)
predictions = torch.stack(predictions).cpu()
prediction_probs = torch.stack(prediction_probs).cpu()
real_values = torch.stack(real_values).cpu()
return texts, predictions, prediction_probs, real_values
y_texts, y_pred, y_pred_probs, y_test = get_predictions(
model,
test_data_loader
)
print(classification_report(y_test, y_pred, target_names=[str(label) for label in class_names]))
# 模型预测
sample_text='Hard but Robust, Easy but Sensitive: How Encod.'
encoded_text = tokenizer.encode_plus(
sample_text,
max_length=MAX_LEN,
add_special_tokens=True,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoded_text['input_ids'].to(device)
attention_mask = encoded_text['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Sample text: {sample_text}')
print(f'Danger label : {label_id2cate[prediction.cpu().numpy()[0]]}')