一:
之前学习了一下transformers 的简单用法,这里通过完成李宏毅老师的机器学习作业来实战应用。本文的数据集和代码都来源于该老师的作业,我根据作业要求在评估数据集上完成了strong baseline的要求,至于boss baseline,还差一些,就没有继续研究。这里分享一下我整理过的的代码,后续会慢慢加些注释。
二:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast
from tqdm.auto import tqdm
import logging
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import StepLR
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoModelForQuestionAnswering,AutoTokenizer
logger = logging.getLogger(__name__)
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")
tokenizer = AutoTokenizer.from_pretrained('luhua/chinese_pretrain_mrc_roberta_wwm_ext_large')
writer = SummaryWriter('./log')
def same_seeds(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def read_data(file):
with open(file, 'r', encoding="utf-8") as reader:
data = json.load(reader)
return data["questions"], data["paragraphs"]
class QA_Dataset(Dataset):
def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
self.split = split
self.questions = questions
self.tokenized_questions = tokenized_questions
self.tokenized_paragraphs = tokenized_paragraphs
self.max_question_len = 40
self.max_paragraph_len = 150
##### TODO: Change value of doc_stride #####
# self.doc_stride = 150
self.doc_stride = 35
# Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
def __len__(self):
return len(self.questions)
def __getitem__(self, idx):
question = self.questions[idx]
tokenized_question = self.tokenized_questions[idx]
tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]
##### TODO: Preprocessing #####
# Hint: How to prevent model from learning something it should not learn
if self.split == "train":
# Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph
answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])
# A single window is obtained by slicing the portion of paragraph containing the answer
distance_start = np.random.randint(0,self.max_paragraph_len-(answer_end_token-answer_start_token))
# mid = (answer_start_token + answer_end_token) // 2
# paragraph_start = max(0, min(mid - self.max_paragraph_len // 2,
# len(tokenized_paragraph) - self.max_paragraph_len))
paragraph_start = max(0,min(answer_start_token - distance_start,
len(tokenized_paragraph) - self.max_paragraph_len))
paragraph_end = paragraph_start + self.max_paragraph_len
# Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
input_ids_paragraph = tokenized_paragraph.ids[paragraph_start: paragraph_end] + [102]
# Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window
answer_start_token += len(input_ids_question) - paragraph_start
answer_end_token += len(input_ids_question) - paragraph_start
# Pad sequence and obtain inputs to model
input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(
attention_mask), answer_start_token, answer_end_token
# Validation/Testing
else:
input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
# Paragraph is split into several windows, each with start positions separated by step "doc_stride"
for i in range(0, len(tokenized_paragraph), self.doc_stride):
# Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
input_ids_paragraph = tokenized_paragraph.ids[i: i + self.max_paragraph_len] + [102]
# Pad sequence and obtain inputs to model
input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
input_ids_list.append(input_ids)
token_type_ids_list.append(token_type_ids)
attention_mask_list.append(attention_mask)
return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)
def padding(self, input_ids_question, input_ids_paragraph):
# Pad zeros if sequence length is shorter than max_seq_len
padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
# Indices of input sequence tokens in the vocabulary
input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
return input_ids, token_type_ids, attention_mask
def evaluate(data, output):
##### TODO: Postprocessing #####
# There is a bug and room for improvement in postprocessing
# Hint: Open your prediction file to see what is wrong
# 预测文件中出现未知字符和空字符。对于未知字符,更改模型,使用更大的词库解决,对于空字符,主要原因是预测结果的尾端索引比
# 首段索引小,导致无法预测字符。我的解决方法是当结果尾端索引比首段索引小时,说明当前片段大概率不包含答案,直接舍弃。
answer = ''
max_prob = float('-inf')
num_of_windows = data[0].shape[1]
for k in range(num_of_windows):
# Obtain answer by choosing the most probable start position / end position
start_prob, start_index = torch.max(output.start_logits[k], dim=0)
end_prob, end_index = torch.max(output.end_logits[k], dim=0)
# Probability of answer is calculated as sum of start_prob and end_prob
prob = start_prob + end_prob
if (start_index >= end_index):
continue
# Replace answer if calculated probability is larger than previous windows
if prob > max_prob:
max_prob = prob
# Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
answer = tokenizer.decode(data[0][0][k][start_index: end_index + 1])
# Remove spaces in answer (e.g. "大 金" --> "大金")
return answer.replace(' ', '')
def main_run():
#logging 设置
logging.basicConfig(
level=logging.DEBUG,
handlers=[logging.FileHandler(filename='./log/log'),logging.StreamHandler()],
format='%(message)s'
)
# 初始化设置
device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f'使用{device}训练')
logger.debug(f'使用{device}训练')
same_seeds(0)
# 加速设置
fp16_training = False
if fp16_training:
from accelerate import Accelerator
accelerator = Accelerator(fp16=True)
device = accelerator.device
# 准备数据集
#读取数据
train_questions, train_paragraphs = read_data("hw7_train.json")
dev_questions, dev_paragraphs = read_data("hw7_dev.json")
test_questions, test_paragraphs = read_data("hw7_test.json")
#分词
# Tokenize questions and paragraphs separately
# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__
train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False)
train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)
# 准备数据集
train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)
# data = dev_set[5]
train_batch_size = 32
# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)
# 准备模型
# model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
model = AutoModelForQuestionAnswering.from_pretrained('luhua/chinese_pretrain_mrc_roberta_wwm_ext_large').to(device)
num_epoch = 5
validation = True
logging_step = 100
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5, gamma=0.999)
# 训练
if fp16_training:
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
model.train()
print("Start Training ...")
for epoch in range(num_epoch):
step = 1
train_loss = train_acc = 0
for j,data in enumerate(tqdm(train_loader)):
# Load all data into GPU
data = [i.to(device) for i in data]
# Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
# Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)
output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3],
end_positions=data[4])
# Choose the most probable start position / end position
start_index = torch.argmax(output.start_logits, dim=1)
end_index = torch.argmax(output.end_logits, dim=1)
# Prediction is correct only if both start_index and end_index are correct
train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
train_loss += output.loss
if fp16_training:
accelerator.backward(output.loss)
else:
output.loss.backward()
optimizer.step()
optimizer.zero_grad()
writer.add_scalar('lr', optimizer.param_groups[0]['lr'], j)
scheduler.step()
step += 1
##### TODO: Apply linear learning rate decay #####
# Print training loss and accuracy over past logging step
if step % logging_step == 0:
# print(
# f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
logger.debug( f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
train_loss = train_acc = 0
if validation:
# print("Evaluating Dev Set ...")
logger.debug("Evaluating Dev Set ...")
model.eval()
with torch.no_grad():
dev_acc = 0
for i, data in enumerate(tqdm(dev_loader)):
output = model(input_ids=data[0].squeeze().to(device), token_type_ids=data[1].squeeze().to(device),
attention_mask=data[2].squeeze().to(device))
# prediction is correct only if answer text exactly matches
dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
# print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
logger.debug(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
model.train()
# Save a model and its configuration file to the directory 「saved_model」
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」
print("Saving Model ...")
model_save_dir = "saved_model"
model.save_pretrained(model_save_dir)
# 测试
# print("Evaluating Test Set ...")
# result = []
# model.eval()
# with torch.no_grad():
# for data in tqdm(test_loader):
# output = model(input_ids=data[0].squeeze(dim=0).to(device),
# token_type_ids=data[1].squeeze(dim=0).to(device),
# attention_mask=data[2].squeeze(dim=0).to(device))
# result.append(evaluate(data, output))
# result_file = "result.csv"
# with open(result_file, 'w') as f:
# f.write("ID,Answer\n")
# for i, test_question in enumerate(test_questions):
# # Replace commas in answers with empty strings (since csv is separated by comma)
# # Answers in kaggle are processed in the same way
# f.write(f"{test_question['id']},{result[i].replace(',', '')}\n")
# print(f"Completed! Result is in {result_file}")
if __name__ == "__main__": # 教训 : "__mian__" 不要加空格
main_run()
三: 链接