环境
- python==3.7
- transformers==4.9.2
- rouge-score==0.0.4
数据准备
将数据放在一个txt中,每行为一条,文章正文跟label的摘要用\t分割
构建数据集
from datasets import Dataset
class Data:
def __init__(self, data_path, tokenizer):
self.path = data_path
self.max_input_length = 1024
self.max_target_length = 150
# self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)
self.tokenizer = tokenizer
def preprocess(self, train_scale=0.8):
with open(self.path,'r') as f:
raw_data = f.readlines()
print(f"=======data_len: {len(raw_data)}")
start = int(len(raw_data)*train_scale)
print(f"======train_len: {start}")
raw_train_data = raw_data[:start]
raw_test_data = raw_data[start:]
raw_train_test_data = {'train':{'id':[],'document':[],'summary':[]}, \
'test':{'id':[],'document':[],'summary':[]}}
for i,item in enumerate(raw_train_data):
if len(item.split('\t')) != 3:
continue
url,text,label = item.split('\t')
raw_train_test_data['train']['id'].append(i)
# document 是训练数据, summary是label
raw_train_test_data['train']['summary'].append(label.strip())
raw_train_test_data['train']['document'].append(text.strip())
for j,item in enumerate(raw_test_data):
if len(item.split('\t')) != 3:
continue
url,text,label = item.split('\t')
raw_train_test_data['test']['id'].append(i+j+1)
raw_train_test_data['test']['summary'].append(label.strip())
raw_train_test_data['test']['document'].append(text.strip())
def preprocess_function(examples):
# document 是训练数据
inputs = examples['document']
model_inputs = self.tokenizer(inputs, max_length = self.max_input_length, padding = 'max_length', truncation=True)
# summary是label
with self.tokenizer.as_target_tokenizer():
labels = self.tokenizer(examples['summary'], max_length = self.max_target_length, padding = 'max_length', truncation = True)
model_inputs['labels'] = labels['input_ids']
return model_inputs
train_dataset = Dataset.from_dict(raw_train_test_data['train'])
test_dataset = Dataset.from_dict(raw_train_test_data['test'])
tokenized_train_dataset = train_dataset.map(preprocess_function)
tokenized_test_dataset = test_dataset.map(preprocess_function)
return tokenized_train_dataset, tokenized_test_dataset
模型加载
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartForConditionalGeneration
checkpoint = "distilbart-xsum-9-6"
model = BartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
Metrics
from rouge_score import rouge_scorer, scoring
def compute(predictions, references, rouge_types=None, use_agregator=True, use_stemmer=False):
if rouge_types is None:
rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
if use_agregator:
aggregator = scoring.BootstrapAggregator()
else:
scores = []
for ref, pred in zip(references, predictions):
score = scorer.score(ref, pred)
if use_agregator:
aggregator.add_scores(score)
else:
scores.append(score)
if use_agregator:
result = aggregator.aggregate()
else:
result = {}
for key in scores[0]:
result[key] = list(score[key] for score in scores)
return result
#metrics
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Rouge expects a newline after each sentence
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
result = compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# Extract a few results
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
# Add mean generated length
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
result["gen_len"] = np.mean(prediction_lens)
return {k: round(v, 4) for k, v in result.items()}
训练
超参配置
batch_size = 1
args = Seq2SeqTrainingArguments( \
"/data/yuhengshi/europe_summary/model", \
evaluation_strategy = 'steps', \
learning_rate = 3e-5, \
per_device_train_batch_size = batch_size, \
per_device_eval_batch_size = batch_size, \
weight_decay = 0.1, \
save_steps = 200, \
save_total_limit = 10, \
num_train_epochs = 5, \
predict_with_generate = True, \
fp16 = True, \
eval_steps = 200, \
logging_dir="/data/yuhengshi/europe_summary/log", \
logging_first_step=True)
transformers api训练
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, padding=True)
data = Data('/data/yuhengshi/europe_summary/data_no_daily_news.txt', tokenizer)
tokenized_train_dataset, tokenized_test_dataset = data.preprocess()
trainer =Seq2SeqTrainer( \
model, \
args, \
train_dataset = tokenized_train_dataset, \
eval_dataset = tokenized_test_dataset, \
data_collator = data_collator, \
tokenizer = tokenizer, \
compute_metrics = compute_metrics)
结果
从下面step中选loss跟rouge都比较好的
预测 生成summary
def predict(sentence):
inputs = tokenizer([sentence],max_length = 1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=70, max_length=150,min_length=50,early_stopping=True)
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
return ' '.join(summary)