基于预训练模型Bart的英文文本摘要summary生成

环境

  • python==3.7
  • transformers==4.9.2
  • rouge-score==0.0.4

数据准备

将数据放在一个txt中,每行为一条,文章正文跟label的摘要用\t分割

构建数据集

from datasets import Dataset


class Data:
    def __init__(self, data_path, tokenizer):
        self.path = data_path
        self.max_input_length = 1024
        self.max_target_length = 150
        # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)
        self.tokenizer = tokenizer
        
    def preprocess(self, train_scale=0.8):
        with open(self.path,'r') as f:
            raw_data = f.readlines()
        print(f"=======data_len: {len(raw_data)}")
        start = int(len(raw_data)*train_scale)
        print(f"======train_len: {start}")
        
        raw_train_data = raw_data[:start]
        raw_test_data = raw_data[start:]
        raw_train_test_data = {'train':{'id':[],'document':[],'summary':[]}, \
                               'test':{'id':[],'document':[],'summary':[]}}        
        for i,item in enumerate(raw_train_data):
            if len(item.split('\t')) != 3:
                continue
            url,text,label = item.split('\t')
            raw_train_test_data['train']['id'].append(i)
            
            # document 是训练数据, summary是label
            raw_train_test_data['train']['summary'].append(label.strip())
            raw_train_test_data['train']['document'].append(text.strip())

        for j,item in enumerate(raw_test_data):
            if len(item.split('\t')) != 3:
                continue
            url,text,label = item.split('\t')
            raw_train_test_data['test']['id'].append(i+j+1)
            raw_train_test_data['test']['summary'].append(label.strip())
            raw_train_test_data['test']['document'].append(text.strip())
        
        def preprocess_function(examples):
            # document 是训练数据
            inputs = examples['document']
            model_inputs = self.tokenizer(inputs, max_length = self.max_input_length, padding = 'max_length', truncation=True)
            # summary是label
            with self.tokenizer.as_target_tokenizer():
                labels = self.tokenizer(examples['summary'], max_length = self.max_target_length, padding = 'max_length', truncation = True)
            model_inputs['labels'] = labels['input_ids']
            return model_inputs
        
        train_dataset = Dataset.from_dict(raw_train_test_data['train'])
        test_dataset = Dataset.from_dict(raw_train_test_data['test'])
        tokenized_train_dataset = train_dataset.map(preprocess_function)
        tokenized_test_dataset = test_dataset.map(preprocess_function)
        return tokenized_train_dataset, tokenized_test_dataset

模型加载

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartForConditionalGeneration


checkpoint = "distilbart-xsum-9-6"
model = BartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Metrics

from rouge_score import rouge_scorer, scoring


def compute(predictions, references, rouge_types=None, use_agregator=True, use_stemmer=False):
    if rouge_types is None:
        rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

    scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer)
    if use_agregator:
        aggregator = scoring.BootstrapAggregator()
    else:
        scores = []

    for ref, pred in zip(references, predictions):
        score = scorer.score(ref, pred)
        if use_agregator:
            aggregator.add_scores(score)
        else:
            scores.append(score)

    if use_agregator:
        result = aggregator.aggregate()
    else:
        result = {}
        for key in scores[0]:
            result[key] = list(score[key] for score in scores)

    return result


#metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

训练

超参配置

batch_size = 1
args = Seq2SeqTrainingArguments( \
    "/data/yuhengshi/europe_summary/model", \
    evaluation_strategy = 'steps', \
    learning_rate = 3e-5, \
    per_device_train_batch_size = batch_size, \
    per_device_eval_batch_size = batch_size, \
    weight_decay = 0.1, \
    save_steps = 200, \
    save_total_limit = 10, \
    num_train_epochs = 5, \
    predict_with_generate = True, \
    fp16 = True, \
    eval_steps = 200, \
    logging_dir="/data/yuhengshi/europe_summary/log", \
    logging_first_step=True)

transformers api训练

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, padding=True)
data = Data('/data/yuhengshi/europe_summary/data_no_daily_news.txt', tokenizer)
tokenized_train_dataset, tokenized_test_dataset = data.preprocess()
trainer =Seq2SeqTrainer( \
                       model, \
                       args, \
                       train_dataset = tokenized_train_dataset, \
                       eval_dataset = tokenized_test_dataset, \
                       data_collator = data_collator, \
                       tokenizer = tokenizer, \
                       compute_metrics = compute_metrics)

结果

从下面step中选loss跟rouge都比较好的
train 结果


预测 生成summary

def predict(sentence):
    inputs = tokenizer([sentence],max_length = 1024, return_tensors='pt')
    summary_ids = model.generate(inputs['input_ids'], num_beams=70, max_length=150,min_length=50,early_stopping=True)
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    return ' '.join(summary)
  • 4
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

yuhengshi

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值