昇思25天学习打卡营第21天| 基于Mindspore 的GPT2文本摘要

2301_78538042

已于 2024-07-27 11:46:41 修改

阅读量273

点赞数 2

文章标签：学习

于 2024-07-24 22:43:45 首次发布

本文链接：https://blog.csdn.net/2301_78538042/article/details/140669815

版权

Not a good outcome. Since I just trained 10000samples instead of 50000 to escape 8 hour waiting.

More like a example instead of notes.

load data.

url = 'https://download.mindspore.cn/toolkits/mindnlp/dataset/text_generation/nlpcc2017/train_with_summ.txt'
path = http_get(url, './')
dataset = TextFileDataset(str(path), shuffle = False)

train_dataset, test_dataset = dataset.split([0.9,0.1], randomizer = False)

we then hope to process data:

def process_dataset(dataset, tokenizer, batch_size = 6, max_seq_len = 1024, shuffle = False):
    def read_map(text):
        data = json.loads(text.tobytes())
        return np.array(data['article']), np.array(data['summarization'])
    def merge_and_pad (article, summary):
        tokenized = tokenizer(text = article, text_pair = summary,
                        padding = 'max_length', truncation = 'only_first',max_length = max_seq_len)
        return tokenized['inputs_ids'],tokenized['input_ids']
dataset = dataset.map(read_map, 'text', ['article', 'summary'])
dataset = dataset.map(merge_and_pad, ['article', 'summary'] ,['input_ids', 'labels'])
dataset = dataset.batch(batch_size)
if shuffle:
    dataset  = dataset.shuffle(batch_size)
return dataset

Using BertTokenizer to replace GPT2Tokenizer not having Chinese.

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

train_dataset = process_dataset(train_dataset, tokenizer, batch_size = 4)

def the model

class GPT2ForSummarization(GPT2LMHeadModel):
    def construct(
        self,
        input_ids = None,
        attention_mask = None,
        labels = None,
    ):
        outputs = super().construct(input_ids = input_ids, attention_mask = attention_mask)
        shift_logits = outputs.logits[...,:-1,:]
        shift_labels = labels[...,1:]
        loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1], shift_labels.view(-1), ignore_index=  tokenizer.pad_token_id))
        return loss

dynamic learning rate

class LinearWithWarmUp(LearningRateSchedule):
    def __init__(self, learning_rate, num_warmup_steps, num_training_steps):
        super().__init__()
        self.learning_rate = learning_rate
        self.num_warmup_steps = num_warmup_steps
        self.num_training_steps = num_training_steps
    def construct(self, global_step):
        if global_step < self.num_warmup_steps:
            return global_step / float(max(1, self.num_warmup_steps))*self.learning_rate
        return ops.maximum(
            0.0, (self.num_training_steps - global_step) / (max(1, self.num_training_steps - self.num_warmup_steps))
)*self.learning_rate

Next we train the model.

num_epochs = 1
warmup_steps = 2000
learning_rate = 1.5e-4

num_training_steps = num_epochs*train_dataset.get_dataset_size()
config = GPT2Config(vocab_size = len(tokenizer))
model = GPT2ForSummarization(config)
lr_scheduler = LinearWithWarmUp(learning_rate = learning_rate, num_warmup_steps = warmup_steps, num_training_steps = num_training_steps)
optimizer = nn.AdamWeightDecay(model.trainable.params().learning_rate = lr_scheduler)

print('number of model parameters: {}'.format(model.num_parameters()))

ckpoint_cb = CheckpointCallback(save_path = 'checkpoint', ckpt_name = 'gpt2_summarization', epochs = 1, keep_checkpoint_max = 2)
trainer = Trainer (network=model, train_dataset = train_dataset,
                        epochs = 1, optimizer = optimizer, callbacks = ckpoint_cb)
trainer.set_amp(level = '01')#mixed precision
trainer.run(tgt_columns = 'labels')

Finally we predict

def process_test_dataset, tokenizer, batch_size = 1, max_seq_len = 1024, max_summary_len = 100):
    def read_map(text):
        data = json.loads(text.tobytes())
        return np.array(data['article']), np.array(data['summarization'])
    def pad(article):
        tokenized = tokenizer(text = article, truncation = True, max_length = max_seq_len- max_summary_len)
        return tokenized['input_ids']
    dataset = dataset.map(read_map, 'text', ['article', 'summary'])
    dataset = dataset.map(pad, 'article', ['input_ids'])
    dataset = dataset.batch(batch_size)
    return dataset

test_dataset = process_test_dataset(test_dataset, tokenizer, batch_size = 1)
print(next(test_dataset.create_tuple_iterator(output_numpy = True)))

model = GPT2LMHeadModel.from_pretrained('./checkpoint/gpt2_summarization_epoch_0.ckpt', config = config)

model.set_train(False)
model.config.eos_token_id  = model.config.sep_token_id
i = 0
for (input_ids, raw_summary) in test_dataset.create_tuple_iterator():
    output_ids = model.generate(input_ids, max_new_tokens = 50, num_beams = 5, no_repeat_ngram_size = 2)
    output_text = tokenizer.decode(output_ids[0].tolist())
    print(output_text)
    i+=1
    if i == 1:
        break

here we just check the result.