Not a good outcome. Since I just trained 10000samples instead of 50000 to escape 8 hour waiting.
More like a example instead of notes.
load data.
url = 'https://download.mindspore.cn/toolkits/mindnlp/dataset/text_generation/nlpcc2017/train_with_summ.txt'
path = http_get(url, './')
dataset = TextFileDataset(str(path), shuffle = False)
train_dataset, test_dataset = dataset.split([0.9,0.1], randomizer = False)
we then hope to process data:
def process_dataset(dataset, tokenizer, batch_size = 6, max_seq_len = 1024, shuffle = False):
def read_map(text):
data = json.loads(text.tobytes())
return np.array(data['article']), np.array(data['summarization'])
def merge_and_pad (article, summary):
tokenized = tokenizer(text = article, text_pair = summary,
padding = 'max_length', truncation = 'only_first',max_length = max_seq_len)
return tokenized['inputs_ids'],tokenized['input_ids']
dataset = dataset.map(read_map, 'text', ['article', 'summary'])
dataset = dataset.map(merge_and_pad, ['article', 'summary'] ,['input_ids', 'labels'])
dataset = dataset.batch(batch_size)
if shuffle:
dataset = dataset.shuffle(batch_size)
return dataset
Using BertTokenizer to replace GPT2Tokenizer not having Chinese.
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_dataset = process_dataset(train_dataset, tokenizer, batch_size = 4)
def the model
class GPT2ForSummarization(GPT2LMHeadModel):
def construct(
self,
input_ids = None,
attention_mask = None,
labels = None,
):
outputs = super().construct(input_ids = input_ids, attention_mask = attention_mask)
shift_logits = outputs.logits[...,:-1,:]
shift_labels = labels[...,1:]
loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1], shift_labels.view(-1), ignore_index= tokenizer.pad_token_id))
return loss
dynamic learning rate
class LinearWithWarmUp(LearningRateSchedule):
def __init__(self, learning_rate, num_warmup_steps, num_training_steps):
super().__init__()
self.learning_rate = learning_rate
self.num_warmup_steps = num_warmup_steps
self.num_training_steps = num_training_steps
def construct(self, global_step):
if global_step < self.num_warmup_steps:
return global_step / float(max(1, self.num_warmup_steps))*self.learning_rate
return ops.maximum(
0.0, (self.num_training_steps - global_step) / (max(1, self.num_training_steps - self.num_warmup_steps))
)*self.learning_rate
Next we train the model.
num_epochs = 1
warmup_steps = 2000
learning_rate = 1.5e-4
num_training_steps = num_epochs*train_dataset.get_dataset_size()
config = GPT2Config(vocab_size = len(tokenizer))
model = GPT2ForSummarization(config)
lr_scheduler = LinearWithWarmUp(learning_rate = learning_rate, num_warmup_steps = warmup_steps, num_training_steps = num_training_steps)
optimizer = nn.AdamWeightDecay(model.trainable.params().learning_rate = lr_scheduler)
print('number of model parameters: {}'.format(model.num_parameters()))
ckpoint_cb = CheckpointCallback(save_path = 'checkpoint', ckpt_name = 'gpt2_summarization', epochs = 1, keep_checkpoint_max = 2)
trainer = Trainer (network=model, train_dataset = train_dataset,
epochs = 1, optimizer = optimizer, callbacks = ckpoint_cb)
trainer.set_amp(level = '01')#mixed precision
trainer.run(tgt_columns = 'labels')
Finally we predict
def process_test_dataset, tokenizer, batch_size = 1, max_seq_len = 1024, max_summary_len = 100):
def read_map(text):
data = json.loads(text.tobytes())
return np.array(data['article']), np.array(data['summarization'])
def pad(article):
tokenized = tokenizer(text = article, truncation = True, max_length = max_seq_len- max_summary_len)
return tokenized['input_ids']
dataset = dataset.map(read_map, 'text', ['article', 'summary'])
dataset = dataset.map(pad, 'article', ['input_ids'])
dataset = dataset.batch(batch_size)
return dataset
test_dataset = process_test_dataset(test_dataset, tokenizer, batch_size = 1)
print(next(test_dataset.create_tuple_iterator(output_numpy = True)))
model = GPT2LMHeadModel.from_pretrained('./checkpoint/gpt2_summarization_epoch_0.ckpt', config = config)
model.set_train(False)
model.config.eos_token_id = model.config.sep_token_id
i = 0
for (input_ids, raw_summary) in test_dataset.create_tuple_iterator():
output_ids = model.generate(input_ids, max_new_tokens = 50, num_beams = 5, no_repeat_ngram_size = 2)
output_text = tokenizer.decode(output_ids[0].tolist())
print(output_text)
i+=1
if i == 1:
break
here we just check the result.