一、 核心代码
# input-01
PROMPT_TEMPLATE = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response: "
)
def build_instruction_dataset(data_path: Union[List[str],str],
tokenizer: transformers.PreTrainedTokenizer,
max_seq_length: int, data_cache_dir = None,
preprocessing_num_workers = None,
):
def tokenization(examples):
sources = []
targets = []
prompt = PROMPT_TEMPLATE
for instruction, input, output in zip(examples['instruction'],examples['input'],examples['output']):
# input-02
if input is not None and input !="":
instruction = instruction+'\n'+input
source = prompt.format_map({'instruction':instruction})
target = f"{output}{tokenizer.eos_token}"
sources.append(source)
targets.append(target)
tokenized_sources = tokenizer(sources,return_attention_mask=False)
tokenized_targets = tokenizer(targets,return_attention_mask=False,add_special_tokens=False)
all_input_ids = []
all_labels = []
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
# input-03,模型的输入是包括input与label的
input_ids = torch.LongTensor(s + t)[:max_seq_length]
# output-01, label中输入位全置为-100
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
assert len(input_ids) == len(labels)
all_input_ids.append(input_ids)
all_labels.append(labels)
results = {'input_ids':all_input_ids, 'labels': all_labels}
return results
二、结论:
{'instruction': '创建一个完成这句话的句子。',
'input': '她收拾好了行李,然后',
'output': '踏上了她的旅程。'}
{'instruction': '创作一张对50岁以上的人有趣的模因(meme)。',
'input': '',
'output': '一张模因图片,上面是一位拿着拐杖或行走杖的老人望向远方,配有字幕:“当二十多岁的人们谈论‘好旧时光’时”。'}
1、human_input:PROMPT_TEMPLATE + instruction + input
2、model_response: output
微调模型输入:
human_input + model_response
微调模型输出:
[- 100] * len(human_input ) + model_response