数据处理
1. 切分大文件
执行auto_split.sh文件,作用是分割大文件,超过1GB的文件,会自动分割未300M的小文件
使用split命令将文件分割成300M一份的小文件,并将分割后的文件以两位数字结尾,保存在原文件所在目录下,然后使用ls命令获取当前目录下所有文件,通过grep和正则表达式找到带有两位数字结尾的文件,并使用xargs -n1 -i{} mv {} {}.json命令将这些文件重命名为.json文件。
split -a 2 -C 300M $p $file/$name- && ls|grep -E "(-[a-zA-Z]{2})" |xargs -n1 -i{} mv {} {}.json
2. 分句
执行preprocessing.py文件,该文件的作用主要是分句,为什么不嵌入到collate_fn中做,是发现那样效率会慢一些,所以单独拿出来做了
3. 加载数据
load.py文件是用fsdata的方式加载数据集,也是执行即可。执行一遍,后续的加载可以实现180GB的数据秒入
token实现Mask
NSP任务,只做mask任务,具体mask策略如下:
- 15%随机mask
- 80% mask
- 10% 随机替换
- 10% 保持不变
- 全词mask (wwm)
- n-gram mask
class DataCollate(object):
def __init__(self, tokenizer, max_length, mask_rate=0.15, max_ngram=3, if_padding=True) -> None:
self.tokenizer = tokenizer
self.max_length = max_length
self.word_cuter = jieba.cut #分词器
self.vocab_length = len(tokenizer) #词表长度
self.mask_rate = mask_rate #mask的比例
self.ignore_labels = -100 #忽略计算的label
self.ngrams = np.arange(1, max_ngram + 1, dtype=np.int64) # 最大的grams
pvals = 1. / np.arange(1, max_ngram + 1)
pvals /= pvals.sum(keepdims=True) # p(n) = 1/n / sigma(1/k)
self.pvals = pvals
self.padding = if_padding
def token_process(self, token_id):
rand = np.random.random()
if rand <= 0.8: # 80% mask
return self.tokenizer.mask_token_id
elif rand <= 0.9: # 10%保持不变
return token_id
else: # 10%随机替换
return np.random.randint(1, self.vocab_length)
def __call__(self, samples):
input_ids = []
attention_mask = []
token_type_ids = []
batch_labels = []
# print('^-^ batch size :',len(samples))
for sample in samples:
word_list = list(self.word_cuter(sample['text'])) #句子分词后列表
mask_ids, labels = [], []
record = []
for i in range(len(word_list)): #遍历
rands = np.random.random()
if i in record:
continue
word = word_list[i]
if rands > self.mask_rate and len(word) < 4: # 全词mask
word = word_list[i]
word_encode = tokenizer.encode(word, add_special_tokens=False)
for token in word_encode:
mask_ids.append(token)
labels.append(self.ignore_labels)
record.append(i)
else:
n = np.random.choice(self.ngrams, p=self.pvals) # ngram的mask
for index in range(n):
ind = index + i
if ind in record or ind >= len(word_list):
continue
record.append(ind)
word = word_list[ind]
word_encode = tokenizer.encode(word, add_special_tokens=False)
for token in word_encode:
mask_ids.append(self.token_process(token))
labels.append(token)
if self.padding:
if len(mask_ids) > self.max_length: # 大于max length进行截断
input_ids.append(mask_ids[:self.max_length])
batch_labels.append(labels[:self.max_length])
else:
lenght = len(mask_ids) # 小于max length 用0填充,label为-100表示不参与loss计算
mask_ids.extend([0]*(self.max_length-lenght))
labels.extend([-100]*(self.max_length-lenght))
input_ids.append(mask_ids)
batch_labels.append(labels)
attention_mask.append([1]*self.max_length)
token_type_ids.append([0]*self.max_length)
# print('sentence:',sample['text'])
# print('input_ids:',mask_ids)
# print('decode inputids:',self.tokenizer.decode(mask_ids))
# print('labels',labels)
# print('decode labels:',self.tokenizer.decode(labels))
# print('*'*20)
return {
'input_ids': torch.tensor(input_ids),
'labels': torch.tensor(batch_labels),
'attention_mask': torch.tensor(attention_mask),
'token_type_ids': torch.tensor(token_type_ids)
}
带mask的ACC计算
def comput_metrix(self, logits, labels):
ones = torch.ones_like(labels)
zero = torch.zeros_like(labels)
mask = torch.where(labels < 0, zero, ones)
mask = mask.view(size=(-1,)).float()
# y_true=labels.view(size=(-1,)).float()
y_pred = torch.argmax(logits, dim=-1)
y_pred = y_pred.view(size=(-1,))
y_true = labels.view(size=(-1,)).float()
corr = torch.eq(y_pred, y_true)
corr = torch.multiply(corr.float(), mask)
acc = torch.sum(corr.float()) / torch.sum(mask)
return acc
CP和SFT的不同
- continue-pretrain
def training_step(self, batch, batch_idx):
output = self.model(
input_ids=batch['input_ids'],
labels=batch['labels'])
acc = self.comput_metrix(output.logits, batch['labels'])
self.log('train_loss', output.loss, sync_dist=True)
self.log('train_acc', acc, sync_dist=True)
return output.loss
- stf:
def training_step(self, batch, batch_idx):
output = self.model(
input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
labels=batch['labels'])
acc = self.comput_metrix(output.logits, batch['labels'])
self.log('train_loss', output.loss, sync_dist=True)
self.log('train_acc', acc, sync_dist=True)
return output.loss
- pretrain的attention mask全都是1,也就是sequence的全部token都参与计算
- finetune的attention mask是要区分的,response的attention mask才是1
QA的tokenizer (T5)
def regular_tokenize(self, sample):
"""
sample.keys:question:str,context:stc, answer:[],idx:int,ans_span:[]
"""
plain_text = (
"question:"
+ sample["question"]
+ "knowledge:"
+ sample["context"][: self.max_knowledge_length]
)
l_text = len(plain_text)
ctx_len = self.max_seq_length - l_text - 1
if ctx_len > 0 and "history" in sample:
context = "[SEP]".join(sample["history"])
plain_text += "context:" + context
res_prefix = self.tokenizer.encode("answer:", add_special_tokens=False)
# res_prefix.tolist()
l_rp = len(res_prefix)
tokenized = self.tokenizer.encode(
plain_text,
add_special_tokens=False,
truncation=True,
max_length=self.max_seq_length - 2 - l_rp,
)
# tokenized.tolist()
tokenized += res_prefix
# add maskid
mask_id = self.tokenizer.convert_tokens_to_ids("<extra_id_0>")
tokenized.append(mask_id)
tokenized.append(self.eos_token_id)
# print(tokenized)
target_ids = self.tokenizer.encode(
"<extra_id_0>" + sample["answer"][0],
add_special_tokens=True,
truncation=True,
max_length=self.max_target_length,
)
# print(target_ids)
tokenized_sample = {}
tokenized_sample["input_ids"] = np.array(tokenized, dtype=np.int32)
tokenized_sample["attention_mask"] = np.ones(len(tokenized), dtype=np.int8)
tokenized_sample["labels"] = np.array(target_ids, dtype=np.int32)
tokenized_sample["idx"] = sample["idx"]
# print(tokenized_sample)
return tokenized_sample
Decoder mask(T5)
batch["decoder_input_ids"] = torch.tensor(
self.shift_tokens_right(
batch["labels"], self.pad_token_id, self.decoder_start_token_id
),
dtype=torch.long,
)
def shift_tokens_right(
self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int
) -> np.ndarray:
"""
Shift input ids one token to the right.
[decoder_start_token_id, input_ids[:-1] ]
example:
input_ids = [a,b,c,d]
shift_input_ids = [start_token,a,b,c]
"""
shifted_input_ids = np.zeros_like(input_ids)
shifted_input_ids[:, 1:] = input_ids[:, :-1]
shifted_input_ids[:, 0] = decoder_start_token_id
shifted_input_ids = np.where(
shifted_input_ids == -100, pad_token_id, shifted_input_ids
)
return shifted_input_ids
LLama SFT
class LlamaSFTCollator:
'''
由input处理成samples,也就是最终模型的输入
其中主要处理逻辑在__call__里
'''
tokenizer: None # 分词
max_seq_length: 1536
def __call__(self, samples):
input_ids_list = []
labels_list = []
max_length = 0
for s in samples:
"""
sample: {
"task" : str,
"prompt": [str]
"output": [str]
}
"""
prompt_cnt = min(len(s["prompt"]), len(s["output"]))
# input_ids = self.tokenizer(prompt_prefix).input_ids
input_ids = []
labels_ids = [-100] * len(input_ids)
for i in range(prompt_cnt):
prompt_input_ids = self.tokenizer(prompt_without_output.format_map(
{"prompt": s["prompt"][i].strip()}), add_special_tokens=False).input_ids
output_ids = self.tokenizer(s["output"][i].strip(), add_special_tokens=False).input_ids + [self.tokenizer.eos_token_id]
input_ids += prompt_input_ids
input_ids += output_ids
labels_ids += [-100] * (len(prompt_input_ids)) + output_ids # prompt部分是-100,即忽略计算
# input_ids += [self.tokenizer.eos_token_id]
# labels_ids += [self.tokenizer.eos_token_id]
max_length = min(max(len(input_ids), max_length), self.max_seq_length)
input_ids_list.append(input_ids)
labels_list.append(labels_ids)
# PAD
for i in range(len(input_ids_list)):
labels_list[i] = pad(labels_list[i], -100, max_length) # label用-100填充
input_ids_list[i] = pad(input_ids_list[i], self.tokenizer.eos_token_id, max_length) # input用eos token填充
model_inputs = {
'input_ids': torch.tensor(input_ids_list).clone(),
'attention_mask': torch.ones((len(input_ids_list), max_length)).clone(),
"position_ids": torch.arange(0, max_length).unsqueeze(0).expand(len(input_ids_list), max_length).clone(),
'labels': torch.tensor(labels_list).clone(),
}
return model_inputs