预训练细节

本文介绍了如何通过auto_split.sh和preprocessing.py脚本进行大文件切分和文本分句处理,以及使用load.py加载数据时的高效方法。详细描述了NSP任务中的mask策略和T5tokenizer的应用,以及CP和SFT在训练步骤中的区别。
摘要由CSDN通过智能技术生成

数据处理

1. 切分大文件

执行auto_split.sh文件,作用是分割大文件,超过1GB的文件,会自动分割未300M的小文件

使用split命令将文件分割成300M一份的小文件,并将分割后的文件以两位数字结尾,保存在原文件所在目录下,然后使用ls命令获取当前目录下所有文件,通过grep和正则表达式找到带有两位数字结尾的文件,并使用xargs -n1 -i{} mv {} {}.json命令将这些文件重命名为.json文件。
split -a 2 -C 300M $p $file/$name- && ls|grep -E "(-[a-zA-Z]{2})" |xargs -n1 -i{} mv {} {}.json

2. 分句

执行preprocessing.py文件,该文件的作用主要是分句,为什么不嵌入到collate_fn中做,是发现那样效率会慢一些,所以单独拿出来做了

3. 加载数据

load.py文件是用fsdata的方式加载数据集,也是执行即可。执行一遍,后续的加载可以实现180GB的数据秒入

token实现Mask

NSP任务,只做mask任务,具体mask策略如下:

  • 15%随机mask
    • 80% mask
    • 10% 随机替换
    • 10% 保持不变
  • 全词mask (wwm)
  • n-gram mask
class DataCollate(object):

    def __init__(self, tokenizer, max_length, mask_rate=0.15, max_ngram=3, if_padding=True) -> None:
        self.tokenizer = tokenizer	
        self.max_length = max_length
        self.word_cuter = jieba.cut		#分词器
        self.vocab_length = len(tokenizer)	#词表长度
        self.mask_rate = mask_rate	#mask的比例
        self.ignore_labels = -100	#忽略计算的label
        self.ngrams = np.arange(1, max_ngram + 1, dtype=np.int64) # 最大的grams
        pvals = 1. / np.arange(1, max_ngram + 1)
        pvals /= pvals.sum(keepdims=True)  # p(n) = 1/n / sigma(1/k)
        self.pvals = pvals
        self.padding = if_padding

    def token_process(self, token_id):
        rand = np.random.random()
        if rand <= 0.8:  # 80% mask
            return self.tokenizer.mask_token_id
        elif rand <= 0.9: # 10%保持不变
            return token_id
        else: # 10%随机替换
            return np.random.randint(1, self.vocab_length)

    def __call__(self, samples):
        input_ids = []
        attention_mask = []
        token_type_ids = []
        batch_labels = []
        # print('^-^ batch size :',len(samples))
        for sample in samples:
            word_list = list(self.word_cuter(sample['text'])) #句子分词后列表
            mask_ids, labels = [], []

            record = []
            for i in range(len(word_list)): #遍历
                rands = np.random.random()
                if i in record:
                    continue
                word = word_list[i]
                if rands > self.mask_rate and len(word) < 4: # 全词mask
                    word = word_list[i]
                    word_encode = tokenizer.encode(word, add_special_tokens=False)
                    for token in word_encode:
                        mask_ids.append(token)
                        labels.append(self.ignore_labels)
                    record.append(i)
                else:
                    n = np.random.choice(self.ngrams, p=self.pvals) # ngram的mask
                    for index in range(n):
                        ind = index + i
                        if ind in record or ind >= len(word_list):
                            continue
                        record.append(ind)
                        word = word_list[ind]
                        word_encode = tokenizer.encode(word, add_special_tokens=False)
                        for token in word_encode:
                            mask_ids.append(self.token_process(token))
                            labels.append(token)
            if self.padding:
                if len(mask_ids) > self.max_length: # 大于max length进行截断
                    input_ids.append(mask_ids[:self.max_length])
                    batch_labels.append(labels[:self.max_length])
                else:
                    lenght = len(mask_ids) # 小于max length 用0填充,label为-100表示不参与loss计算
                    mask_ids.extend([0]*(self.max_length-lenght))
                    labels.extend([-100]*(self.max_length-lenght))
                    input_ids.append(mask_ids)
                    batch_labels.append(labels)
            attention_mask.append([1]*self.max_length)
            token_type_ids.append([0]*self.max_length)

        #     print('sentence:',sample['text'])
        #     print('input_ids:',mask_ids)
        #     print('decode inputids:',self.tokenizer.decode(mask_ids))
        #     print('labels',labels)
        #     print('decode labels:',self.tokenizer.decode(labels))
        #     print('*'*20)
        return {
            'input_ids': torch.tensor(input_ids),
            'labels': torch.tensor(batch_labels),
            'attention_mask': torch.tensor(attention_mask),
            'token_type_ids': torch.tensor(token_type_ids)
        }

带mask的ACC计算

def comput_metrix(self, logits, labels):
        ones = torch.ones_like(labels)
        zero = torch.zeros_like(labels)
        mask = torch.where(labels < 0, zero, ones)
        mask = mask.view(size=(-1,)).float()
        # y_true=labels.view(size=(-1,)).float()

        y_pred = torch.argmax(logits, dim=-1)
        y_pred = y_pred.view(size=(-1,))
        y_true = labels.view(size=(-1,)).float()
        corr = torch.eq(y_pred, y_true)
        corr = torch.multiply(corr.float(), mask)
        acc = torch.sum(corr.float()) / torch.sum(mask)
        return acc

CP和SFT的不同

  • continue-pretrain
 def training_step(self, batch, batch_idx):
        output = self.model(
            input_ids=batch['input_ids'],
             labels=batch['labels'])
        acc = self.comput_metrix(output.logits, batch['labels'])
        self.log('train_loss', output.loss, sync_dist=True)
        self.log('train_acc', acc, sync_dist=True)
        return output.loss
  • stf:
 def training_step(self, batch, batch_idx):
        output = self.model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels'])
        acc = self.comput_metrix(output.logits, batch['labels'])
        self.log('train_loss', output.loss, sync_dist=True)
        self.log('train_acc', acc, sync_dist=True)
        return output.loss
  • pretrain的attention mask全都是1,也就是sequence的全部token都参与计算
  • finetune的attention mask是要区分的,response的attention mask才是1

QA的tokenizer (T5)

 def regular_tokenize(self, sample):
        """
        sample.keys:question:str,context:stc, answer:[],idx:int,ans_span:[]
        """
        plain_text = (
            "question:"
            + sample["question"]
            + "knowledge:"
            + sample["context"][: self.max_knowledge_length]
        )
        l_text = len(plain_text)

        ctx_len = self.max_seq_length - l_text - 1
        if ctx_len > 0 and "history" in sample:
            context = "[SEP]".join(sample["history"])
            plain_text += "context:" + context

        res_prefix = self.tokenizer.encode("answer:", add_special_tokens=False)
        # res_prefix.tolist()
        l_rp = len(res_prefix)

        tokenized = self.tokenizer.encode(
            plain_text,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_seq_length - 2 - l_rp,
        )
        # tokenized.tolist()
        tokenized += res_prefix
        # add maskid
        mask_id = self.tokenizer.convert_tokens_to_ids("<extra_id_0>")
        tokenized.append(mask_id)
        tokenized.append(self.eos_token_id)
        # print(tokenized)

        target_ids = self.tokenizer.encode(
            "<extra_id_0>" + sample["answer"][0],
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_target_length,
        )

        # print(target_ids)
        tokenized_sample = {}
        tokenized_sample["input_ids"] = np.array(tokenized, dtype=np.int32)
        tokenized_sample["attention_mask"] = np.ones(len(tokenized), dtype=np.int8)
        tokenized_sample["labels"] = np.array(target_ids, dtype=np.int32)
        tokenized_sample["idx"] = sample["idx"]
        # print(tokenized_sample)
        return tokenized_sample

Decoder mask(T5)

   batch["decoder_input_ids"] = torch.tensor(
            self.shift_tokens_right(
                batch["labels"], self.pad_token_id, self.decoder_start_token_id
            ),
            dtype=torch.long,
        )
      
 def shift_tokens_right(
        self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int
    ) -> np.ndarray:
        """
        Shift input ids one token to the right. 
        [decoder_start_token_id, input_ids[:-1] ] 
        example:
        input_ids = [a,b,c,d]
        shift_input_ids = [start_token,a,b,c] 
        """
        shifted_input_ids = np.zeros_like(input_ids) 
        shifted_input_ids[:, 1:] = input_ids[:, :-1] 
        shifted_input_ids[:, 0] = decoder_start_token_id  

        shifted_input_ids = np.where(
            shifted_input_ids == -100, pad_token_id, shifted_input_ids
        )
        return shifted_input_ids

LLama SFT

class LlamaSFTCollator:
    '''
    由input处理成samples,也就是最终模型的输入
    其中主要处理逻辑在__call__里
    '''
    tokenizer: None  # 分词
    max_seq_length: 1536
    def __call__(self, samples):
        input_ids_list = []
        labels_list = []
        max_length = 0
        for s in samples:
            """
            sample: {
                "task" : str,
                "prompt": [str]
                "output": [str]
                }
            """
            prompt_cnt = min(len(s["prompt"]), len(s["output"]))
            # input_ids = self.tokenizer(prompt_prefix).input_ids
            input_ids = []
            labels_ids = [-100] * len(input_ids)
            for i in range(prompt_cnt):
                prompt_input_ids = self.tokenizer(prompt_without_output.format_map(
                    {"prompt": s["prompt"][i].strip()}), add_special_tokens=False).input_ids
                output_ids = self.tokenizer(s["output"][i].strip(), add_special_tokens=False).input_ids + [self.tokenizer.eos_token_id]
                
                input_ids += prompt_input_ids
                input_ids += output_ids
                
                labels_ids += [-100] * (len(prompt_input_ids)) + output_ids # prompt部分是-100,即忽略计算
            
            # input_ids += [self.tokenizer.eos_token_id]
            # labels_ids += [self.tokenizer.eos_token_id]
            max_length = min(max(len(input_ids), max_length), self.max_seq_length)
            input_ids_list.append(input_ids)
            labels_list.append(labels_ids)

        # PAD
        for i in range(len(input_ids_list)):
            labels_list[i] = pad(labels_list[i], -100, max_length) # label用-100填充
            input_ids_list[i] = pad(input_ids_list[i], self.tokenizer.eos_token_id, max_length) # input用eos token填充
        model_inputs = {
            'input_ids': torch.tensor(input_ids_list).clone(),
            'attention_mask': torch.ones((len(input_ids_list), max_length)).clone(),
            "position_ids": torch.arange(0, max_length).unsqueeze(0).expand(len(input_ids_list), max_length).clone(),
            'labels': torch.tensor(labels_list).clone(),
        }
        return model_inputs
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值