基于T5的模型微调以及对应的数据介绍

会发paper的学渣

已于 2023-08-25 16:13:18 修改

阅读量3.3k

点赞数 4

分类专栏：深度学习基础文章标签：深度学习神经网络自然语言处理大模型

于 2023-03-07 18:10:27 首次发布

本文链接：https://blog.csdn.net/sslfk/article/details/129387874

版权

深度学习基础专栏收录该内容

24 篇文章 4 订阅

订阅专栏

对于配置较低的GPU上的T5微调训练，只能单条数据的训练，训练代码如下：


# 使用
import datetime
import json
import os

import transformers
from torch.utils.tensorboard import SummaryWriter
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

def preprocess(text):
  text = text.replace("\n", "\\n").replace("\t", "\\t")
  return text

def postprocess(text):
  return text.replace("\\n", "\n").replace("\\t", "\t")

def train():

  lr = 1.5e-4
  num_warmup_steps = 2000
  epochs = 3
  tb_writer = SummaryWriter(log_dir="t5/summary")
  output_dir = "t5/my_model/"
  batch_size = 1
  gradient_accumulation=1
  max_grad_norm = 1
  log_step = 1
  import pandas as pd
  colum_data = pd.read_excel("data/rewrite_train.xlsx")

  data_json_list = json.loads(colum_data.to_json(force_ascii=False, orient="records"))

  total_steps = int(len(data_json_list) / epochs/ batch_size / gradient_accumulation)

  if not os.path.exists(output_dir):
    os.mkdir(output_dir)

  tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
  model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
  # 修改colab笔记本设置为gpu，推理更快
  model.train()
  device = torch.device('cuda')
  model.to(device)

  print('calculating total steps')

  optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)

  scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                           num_warmup_steps=num_warmup_steps,
                                                           num_training_steps=total_steps)
  print('starting training')
  overall_step = 0
  running_loss = 0
  for epoch in range(epochs):
    print('epoch {}'.format(epoch + 1))
    now = datetime.datetime.now()
    print('time: {}'.format(now))
    import random
    random.shuffle(data_json_list)

    for step, each in enumerate(data_json_list):
      input_ids = tokenizer(preprocess(each.get("input")), return_tensors="pt").input_ids.long().to(device)
      labels = tokenizer(preprocess(each.get("label")), return_tensors="pt").input_ids.long().to(device)
      outputs = model(input_ids=input_ids, labels=labels)
      loss = outputs.loss

      if gradient_accumulation > 1:
        loss = loss / gradient_accumulation

      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

      #  optimizer step
      if (step + 1) % gradient_accumulation == 0:
        running_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
        overall_step += 1
        if (overall_step + 1) % log_step == 0:
          tb_writer.add_scalar('loss', loss.item(), overall_step)
      if (overall_step + 1) % log_step == 0:
        print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
          datetime.datetime.now().hour,
          datetime.datetime.now().minute,
          step + 1,
          epoch + 1,
          running_loss / log_step))
        running_loss = 0
      if step%10==0 and step>=10:
        if not os.path.exists(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step)):
          os.mkdir(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
        print('saving model for epoch {}, step {}'.format(epoch + 1,step))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
    print('saving model for epoch {}'.format(epoch + 1))
    if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
      os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
    print('epoch {} finished'.format(epoch + 1))

    then = datetime.datetime.now()
    print('time: {}'.format(then))
    print('time for one epoch: {}'.format(then - now))

  print('training finished')
  if not os.path.exists(output_dir + 'final_model'):
    os.mkdir(output_dir + 'final_model')
  model_to_save = model.module if hasattr(model, 'module') else model
  model_to_save.save_pretrained(output_dir + 'final_model')



print("begin train now")
train()
print("train end")

如果你是土豪，可以使用批量的训练方法：


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/2/27 16:39
# preference:https://github.com/Shivanandroy/T5-Finetuning-PyTorch
#数据下载：链接：https://pan.baidu.com/s/1cwKLNZD7-rsdETogacP2jw?pwd=mefc 提取码：mefc
# @Author : sparkle_code_guy
import os
from torch.utils.tensorboard import SummaryWriter
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from torch import cuda
import numpy as np
import pandas as pd

device = 'cuda' if cuda.is_available() else 'cpu'
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and
  loading it into the dataloader to pass it to the
  neural network for finetuning the model

  """

  def __init__(
          self, dataframe, tokenizer, source_len, target_len, source_text, target_text
  ):
    """
    Initializes a Dataset class

    Args:
        dataframe (pandas.DataFrame): Input dataframe
        tokenizer (transformers.tokenizer): Transformers tokenizer
        source_len (int): Max length of source text
        target_len (int): Max length of target text
        source_text (str): column name of source text
        target_text (str): column name of target text
    """
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.rewrite_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    """returns the length of dataframe"""

    return len(self.target_text)

  def __getitem__(self, index):
    """return the input ids, attention masks and target ids"""

    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    # cleaning data so as to ensure data is in string type
    source_text = " ".join(source_text.split())
    target_text = " ".join(target_text.split())

    source = self.tokenizer.batch_encode_plus(
      [source_text],
      max_length=self.source_len,
      pad_to_max_length=True,
      truncation=True,
      padding="max_length",
      return_tensors="pt",
    )
    target = self.tokenizer.batch_encode_plus(
      [target_text],
      max_length=self.rewrite_len,
      pad_to_max_length=True,
      truncation=True,
      padding="max_length",
      return_tensors="pt",
    )

    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()

    return {
      "source_ids": source_ids.to(dtype=torch.long),
      "source_mask": source_mask.to(dtype=torch.long),
      "target_ids": target_ids.to(dtype=torch.long),
      "target_ids_y": target_ids.to(dtype=torch.long),
    }

def train(epoch, tokenizer, model, device, loader, optimizer,summary_writer,output_dir):

    """
    Function to be called for training with the parameters passed from main function
    """

    model.train()
    for _, data in enumerate(loader, 0):
      y = data["target_ids"].to(device, dtype=torch.long)
      y_ids = y[:, :-1].contiguous()
      lm_labels = y[:, 1:].clone().detach()
      lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
      ids = data["source_ids"].to(device, dtype=torch.long)
      mask = data["source_mask"].to(device, dtype=torch.long)

      outputs = model(
        input_ids=ids,
        attention_mask=mask,
        decoder_input_ids=y_ids,
        labels=lm_labels,
      )
      loss = outputs[0]
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      summary_writer.add_scalar('epoch/loss_{}'.format(epoch), loss.item(), _)

      if _%100000==0 and _ > 0:
        print(f"[Saving Model]...\n")
        # Saving the model after training
        path = os.path.join(output_dir, 'model_epoch{}_step{}'.format(epoch + 1,_))
        if not os.path.exists(path):
          os.mkdir(path)
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)

def T5Trainer(
        dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):
  """
  T5 trainer
  """
  if not os.path.exists(output_dir):
    os.mkdir(output_dir)
  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"])  # pytorch random seed
  np.random.seed(model_params["SEED"])  # numpy random seed

  # logging
  print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)

  # logging
  print(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe = dataframe[[source_text, target_text]]

  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation.
  train_size = 1
  train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
  train_dataset = train_dataset.reset_index(drop=True)

  print(f"FULL Dataset: {dataframe.shape}")
  print(f"TRAIN Dataset: {train_dataset.shape}")

  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(
    train_dataset,
    tokenizer,
    model_params["MAX_SOURCE_TEXT_LENGTH"],
    model_params["MAX_TARGET_TEXT_LENGTH"],
    source_text,
    target_text,
  )

  # Defining the parameters for creation of dataloaders
  train_params = {
    "batch_size": model_params["TRAIN_BATCH_SIZE"],
    "shuffle": True,
    "num_workers": 0,
  }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)

  # Defining the optimizer that will be used to tune the weights of the network in the training session.
  optimizer = torch.optim.Adam(
    params=model.parameters(), lr=model_params["LEARNING_RATE"]
  )

  # Training loop
  print(f"[Initiating Fine Tuning]...\n")

  for epoch in range(model_params["TRAIN_EPOCHS"]):
    summary_writer = SummaryWriter(log_dir="t5/summary_task")
    train(epoch, tokenizer, model, device, training_loader, optimizer,summary_writer,output_dir)
    print(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))
    if not os.path.exists(path):
      os.mkdir(path)
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)
  print(
    f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
  )


if __name__ == '__main__':
  model_params = {
    "MODEL": "ClueAI/ChatYuan-large-v1",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "TRAIN_EPOCHS": 3,  # number of training epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 768,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 512,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
  }
  train_dataframe = pd.read_csv("data/new_data.txt", sep='\t')
  T5Trainer(train_dataframe, "input", "label", model_params)

训练数据集：

链接：百度网盘请输入提取码

提取码：nrb9

关于模型的应用部分：


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/2/27 16:39
# @Author : sparkle_code_guy
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
import torch

# 修改colab笔记本设置为gpu，推理更快
device = torch.device('cuda')
model.to(device)

def preprocess(text):
    text = text.replace("\n", "\\n").replace("\t", "\\t")
    return text

def postprocess(text):
    return text.replace("\\n", "\n").replace("\\t", "\t")

def answer(text, sample=True, top_p=1, temperature=0.7):
    '''sample：是否抽样。生成任务，可以设置为True;
    top_p：0-1之间，生成的内容越多样'''
    text = preprocess(text)
    print(len(text))
    encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=768, return_tensors="pt").to(device)
    if not sample:
        out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
                             num_beams=1, length_penalty=0.6)
    else:
        out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
                             do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3)
    out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
    return postprocess(out_text[0])

def rewrite_message(input):
    print("query message:",input)
    answer_message_list=[]
    for each in range(4):
        answer_message_list.append("方案{0}：".format(each) + answer_message(input) )

    return "\n\n".join(answer_message_list)

def answer_message(input):
    input_format = input.replace("\n", "。")
    input_text = "用户：" + input_format + "\n小智："
    output_text = answer(input_text)
    return f"{output_text}"

import gradio as gr

examples_list = [
                 "example1"]

synthesis_interface = gr.Interface(rewrite_message,
                                   inputs=gr.components.Textbox(lines=10,interactive=True,placeholder="enter your question ..."),
                                   outputs=gr.components.Textbox(lines=10,interactive=False),
                                   cache_examples=False,
                                   title="问答",
                                   examples_per_page=5,
                                   examples=examples_list,
                                   live=False)
synthesis_interface.launch(share=False,server_name='0.0.0.0',server_port=7860)