05-28 周二 TTFT， ITL， TGS 计算过程以及LLama2推理代码调试过程

0-21

已于 2024-05-29 20:38:30 修改

阅读量3.5k

点赞数 19

文章标签：推理

于 2024-05-29 20:37:34 首次发布

本文链接：https://blog.csdn.net/lk142500/article/details/139305261

版权

05-28 周二 LLama2推理代码调试过程

时间	版本	修改人	描述
2024年5月28日15:03:49	V0.1	宋全恒	新建文档

简介

本文主要用于求解大模型推理过程中的几个指标：

主要是TTFT，ITL， TGS

在这里插入图片描述

代码片段

import os

data_dir = "/workspace/models/"
model_name = "Llama-2-7b-hf"
data_dir = data_dir + model_name

import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import LlamaForCausalLM, LlamaTokenizer

import time
import torch
# from ixformer.inference.models.chatglm2_6b import ChatGLMForConditionalGeneration
# from torch.cuda import profiler

import argparse
import pickle
# from thop import profile
import json
from datetime import datetime



def main(args):
    tokenizer = AutoTokenizer.from_pretrained(data_dir, trust_remote_code=True, device_map="auto")
    model = transformers.AutoModelForCausalLM.from_pretrained(data_dir, trust_remote_code=True, device_map='auto')



    INPUT_LEN = [32,64,128,256,512,1024,2048]
    # INPUT_LEN = [1024, 2048]
    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
    res_file = "result_" + model_name + "_fp16_" + current_time + ".txt"
    print("res_file {res_file}")
    with open(res_file, "w") as f_result:
        with open("input_request_list","rb") as f:
            input_request_list = pickle.load(f)
            for input_request in input_request_list:

                print(input_request)
                test_len = input_request[1]
                if test_len not in INPUT_LEN:
                    continue
                print("testing len:{}...".format(test_len))

                query, prompt_len, output_len = input_request

                inputs = tokenizer(query, return_tensors='pt').to('cuda')
                geneate_ids = model.generate(inputs.input_ids, max_new_tokens=1, max_length=None, do_sample=False)
                # response, _ = model.chat(tokenizer, query, max_new_tokens=1, do_sample=False, history=[])
                #torch.cuda.synchronize()

                print("start TTFT test...")
                TTFT_list = []
                for _ in range(2):
                    start_time = time.time()
                    geneate_ids = model.generate(inputs.input_ids, max_new_tokens=1, max_length=None, do_sample=False)
                    # response, _ = model.chat(tokenizer, query, do_sample=False, max_new_tokens=1,max_length=None, history=[])
                    #torch.cuda.synchronize()
                    end_time = time.time()

                    TTFT = (end_time - start_time) * 1000
                    print(TTFT)
                    TTFT_list.append(TTFT)
                TTFT = sum(TTFT_list)/len(TTFT_list)
                print("time to first token:{:2f} ms".format(TTFT))

                print("start ITL test...")
                ITL_list = []
                out_tokens_num = 0
                for _ in range(2):
                    start_time = time.time()
                    geneate_ids = model.generate(inputs.input_ids, max_new_tokens=50, max_length=None, do_sample=False)
                    outputs = geneate_ids.tolist()[0][len(inputs["input_ids"][0]):]
                    # response, _ = model.chat(tokenizer, query, max_new_tokens=50, do_sample=False, history=[])
                    #torch.cuda.synchronize()
                    end_time = time.time()

                    # out_tokens_num = len(tokenizer(response).input_ids)
                    out_tokens_num = len(outputs)
                    print("out_tokens_num:{}".format(out_tokens_num))
                    ITL = ((end_time - start_time) * 1000 - TTFT) / out_tokens_num
                    print(ITL)
                    ITL_list.append(ITL)
                ITL = sum(ITL_list) / len(ITL_list)
                print("inter-token latency:{:2f} ms".format(ITL))

                f_result.write("In len:{}\n".format(test_len))
                f_result.write("Out len:{}\n".format(out_tokens_num))
                f_result.write("TTFT:{:.2f}\n".format(TTFT))
                f_result.write("ITL:{:.2f}\n".format(ITL))
                f_result.write("\n")
                f_result.flush()



if __name__ == "__main__":
    main()

调试过程

vscode配置调试代码

具体可以参见， 05-16 周四 vscode 搭建远程调试环境

launch.json配置

{
   
    // 使用 IntelliSense 了解相关属性。 
    // 悬停以查看现有属性的描述。
    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
   
            "name": "LLaMa2 推理",

最低0.47元/天解锁文章