05-28 周二 TTFT, ITL, TGS 计算过程以及LLama2推理代码调试过程

05-28 周二 LLama2推理代码调试过程
时间 版本 修改人 描述
2024年5月28日15:03:49 V0.1 宋全恒 新建文档

简介

 本文主要用于求解大模型推理过程中的几个指标:

 主要是TTFT,ITL, TGS

在这里插入图片描述

代码片段

import os

data_dir = "/workspace/models/"
model_name = "Llama-2-7b-hf"
data_dir = data_dir + model_name

import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import LlamaForCausalLM, LlamaTokenizer

import time
import torch
# from ixformer.inference.models.chatglm2_6b import ChatGLMForConditionalGeneration
# from torch.cuda import profiler

import argparse
import pickle
# from thop import profile
import json
from datetime import datetime



def main(args):
    tokenizer = AutoTokenizer.from_pretrained(data_dir, trust_remote_code=True, device_map="auto")
    model = transformers.AutoModelForCausalLM.from_pretrained(data_dir, trust_remote_code=True, device_map='auto')



    INPUT_LEN = [32,64,128,256,512,1024,2048]
    # INPUT_LEN = [1024, 2048]
    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
    res_file = "result_" + model_name + "_fp16_" + current_time + ".txt"
    print("res_file {res_file}")
    with open(res_file, "w") as f_result:
        with open("input_request_list","rb") as f:
            input_request_list = pickle.load(f)
            for input_request in input_request_list:

                print(input_request)
                test_len = input_request[1]
                if test_len not in INPUT_LEN:
                    continue
                print("testing len:{}...".format(test_len))

                query, prompt_len, output_len = input_request

                inputs = tokenizer(query, return_tensors='pt').to('cuda')
                geneate_ids = model.generate(inputs.input_ids, max_new_tokens=1, max_length=None, do_sample=False)
                # response, _ = model.chat(tokenizer, query, max_new_tokens=1, do_sample=False, history=[])
                #torch.cuda.synchronize()

                print("start TTFT test...")
                TTFT_list = []
                for _ in range(2):
                    start_time = time.time()
                    geneate_ids = model.generate(inputs.input_ids, max_new_tokens=1, max_length=None, do_sample=False)
                    # response, _ = model.chat(tokenizer, query, do_sample=False, max_new_tokens=1,max_length=None, history=[])
                    #torch.cuda.synchronize()
                    end_time = time.time()

                    TTFT = (end_time - start_time) * 1000
                    print(TTFT)
                    TTFT_list.append(TTFT)
                TTFT = sum(TTFT_list)/len(TTFT_list)
                print("time to first token:{:2f} ms".format(TTFT))

                print("start ITL test...")
                ITL_list = []
                out_tokens_num = 0
                for _ in range(2):
                    start_time = time.time()
                    geneate_ids = model.generate(inputs.input_ids, max_new_tokens=50, max_length=None, do_sample=False)
                    outputs = geneate_ids.tolist()[0][len(inputs["input_ids"][0]):]
                    # response, _ = model.chat(tokenizer, query, max_new_tokens=50, do_sample=False, history=[])
                    #torch.cuda.synchronize()
                    end_time = time.time()

                    # out_tokens_num = len(tokenizer(response).input_ids)
                    out_tokens_num = len(outputs)
                    print("out_tokens_num:{}".format(out_tokens_num))
                    ITL = ((end_time - start_time) * 1000 - TTFT) / out_tokens_num
                    print(ITL)
                    ITL_list.append(ITL)
                ITL = sum(ITL_list) / len(ITL_list)
                print("inter-token latency:{:2f} ms".format(ITL))

                f_result.write("In len:{}\n".format(test_len))
                f_result.write("Out len:{}\n".format(out_tokens_num))
                f_result.write("TTFT:{:.2f}\n".format(TTFT))
                f_result.write("ITL:{:.2f}\n".format(ITL))
                f_result.write("\n")
                f_result.flush()



if __name__ == "__main__":
    main()

调试过程

vscode配置调试代码

具体可以参见, 05-16 周四 vscode 搭建远程调试环境

launch.json配置

{
   
    // 使用 IntelliSense 了解相关属性。 
    // 悬停以查看现有属性的描述。
    // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
   
            "name": "LLaMa2 推理",
      
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值