大模型量化

云帆@

已于 2023-12-08 10:36:12 修改

阅读量181

点赞数

分类专栏： AI 文章标签： python 深度学习人工智能

于 2023-12-06 16:19:59 首次发布

本文链接：https://blog.csdn.net/weixin_40777649/article/details/134834777

版权

AI 专栏收录该内容

29 篇文章 1 订阅

订阅专栏

一、目录

量化对比。model.quantize(8)
预测时量化
自定义大模型的生成策略，即修改模型的生成策略 LogitsProcessor，通过自定义生成策略，改变输出

二、实现

量化对比。模型加载时是先加载到cpu 上的，执行cuda()时才加载到gpu 上。为了降低gpu显存，可以先量化，再加载到gpu。

import pynvml
from transformers import AutoModel,AutoTokenizer
import torch
device=1
def get_gpu_memory(device):
    pynvml.nvmlInit()
    handle=pynvml.nvmlDeviceGetHandleByIndex(device)
    mem_info=pynvml.nvmlDeviceGetMemoryInfo(handle)

    print("GPU memory total :",mem_info.total/1024/1024/1000)
    print("GPU memory free: ",mem_info.free/1024/1024/1000)
    print("GPU memory used: ",mem_info.used/1024/1024/1000)

def glm2_noquantize():
    get_gpu_memory(device)
    model=AutoModel.from_pretrained("../chatglm2-6b",trust_remote_code=True)
    total=0
    for _,params in model.named_parameters():
        if len(params.shape)==1:
            total+=params.shape[0]
        else:
            total+=params.shape[0]*params.shape[1]
    print("glm2 params total: ",total)
    print(model)
    for layer in model.transformer.encoder.layers:
        weight=layer.self_attention.query_key_value.weight
        print(weight.dtype)
        print(weight)
        break
    model.to(torch.device("cuda:3"))       
    for layer in model.transformer.encoder.layers:
        weight=layer.self_attention.query_key_value.weight
        print(weight.dtype)
        print(weight)
        break

    get_gpu_memory(device)

def glm2_qualize(bits=8):
    get_gpu_memory(device)
    model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True)
    total = 0
    for _, params in model.named_parameters():
        if len(params.shape) == 1:
            total += params.shape[0]
        else:
            total += params.shape[0] * params.shape[1]
    print("glm2 params total: ", total)
    print("CPU")
    for layer in model.transformer.encoder.layers:
        weight = layer.self_attention.query_key_value.weight
        print(weight)
        break
    print("quantize")
    model = model.quantize(bits)   #模型量化   等价于 model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_8bit=True,device_map={"":0})
    # print(model)
    for layer in model.transformer.encoder.layers:
        weight = layer.self_attention.query_key_value.weight
        print(weight)
        break
    print("transfer tensor to GPU")        #加载到gpu
    model.to(torch.device("cuda:3"))
    for layer in model.transformer.encoder.layers:
        weight = layer.self_attention.query_key_value.weight
        print(weight)
        break
    get_gpu_memory(device)

if __name__ == '__main__':
    glm2_noquantize()
    glm2_qualize()

总结：量化之后，占用显存降低，参数类型改变。

model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_8bit=True,device_map={"":0})
等价于 model.quantize(8).cuda()

model.quantize(4).cuda()   #不存在，该方式不存在4倍量化。

model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_4bit=True,device_map={"":0})
等价于
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,quantization_config=quantization_config)

预测时量化

def glm2_quantize_inference():
    tokenizer = AutoTokenizer.from_pretrained("../chatglm2-6b",
                                              trust_remote_code=True)
    model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True).quantize(8).cuda()   #量化，加载到gpu
    max_length = 1024
    do_sample = True
    top_p = 0.9
    temperature = 0.95
    logits_processor = LogitsProcessorList()
    logits_processor.append(InvalidScoreLogitsProcessor())    #自定义改变输出
    gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
                  "temperature": temperature, "logits_processor": logits_processor}
    text = "你好呀，请问你是谁？"
    t1 = time.time()
    total = 0

    input_ids = tokenizer([text], return_tensors="pt", padding=True)
    input_ids = input_ids.to("cuda")
    outputs = model.generate(**input_ids, **gen_kwargs)
    outputs = outputs.cpu().tolist()[0][len(input_ids["input_ids"][0]):]
    response = tokenizer.decode(outputs)
    total += len(response)

    t2 = time.time()
    print(f"glm2_noquantize_inference total time {round(t2-t1,4)} s total tokens {total} each token time cost is {round(1000*(t2-t1)/total,4)} ms")


if __name__ == '__main__':
    glm2_quantize_inference()

总结：量化后模型生成过程边长，即模型推理速度变慢，但显存占用变少，即用速度换显存。

自定义大模型的生成策略，即修改模型的生成策略 LogitsProcessor，通过自定义生成策略，改变输出

logits processor是在生成的过程中，每一个step的score计算完成之后，对score进行进一步的加工，改变模型输出的概率分布，从而影响后续生成的结果。
transformers模块中提供了若干内置的processor可以直接调用。具体的整理和简介可以参考之前的文章以beam search为例，详解transformers中generate方法（上）。
https://blog.csdn.net/weixin_44826203/article/details/132108662
修改输出策略：

rom transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
import torch
from typing import List
tokenizer=AutoTokenizer.from_pretrained("../chatglm2-6b",trust_remote_code=True)
model=AutoModel.from_pretrained("../chatglm2-6b",trust_remote_code=True).cuda()
class SuppressSpecificBOSTokenLogitsProcessor(LogitsProcessor):
    """
    防止生成的第一个token是某些特定的token
    ---------------
    """
    def __init__(self, bad_bos_token_id_list: List[int] = None):
        """
        :param bad_bos_token_id_list: 不可以作为第一个token的token的id列表
        """
        self.bad_bos_token_id_list = bad_bos_token_id_list

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        #input_ids.shape   当前的维度
        new_token_len = input_ids.shape[-1] - current_token_len         #current_token_len,第一个输出，下文中定义
        if new_token_len == 0:
            for id_ in self.bad_bos_token_id_list:
                scores[:, id_] = -float('inf')
        return scores

NUMBER_ID_LIST = []
for i in range(10):
    NUMBER_ID_LIST.append(tokenizer.convert_tokens_to_ids(str(i)))
NUMBER_ID_LIST.append(tokenizer.convert_tokens_to_ids(str("?")))
NUMBER_ID_LIST.append(31514)


logits_processor = LogitsProcessorList()
logits_processor.append(SuppressSpecificBOSTokenLogitsProcessor(NUMBER_ID_LIST))


global current_token_len
with torch.no_grad():
    input_text ="你知道梦中情炉吗"
    ids = tokenizer.encode(
        input_text

    )
    input_ids = torch.LongTensor([ids]).cuda()
    current_token_len=input_ids.shape[1]
    '''
    输入：[[64790, 64792, 36474, 31717, 47132, 54623, 56754, 55398]]，生成的第一个token 为下一个字
    '''
    out = model.generate(
        input_ids=input_ids,
        max_length=150,
        do_sample=False,
        temperature=0,
        logits_processor=logits_processor      #自定义输出
    )
    out_text = tokenizer.decode(out[0])
    answer = out_text.replace(input_text, "").replace("\nEND", "").strip()
    print(answer)

云帆@

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
大模型量化

transformers模块中提供了若干内置的processor可以直接调用。具体的整理和简介可以参考之前的文章以beam search为例，详解transformers中generate方法（上）。logits processor是在生成的过程中，每一个step的score计算完成之后，对score进行进一步的加工，改变模型输出的概率分布，从而影响后续生成的结果。总结：量化后模型生成过程边长，即模型推理速度变慢，但显存占用变少，即用速度换显存。总结：量化之后，占用显存降低，参数类型改变。
复制链接

扫一扫