一、目录
- 量化对比。model.quantize(8)
- 预测时量化
- 自定义大模型的生成策略,即修改模型的生成策略 LogitsProcessor,通过自定义生成策略,改变输出
二、实现
- 量化对比。 模型加载时是先加载到cpu 上的,执行cuda()时才加载到gpu 上。为了降低gpu显存,可以先量化,再加载到gpu。
import pynvml
from transformers import AutoModel,AutoTokenizer
import torch
device=1
def get_gpu_memory(device):
pynvml.nvmlInit()
handle=pynvml.nvmlDeviceGetHandleByIndex(device)
mem_info=pynvml.nvmlDeviceGetMemoryInfo(handle)
print("GPU memory total :",mem_info.total/1024/1024/1000)
print("GPU memory free: ",mem_info.free/1024/1024/1000)
print("GPU memory used: ",mem_info.used/1024/1024/1000)
def glm2_noquantize():
get_gpu_memory(device)
model=AutoModel.from_pretrained("../chatglm2-6b",trust_remote_code=True)
total=0
for _,params in model.named_parameters():
if len(params.shape)==1:
total+=params.shape[0]
else:
total+=params.shape[0]*params.shape[1]
print("glm2 params total: ",total)
print(model)
for layer in model.transformer.encoder.layers:
weight=layer.self_attention.query_key_value.weight
print(weight.dtype)
print(weight)
break
model.to(torch.device("cuda:3"))
for layer in model.transformer.encoder.layers:
weight=layer.self_attention.query_key_value.weight
print(weight.dtype)
print(weight)
break
get_gpu_memory(device)
def glm2_qualize(bits=8):
get_gpu_memory(device)
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True)
total = 0
for _, params in model.named_parameters():
if len(params.shape) == 1:
total += params.shape[0]
else:
total += params.shape[0] * params.shape[1]
print("glm2 params total: ", total)
print("CPU")
for layer in model.transformer.encoder.layers:
weight = layer.self_attention.query_key_value.weight
print(weight)
break
print("quantize")
model = model.quantize(bits) #模型量化 等价于 model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_8bit=True,device_map={"":0})
# print(model)
for layer in model.transformer.encoder.layers:
weight = layer.self_attention.query_key_value.weight
print(weight)
break
print("transfer tensor to GPU") #加载到gpu
model.to(torch.device("cuda:3"))
for layer in model.transformer.encoder.layers:
weight = layer.self_attention.query_key_value.weight
print(weight)
break
get_gpu_memory(device)
if __name__ == '__main__':
glm2_noquantize()
glm2_qualize()
总结: 量化之后,占用显存降低,参数类型改变。
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_8bit=True,device_map={"":0})
等价于 model.quantize(8).cuda()
model.quantize(4).cuda() #不存在,该方式不存在4倍量化。
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,load_in_4bit=True,device_map={"":0})
等价于
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True,quantization_config=quantization_config)
- 预测时量化
def glm2_quantize_inference():
tokenizer = AutoTokenizer.from_pretrained("../chatglm2-6b",
trust_remote_code=True)
model = AutoModel.from_pretrained("../chatglm2-6b", trust_remote_code=True).quantize(8).cuda() #量化,加载到gpu
max_length = 1024
do_sample = True
top_p = 0.9
temperature = 0.95
logits_processor = LogitsProcessorList()
logits_processor.append(InvalidScoreLogitsProcessor()) #自定义改变输出
gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
"temperature": temperature, "logits_processor": logits_processor}
text = "你好呀,请问你是谁?"
t1 = time.time()
total = 0
input_ids = tokenizer([text], return_tensors="pt", padding=True)
input_ids = input_ids.to("cuda")
outputs = model.generate(**input_ids, **gen_kwargs)
outputs = outputs.cpu().tolist()[0][len(input_ids["input_ids"][0]):]
response = tokenizer.decode(outputs)
total += len(response)
t2 = time.time()
print(f"glm2_noquantize_inference total time {round(t2-t1,4)} s total tokens {total} each token time cost is {round(1000*(t2-t1)/total,4)} ms")
if __name__ == '__main__':
glm2_quantize_inference()
总结: 量化后模型生成过程边长,即模型推理速度变慢,但显存占用变少,即用速度换显存。
- 自定义大模型的生成策略,即修改模型的生成策略 LogitsProcessor,通过自定义生成策略,改变输出
logits processor是在生成的过程中,每一个step的score计算完成之后,对score进行进一步的加工,改变模型输出的概率分布,从而影响后续生成的结果。
transformers模块中提供了若干内置的processor可以直接调用。具体的整理和简介可以参考之前的文章以beam search为例,详解transformers中generate方法(上)。
https://blog.csdn.net/weixin_44826203/article/details/132108662
修改输出策略:
rom transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
import torch
from typing import List
tokenizer=AutoTokenizer.from_pretrained("../chatglm2-6b",trust_remote_code=True)
model=AutoModel.from_pretrained("../chatglm2-6b",trust_remote_code=True).cuda()
class SuppressSpecificBOSTokenLogitsProcessor(LogitsProcessor):
"""
防止生成的第一个token是某些特定的token
---------------
"""
def __init__(self, bad_bos_token_id_list: List[int] = None):
"""
:param bad_bos_token_id_list: 不可以作为第一个token的token的id列表
"""
self.bad_bos_token_id_list = bad_bos_token_id_list
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
#input_ids.shape 当前的维度
new_token_len = input_ids.shape[-1] - current_token_len #current_token_len,第一个输出,下文中定义
if new_token_len == 0:
for id_ in self.bad_bos_token_id_list:
scores[:, id_] = -float('inf')
return scores
NUMBER_ID_LIST = []
for i in range(10):
NUMBER_ID_LIST.append(tokenizer.convert_tokens_to_ids(str(i)))
NUMBER_ID_LIST.append(tokenizer.convert_tokens_to_ids(str("?")))
NUMBER_ID_LIST.append(31514)
logits_processor = LogitsProcessorList()
logits_processor.append(SuppressSpecificBOSTokenLogitsProcessor(NUMBER_ID_LIST))
global current_token_len
with torch.no_grad():
input_text ="你知道梦中情炉吗"
ids = tokenizer.encode(
input_text
)
input_ids = torch.LongTensor([ids]).cuda()
current_token_len=input_ids.shape[1]
'''
输入:[[64790, 64792, 36474, 31717, 47132, 54623, 56754, 55398]],生成的第一个token 为下一个字
'''
out = model.generate(
input_ids=input_ids,
max_length=150,
do_sample=False,
temperature=0,
logits_processor=logits_processor #自定义输出
)
out_text = tokenizer.decode(out[0])
answer = out_text.replace(input_text, "").replace("\nEND", "").strip()
print(answer)