教育领域“知之”大模型--山东大学软件学院2024年项目实训（十）-CSDN博客

本文链接：https://blog.csdn.net/m0_62817302/article/details/139921976

# （4）模型量化

我们对微调后的模型进行awq（激活感知权重量化）量化。

AWQ（Activation-aware Weight Quantization）是一种用于大型语言模型（LLM）的权重量化技术，旨在减少模型大小和加速推理过程，同时保持模型性能。AWQ的核心思想是权重对模型性能的重要性并不相同，因此通过保护更重要的权重（通常是较小的一部分，如0.1%-1%），可以显著减少量化误差。

"""import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from swift.llm import AppUIArguments, ModelType, app_ui_main

app_ui_args = AppUIArguments(model_type=ModelType.deepseek_math_7b_instruct, quantization_bit=4)
app_ui_main(app_ui_args)"""

"""
CUDA_VISIBLE_DEVICES=0 swift export \
    --ckpt_dir output/qwen1half-7b-chat/vx-xxx/checkpoint-xxx \
    --quant_bits 4 --quant_method awq \
    --merge_lora true
"""

# Experimental environment: 3090
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from swift.llm import (
    ModelType, get_vllm_engine, get_default_template_type,
    get_template, inference_vllm, inference_stream_vllm
)
import torch

model_type = ModelType.deepseek_math_7b_instruct
model_id_or_path = 'output/qwen1half-7b-chat/vx-xxx/checkpoint-xxx-merged-awq-int4'
llm_engine = get_vllm_engine(model_type,
                             model_id_or_path=model_id_or_path,
                             max_model_len=4096)
template_type = get_default_template_type(model_type)
template = get_template(template_type, llm_engine.hf_tokenizer)
# 与`transformers.GenerationConfig`类似的接口
llm_engine.generation_config.max_new_tokens = 512

request_list = [{'query': '1+1=?'}, {'query': '2+2=?'}]
resp_list = inference_vllm(llm_engine, template, request_list)
for request, resp in zip(request_list, resp_list):
    print(f"query: {request['query']}")
    print(f"response: {resp['response']}")

# 流式
history1 = resp_list[1]['history']
query = "3+3=?"
request_list = [{'query': query, 'history': history1}]
gen = inference_stream_vllm(llm_engine, template, request_list)
print_idx = 0
print(f'query: {query}\nresponse: ', end='')
for resp_list in gen:
    request = request_list[0]
    resp = resp_list[0]
    response = resp['response']
    delta = response[print_idx:]
    print(delta, end='', flush=True)
    print_idx = len(response)
print()
print(f"history: {resp_list[0]['history']}")