文章目录
模型下载路径:
魔塔社区:
7B模型
7B量化模型
运行
python代码运行
不知道是不是因为我的环境没有配置成功,VLLM进行模型加速的时候总会报错,不过用transformers和diffusion的时候就没有问题。
transformers代码:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor,AutoImageProcessor
from qwen_vl_utils import process_vision_info
import torch
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-72B-Instruct-AWQ", torch_dtype="auto", device_map="cuda:0",
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-72B-Instruct-AWQ")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "1.png",
},
{"type": "text", "text": "Use Chinese discribe this image."},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
vllm代码
# from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = "Qwen/Qwen2.5-VL-72B-Instruct-AWQ"
llm = LLM(
model=MODEL_PATH,
limit_mm_per_prompt={"image": 10, "video": 10},
)
sampling_params = SamplingParams(
temperature=0.1,
top_p=0.001,
repetition_penalty=1.05,
max_tokens=256,
stop_token_ids=[],
)
image_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "image",
"image": "1.png",
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
{"type": "text", "text": "这张图片里边是什么东西?"},
],
},
]
这个代码我确实跑不起来,试了docker等方法都不行,遂放弃,转为框架部署
VLLM框架部署
docker指令,在部署之前,要确保自己的服务器上已经有一个VLLM的本地镜像,GPU显存最好大于40G。
docker run -it \
--name Qwen2.5-VL-7B-Instruct \
--gpus 2 \
-v /data:/data \
-p 5056:5056 \
--ipc=host vllm/vllm-openai:latest \
--model /data/H2413325/qianwen2.5VL/Qwen/Qwen2.5-VL-7B-Instruct \
--max-num-batched-tokens 131072 \
--max-num-seqs 4 \
--served-model-name Qwen2.5-VL-7B-Instruct --port 5056
多模态输入
{
"model": "Qwen2.5-VL-7B-Instruct",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "http://10.211.74.58:8810/images/1.jpg"
}
},
{
"type": "text",
"text": "这张图片里边的男子在图片的坐标是哪里?"
}
]
}
]
}
输出参数
{
"id": "chatcmpl-f9188e1bdb574e5ebe457435892fc514",
"object": "chat.completion",
"created": 1741847606,
"model": "Qwen2.5-VL-7B-Instruct",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"reasoning_content": null,
"content": "这张图片里边的男子位于图片的中心位置。具体来说,他站在一个平台的正中央,背景是落日和草地。",
"tool_calls": []
},
"logprobs": null,
"finish_reason": "stop",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 3923,
"total_tokens": 3954,
"completion_tokens": 31,
"prompt_tokens_details": null
},
"prompt_logprobs": null
}