Qwen 2.5 VL 多种推理方案
flyfish
单图推理
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "output_frames/frame_0000.jpg",
},
{"type": "text", "text": "描述图像"},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
多图推理
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
# 默认:加载模型到可用设备上
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
# )
# 指定本地模型路径
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
# 推荐启用 flash_attention_2 以获得更好的加速和内存节省,特别是在多图像和视频场景中。
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
# 默认处理器
processor = AutoProcessor.from_pretrained(model_path)
# 模型中每个图像的视觉标记数量范围默认为 4-16384。
# 根据需要设置 min_pixels 和 max_pixels,例如令牌范围为 256-1280,以平衡性能和成本。
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
# 包含多个图像和文本查询的消息
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": "output_frames/frame_0000.jpg"},
{"type": "image", "image": "output_frames/frame_0001.jpg"},
{"type": "text", "text": "找出这些图片之间的相似之处。"},
],
}
]
# 准备推理
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# 推理
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
视频推理1
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": [
"output_frames/frame_0000.jpg",
"output_frames/frame_0001.jpg",
"output_frames/frame_0002.jpg",
"output_frames/frame_0003.jpg",
],
},
{"type": "text", "text": "描述这个视频。"},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
视频推理2
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)
# messages = [
# {
# "role": "user",
# "content": [
# {
# "type": "video",
# "video": "test.mp4",
# },
# {"type": "text", "text": "描述这个视频。"},
# ],
# }
# ]
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "test.mp4",
"max_pixels": 360 * 420,
"fps": 1.0,
},
{"type": "text", "text": "描述这个视频。"},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
批量推理
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)
processor.tokenizer.padding_side = 'left'
# Sample messages for batch inference
messages1 = [
{
"role": "user",
"content": [
{"type": "image", "image": "output_frames/frame_0000.jpg"},
{"type": "image", "image": "output_frames/frame_0001.jpg"},
{"type": "text", "text": "这些图片中有哪些共同的元素?"},
],
}
]
messages2 = [
{"role": "system", "content": "你是一个能提供帮助的助手。"},
{"role": "user", "content": "你是谁?"},
]
# Combine messages for batch processing
messages = [messages1, messages2]
# Preparation for batch inference
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=texts,
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Batch Inference
generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_texts)
72B
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
#model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
model_path = "/media//Qwen/Qwen25-VL-72B-Instruct-AWQ/"
# #7B
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# model_path,
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )
# 72B
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
device_map="cuda",
)
model.config.use_cache = True
model = torch.compile(
model,
mode="max-autotune",
fullgraph=True,
dynamic=False
)
processor = AutoProcessor.from_pretrained(model_path)
# 加载提示信息
def load_prompts():
# 打开并读取文本文件的内容
with open('prompt.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# 将多行内容合并为单行
prompt = ''.join(lines).replace('\n', ' ').strip()
return prompt,
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": [
"output_frames/frame_0001.jpg",
"output_frames/frame_0000.jpg",
"output_frames/frame_0002.jpg",
"output_frames/frame_0003.jpg",
],
},
{"type": "text", "text": load_prompts()},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)