Qwen 2.5 VL 多种推理方案

Qwen 2.5 VL 多种推理方案

flyfish

单图推理

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)


processor = AutoProcessor.from_pretrained(model_path)


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "output_frames/frame_0000.jpg",
            },
            {"type": "text", "text": "描述图像"},
        ],
    }
]


text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

多图推理

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

# 默认:加载模型到可用设备上
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
# )

# 指定本地模型路径
model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"
# 推荐启用 flash_attention_2 以获得更好的加速和内存节省,特别是在多图像和视频场景中。
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

# 默认处理器
processor = AutoProcessor.from_pretrained(model_path)

# 模型中每个图像的视觉标记数量范围默认为 4-16384。
# 根据需要设置 min_pixels 和 max_pixels,例如令牌范围为 256-1280,以平衡性能和成本。
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)


# 包含多个图像和文本查询的消息
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "output_frames/frame_0000.jpg"},
            {"type": "image", "image": "output_frames/frame_0001.jpg"},
            {"type": "text", "text": "找出这些图片之间的相似之处。"},

        ],
    }
]

# 准备推理
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# 推理
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

视频推理1

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)


processor = AutoProcessor.from_pretrained(model_path)


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": [
                    "output_frames/frame_0000.jpg",
                    "output_frames/frame_0001.jpg",
                    "output_frames/frame_0002.jpg",
                    "output_frames/frame_0003.jpg",
                ],
            },
            {"type": "text", "text": "描述这个视频。"},
        ],
    }
]

text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

视频推理2

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)


processor = AutoProcessor.from_pretrained(model_path)


# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "video",
#                 "video": "test.mp4",
#             },
#             {"type": "text", "text": "描述这个视频。"},
#         ],
#     }
# ]


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "test.mp4",
                "max_pixels": 360 * 420,
                "fps": 1.0,
            },
            {"type": "text", "text": "描述这个视频。"},
        ],
    }
]



text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

批量推理

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)


processor = AutoProcessor.from_pretrained(model_path)

processor.tokenizer.padding_side = 'left'
# Sample messages for batch inference
messages1 = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "output_frames/frame_0000.jpg"},
            {"type": "image", "image": "output_frames/frame_0001.jpg"},
            {"type": "text", "text": "这些图片中有哪些共同的元素?"},
        ],
    }
]
messages2 = [
    {"role": "system", "content": "你是一个能提供帮助的助手。"},
    {"role": "user", "content": "你是谁?"},
]
# Combine messages for batch processing
messages = [messages1, messages2]

# Preparation for batch inference
texts = [
    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
    for msg in messages
]
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=texts,
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Batch Inference
generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_texts = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_texts)

72B

from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

#model_path = "/media/model/Qwen/Qwen25-VL-7B-Instruct/"

model_path = "/media//Qwen/Qwen25-VL-72B-Instruct-AWQ/"

# #7B
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     model_path,
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )
# 72B
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2",
    device_map="cuda",
)
model.config.use_cache = True
model = torch.compile(
                model,
                mode="max-autotune",
                fullgraph=True,
                dynamic=False
                )
processor = AutoProcessor.from_pretrained(model_path)


# 加载提示信息
def load_prompts():
    # 打开并读取文本文件的内容
    with open('prompt.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 将多行内容合并为单行
    prompt = ''.join(lines).replace('\n', ' ').strip()
    return prompt, 

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": [

                    "output_frames/frame_0001.jpg",
                    "output_frames/frame_0000.jpg",
                    "output_frames/frame_0002.jpg",
                    "output_frames/frame_0003.jpg",

                ],
            },
            {"type": "text", "text": load_prompts()},
        ],
    }
]

text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=8192)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
### Qwen2.5-VL模型推理适配方法和指南 #### 1. 准备环境 为了确保Qwen2.5-VL能够顺利运行,建议使用Docker容器来创建隔离的开发环境。这不仅简化了依赖管理,还提高了部署的一致性和可靠性[^1]。 安装并配置好Docker之后,可以拉取官方提供的预构建镜像: ```bash docker pull qwen/qwen2.5-vl:latest ``` 启动容器时推荐挂载本地目录以便于数据交换: ```bash docker run -it --rm -v /local/path:/container/path qwen/qwen2.5-vl:latest bash ``` #### 2. 加载模型 进入容器内部后,通过Python脚本加载Qwen2.5-VL模型。考虑到该模型体积较大,在首次加载前可能需要下载权重文件至缓存路径下。对于已经训练好的模型实例,则可以直接调用API接口完成初始化工作。 ```python from transformers import AutoModelForVision2Seq, AutoProcessor processor = AutoProcessor.from_pretrained("qwen/Qwen2.5-VL") model = AutoModelForVision2Seq.from_pretrained("qwen/Qwen2.5-VL") ``` #### 3. 数据处理 针对输入的数据集,需按照特定格式进行预处理操作。具体来说,图片类资源应当转换成适合传递给神经网络的形式;而对于文本描述部分,则要遵循BERT编码器的要求实施分词、截断等措施。处理器对象`AutoProcessor`可以帮助自动化这部分流程。 ```python image_path = "example_image.png" text_query = "What is this image about?" encoding = processor(image=image_path, text=text_query, return_tensors="pt") pixel_values = encoding["pixel_values"] input_ids = encoding["input_ids"] attention_mask = encoding.get("attention_mask", None) ``` #### 4. 执行推理 准备好所有必要的参数后即可执行实际的预测过程。这里需要注意的是,由于多模态任务的特点决定了每次请求可能会消耗较多计算资源,因此合理设置批大小(batch size)以及优化GPU内存分配显得尤为重要。 ```python outputs = model.generate( pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, ) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] print(generated_text) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

二分掌柜的

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值