利用gradio为InternVL构建UI界面

温柔哥`

于 2024-08-23 17:26:58 发布

阅读量216

点赞数 4

本文链接：https://blog.csdn.net/lemonzjk/article/details/141470887

版权

大模型专栏收录该内容

3 篇文章 0 订阅

订阅专栏

文章目录

工具类
单论纯文本对话
单轮多图对话

我这边自己写了两个，一个是纯文本的，一个是支持多图片的，不过都是单论对话，没有历史对话信息。

工具类

首先，为他们写了一个公共的工具类intern_utils

import numpy as np                                                  # 用于科学计算的库，提供多维数组对象和各种操作
import torch                                                        # 用于深度学习
import torchvision.transforms as T                                  # PyTorch的子模块，提供了一些常用的图像变换操作
from decord import VideoReader, cpu, gpu                            # 用于高效视频读取和处理的库
from PIL import Image                                               # Python Imaging Library，提供了一些图像处理功能
from torchvision.transforms.functional import InterpolationMode     # 图像插值模式，通常用于图像缩放等操作

IMAGENET_MEAN = (0.485, 0.456, 0.406)       # ImageNet数据集的三个通道（RGB）的均值
IMAGENET_STD = (0.229, 0.224, 0.225)        # ImageNet数据集的三个通道（RGB）的标准差

"""
根据给定的输入尺寸 input_size 构建并返回一个组合的图像变换管道
"""
def build_transform(input_size):                # 参数 input_size，表示调整后的图像尺寸
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    """
    更好的理解img是怎么来的
    transform = build_transform(224)
    image = Image.open('path_to_image.jpg')  # 使用PIL加载图像
    transformed_image = transform(image)  # 调用变换管道
    """
    transform = T.Compose([                     # 图像变换管道
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),         # 如果输入图像不是RGB模式，将其转换为RGB模式。这一步确保所有图像都有一致的通道数（3个通道）
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),    # 调整图像大小到指定的 input_size，使用双三次插值（BICUBIC）方法。这种方法通常用于图像缩放，能保留较多的图像细节。
        T.ToTensor(),                           # 将图像转换为PyTorch张量，并将像素值从[0, 255]范围缩放到[0, 1]范围
        T.Normalize(mean=MEAN, std=STD)         # 使用预定义的均值和标准差对图像进行标准化处理。这一步将图像数据归一化，使其均值为0，标准差为1。
    ])
    return transform    # 返回构建好的图像变换管道

"""
在给定的目标宽高比列表中，找到与给定图像宽高比最接近的目标宽高比，
并在差异相等的情况下根据图像面积的某种条件选择最佳宽高比

    aspect_ratio：给定图像的宽高比（宽度除以高度）。
    target_ratios：一个包含目标宽高比的列表，每个元素是一个二元组（宽度，高度）。
    width 和 height：给定图像的宽度和高度。
    image_size：图像的尺寸，这里指的是目标图像尺寸。
"""
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')      # 初始化为正无穷，用于记录找到的最小宽高比差异
    best_ratio = (1, 1)                 # 初始化为 (1, 1)，用于记录找到的最佳宽高比
    area = width * height               # 计算给定图像的面积
    for ratio in target_ratios:         # 遍历目标宽高比，每个 ratio 是一个二元组（宽度，高度）
        target_aspect_ratio = ratio[0] / ratio[1]               # 计算目标宽高比 target_aspect_ratio
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)    # 计算给定图像宽高比与目标宽高比之间的差异 ratio_diff
        if ratio_diff < best_ratio_diff:        # 如果当前 ratio_diff 小于 best_ratio_diff，更新 best_ratio_diff 和 best_ratio
            best_ratio_diff = ratio_diff    
            best_ratio = ratio  
        elif ratio_diff == best_ratio_diff:     # 如果 ratio_diff 等于当前的 best_ratio_diff，则进一步比较图像面积
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

"""
根据给定的参数，将图像调整为最接近目标宽高比的尺寸，并将其划分为多个块，以适应后续处理或模型输入的需求

    image：待处理的PIL图像。
    min_num：用于计算目标宽高比的最小因数，默认为1。
    max_num：用于计算目标宽高比的最大因数，默认为12。
    image_size：目标图像块的大小，默认为448。
    use_thumbnail：布尔值，决定是否在返回的图像列表中包含一个缩略图。
"""
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size        # 获取原始图像的宽度和高度
    aspect_ratio = orig_width / orig_height     # 计算原始图像的宽高比

    # calculate the existing image aspect ratio
    target_ratios = set(    # 生成所有可能的目标宽高比，满足 min_num <= i * j <= max_num。使用 set 去重
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])    # 根据宽高比的面积（i * j）进行排序。

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(    # 找到与原始图像宽高比最接近的目标宽高比。
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]          # 计算目标图像的宽度和高度，目标宽度和高度是目标块大小与目标宽高比的乘积
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]    # 计算图像块的数量（即目标宽高比的宽和高的乘积）

    # resize the image
    resized_img = image.resize((target_width, target_height))   # 调整图像大小为计算的目标宽度和高度
    
    # 划分图像为多个块
    processed_images = []       # 初始化一个空列表 processed_images 来存储图像块
    for i in range(blocks):     # 使用循环遍历每个块
        box = (                 # 计算每个块在调整后图像中的位置，box 的格式是 (left, upper, right, lower)，表示块的左上角和右下角的坐标
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)   # 根据计算得到的 box 边界框裁剪图像，得到一个图像块 split_img
        processed_images.append(split_img)  # 将裁剪得到的图像块添加到 processed_images 列表中
    assert len(processed_images) == blocks  # 确保生成的图像块数量与计算的块数一致
    if use_thumbnail and len(processed_images) != 1:            # 如果 use_thumbnail 为 True 且图像块数量不为1，生成一个缩略图并添加到图像块列表中
        thumbnail_img = image.resize((image_size, image_size))  # 生成的缩略图不会包含所有块的内容，它只是对原始图像进行缩放，生成一个与块大小一致的图像，并不是将所有块拼接成一个图像
        processed_images.append(thumbnail_img)
    return processed_images

"""
用于加载、预处理图像，并将图像转换为适合深度学习模型输入的格式

    image_file：图像文件的路径。
    input_size：目标图像块的大小，默认值为 448。
    max_num：用于动态预处理的最大块数，默认值为 12。
"""
def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')           # 使用 PIL 库打开图像文件，将图像转换为 RGB 模式，以确保图像有三个颜色通道
    transform = build_transform(input_size=input_size)      # 生成一个图像变换管道，用于后续对图像进行标准化处理
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)  # 函数返回一个图像块列表 images，每个块都是一个 PIL 图像对象
    pixel_values = [transform(image) for image in images]   # 用生成的图像变换管道 transform 对每个图像块进行变换
    pixel_values = torch.stack(pixel_values)                # 将所有图像块张量堆叠到一个张量中，形成一个四维张量，形状为 [N, C, H, W]， N 是图像块的数量
    return pixel_values     # 返回处理后的图像张量 pixel_values

import math

# ---------------------------------------------------------------------------------------------------
"""
根据指定模型的层数和可用GPU的数量，将模型的不同部分分配到不同的GPU上进行并行处理，
从而加速模型的训练或推理过程。具体地，代码生成一个 device_map，其中键表示模型的各个层或部分，
值表示对应的GPU编号。
"""
def split_model(model_name):    
    device_map = {}                             # 空字典，用于存储模型各部分与GPU的对应关系
    world_size = torch.cuda.device_count()      # 获取当前可用的GPU数量
    num_layers = {                              # 根据 model_name 从字典中获取对应的层数 num_layers
        'InternVL2-1B': 24, 'InternVL2-2B': 24, 'InternVL2-4B': 32, 'InternVL2-8B': 32,
        'InternVL2-26B': 48, 'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]   
    # Since the first GPU will be used for ViT, treat it as half a GPU.
    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
    num_layers_per_gpu = [num_layers_per_gpu] * world_size              # 这里不是做乘法，而是将他变成world_size份
    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
    
    # 分配层到各个GPU
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'language_model.model.layers.{layer_cnt}'] = i
            layer_cnt += 1
    
    # 将特定的模型部分分配到第一个GPU
    device_map['vision_model'] = 0
    device_map['mlp1'] = 0
    device_map['language_model.model.tok_embeddings'] = 0
    device_map['language_model.model.embed_tokens'] = 0
    device_map['language_model.output'] = 0
    device_map['language_model.model.norm'] = 0
    device_map['language_model.lm_head'] = 0
    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0

    return device_map

def invert_imgs_to_pixel_values_and_num_patches_list(img_paths):
    # 加载所有图片，拼接 pixel_values，并记录每个图片的 patch 数量
    pixel_values = torch.cat([
        load_image(img_path, max_num=12).to(torch.bfloat16).cuda() 
        for img_path in img_paths
    ], dim=0)
    
    num_patches_list = [load_image(img_path, max_num=12).size(0) for img_path in img_paths]

    return pixel_values, num_patches_list


def create_imgs_list(img_paths):
    # 生成每张图片的标识文本
    question_lines = [f'Image-{idx+1}: <image>' for idx in range(len(img_paths))]

    # 合并所有图片标识文本并添加说明
    question = '\n'.join(question_lines) + '\n'

    return question

单论纯文本对话

这里需要将path 改为自己实际的模型路径

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

# Import packages
import google.generativeai as genai
from typing import List, Tuple
import gradio as gr
import json

import numpy as np                                                  # 用于科学计算的库，提供多维数组对象和各种操作
import torch                                                        # 用于深度学习
import torchvision.transforms as T                                  # PyTorch的子模块，提供了一些常用的图像变换操作
from decord import VideoReader, cpu, gpu                            # 用于高效视频读取和处理的库
from PIL import Image                                               # Python Imaging Library，提供了一些图像处理功能
from torchvision.transforms.functional import InterpolationMode     # 图像插值模式，通常用于图像缩放等操作
from transformers import AutoModel, AutoTokenizer                   # Hugging Face的Transformers库，用于处理预训练的自然语言处理模型
import internvl_utils


path = '/data1/zjk/InternVL-2-1B/pretrained'
device_map = internvl_utils.split_model('InternVL2-1B')
model = AutoModel.from_pretrained(              # 从指定路径加载预训练模型
    path,
    torch_dtype=torch.bfloat16,                 # 将模型加载为 bfloat16 数据类型，这种类型在某些硬件（如TPU和最新的NVIDIA GPU）上可以提高计算效率和节省显存
    low_cpu_mem_usage=True,                     # 启用低CPU内存使用模式，这对于大模型加载非常有用，因为它会将模型的部分加载工作推迟到GPU上，减少CPU内存占用
    trust_remote_code=True,                     # 允许加载和执行远程仓库中提供的自定义代码
    device_map=device_map).eval()               # 将之前生成的设备映射传递给模型，指示模型的各个部分应该在不同的GPU上进行计算
                                                # model.eval() 将模型设置为评估模式。评估模式下，模型的一些层（如 dropout 和 batch normalization）会表现得与训练模式不同，这是为了确保推理的一致性和准确性
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=1024, do_sample=False)



# function to clear the conversation
def reset() -> List:
    return []

# function to call the model to generate
def interact(question: str, temp = 1.0) -> List[Tuple[str, str]]:
    generation_config = dict(max_new_tokens=1024, do_sample=False, temperature = temp)
    '''
      * Arguments

        - question: input of user

        - temp: the temperature parameter of this model. Temperature is used to control the output of the chatbot.
                The higher the temperature is, the more creative response you will get.
    '''
    input = f"{question}"
    response = model.chat(tokenizer, None, input, generation_config, history=None, return_history=False)
    print("response=\n"+response)

    return [(input, response)]


# This part constructs the Gradio UI interface
with gr.Blocks() as demo:   # gr.Blocks() 是 Gradio 提供的一个容器，用于组织UI的布局。with 语句用于创建上下文环境，所有的UI元素都将在这个上下文中被创建。
    gr.Markdown("# 单轮纯文本对话聊天 \n")  # 使用 gr.Markdown() 显示一个标题
    chatbot = gr.Chatbot()  # 创建一个聊天机器人窗口，用于显示摘要的结果
    input_textbox = gr.Textbox(label="Question", interactive = True, value = "你好") # 用户用来输入的
    with gr.Column():   # 创建一个垂直排列的列布局
        gr.Markdown("#  Temperature超参数\n 值越大回复越有创造力")
        temperature_slider = gr.Slider(0.0, 1.0, 0.7, step = 0.1, label="Temperature")  # 创建一个滑块，用于调节生成摘要时的“温度”，即控制生成内容的随机性或创造性。滑块范围为0.0到1.0，默认值为0.7
    with gr.Row():  # 创建一个水平排列的行布局
        sent_button = gr.Button(value="Send")   # 创建“Send”按钮用于触发摘要生成
        reset_button = gr.Button(value="Reset") # “Reset”按钮用于清除之前的结果。
    
    sent_button.click(interact, inputs=[input_textbox, temperature_slider], outputs=[chatbot])  # 绑定了按钮的点击事件与 interact 函数。当点击发送按钮时，输入的 prompt_textbox、input_textbox 和 temperature_slider 的值会被传递给 interact 函数，结果会输出到 chatbot
    reset_button.click(reset, outputs=[chatbot])    # 定了重置按钮的点击事件与 reset 函数，用于清除 chatbot 的内容。

demo.launch(debug = True)   # 启动这个Gradio应用，debug=True 会在调试模式下运行，提供更多的错误信息和反馈。

单轮多图对话

这里需要将path 改为自己实际的模型路径，还有就是图片不是在界面上传的，而是读取文件夹的，所以你需要将imgs_root_path改为自己的图片文件夹根路径

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"

# Import packages
import google.generativeai as genai
from typing import List, Tuple
import gradio as gr
import json
from gradio import Image as GImage


import numpy as np                                                  # 用于科学计算的库，提供多维数组对象和各种操作
import torch                                                        # 用于深度学习
import torchvision.transforms as T                                  # PyTorch的子模块，提供了一些常用的图像变换操作
from decord import VideoReader, cpu, gpu                            # 用于高效视频读取和处理的库
from PIL import Image                                               # Python Imaging Library，提供了一些图像处理功能
from torchvision.transforms.functional import InterpolationMode     # 图像插值模式，通常用于图像缩放等操作
from transformers import AutoModel, AutoTokenizer                   # Hugging Face的Transformers库，用于处理预训练的自然语言处理模型
import internvl_utils


path = '/data1/zjk/InternVL-2-26B'
device_map = internvl_utils.split_model('InternVL2-26B')
model = AutoModel.from_pretrained(              # 从指定路径加载预训练模型
    path,
    torch_dtype=torch.bfloat16,                 # 将模型加载为 bfloat16 数据类型，这种类型在某些硬件（如TPU和最新的NVIDIA GPU）上可以提高计算效率和节省显存
    low_cpu_mem_usage=True,                     # 启用低CPU内存使用模式，这对于大模型加载非常有用，因为它会将模型的部分加载工作推迟到GPU上，减少CPU内存占用
    trust_remote_code=True,                     # 允许加载和执行远程仓库中提供的自定义代码
    device_map=device_map).eval()               # 将之前生成的设备映射传递给模型，指示模型的各个部分应该在不同的GPU上进行计算
                                                # model.eval() 将模型设置为评估模式。评估模式下，模型的一些层（如 dropout 和 batch normalization）会表现得与训练模式不同，这是为了确保推理的一致性和准确性
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=1024, do_sample=True)


# function to clear the conversation
def reset() -> List:
    return []

def process_imgs(imgs_root_path):
    # 读取imgs_root_path所有的图片，并且按照文件名从小到大排序
    # 获取路径下所有文件的列表，并过滤掉非图片文件，同时生成绝对路径并排序
    img_paths_sorted = sorted(
        [os.path.join(imgs_root_path, f) for f in os.listdir(imgs_root_path) if f.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    )
    pixel_values, num_patches_list = internvl_utils.invert_imgs_to_pixel_values_and_num_patches_list(img_paths_sorted)
    imgs_list = internvl_utils.create_imgs_list(img_paths_sorted)

    return pixel_values, num_patches_list, imgs_list

# function to call the model to generate
def interact(question: str, temp = 1.0) -> List[Tuple[str, str]]:
    generation_config = dict(max_new_tokens=1024, do_sample=True, temperature = temp)
    '''
      * Arguments

        - question: input of user

        - temp: the temperature parameter of this model. Temperature is used to control the output of the chatbot.
                The higher the temperature is, the more creative response you will get.
    '''
   

    imgs_root_path = '/data1/zjk/lihongyi/3/imgs/1'
    pixel_values, num_patches_list, imgs_list = process_imgs(imgs_root_path)
    input = imgs_list + f"{question}"

    response = model.chat(tokenizer, pixel_values, input, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=False)
                      
    print(f"question:{input}\nresponse:\n{response}")

    return [(question, response)]


# This part constructs the Gradio UI interface
with gr.Blocks() as demo:   # gr.Blocks() 是 Gradio 提供的一个容器，用于组织UI的布局。with 语句用于创建上下文环境，所有的UI元素都将在这个上下文中被创建。
    gr.Markdown("# 单轮文本-图像对话聊天 \n")  # 使用 gr.Markdown() 显示一个标题
    chatbot = gr.Chatbot()  # 创建一个聊天机器人窗口，用于显示摘要的结果
    input_textbox = gr.Textbox(label="Question", interactive = True, value = "请用一句话描述图片中的内容") # 用户用来输入的
    with gr.Column():   # 创建一个垂直排列的列布局
        gr.Markdown("#  Temperature超参数\n 值越大回复越有创造力")
        temperature_slider = gr.Slider(0.0, 1.0, 0.7, step = 0.1, label="Temperature")  # 创建一个滑块，用于调节生成摘要时的“温度”，即控制生成内容的随机性或创造性。滑块范围为0.0到1.0，默认值为0.7
    with gr.Row():  # 创建一个水平排列的行布局
        sent_button = gr.Button(value="Send")   # 创建“Send”按钮用于触发摘要生成
        reset_button = gr.Button(value="Reset") # “Reset”按钮用于清除之前的结果。
        
    # 点击发送按钮时，将问题和处理后的图像数据一并传递给 interact 函数
    sent_button.click(interact, inputs=[input_textbox, temperature_slider], outputs=[chatbot])
    reset_button.click(reset, outputs=[chatbot])    # 定了重置按钮的点击事件与 reset 函数，用于清除 chatbot 的内容。
    

demo.launch(debug = True)   # 启动这个Gradio应用，debug=True 会在调试模式下运行，提供更多的错误信息和反馈。