MINICPM-V2_6图像预处理流程-代码解读

红酒暖心也暖胃

已于 2024-09-11 11:24:51 修改

阅读量635

点赞数 12

分类专栏： AIGC 文章标签： python 人工智能 AIGC

于 2024-09-10 16:57:16 首次发布

本文链接：https://blog.csdn.net/zpp13hao1/article/details/142101990

版权

AIGC 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

目的

MINICPM-V2_6 huggingface github
因为我对大模型做多模态的数据处理流程不熟悉，所以从MINICPM-V2_6入手，了解如何从图像得到ID化的过程

基础代码

import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, Auto

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
# 下载了好久啊
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)

# 单张图像 这里只考虑单张图片（单轮对话），多张图片（多轮对话）的情况类似
image = Image.open('bicycle.png').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]

res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
# 输出经过processor处理后的inputs

inputs中包含input_ids、attention_mask、pixel_values、image_sizes、image_bound、tgt_sizes
其中我能看明白的是image_sizes，对应的是原始图片尺寸，并不是14的倍数（解决）
input_ids可以明白是id化后的token，但是怎么得到的并不知道**（存疑）**
attention_mask都为True，且长度和input_ids是一样的，这是可以后期生成的（解决）
pixel_values、image_bound、tgt_sizes这三个看着莫名其妙的，确实不知道该从哪下手看了**（重点看）**

翻 modeling_minicpmv.py可以看到process是transformers封装好的包，我没找到源码（泪目，有知道位置的，请告知），但是我翻到了MiniCPM-Llama3-V-2_5下的源码，看到了图片处理的流程，接下来一起看看吧

图片预处理的来源

LLaVA-UHD是这篇论文提出图片切片的方法

这里主要讲的是左半部分的切片方法

会将input image按照标准切片区域得到理想的切片数量（N=6.5），为了让切片的分法可以多种多样（比如当切片数量N=7时，便只有1-7和7-1这两种分法，而这两种分法可能不是最优的），所以论文中提出要考虑[N-1,N,N+1]这几个切片数量对应的切片分法。

定义每种分法对应的分数计算公式
$abs(\log (\frac{n}{m})-log(\frac {W}{H}))$
最后取S最小对应的那种分法

代码

基本变量

# 一些基本变量
import math
from torchvision import transforms
image_feature_size = 64 # 每张图片的占位符数量
max_slice_nums = 9# 最多分的块数
scale_resolution = 448# 每块对应的最大宽或高 小块数量=448/14=32
patch_size = 14# patch_size

IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
                ),
            ]
        )# 常见的图片预处理分法

函数定义

def get_slice_image_placeholder(image, tokenizer):
    """
    输入：image 单张图片
         tokenizer 分词器
    输出：原始图片(宽高是14的倍数)和分块后的图片，图片的占位符
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    slice_images, final_placeholder = get_slice_image_placeholder(image, tokenizer)
    # [<PIL.Image.Image image mode=RGB size=518x392 at 0x7F024D82FB80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D305A80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D82F310>]
    # <image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>
    """
    # 这个函数里面得到的image_placeholder没有考虑<image_id>0</image_id> （即图片编号），其他的和autoprofessor都是一样的
    image_placeholder = (
        tokenizer.im_start
        + tokenizer.unk_token * image_feature_size
        + tokenizer.im_end
    )# <image>+<unk>*64+</image>
    slice_images = []
    source_image, patches, best_grid = slice_image(
        image,
        max_slice_nums,# 9
        scale_resolution,# 448
        patch_size,# 14
    )# 原始图片(宽高是14的倍数) patches（分好的块list） best_grid（得到的分割方式）
    # <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0> 
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    # [2,1]
    slice_images.append(source_image)# 将原始图片(宽高是14的倍数)放入slice_images
    final_placeholder = image_placeholder# 原始图片的占位符
    if len(patches) > 0:
        for i in range(len(patches)):
            for j in range(len(patches[i])):
                slice_images.append(patches[i][j])# 将分块后的图片放入slice_images
        final_placeholder += get_grid_placeholder(
            tokenizer, best_grid, image_feature_size
        )# 带有分割块标志的图片占位符
        # '<slice><unk>*64</slice><slice><unk>*64</slice>'
        # 注意这里图片行与行之间会用\n分开
    return slice_images, final_placeholder# 将原始图片(宽高是14的倍数)和分块后的图片放在slice_images中，占位符放在final_placeholder

def slice_image(image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
    """
    输入：image 单张图片
         max_slice_nums 最大的分块数量
         scale_resolution 每一块的分辨率
         patch_size 块大小
    输出：原始图片(宽高是14的倍数) patches（分好的块list） best_grid（得到的分割方式）
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    max_slice_nums=9
    scale_resolution = 448
    patch_size = 14
    source_image, patches, best_grid = slice_image(image, max_slice_nums, scale_resolution, patch_size)
    # <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0> 
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    # [2,1]
    """
    original_size = image.size# 图像大小 667,500
    original_width, original_height = original_size
    log_ratio = math.log(original_width / original_height)# 0.288181947493432
    ratio = original_width * original_height / (scale_resolution * scale_resolution)# 1.6616509885204083
    multiple = min(math.ceil(ratio), max_slice_nums)# 2 得到理想分块数量
    source_image = None
    best_grid = None
    patches = []
    if multiple <= 1 or never_split:# 不需要分块，上采样
        # dont need to slice, upsample
        best_size = find_best_resize(
            original_size, scale_resolution, patch_size, allow_upscale=True
        )# patch_size的宽，patch_size的高
        source_image = image.resize(best_size, Image.Resampling.BICUBIC)# 调整大小
    else:
        candidate_split_grids_nums = []# 2 3 去掉不分块的，也不能超过最大分块数量
        for i in [multiple - 1, multiple, multiple + 1]:# 1 2 3
            if i == 1 or i > max_slice_nums:
                continue
            candidate_split_grids_nums.append(i)
        # source image, down-sampling and ensure divided by patch_size
        best_resize = find_best_resize(original_size, scale_resolution, patch_size)# patch_size的宽，patch_size的高
        source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)# 518,392
        candidate_grids = []
        # find best grid
        for split_grids_nums in candidate_split_grids_nums:# 2 3
        # 找到所有的分块可能
        # 比如6块可以是1-6，2-3，3-2，6-1
            m = 1
            while m <= split_grids_nums:
                if split_grids_nums % m == 0:
                    candidate_grids.append([m, split_grids_nums // m])
                m += 1
        # 找到 1-2，2-1，1-3，3-1四种可能分法 要用每种分法对应的分数决定取哪种分法
        best_grid = [1, 1]
        min_error = float("inf")
        for grid in candidate_grids:
            error = abs(log_ratio - math.log(grid[0] / grid[1]))# math.log(original_width / original_height)-math.log(m / n)
            if error < min_error:
                best_grid = grid
                min_error = error
        refine_size = get_refine_size(
            original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
        )# 728，546
        refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)# 728，546
        patches = split_to_patches(refine_image, best_grid)# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    return source_image, patches, best_grid# 返回了原始图片(也是14的倍数) patches（分好的2*1块） best_grid（最好的分隔方式）

def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False):
    """
    输入：original_size 图片的原始尺寸
         grid 分块的分法 list
         scale_resolution 每一块的分辨率
         patch_size 块大小 
    输出：找到原始图片按照grid分块后应该对应的图像尺寸
    demo:
    original_size = 667,500
    grid = [2,1]
    scale_resolution = 448
    patch_size = 14
    best_length = get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=True)
    # 728，546
    """
    print (original_size, grid)
    width, height = original_size
    grid_x, grid_y = grid
    refine_width = ensure_divide(width, grid_x)# 668
    refine_height = ensure_divide(height, grid_y)# 500
    grid_width = refine_width / grid_x# 334
    grid_height = refine_height / grid_y# 500 找到每一块的宽和高
    best_grid_size = find_best_resize(
        (grid_width, grid_height),
        scale_resolution,
        patch_size,
        allow_upscale=allow_upscale,
    )# 364,546 注意这里allow_upscale=True
    refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)# 728，546
    return refine_size

def ensure_divide(length, patch_size):
    """
    输入：length 长度 
         patch_size 块大小 
    输出：找到离length最近的patch_size的倍数
    demo:
    length = 516
    patch_size = 14
    best_length = ensure_divide(length, patch_size)
    # 518
    """
    return max(round(length / patch_size) * patch_size, patch_size)


def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
    """
    输入：original_size 图片的原始尺寸
         scale_resolution 每一块的分辨率
         patch_size 块大小
    输出：patch_size的宽，patch_size的高
    demo:
    original_size = 667,500
    (best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
    # 518,392 -情况1
    original_size = 334,500
    (best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
    # 364,546 -情况2
    """
    width, height = original_size
    if (width * height > scale_resolution * scale_resolution) or allow_upscale:
    # 情况1:原始图片比目标图片大：
    # 通过这样的方式找到的缩放比例使得原始图片能保持原始比例
    # 最接近scale_resolution * scale_resolution的缩放尺寸
    # 情况2:当allow_upscale为True时
    # 也会完成这一步，只是这里是放大图片
        r = width / height# 1.334
        height = int(scale_resolution / math.sqrt(r))# 387
        width = int(height * r)# 516
        """
        原始图片   width/height=1.334 
        缩放后的图片width/height=1.333且width*height= 199692
        scale_resolution*scale_resolution = 200704
        width / height = scale_resolution / math.sqrt(r) * r / (scale_resolution / math.sqrt(r)) = r
        height * width = scale_resolution / math.sqrt(r) * (scale_resolution / math.sqrt(r) * r) = scale_resolution*scale_resolution
        """
    best_width = ensure_divide(width, patch_size)# 518 是patch_size的倍数了
    best_height = ensure_divide(height, patch_size)# 392 是patch_size的倍数了
    return (best_width, best_height)


def split_to_patches(image, grid):
    """
    输入：image 原始图片按照grid分块后应该对应的图像尺寸放缩后的图片
         grid 分块的分法 list
    输出：裁剪后的图片list
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    refine_image = image.resize((728,546), Image.Resampling.BICUBIC)# 728，546
    grid = [2,1]
    patches = split_to_patches(refine_image, grid)
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    """
    patches = []
    width, height = image.size
    grid_x = int(width / grid[0])# 728/2=364
    grid_y = int(height / grid[1])# 546/1=546
    for i in range(0, height, grid_y):
        images = []
        for j in range(0, width, grid_x):
            box = (j, i, j + grid_x, i + grid_y)
            patch = image.crop(box)# 按照box进行裁剪
            images.append(patch)
        patches.append(images)
    return patches# 裁剪后的图片

def get_grid_placeholder(tokenizer, grid, query_num):# 返回2*2的patch占位符，行与行之间需要用\n连接
    """
    输入：tokenizer 分词器
         grid 分块的分法 list
         query_num 占位符的个数
    输出：带有分割块标志的图片占位符
    demo:
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    grid = [2,2]
    query_num = 64
    slice_placeholder = get_grid_placeholder(tokenizer, grid, query_num)
    # '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
    """
    image_placeholder = (
        tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
    )# '<slice><unk>*64</slice>'
    cols = grid[0]# 2
    rows = grid[1]# 1
    slices = []
    for i in range(rows):
        lines = []
        for j in range(cols):
            lines.append(image_placeholder)
        slices.append("".join(lines))
    # ['<slice><unk>*64</slice><slice><unk>*64</slice>']
    slice_placeholder = "\n".join(slices)# 注意这里是将每行之间加了一个"\n"
    # '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
    return slice_placeholder

def reshape_by_patch(image_tensor, patch_size):
    """
    :param image_tensor: shape [3, H, W]
    :param patch_size:
    :return: [3, patch_size, HW/patch_size]
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    patch_size = 14
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    slice_images, _ = get_slice_image_placeholder(image, tokenizer)
    slice_image = transform(slice_images[0])# [3, 392, 518]
    reshape_image = reshape_by_patch(slice_image, patch_size)# [3, 14, 14504]
    """
    patches = torch.nn.functional.unfold(
        image_tensor,
        (patch_size, patch_size),
        stride=(patch_size, patch_size)
    )# 将image_tensor按照patch_size，patch_size的块折叠 3*14*14，H*W/(14*14)
    patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)# 3,14,14,H*W/(14*14)
    patches = patches.permute(0, 1, 3, 2).reshape(image_tensor.size(0), patch_size, -1)# 3,14,H*W/14
    return patches# [3, patch_size, HW/patch_size]

函数调用

slice_images, image_placeholder = get_slice_image_placeholder(image, tokenizer)
# 将原始图片和分块后的图片放在slice_images中，占位符放在final_placeholder                        
# slice_images [<PIL.Image.Image image mode=RGB size=518x392 at 0x7F024D82E740>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D305F90>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D82FD00>]
# image_placeholder '<image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>'

images = []# 存的是原始图片+分块图片，shape是[3, patch_size, HW/patch_size]
cur_msgs = []
tgt_sizes = []# 存放的是images中的图片尺寸/14，即H/14,W/14
cur_msgs.append(image_placeholder)
for slice_image in slice_images:# 3个图片
    slice_image = transform(slice_image)# [3, H, W]
    H, W = slice_image.shape[1:]
    images.append(reshape_by_patch(slice_image, patch_size))# [3, patch_size, HW/patch_size]
    tgt_sizes.append(torch.Tensor([H // patch_size, W // patch_size]).type(torch.int32))# H/14,W/14

question = 'What is in the image?'
cur_msgs.append(question)
content = '\n'.join(cur_msgs)# 这里注意图片和文本之间需要加\n
copy_msgs = [{'role': 'user', 'content': '<image_id>0</image_id>' + content}]
input_ids = tokenizer.apply_chat_template(copy_msgs, tokenize=True, add_generation_prompt=False)# 包含了图像和文本


if tgt_sizes:
    tgt_sizes = torch.vstack(tgt_sizes)# n*2

到此为止就找到了pixel_values（images）、tgt_sizes（✌️）

input_ids是通过将图片+问题进行token ID化，并且需要pad（pad需要指定左、右）
MINICPM-V2_6里面的autoprocess处理完的input_ids和这里稍微有些不一样

# 本代码
<image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\nWhat is in the image?
# MINICPM-V2_6
<image_id>0</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\nWhat is in the image? + <|im_start|>assistant\n

缺少了每张图片的编号(<image_id>0</image_id>)和后面需要模型输出的角色引导，本文不考虑这些

这里再额外的补充几句，真实代码中处理的时候每张原始图片之前都会加上编号（<image_id>0</image_id>），切块图片行与行之间会加上\n，每张图片和每张图片之间会加上\n，图片和字符串之间会加上\n。看一个真实的结果（两张图片，每张图片都是2*2分块）

<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n
<|im_start|>user\n
<image_id>0</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\n
<slice><unk>*64</slice><slice><unk>*64</slice>\n
<image_id>1</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\n
<slice><unk>*64</slice><slice><unk>*64</slice>\n
What is in the image?<|im_end|>\n
<|im_start|>assistant\n

最后，但是还没找到image_bound呢

from typing import List, Optional
def convert_to_tensors(tokenizer, input_ids, max_inp_length: Optional[int] = None):
    """
    输入：tokenizer 分词器
         input_ids 输入id
         max_inp_length 最大句子长度
    输出：通过input_ids返回了tensor后的input_ids和image_bound（图片的开始位置和结束位置）
    demo:
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    max_inp_length = 250
    input_ids = [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,151645,    198, 151644,    872,    198, 151658,     15, 151659, 151646,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 151647, 151656, 151646, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 151647]
    model_input = convert_to_tensors(tokenizer, input_ids, max_inp_length)
    # 'input_ids','image_bound'
    """
    if max_inp_length is not None:
        input_ids = input_ids[:max_inp_length]
    input_ids = torch.tensor(input_ids, dtype=torch.int32)# [232]
    image_start_tokens = torch.where(input_ids == tokenizer.im_start_id)[0]# 找到图片开始的位置 tensor([ 17,  84, 150])
    # 跳过 im_start
    image_start_tokens += 1# 对应位置加1 tensor([ 18,  85, 151])
    image_end_tokens = torch.where(input_ids == tokenizer.im_end_id)[0]# 找到图片结束的位置 tensor([ 82, 149, 215])
    valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))# 有效图片的数量 3
    image_bound = torch.hstack(
        [
            image_start_tokens[:valid_image_nums].unsqueeze(-1),
            image_end_tokens[:valid_image_nums].unsqueeze(-1),
        ]
    )# n*2 对应了n张图片的开始位置和结束位置
    model_input = {}
    model_input["input_ids"] = input_ids.unsqueeze(0)# 1*句子长度
    model_input["image_bound"] = image_bound
    return model_input# 返回了input_ids和image_bound（图片的开始位置和结束位置）


max_inp_length = 250
model_inputs = convert_to_tensors(tokenizer, input_ids, max_inp_length) # 'input_ids','image_bound'