目的
MINICPM-V2_6 huggingface github
因为我对大模型做多模态的数据处理流程不熟悉,所以从MINICPM-V2_6入手,了解如何从图像得到ID化的过程
基础代码
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer, Auto
model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
# 下载了好久啊
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
# 单张图像 这里只考虑单张图片(单轮对话),多张图片(多轮对话)的情况类似
image = Image.open('bicycle.png').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
# 输出经过processor处理后的inputs
inputs中包含input_ids、attention_mask、pixel_values、image_sizes、image_bound、tgt_sizes
其中我能看明白的是image_sizes,对应的是原始图片尺寸,并不是14的倍数(解决)
input_ids可以明白是id化后的token,但是怎么得到的并不知道**(存疑)**
attention_mask都为True,且长度和input_ids是一样的,这是可以后期生成的(解决)
pixel_values、image_bound、tgt_sizes这三个看着莫名其妙的,确实不知道该从哪下手看了**(重点看)**
翻 modeling_minicpmv.py可以看到process是transformers封装好的包,我没找到源码(泪目,有知道位置的,请告知),但是我翻到了MiniCPM-Llama3-V-2_5下的源码,看到了图片处理的流程,接下来一起看看吧
图片预处理的来源
LLaVA-UHD是这篇论文提出图片切片的方法
这里主要讲的是左半部分的切片方法
会将input image按照标准切片区域得到理想的切片数量(N=6.5),为了让切片的分法可以多种多样(比如当切片数量N=7时,便只有1-7和7-1这两种分法,而这两种分法可能不是最优的),所以论文中提出要考虑[N-1,N,N+1]这几个切片数量对应的切片分法。
定义每种分法对应的分数计算公式
S
=
a
b
s
(
log
(
n
m
)
−
l
o
g
(
W
H
)
)
S = abs(\log (\frac{n}{m})-log(\frac {W}{H}))
S=abs(log(mn)−log(HW))
最后取S最小对应的那种分法
代码
基本变量
# 一些基本变量
import math
from torchvision import transforms
image_feature_size = 64 # 每张图片的占位符数量
max_slice_nums = 9# 最多分的块数
scale_resolution = 448# 每块对应的最大宽或高 小块数量=448/14=32
patch_size = 14# patch_size
IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_STD
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
),
]
)# 常见的图片预处理分法
函数定义
def get_slice_image_placeholder(image, tokenizer):
"""
输入:image 单张图片
tokenizer 分词器
输出:原始图片(宽高是14的倍数)和分块后的图片,图片的占位符
demo:
image = Image.open('bicycle.png').convert('RGB')
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
slice_images, final_placeholder = get_slice_image_placeholder(image, tokenizer)
# [<PIL.Image.Image image mode=RGB size=518x392 at 0x7F024D82FB80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D305A80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D82F310>]
# <image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>
"""
# 这个函数里面得到的image_placeholder没有考虑<image_id>0</image_id> (即图片编号),其他的和autoprofessor都是一样的
image_placeholder = (
tokenizer.im_start
+ tokenizer.unk_token * image_feature_size
+ tokenizer.im_end
)# <image>+<unk>*64+</image>
slice_images = []
source_image, patches, best_grid = slice_image(
image,
max_slice_nums,# 9
scale_resolution,# 448
patch_size,# 14
)# 原始图片(宽高是14的倍数) patches(分好的块list) best_grid(得到的分割方式)
# <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0>
# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
# [2,1]
slice_images.append(source_image)# 将原始图片(宽高是14的倍数)放入slice_images
final_placeholder = image_placeholder# 原始图片的占位符
if len(patches) > 0:
for i in range(len(patches)):
for j in range(len(patches[i])):
slice_images.append(patches[i][j])# 将分块后的图片放入slice_images
final_placeholder += get_grid_placeholder(
tokenizer, best_grid, image_feature_size
)# 带有分割块标志的图片占位符
# '<slice><unk>*64</slice><slice><unk>*64</slice>'
# 注意这里图片行与行之间会用\n分开
return slice_images, final_placeholder# 将原始图片(宽高是14的倍数)和分块后的图片放在slice_images中,占位符放在final_placeholder
def slice_image(image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
"""
输入:image 单张图片
max_slice_nums 最大的分块数量
scale_resolution 每一块的分辨率
patch_size 块大小
输出:原始图片(宽高是14的倍数) patches(分好的块list) best_grid(得到的分割方式)
demo:
image = Image.open('bicycle.png').convert('RGB')
max_slice_nums=9
scale_resolution = 448
patch_size = 14
source_image, patches, best_grid = slice_image(image, max_slice_nums, scale_resolution, patch_size)
# <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0>
# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
# [2,1]
"""
original_size = image.size# 图像大小 667,500
original_width, original_height = original_size
log_ratio = math.log(original_width / original_height)# 0.288181947493432
ratio = original_width * original_height / (scale_resolution * scale_resolution)# 1.6616509885204083
multiple = min(math.ceil(ratio), max_slice_nums)# 2 得到理想分块数量
source_image = None
best_grid = None
patches = []
if multiple <= 1 or never_split:# 不需要分块,上采样
# dont need to slice, upsample
best_size = find_best_resize(
original_size, scale_resolution, patch_size, allow_upscale=True
)# patch_size的宽,patch_size的高
source_image = image.resize(best_size, Image.Resampling.BICUBIC)# 调整大小
else:
candidate_split_grids_nums = []# 2 3 去掉不分块的,也不能超过最大分块数量
for i in [multiple - 1, multiple, multiple + 1]:# 1 2 3
if i == 1 or i > max_slice_nums:
continue
candidate_split_grids_nums.append(i)
# source image, down-sampling and ensure divided by patch_size
best_resize = find_best_resize(original_size, scale_resolution, patch_size)# patch_size的宽,patch_size的高
source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)# 518,392
candidate_grids = []
# find best grid
for split_grids_nums in candidate_split_grids_nums:# 2 3
# 找到所有的分块可能
# 比如6块可以是1-6,2-3,3-2,6-1
m = 1
while m <= split_grids_nums:
if split_grids_nums % m == 0:
candidate_grids.append([m, split_grids_nums // m])
m += 1
# 找到 1-2,2-1,1-3,3-1四种可能分法 要用每种分法对应的分数决定取哪种分法
best_grid = [1, 1]
min_error = float("inf")
for grid in candidate_grids:
error = abs(log_ratio - math.log(grid[0] / grid[1]))# math.log(original_width / original_height)-math.log(m / n)
if error < min_error:
best_grid = grid
min_error = error
refine_size = get_refine_size(
original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
)# 728,546
refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)# 728,546
patches = split_to_patches(refine_image, best_grid)# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
return source_image, patches, best_grid# 返回了原始图片(也是14的倍数) patches(分好的2*1块) best_grid(最好的分隔方式)
def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False):
"""
输入:original_size 图片的原始尺寸
grid 分块的分法 list
scale_resolution 每一块的分辨率
patch_size 块大小
输出:找到原始图片按照grid分块后应该对应的图像尺寸
demo:
original_size = 667,500
grid = [2,1]
scale_resolution = 448
patch_size = 14
best_length = get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=True)
# 728,546
"""
print (original_size, grid)
width, height = original_size
grid_x, grid_y = grid
refine_width = ensure_divide(width, grid_x)# 668
refine_height = ensure_divide(height, grid_y)# 500
grid_width = refine_width / grid_x# 334
grid_height = refine_height / grid_y# 500 找到每一块的宽和高
best_grid_size = find_best_resize(
(grid_width, grid_height),
scale_resolution,
patch_size,
allow_upscale=allow_upscale,
)# 364,546 注意这里allow_upscale=True
refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)# 728,546
return refine_size
def ensure_divide(length, patch_size):
"""
输入:length 长度
patch_size 块大小
输出:找到离length最近的patch_size的倍数
demo:
length = 516
patch_size = 14
best_length = ensure_divide(length, patch_size)
# 518
"""
return max(round(length / patch_size) * patch_size, patch_size)
def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
"""
输入:original_size 图片的原始尺寸
scale_resolution 每一块的分辨率
patch_size 块大小
输出:patch_size的宽,patch_size的高
demo:
original_size = 667,500
(best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
# 518,392 -情况1
original_size = 334,500
(best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
# 364,546 -情况2
"""
width, height = original_size
if (width * height > scale_resolution * scale_resolution) or allow_upscale:
# 情况1:原始图片比目标图片大:
# 通过这样的方式找到的缩放比例使得原始图片能保持原始比例
# 最接近scale_resolution * scale_resolution的缩放尺寸
# 情况2:当allow_upscale为True时
# 也会完成这一步,只是这里是放大图片
r = width / height# 1.334
height = int(scale_resolution / math.sqrt(r))# 387
width = int(height * r)# 516
"""
原始图片 width/height=1.334
缩放后的图片width/height=1.333且width*height= 199692
scale_resolution*scale_resolution = 200704
width / height = scale_resolution / math.sqrt(r) * r / (scale_resolution / math.sqrt(r)) = r
height * width = scale_resolution / math.sqrt(r) * (scale_resolution / math.sqrt(r) * r) = scale_resolution*scale_resolution
"""
best_width = ensure_divide(width, patch_size)# 518 是patch_size的倍数了
best_height = ensure_divide(height, patch_size)# 392 是patch_size的倍数了
return (best_width, best_height)
def split_to_patches(image, grid):
"""
输入:image 原始图片按照grid分块后应该对应的图像尺寸放缩后的图片
grid 分块的分法 list
输出:裁剪后的图片list
demo:
image = Image.open('bicycle.png').convert('RGB')
refine_image = image.resize((728,546), Image.Resampling.BICUBIC)# 728,546
grid = [2,1]
patches = split_to_patches(refine_image, grid)
# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
"""
patches = []
width, height = image.size
grid_x = int(width / grid[0])# 728/2=364
grid_y = int(height / grid[1])# 546/1=546
for i in range(0, height, grid_y):
images = []
for j in range(0, width, grid_x):
box = (j, i, j + grid_x, i + grid_y)
patch = image.crop(box)# 按照box进行裁剪
images.append(patch)
patches.append(images)
return patches# 裁剪后的图片
def get_grid_placeholder(tokenizer, grid, query_num):# 返回2*2的patch占位符,行与行之间需要用\n连接
"""
输入:tokenizer 分词器
grid 分块的分法 list
query_num 占位符的个数
输出:带有分割块标志的图片占位符
demo:
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
grid = [2,2]
query_num = 64
slice_placeholder = get_grid_placeholder(tokenizer, grid, query_num)
# '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
"""
image_placeholder = (
tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
)# '<slice><unk>*64</slice>'
cols = grid[0]# 2
rows = grid[1]# 1
slices = []
for i in range(rows):
lines = []
for j in range(cols):
lines.append(image_placeholder)
slices.append("".join(lines))
# ['<slice><unk>*64</slice><slice><unk>*64</slice>']
slice_placeholder = "\n".join(slices)# 注意这里是将每行之间加了一个"\n"
# '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
return slice_placeholder
def reshape_by_patch(image_tensor, patch_size):
"""
:param image_tensor: shape [3, H, W]
:param patch_size:
:return: [3, patch_size, HW/patch_size]
demo:
image = Image.open('bicycle.png').convert('RGB')
patch_size = 14
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
slice_images, _ = get_slice_image_placeholder(image, tokenizer)
slice_image = transform(slice_images[0])# [3, 392, 518]
reshape_image = reshape_by_patch(slice_image, patch_size)# [3, 14, 14504]
"""
patches = torch.nn.functional.unfold(
image_tensor,
(patch_size, patch_size),
stride=(patch_size, patch_size)
)# 将image_tensor按照patch_size,patch_size的块折叠 3*14*14,H*W/(14*14)
patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)# 3,14,14,H*W/(14*14)
patches = patches.permute(0, 1, 3, 2).reshape(image_tensor.size(0), patch_size, -1)# 3,14,H*W/14
return patches# [3, patch_size, HW/patch_size]
函数调用
slice_images, image_placeholder = get_slice_image_placeholder(image, tokenizer)
# 将原始图片和分块后的图片放在slice_images中,占位符放在final_placeholder
# slice_images [<PIL.Image.Image image mode=RGB size=518x392 at 0x7F024D82E740>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D305F90>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D82FD00>]
# image_placeholder '<image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>'
images = []# 存的是原始图片+分块图片,shape是[3, patch_size, HW/patch_size]
cur_msgs = []
tgt_sizes = []# 存放的是images中的图片尺寸/14,即H/14,W/14
cur_msgs.append(image_placeholder)
for slice_image in slice_images:# 3个图片
slice_image = transform(slice_image)# [3, H, W]
H, W = slice_image.shape[1:]
images.append(reshape_by_patch(slice_image, patch_size))# [3, patch_size, HW/patch_size]
tgt_sizes.append(torch.Tensor([H // patch_size, W // patch_size]).type(torch.int32))# H/14,W/14
question = 'What is in the image?'
cur_msgs.append(question)
content = '\n'.join(cur_msgs)# 这里注意图片和文本之间需要加\n
copy_msgs = [{'role': 'user', 'content': '<image_id>0</image_id>' + content}]
input_ids = tokenizer.apply_chat_template(copy_msgs, tokenize=True, add_generation_prompt=False)# 包含了图像和文本
if tgt_sizes:
tgt_sizes = torch.vstack(tgt_sizes)# n*2
到此为止就找到了pixel_values(images)、tgt_sizes(✌️)
input_ids是通过将图片+问题进行token ID化,并且需要pad(pad需要指定左、右)
MINICPM-V2_6里面的autoprocess处理完的input_ids和这里稍微有些不一样
# 本代码
<image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\nWhat is in the image?
# MINICPM-V2_6
<image_id>0</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\nWhat is in the image? + <|im_start|>assistant\n
缺少了每张图片的编号(<image_id>0</image_id>)和后面需要模型输出的角色引导,本文不考虑这些
这里再额外的补充几句,真实代码中处理的时候每张原始图片之前都会加上编号(<image_id>0</image_id>),切块图片行与行之间会加上\n,每张图片和每张图片之间会加上\n,图片和字符串之间会加上\n。看一个真实的结果(两张图片,每张图片都是2*2分块)
<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n
<|im_start|>user\n
<image_id>0</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\n
<slice><unk>*64</slice><slice><unk>*64</slice>\n
<image_id>1</image_id><image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>\n
<slice><unk>*64</slice><slice><unk>*64</slice>\n
What is in the image?<|im_end|>\n
<|im_start|>assistant\n
最后,但是还没找到image_bound呢
from typing import List, Optional
def convert_to_tensors(tokenizer, input_ids, max_inp_length: Optional[int] = None):
"""
输入:tokenizer 分词器
input_ids 输入id
max_inp_length 最大句子长度
输出:通过input_ids返回了tensor后的input_ids和image_bound(图片的开始位置和结束位置)
demo:
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
max_inp_length = 250
input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13,151645, 198, 151644, 872, 198, 151658, 15, 151659, 151646,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 151647, 151656, 151646, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 151647]
model_input = convert_to_tensors(tokenizer, input_ids, max_inp_length)
# 'input_ids','image_bound'
"""
if max_inp_length is not None:
input_ids = input_ids[:max_inp_length]
input_ids = torch.tensor(input_ids, dtype=torch.int32)# [232]
image_start_tokens = torch.where(input_ids == tokenizer.im_start_id)[0]# 找到图片开始的位置 tensor([ 17, 84, 150])
# 跳过 im_start
image_start_tokens += 1# 对应位置加1 tensor([ 18, 85, 151])
image_end_tokens = torch.where(input_ids == tokenizer.im_end_id)[0]# 找到图片结束的位置 tensor([ 82, 149, 215])
valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))# 有效图片的数量 3
image_bound = torch.hstack(
[
image_start_tokens[:valid_image_nums].unsqueeze(-1),
image_end_tokens[:valid_image_nums].unsqueeze(-1),
]
)# n*2 对应了n张图片的开始位置和结束位置
model_input = {}
model_input["input_ids"] = input_ids.unsqueeze(0)# 1*句子长度
model_input["image_bound"] = image_bound
return model_input# 返回了input_ids和image_bound(图片的开始位置和结束位置)
max_inp_length = 250
model_inputs = convert_to_tensors(tokenizer, input_ids, max_inp_length) # 'input_ids','image_bound'
到此为止就找到了全部😄
额外说一句
slice_logic LLaVA-UHD完成这一步对应的代码,基本流程是一样的,少了最后的几步