关键点模型的输出是一个形状为1×17×48×64的热图张量,表示一个具有以下特征的数据结构:
1:代表批量大小(batch size),表示在这个张量中包含了一个样本。
17:代表关键点的数量,表示在这个张量中包含了17个关键点的信息。
48:代表热图的高度(height),表示热图的垂直方向上的像素数。
64:代表热图的宽度(width),表示热图的水平方向上的像素数。
这个张量可以被理解为一个包含了17个关键点的热图数据。在这个热图中,每个关键点都用一个二维的高斯分布表示,其中关键点的位置对应于高斯分布的峰值位置,而高斯分布的形状和强度则表示了关键点的置信度。
该张量的形状表示了数据的维度信息,即在一个批量中包含了一个样本,每个样本中有17个关键点的热图,每个热图的尺寸为48×64。
1、从热图中得到关键点坐标
# 从每个关键点的热图中得到最大值及其索引坐标,对应关键点预测分数及预测坐标
def get_max_preds(batch_heatmaps):
'''
get predictions from score maps
heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
'''
assert isinstance(batch_heatmaps, np.ndarray), \
'batch_heatmaps should be numpy.ndarray'
assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
batch_size = batch_heatmaps.shape[0]
num_joints = batch_heatmaps.shape[1]
width = batch_heatmaps.shape[3]
# 重塑形状为(batch_size, num_joints, -1),-1表示自动计算的维度
heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
# np.argmax第2个维度上找到每个关键点热图中的最大值的索引,得到一个形状为(batch_size, num_joints)的数组idx。
idx = np.argmax(heatmaps_reshaped, 2)
# np.amax函数在第2个维度上找到每个关键点热图中的最大值,得到一个形状为(batch_size, num_joints)的数组maxvals
maxvals = np.amax(heatmaps_reshaped, 2)
# 将maxvals和idx重塑为形状为(batch_size, num_joints, 1)的数组
maxvals = maxvals.reshape((batch_size, num_joints, 1))
idx = idx.reshape((batch_size, num_joints, 1))
# np.tile函数将idx在第3个维度上重复两次,得到一个形状为(batch_size, num_joints, 2)的浮点数数组preds,用于存储关键点的预测坐标。
preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
preds[:, :, 0] = (preds[:, :, 0]) % width # 对preds的x坐标进行取模运算,将其限制在热图的宽度范围内
preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) # 对preds的y坐标进行向下取整运算,将其映射到热图的高度范围内
# 将maxvals大于0的部分在第3个维度上重复两次,得到一个形状为(batch_size, num_joints, 2)的浮点数数组pred_mask,用于掩码关键点的预测坐标。
pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
pred_mask = pred_mask.astype(np.float32)
# 将preds和pred_mask相乘,将未达到最大值的关键点的预测坐标置为0
preds *= pred_mask
return preds, maxvals # 返回最终的关键点预测坐标数组preds和最大值数组maxvals
2、仿射变换
将图片中的每个像素点按照一定的规律映射到新的位置,仿射变化需要一个转换矩阵,但是由于仿射变换比较复杂,一般很难直接找到这个矩阵,opencv提供了根据源图像和目标图像上三个对应的点来自动创建变换矩阵,矩阵维度为 2x3。仿射矩阵trans = cv2.getAffineTransform(src,dst)
,src和dst就是源图像和目标图像的对应三个点的位置关系。最后这个矩阵会被传给函数 cv2.warpAffine(img, trans, (width,height))
来实现仿射变换,得到目标图像,img为源图像,(width,height)为目标图像宽高
。
# 根据仿射矩阵trans对模型预测得到的关键点坐标coords进行变换
def transform_preds(coords, center, input_size, output_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, input_size, 0, output_size, inv=1)
for p in range(coords.shape[0]): # 使用循环遍历coords数组的每一行,每行是一个坐标点
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) # coords[p, 0:2]表示第p行的前两个元素(x和y坐标)
return target_coords # 变换后的坐标数组target_coords
# 获取仿射矩阵trans
def get_affine_transform(center,
input_size,
rot,
output_size,
shift=np.array([0, 0], dtype=np.float32),
inv=0):
src_w = input_size[0] # 源图像的宽度src_w
dst_w = output_size[0] # 目标图像的宽度
dst_h = output_size[1]
rot_rad = np.pi * rot / 180
# [0, src_w * -0.5]表示一个起始点的坐标,起始点经过旋转后,会得到新的方向向量src_dir。
src_dir = get_dir([0, src_w * -0.5], rot_rad)
dst_dir = np.array([0, dst_w * -0.5], np.float32) # 方向向量
src = np.zeros((3, 2), dtype=np.float32)
dst = np.zeros((3, 2), dtype=np.float32) # 3行, 三个坐标点
src[0, :] = center + input_size * shift
src[1, :] = center + src_dir + input_size * shift
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
# 根据前两个坐标得到第三个坐标
src[2:, :] = get_3rd_point(src[0, :], src[1, :])
dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
# 如果inv为1,则将目标图像作为源图像,否则将源图像作为源图像
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return transrans
# 对单个关键点坐标根据矩阵乘法进行仿射变换
def affine_transform(pt, trans): # 输入的点坐标pt
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(trans, new_pt) # 将仿射变换矩阵tras和new_pt进行矩阵乘法运算,得到新的坐标点。
return new_pt[:2] # 返回新坐标点的前两个元素,即变换后的坐标。
# 根据设定的旋转角度得到旋转后的方向向量
def get_dir(src_point, rot_rad):
sn, cs = np.sin(rot_rad), np.cos(rot_rad)
src_result = [0, 0]
src_result[0] = src_point[0] * cs - src_point[1] * sn
src_result[1] = src_point[0] * sn + src_point[1] * cs
return src_result
# 获取计算仿射矩阵需要的第三个坐标点的坐标
def get_3rd_point(a, b):
direct = a - b
return b + np.array([-direct[1], direct[0]], dtype=np.float32)
3、关键点绘制
得到最终关键点坐标和预测分数:
# 调整关键点坐标并进行仿射变换得到最终关键点预测结果
def get_final_preds(batch_heatmaps, center, scale):
coords, maxvals = get_max_preds(batch_heatmaps)
heatmap_height = batch_heatmaps.shape[2]
heatmap_width = batch_heatmaps.shape[3]
for n in range(coords.shape[0]): # 对于每个样本n,coords.shape[0]值为batch_size
for p in range(coords.shape[1]): # # 对于每个关键点p,coords.shape[1]值为num_joints
hm = batch_heatmaps[n][p] # 获取对应的热图hm
px = int(math.floor(coords[n][p][0] + 0.5)) # 将关键点的x坐标四舍五入到最近的整数,并加上0.5,得到px
py = int(math.floor(coords[n][p][1] + 0.5))
if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
# 计算热图中相邻像素的差异,得到一个形状为(2,)的数组diff
diff = np.array([hm[py][px + 1] - hm[py][px - 1],
hm[py + 1][px] - hm[py - 1][px]])
# 将关键点的预测坐标coords[i][p]根据diff的符号调整,每个维度调整量为0.25。
coords[n][p] += np.sign(diff) * .25
preds = coords.copy()
for i in range(coords.shape[0]): # 对于每个样本,将coords中的关键点坐标进行逆变换,得到预测结果preds
preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height])
return preds, maxvals
if __name__ == "__main__":
key_point, kp_score = get_final_preds(output_tensor, np.asarray([center]), np.asarray([scale]))
绘制:
def show_result(self, image, boxes, kp_points, kp_scores, skeleton=None, waitKey=0):
if not skeleton:
skeleton = self.skeleton
image = self.draw_keypoints(image, boxes, kp_points, kp_scores, skeleton)
cv_show_image('frame', image, use_rgb=False, waitKey=waitKey)
return image
def draw_keypoints(self,
image,
boxes,
kp_points,
kp_scores,
skeleton, box_color=(255, 0, 0),
circle_color=(0, 255, 0), line_color=(0, 0, 255)):
vis_image = image.copy()
vis_image = draw_key_point_in_image(vis_image, kp_points,
circle_color=circle_color,
line_color=line_color,
pointline=skeleton,
thickness=2)
vis_image = draw_image_boxes(vis_image, boxes, color=box_color)
return vis_image
def draw_key_point_in_image(image,
key_points,
pointline=[],
vis_id=False,
circle_color=(0, 255, 0),
line_color=(0, 0, 255),
thickness=2):
image = copy.deepcopy(image)
for person_id, points in enumerate(key_points):
if points is None:
continue
if vis_id:
text = None
else:
text = [""] * len(points)
image = draw_image_points_lines(image, points, pointline,
circle_color=circle_color,
line_color=line_color,
texts=text,
thickness=thickness)
return image
def draw_image_boxes(bgr_image, boxes_list, color=(0, 0, 255), thickness=1):
for box in boxes_list:
x1, y1, x2, y2 = box[:4]
point1 = (int(x1), int(y1))
point2 = (int(x2), int(y2))
cv2.rectangle(bgr_image, point1, point2, color, thickness=thickness)
return bgr_image
def draw_image_points_lines(image,
points,
pointline=[],
texts=None,
circle_color=(0, 255, 0),
line_color=(0, 0, 255),
thickness=2):
points = np.asarray(points, dtype=np.int32)
if texts is None:
texts = list(range(len(points)))
draw_image_lines(image, points, pointline, color=line_color, thickness=thickness)
thickness_ = max(int(thickness * 1), 1)
image = draw_points_text(image, points,
texts=texts,
color=circle_color,
thickness=thickness_,
drawType="simple")
return image
def draw_points_text(image, points, texts=None, color=(255, 0, 0), thickness=1, drawType="simple"):
if texts is None:
texts = [""] * len(points)
for point, text in zip(points, texts):
point = (int(point[0]), int(point[1]))
cv2.circle(image, point, thickness * 2, color, -1)
draw_text(image, point, text, bg_color=color, thickness=thickness, drawType=drawType)
return image
def draw_text(image, point, text, bg_color=(255, 0, 0), thickness=5, drawType="custom"):
fontScale = 0.5
text_thickness = 1
fontFace = cv2.FONT_HERSHEY_SIMPLEX
# fontFace=cv2.FONT_HERSHEY_SIMPLEX
if drawType == "custom":
text_size, baseline = cv2.getTextSize(str(text), fontFace, fontScale, thickness)
text_loc = (point[0], point[1] + text_size[1])
cv2.rectangle(image, (text_loc[0] - 2 // 2, text_loc[1] - 2 - baseline),
(text_loc[0] + text_size[0], text_loc[1] + text_size[1]), color=bg_color, thickness=thickness)
# draw score value
cv2.putText(image, str(text), (text_loc[0], text_loc[1] + baseline), fontFace, fontScale,
(255, 255, 255), text_thickness, 2)
elif drawType == "simple":
cv2.putText(image, str(text), point, fontFace, fontScale, color=bg_color, thickness=thickness)
return image
def draw_image_lines(image, points, pointline=[], color=(0, 0, 255), thickness=2, check=True):
points = np.asarray(points, dtype=np.int32)
for point_index in pointline:
point1 = tuple(points[point_index[0]])
point2 = tuple(points[point_index[1]])
if check:
if point1 is None or point2 is None:
continue
if sum(point1) <= 0 or sum(point2) <= 0:
continue
cv2.line(image, point1, point2, color, thickness) # 绿色,3个像素宽度
return image
def cv_show_image(title, image, use_rgb=True, waitKey=0):
image = copy.copy(image)
if image.shape[-1] == 3 and use_rgb:
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # 将BGR转为RGB
# cv2.namedWindow(title, flags=cv2.WINDOW_AUTOSIZE)
cv2.namedWindow(title, flags=cv2.WINDOW_NORMAL)
cv2.imshow(title, image)
cv2.waitKey(waitKey)
return image