前言
今天,我们继续去学习EAST,在这里我还是要说一下,我也是在学习可能有的部分有不足望谅解,而且我写的内容大致是按照其他博主的理解,加上一些我认为应该提到函数,但是我要去很多地方查,可能不能一一列出我的资料出处,还望前辈们见谅。今天我们仍然是学习icdar数据处理模块,众所周知我们用到的icdar2015数据集的标签中文本框只是普通的四边形而已,而我们所希望的文本框则是中规中矩的平行四边形或者长方形,以便我们后续和角度信息组合起来,而generate_rbox函数就完成了这个转化过程。下面让我们看一下代码。
def generate_rbox(im_size, polys, tags):#将不规则的四边形变成平行四边形
h, w = im_size
poly_mask = np.zeros((h, w), dtype=np.uint8)
score_map = np.zeros((h, w), dtype=np.uint8)
geo_map = np.zeros((h, w, 5), dtype=np.float32)
# mask used during traning, to ignore some hard areas
training_mask = np.ones((h, w), dtype=np.uint8)
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
poly = poly_tag[0]
tag = poly_tag[1]
r = [None, None, None, None]
for i in range(4):
r[i] = min(np.linalg.norm(poly[i] - poly[(i + 1) % 4]),
np.linalg.norm(poly[i] - poly[(i - 1) % 4]))
# score map
shrinked_poly = shrink_poly(poly.copy(), r).astype(np.int32)[np.newaxis, :, :]
cv2.fillPoly(score_map, shrinked_poly, 1)
cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1)
# if the poly is too small, then ignore it during training
poly_h = min(np.linalg.norm(poly[0] - poly[3]), np.linalg.norm(poly[1] - poly[2]))
poly_w = min(np.linalg.norm(poly[0] - poly[1]), np.linalg.norm(poly[2] - poly[3]))
if min(poly_h, poly_w) < FLAGS.min_text_size:
cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0)
if tag:
cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0)
xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1))
# if geometry == 'RBOX':
# 对任意两个顶点的组合生成一个平行四边形 - generate a parallelogram for any combination of two vertices
fitted_parallelograms = []
for i in range(4):
p0 = poly[i]
p1 = poly[(i + 1) % 4]
p2 = poly[(i + 2) % 4]
p3 = poly[(i + 3) % 4]
edge = fit_line([p0[0], p1[0]], [p0[1], p1[1]])#边(p0, p1)设为edge
backward_edge = fit_line([p0[0], p3[0]], [p0[1], p3[1]])# 边p0和p3设为backward_edge
forward_edge = fit_line([p1[0], p2[0]], [p1[1], p2[1]]) # 边(p1, p2)设为forward_edge
if point_dist_to_line(p0, p1, p2) > point_dist_to_line(p0, p1, p3): # 点到直线距离
# 平行线经过p2 - parallel lines through p2
if edge[1] == 0:
edge_opposite = [1, 0, -p2[0]]
else:
edge_opposite = [edge[0], -1, p2[1] - edge[0] * p2[0]]
else:
# 经过p3 - after p3
if edge[1] == 0:
edge_opposite = [1, 0, -p3[0]]
else:
edge_opposite = [edge[0], -1, p3[1] - edge[0] * p3[0]]
# move forward edge
new_p0 = p0
new_p1 = p1
new_p2 = p2
new_p3 = p3
new_p2 = line_cross_point(forward_edge, edge_opposite)
if point_dist_to_line(p1, new_p2, p0) > point_dist_to_line(p1, new_p2, p3):
# across p0
if forward_edge[1] == 0:
forward_opposite = [1, 0, -p0[0]]
else:
forward_opposite = [forward_edge[0], -1, p0[1] - forward_edge[0] * p0[0]]
else:
# across p3
if forward_edge[1] == 0:
forward_opposite = [1, 0, -p3[0]]
else:
forward_opposite = [forward_edge[0], -1, p3[1] - forward_edge[0] * p3[0]]
new_p0 = line_cross_point(forward_opposite, edge)
new_p3 = line_cross_point(forward_opposite, edge_opposite)
fitted_parallelograms.append([new_p0, new_p1, new_p2, new_p3, new_p0])
# or move backward edge
new_p0 = p0
new_p1 = p1
new_p2 = p2
new_p3 = p3
new_p3 = line_cross_point(backward_edge, edge_opposite)
if point_dist_to_line(p0, p3, p1) > point_dist_to_line(p0, p3, p2):
# across p1
if backward_edge[1] == 0:
backward_opposite = [1, 0, -p1[0]]
else:
backward_opposite = [backward_edge[0], -1, p1[1] - backward_edge[0] * p1[0]]
else:
# across p2
if backward_edge[1] == 0:
backward_opposite = [1, 0, -p2[0]]
else:
backward_opposite = [backward_edge[0], -1, p2[1] - backward_edge[0] * p2[0]]
new_p1 = line_cross_point(backward_opposite, edge)
new_p2 = line_cross_point(backward_opposite, edge_opposite)
fitted_parallelograms.append([new_p0, new_p1, new_p2, new_p3, new_p0])
areas = [Polygon(t).area for t in fitted_parallelograms]
parallelogram = np.array(fitted_parallelograms[np.argmin(areas)][:-1], dtype=np.float32)
# sort thie polygon
parallelogram_coord_sum = np.sum(parallelogram, axis=1)
min_coord_idx = np.argmin(parallelogram_coord_sum)
parallelogram = parallelogram[
[min_coord_idx, (min_coord_idx + 1) % 4, (min_coord_idx + 2) % 4, (min_coord_idx + 3) % 4]]
rectange = rectangle_from_parallelogram(parallelogram)
rectange, rotate_angle = sort_rectangle(rectange)
p0_rect, p1_rect, p2_rect, p3_rect = rectange
for y, x in xy_in_poly:
point = np.array([x, y], dtype=np.float32)
# top
geo_map[y, x, 0] = point_dist_to_line(p0_rect, p1_rect, point)
# right
geo_map[y, x, 1] = point_dist_to_line(p1_rect, p2_rect, point)
# down
geo_map[y, x, 2] = point_dist_to_line(p2_rect, p3_rect, point)
# left
geo_map[y, x, 3] = point_dist_to_line(p3_rect, p0_rect, point)
# angle
geo_map[y, x, 4] = rotate_angle
return score_map, geo_map, training_mask
准备部分
h, w = im_size
poly_mask = np.zeros((h, w), dtype=np.uint8) # mask 全0
score_map = np.zeros((h, w), dtype=np.uint8) # 得分map 全0
geo_map = np.zeros((h, w, 5), dtype=np.float32) # 坐标 全0
# mask used during traning, to ignore some hard areas
training_mask = np.ones((h, w), dtype=np.uint8) # mask 全1
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
poly = poly_tag[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
tag = poly_tag[1] # 文本label, bool类型,True/False
# 对每个顶点,找到经过他的两条边中较短的那条
r = [None, None, None, None] # r中每个值就是经过该点两条边中较短那条边的值
for i in range(4):
# linalg = linear(线性)+algebra(代数),norm则表示范数。默认为二范数
r[i] = min(np.linalg.norm(poly[i] - poly[(i + 1) % 4]), # 就是根据两点坐标求出两点间距离 d=sqrt((x1-x2)^2+(y1-y2)^2)
np.linalg.norm(poly[i] - poly[(i - 1) % 4]))
# 对原始标记框进行0.3倍边长的缩放,这样做可以进一步去除人工标注的误差,拿到更准确的label信息。
shrinked_poly = shrink_poly(poly.copy(), r).astype(np.int32)[np.newaxis, :, :]
# score_map是框类像素均为1,poly_mask则按文字框个数递增填充
cv2.fillPoly(score_map, shrinked_poly, 1) # 将相应部分填充为1
cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1)
# if the poly is too small, then ignore it during training
# 如果文本框标签太小或者txt中没具体标记是什么内容,即*或者###,则加掩模,训练时忽略该部分
poly_h = min(np.linalg.norm(poly[0] - poly[3]), np.linalg.norm(poly[1] - poly[2]))
poly_w = min(np.linalg.norm(poly[0] - poly[1]), np.linalg.norm(poly[2] - poly[3]))
if min(poly_h, poly_w) < FLAGS.min_text_size: # 4条边中最小的边小于最小txt的尺寸,
cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0) # 将标记为*或者###的文本框掩模置0
if tag:
cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0)
# 当前新加入的文本框区域像素点
xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1))
# if geometry == 'RBOX':
# 对任意两个顶点的组合生成一个平行四边形 - generate a parallelogram for any combination of two vertices
# 对于四个顶点,确定两个顶点组成的一条边,再结合剩下的两个点可以得到两个包含这四个点的平行四边形
# 这里就是遍历两个顶点的组合,生成8个平行四边形
fitted_parallelograms = []
这里是准备分数图等的全0魔板,和掩膜的全1魔板,而且对一些标签太小或者txt中没具体标记是什么内容,即*或者###,加以掩膜,等操作这里我们找到的博主代码的注释很到位,我们这里只说前辈自定义的几个函数,和用到的几个函数。
1)shrink_poly
def shrink_poly(poly, r):
'''
fit a poly inside the origin poly, maybe bugs here...
used for generate the score map
:param poly: the text poly
:param r: r in the paper
:return: the shrinked poly
'''
# shrink ratio
R = 0.3
# find the longer pair
if np.linalg.norm(poly[0] - poly[1]) + np.linalg.norm(poly[2] - poly[3]) > \
np.linalg.norm(poly[0] - poly[3]) + np.linalg.norm(poly[1] - poly[2]):
# first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2)
## p0, p1
theta = np.arctan2((poly[1][1] - poly[0][1]), (poly[1][0] - poly[0][0]))
poly[0][0] += R * r[0] * np.cos(theta)
poly[0][1] += R * r[0] * np.sin(theta)
poly[1][0] -= R * r[1] * np.cos(theta)
poly[1][1] -= R * r[1] * np.sin(theta)
## p2, p3
theta = np.arctan2((poly[2][1] - poly[3][1]), (poly[2][0] - poly[3][0]))
poly[3][0] += R * r[3] * np.cos(theta)
poly[3][1] += R * r[3] * np.sin(theta)
poly[2][0] -= R * r[2] * np.cos(theta)
poly[2][1] -= R * r[2] * np.sin(theta)
## p0, p3
theta = np.arctan2((poly[3][0] - poly[0][0]), (poly[3][1] - poly[0][1]))
poly[0][0] += R * r[0] * np.sin(theta)
poly[0][1] += R * r[0] * np.cos(theta)
poly[3][0] -= R * r[3] * np.sin(theta)
poly[3][1] -= R * r[3] * np.cos(theta)
## p1, p2
theta = np.arctan2((poly[2][0] - poly[1][0]), (poly[2][1] - poly[1][1]))
poly[1][0] += R * r[1] * np.sin(theta)
poly[1][1] += R * r[1] * np.cos(theta)
poly[2][0] -= R * r[2] * np.sin(theta)
poly[2][1] -= R * r[2] * np.cos(theta)
else:
## p0, p3
# print poly
theta = np.arctan2((poly[3][0] - poly[0][0]), (poly[3][1] - poly[0][1]))
poly[0][0] += R * r[0] * np.sin(theta)
poly[0][1] += R * r[0] * np.cos(theta)
poly[3][0] -= R * r[3] * np.sin(theta)
poly[3][1] -= R * r[3] * np.cos(theta)
## p1, p2
theta = np.arctan2((poly[2][0] - poly[1][0]), (poly[2][1] - poly[1][1]))
poly[1][0] += R * r[1] * np.sin(theta)
poly[1][1] += R * r[1] * np.cos(theta)
poly[2][0] -= R * r[2] * np.sin(theta)
poly[2][1] -= R * r[2] * np.cos(theta)
## p0, p1
theta = np.arctan2((poly[1][1] - poly[0][1]), (poly[1][0] - poly[0][0]))
poly[0][0] += R * r[0] * np.cos(theta)
poly[0][1] += R * r[0] * np.sin(theta)
poly[1][0] -= R * r[1] * np.cos(theta)
poly[1][1] -= R * r[1] * np.sin(theta)
## p2, p3
theta = np.arctan2((poly[2][1] - poly[3][1]), (poly[2][0] - poly[3][0]))
poly[3][0] += R * r[3] * np.cos(theta)
poly[3][1] += R * r[3] * np.sin(theta)
poly[2][0] -= R * r[2] * np.cos(theta)
poly[2][1] -= R * r[2] * np.sin(theta)
return poly
这个函数文本框进行0.3倍边长的缩放,这样做可以进一步去除人工标注的误差,使label信息更加精准。
2)cv2.fillPoly
这是OpenCV自带的函数,cv2.fillConvexPoly()函数可以用来填充凸多边形,只需要提供凸多边形的顶点即可。这里我们用来填充分数图,几何图,和掩膜。
3)np.argwhere
numpy中的np.argwhere( a ) 作用是返回非0的数组元组的索引,其中a是要索引数组的条件。
图形处理
这里我们假设要求的是以边(p0, p1)和边(p1, p2)作为参考边的平行四边形。要注意的是这里我们说的边(p0, p1)并不是一组坐标,而是坐标组成的边。我们吧(p0, p1)设为edge,边(p1, p2)设为forward_edge,边p0和p3设为backward_edge。首先第一步,先求出点p2和p3到边edge的距离,求出比较大的那个点,图中p2距离更远,因此选择p2。然后过点p2做一条平行于边edge的直线,该边我们定义为edge_opposite。现在,我们就有了平行四边形的三条边,接下来了来画最后一条边。采用同样的方法对比点p0和p3到直线forward_edge的距离,选择距离更远的点,图中是p3,然后过点p3做直线平行于forward_edge,最后这条直线称为forward_opposite。到这里,四条边都画出来了,分别是edge,forward_edge,edge_opposite,和forward_opposite,最后根据直线的交点更新4个顶点位置。
然后让我们来看一下具体代码,首先是绘制第一条边,也就是需要确定的四边形的第四条边:
# 拟合ax+by+c=0
edge = fit_line([p0[0], p1[0]], [p0[1], p1[1]]) # 边(p0, p1)设为edge
backward_edge = fit_line([p0[0], p3[0]], [p0[1], p3[1]]) # 边p0和p3设为backward_edge
forward_edge = fit_line([p1[0], p2[0]], [p1[1], p2[1]]) # 边(p1, p2)设为forward_edge
#首先第一步,先求出点p2和p3到边edge的距离 通过另外两个点到edge的距离大小来决定edge对应的平行线应该过p2还是p3(选距离大的)
if point_dist_to_line(p0, p1, p2) > point_dist_to_line(p0, p1, p3): # 点到直线距离
# 然后过点p2做一条平行于边edge的直线,该边我们定义为edge_opposite
if edge[1] == 0:
edge_opposite = [1, 0, -p2[0]]
else:
edge_opposite = [edge[0], -1, p2[1] - edge[0] * p2[0]]
else: # 然后过点p3做一条平行于边edge的直线,该边我们定义为edge_opposite
# 经过p3 - after p3
if edge[1] == 0:
edge_opposite = [1, 0, -p3[0]]
else:
edge_opposite = [edge[0], -1, p3[1] - edge[0] * p3[0]]
这就是在我们确定edge边以后我们判断p3和p2谁更远一点,我们过更远的一点做edge的平行线作为edge_opposite 边。这里我们要提到前辈自己定义的一个函数,fit_line:
def fit_line(p1, p2):
# fit a line ax+by+c = 0
if p1[0] == p1[1]:
return [1., 0., -p1[0]]
else:
[k, b] = np.polyfit(p1, p2, deg=1)
return [k, -1., b]
这是作者自己定义用来确定边的函数。
point_dist_to_line:
def point_dist_to_line(p1, p2, p3):
# compute the distance from p3 to p1-p2
return np.linalg.norm(np.cross(p2 - p1, p1 - p3)) / np.linalg.norm(p2 - p1)
这个也是前辈自己定义的作用是用来判断距离。
然后是最后一条边的定义:
new_p2 = line_cross_point(forward_edge, edge_opposite) # 求两条直线的交点
# 求p0,p3到直线p1-new_p2的距离,根据距离大的点画最后一条直线
if point_dist_to_line(p1, new_p2, p0) > point_dist_to_line(p1, new_p2, p3):
# across p0
if forward_edge[1] == 0:
forward_opposite = [1, 0, -p0[0]]
else:
forward_opposite = [forward_edge[0], -1, p0[1] - forward_edge[0] * p0[0]]
else:
# across p3
if forward_edge[1] == 0:
forward_opposite = [1, 0, -p3[0]]
else:
forward_opposite = [forward_edge[0], -1, p3[1] - forward_edge[0] * p3[0]]
new_p0 = line_cross_point(forward_opposite, edge)
new_p3 = line_cross_point(forward_opposite, edge_opposite)
fitted_parallelograms.append([new_p0, new_p1, new_p2, new_p3, new_p0])
取相同方法,找出p0和p3中距离较大那个划出最后一条平行线,然后找到四条线的交点,更新文本框坐标。这里我们要提到前辈自己定义的line_cross_point函数:
def line_cross_point(line1, line2):
# line1 0= ax+by+c, compute the cross point of line1 and line2
if line1[0] != 0 and line1[0] == line2[0]:
print('Cross point does not exist')
return None
if line1[0] == 0 and line2[0] == 0:
print('Cross point does not exist')
return None
if line1[1] == 0:
x = -line1[2]
y = line2[0] * x + line2[2]
elif line2[1] == 0:
x = -line2[2]
y = line1[0] * x + line1[2]
else:
k1, _, b1 = line1
k2, _, b2 = line2
x = -(b1-b2)/(k1-k2)
y = k1*x + b1
return np.array([x, y], dtype=np.float32)
作用是求出两条变的交点,以便我们更新坐标。
最后部分
# 选定面积最小的平行四边形
areas = [Polygon(t).area for t in fitted_parallelograms]
parallelogram = np.array(fitted_parallelograms[np.argmin(areas)][:-1], dtype=np.float32)
# sort the polygon
parallelogram_coord_sum = np.sum(parallelogram, axis=1)
min_coord_idx = np.argmin(parallelogram_coord_sum)
parallelogram = parallelogram[
[min_coord_idx, (min_coord_idx + 1) % 4, (min_coord_idx + 2) % 4, (min_coord_idx + 3) % 4]]
# 得到外包矩形即旋转角
rectange = rectangle_from_parallelogram(parallelogram)
rectange, rotate_angle = sort_rectangle(rectange)
p0_rect, p1_rect, p2_rect, p3_rect = rectange
# 对当前新加入的文本框区域像素点,根据其到矩形四边的距离修改geo_map
for y, x in xy_in_poly:
point = np.array([x, y], dtype=np.float32)
# top
geo_map[y, x, 0] = point_dist_to_line(p0_rect, p1_rect, point) # 点到直线的距离
# right
geo_map[y, x, 1] = point_dist_to_line(p1_rect, p2_rect, point)
# down
geo_map[y, x, 2] = point_dist_to_line(p2_rect, p3_rect, point)
# left
geo_map[y, x, 3] = point_dist_to_line(p3_rect, p0_rect, point)
# angle
geo_map[y, x, 4] = rotate_angle
return score_map, geo_map, training_mask
最后这段代码主要完成的是geo_map,众所周知他由角度和和到四个边的距离组成而这段代码上半部分就是在计算角度,下半部分则是在计算到四条边的距离,最后合成geo_map,并且和开头我们算的score_map, training_mask一起作为返回值返回。这里我们介绍作者定义的最后一个函数:point_dist_to_line:
def point_dist_to_line(p1, p2, p3):
# compute the distance from p3 to p1-p2
return np.linalg.norm(np.cross(p2 - p1, p1 - p3)) / np.linalg.norm(p2 - p1)
作用是计算点到直线的距离。
最后
今天我们的学习就到这,最后还是要说一句,本人也是刚学可能有的地方有错误,望请指出,谢谢各位,还有就是我的学习大部分来源于一位名叫业余阻击手19的博主文章,在这里向前辈致以诚挚的敬意。祝大家学业有成,工作顺利。