模型结构构建

最新推荐文章于 2024-05-29 14:41:48 发布

华灯初上~(unique)

最新推荐文章于 2024-05-29 14:41:48 发布

阅读量387

点赞数 2

分类专栏： OCR 文章标签：深度学习神经网络 ocr

本文链接：https://blog.csdn.net/m0_37188326/article/details/120743582

版权

OCR 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

基于paddlepaddle的模型构建

卷积层构建
残差层构建
残差网络构建
FPN实现
Model Head
损失函数构建
paddle中实现的loss
数据预处理
- 注解
- - generate_tvo_and_tco

卷积层构建

class ConvBNLayer(nn.Layer):
    def __init__(
            self,
            in_channels, ## 输入channel
            out_channels, ## 输出channel
            kernel_size,  ## 卷积核大小
            stride=1,	## 滑动步长
            groups=1,	## 卷积核组大小（组卷积）
            is_vd_mode=False, ## 是否使用vd的卷积结构
            act=None,	## 激活函数
            name=None, ):
        super(ConvBNLayer, self).__init__()

        self.is_vd_mode = is_vd_mode
        ## 定义池化
        self._pool2d_avg = nn.AvgPool2D(
            kernel_size=2, stride=2, padding=0, ceil_mode=True)
        ## 定义卷积
        self._conv = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2, ## 卷积输出特征大小不变
            groups=groups,
            weight_attr=ParamAttr(name=name + "_weights"),
            bias_attr=False)
        if name == "conv1":
            bn_name = "bn_" + name
        else:
            bn_name = "bn" + name[3:]
        self._batch_norm = nn.BatchNorm(
            out_channels,
            act=act,
            ## init param
            param_attr=ParamAttr(name=bn_name + '_scale'),
            bias_attr=ParamAttr(bn_name + '_offset'),
            ## inferrence param
            moving_mean_name=bn_name + '_mean',
            moving_variance_name=bn_name + '_variance')

    def forward(self, inputs):
        if self.is_vd_mode:
            inputs = self._pool2d_avg(inputs)
        y = self._conv(inputs)
        y = self._batch_norm(y)
        return y

残差层构建

## 漏斗结构（特点是前后通过conv1x1进行channel的压缩与扩展）
class BottleneckBlock(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 shortcut=True,
                 if_first=False,
                 name=None):
        super(BottleneckBlock, self).__init__()

        self.conv0 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=1,
            act='relu',
            name=name + "_branch2a")
        self.conv1 = ConvBNLayer(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=stride,
            act='relu',
            name=name + "_branch2b")
        self.conv2 = ConvBNLayer(
            in_channels=out_channels,
            out_channels=out_channels * 4,
            kernel_size=1,
            act=None,
            name=name + "_branch2c")
		## 非直连(vd 版本的resnet与之前的版本的区别在这里)
        if not shortcut:
            self.short = ConvBNLayer(
                in_channels=in_channels,
                out_channels=out_channels * 4,
                kernel_size=1,
                stride=1,
                ## 当if_first为False 时，is_vd_mode为True
                ## 这是为了避免在第一个block中做vd降采样
                is_vd_mode=False if if_first else True,
                name=name + "_branch1")

        self.shortcut = shortcut

    def forward(self, inputs):
        y = self.conv0(inputs)
        conv1 = self.conv1(y)
        conv2 = self.conv2(conv1)

        if self.shortcut:
            short = inputs
        else:
            short = self.short(inputs)
        y = paddle.add(x=short, y=conv2)
        y = F.relu(y)
        return y

残差网络构建

class ResNet_SAST(nn.Layer):
    def __init__(self, in_channels=3, layers=50, **kwargs):
        super(ResNet_SAST, self).__init__()

        self.layers = layers
        ## 支持的resnet层数
        supported_layers = [18, 34, 50, 101, 152, 200]
        assert layers in supported_layers, \
            "supported layers are {} but input layer is {}".format(
                supported_layers, layers)

        if layers == 18:
            depth = [2, 2, 2, 2]
        elif layers == 34 or layers == 50:
            # depth = [3, 4, 6, 3]
            depth = [3, 4, 6, 3, 3]
        elif layers == 101:
            depth = [3, 4, 23, 3]
        elif layers == 152:
            depth = [3, 8, 36, 3]
        elif layers == 200:
            depth = [3, 12, 48, 3]
        # num_channels = [64, 256, 512,
        #                 1024] if layers >= 50 else [64, 64, 128, 256]
        # num_filters = [64, 128, 256, 512]
        ## num_channels 一般作为in_channel
        num_channels = [64, 256, 512,
                        1024, 2048] if layers >= 50 else [64, 64, 128, 256]
        ## num_filters 一般作为out_channel
        num_filters = [64, 128, 256, 512, 512]
		## 定义第一层降采样结构，使用3x3的大卷积核
        self.conv1_1 = ConvBNLayer(
            in_channels=in_channels,
            out_channels=32,
            kernel_size=3,
            stride=2,
            act='relu',
            name="conv1_1")
        self.conv1_2 = ConvBNLayer(
            in_channels=32,
            out_channels=32,
            kernel_size=3,
            stride=1,
            act='relu',
            name="conv1_2")
        self.conv1_3 = ConvBNLayer(
            in_channels=32,
            out_channels=64,
            kernel_size=3,
            stride=1,
            act='relu',
            name="conv1_3")
        self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
		## 定义残差结构，50层以上使用bottleneck结构
        self.stages = []
        ## 将前一个降采样的Out_channel保存
        self.out_channels = [3, 64]
        if layers >= 50:
            for block in range(len(depth)):
                block_list = []
                ## 注意，每次block循环开始是，将shortcut初始化为False
                ## 仅在shortcut为False时，做vd 降采样，也就是说，在每
                ## 个残差块第一个卷积层中进行降采样操作
                shortcut = False
                for i in range(depth[block]):
                    if layers in [101, 152] and block == 2:
                        if i == 0:
                            conv_name = "res" + str(block + 2) + "a"
                        else:
                            conv_name = "res" + str(block + 2) + "b" + str(i)
                    else:
                        conv_name = "res" + str(block + 2) + chr(97 + i)			
                    ## 初始化残差模块
                    bottleneck_block = self.add_sublayer(
                        'bb_%d_%d' % (block, i),
                        BottleneckBlock(
                            in_channels=num_channels[block]
                            ## 这里为何channel要*4，可以发现，num_channels中的channel
                            ## 是要大于num_filters的，并且out_channels从num_filters获取
                            ## 对此便使得bottleneck中第一个conv1x1实现了压缩channel的目的
                            if i == 0 else num_filters[block] * 4,
                            out_channels=num_filters[block],
                            ## 残差块中第一个卷积层的步长为2
                            stride=2 if i == 0 and block != 0 else 1,
                            shortcut=shortcut,
                            ## 此参数基本为Fasle，除了在block=0，i=0
                            ## 时(在shortcut为False，if_first为
                            ## False时，才会做vd 下采样)
                            if_first=block == i == 0,
                            name=conv_name))
                    shortcut = True
                    block_list.append(bottleneck_block)
                self.out_channels.append(num_filters[block] * 4)
                self.stages.append(nn.Sequential(*block_list))
        else:
            for block in range(len(depth)):
                block_list = []
                shortcut = False
                for i in range(depth[block]):
                    conv_name = "res" + str(block + 2) + chr(97 + i)
                    basic_block = self.add_sublayer(
                        'bb_%d_%d' % (block, i),
                        BasicBlock(
                            in_channels=num_channels[block]
                            if i == 0 else num_filters[block],
                            out_channels=num_filters[block],
                            stride=2 if i == 0 and block != 0 else 1,
                            shortcut=shortcut,
                            if_first=block == i == 0,
                            name=conv_name))
                    shortcut = True
                    block_list.append(basic_block)
                self.out_channels.append(num_filters[block])
                self.stages.append(nn.Sequential(*block_list))

    def forward(self, inputs):
        out = [inputs]
        y = self.conv1_1(inputs)
        y = self.conv1_2(y)
        y = self.conv1_3(y)
        out.append(y)
        y = self.pool2d_max(y)
        for block in self.stages:
            y = block(y)
            out.append(y)
        return out

以下为resnet_vd与原版的区别，原版resnet中的模块分为两部分，下采样模块与非下采样模块，其中，第一个下采样模块比较特殊，一般采用conv7x7的大卷积进行特征提取，paddle 这边做了相应改进，如图vc中体现，使用了3个conv3x3的卷积做了替代（注意，其maxpool的输出特征尺度不发生变化，做了padding），在残差网络构建中有体现。而之后的下采样模块，在shortcut中做了修改，对原来的conv1x1(s=2)使用avgpool(2x2,s=2)+conv1x1替代，在保留原有参数量的基础上，提高了模型精度。
在这里插入图片描述

FPN实现

## 定义卷积
class ConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 groups=1,
                 if_act=True,
                 act=None,
                 name=None):
        super(ConvBNLayer, self).__init__()
        self.if_act = if_act
        self.act = act
        self.conv = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(name=name + '_weights'),
            bias_attr=False)
  
        self.bn = nn.BatchNorm(
            num_channels=out_channels,
            act=act,
            param_attr=ParamAttr(name="bn_" + name + "_scale"),
            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
            moving_mean_name="bn_" + name + "_mean",
            moving_variance_name="bn_" + name + "_variance")

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return x

## 定义反卷积，用于上采样
class DeConvBNLayer(nn.Layer):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 groups=1,
                 if_act=True,
                 act=None,
                 name=None):
        super(DeConvBNLayer, self).__init__()
        self.if_act = if_act
        self.act = act
        ## 反卷积使用paddle中的Conv2DTranspose（转置卷积）实现
        self.deconv = nn.Conv2DTranspose(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=(kernel_size - 1) // 2,
            groups=groups,
            weight_attr=ParamAttr(name=name + '_weights'),
            bias_attr=False)
        self.bn = nn.BatchNorm(
            num_channels=out_channels,
            act=act,
            param_attr=ParamAttr(name="bn_" + name + "_scale"),
            bias_attr=ParamAttr(name="bn_" + name + "_offset"),
            moving_mean_name="bn_" + name + "_mean",
            moving_variance_name="bn_" + name + "_variance")

    def forward(self, x):
        x = self.deconv(x)
        x = self.bn(x)
        return x

## 上采样模块
class FPN_Up_Fusion(nn.Layer):
	## 上采样模块初始化输入为输入channel列表
    def __init__(self, in_channels):
        super(FPN_Up_Fusion, self).__init__()
    ##  特征图逆排序
        in_channels = in_channels[::-1]
        out_channels = [256, 256, 192, 192, 128]
    ## 定义水平映射模块（对于在backbone中不同尺度的的特征图，需要在同一尺度做一次映射）
    ## 获取隐藏层      
        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0')
        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1')
        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2')
        self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3')
        self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4')
	## 定义反卷积，用于上采样（共有5个）
        self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0')

        self.g1_conv = nn.Sequential(
            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'),
            DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2')
        )
        self.g2_conv = nn.Sequential(
            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'),
            DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2')
        )
        self.g3_conv = nn.Sequential(
            ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'),
            DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2')
        )
		## 注，此模块并未上采样
        self.g4_conv = nn.Sequential(
            ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'),
            ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2')
        )

    def _add_relu(self, x1, x2):
    	## 先相加再激活
        x = paddle.add(x=x1, y=x2)
        x = F.relu(x)
        return x

    def forward(self, x):
    	## 从第三个尺度开始，上采样到第三个个尺度
        f = x[2:][::-1]
        h0 = self.h0_conv(f[0])
        h1 = self.h1_conv(f[1])
        h2 = self.h2_conv(f[2])
        h3 = self.h3_conv(f[3])
        h4 = self.h4_conv(f[4])

        g0 = self.g0_conv(h0)
        g1 = self._add_relu(g0, h1)
        g1 = self.g1_conv(g1)
        g2 = self.g2_conv(self._add_relu(g1, h2))
        g3 = self.g3_conv(self._add_relu(g2, h3))
        g4 = self.g4_conv(self._add_relu(g3, h4))

        return g4

## 下采样模块 
class FPN_Down_Fusion(nn.Layer):
    def __init__(self, in_channels):
        super(FPN_Down_Fusion, self).__init__()
        out_channels = [32, 64, 128]

        self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0')
        self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1')
        self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2')

        self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0')

        self.g1_conv = nn.Sequential(
            ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'),
            ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2')            
        )

        self.g2_conv = nn.Sequential(
            ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'),
            ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2')            
        )

    def forward(self, x):
    	## 取前三个尺度
        f = x[:3]
        h0 = self.h0_conv(f[0])
        h1 = self.h1_conv(f[1])
        h2 = self.h2_conv(f[2])
        g0 = self.g0_conv(h0)
        g1 = paddle.add(x=g0, y=h1)
        g1 = F.relu(g1)
        g1 = self.g1_conv(g1)
        g2 = paddle.add(x=g1, y=h2)
        g2 = F.relu(g2)
        g2 = self.g2_conv(g2)
        return g2

## CAB 模块
class Cross_Attention(nn.Layer):
    def __init__(self, in_channels):
        super(Cross_Attention, self).__init__()
        self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta')
        self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi')
        self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g')

        self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight')
        self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc')

        self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight')
        self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc')

        self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn')

    def _cal_fweight(self, f, shape):
        f_theta, f_phi, f_g = f
        #flatten
        f_theta = paddle.transpose(f_theta, [0, 2, 3, 1])
        f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128])
        f_phi = paddle.transpose(f_phi, [0, 2, 3, 1])
        f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128])
        f_g = paddle.transpose(f_g, [0, 2, 3, 1])
        f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128])
        #correlation
        ## 矩阵相乘，shape:[shape[0] * shape[1], shape[2], shape[2]]
        ## 这里解释下：shape[0]为batch维度，shape[1]为W维度，shape[2]为H维度，
        ## 128为channel维度(注：如果输入矩阵发生转置，W与H位置互换)
        f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1]))
        #scale
        f_attn = f_attn / (128**0.5)
        f_attn = F.softmax(f_attn)
        #weighted sum
        ## 矩阵相乘，shape:[shape[0] * shape[1], shape[2], 128]
        f_weight = paddle.matmul(f_attn, f_g)
        f_weight = paddle.reshape(
            f_weight, [shape[0], shape[1], shape[2], 128])
        return f_weight

    def forward(self, f_common):
        f_shape = paddle.shape(f_common)
        # print('f_shape: ', f_shape)

        f_theta = self.theta_conv(f_common)
        f_phi = self.phi_conv(f_common)
        f_g = self.g_conv(f_common)

        ######## horizon ########
        ## 计算水平方向
        fh_weight = self._cal_fweight([f_theta, f_phi, f_g], 
                                        [f_shape[0], f_shape[2], f_shape[3]])
        ## channel 放回第二个维度
        fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2])
        fh_weight = self.fh_weight_conv(fh_weight)
        #short cut
        fh_sc = self.fh_sc_conv(f_common)
        f_h = F.relu(fh_weight + fh_sc)

        ######## vertical ########
        ## 对fv_theta、fv_phi、fv_g 三个特征做转置
        fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2])
        fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2])
        fv_g = paddle.transpose(f_g, [0, 1, 3, 2])
        fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g], 
                                        [f_shape[0], f_shape[3], f_shape[2]])
        fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1])
        fv_weight = self.fv_weight_conv(fv_weight)
        #short cut
        fv_sc = self.fv_sc_conv(f_common)
        f_v = F.relu(fv_weight + fv_sc)

        ######## merge ########
        ## 对两个方向的注意力图做合并，结构如下图所示
        f_attn = paddle.concat([f_h, f_v], axis=1)
        f_attn = self.f_attn_conv(f_attn)
        return f_attn


class SASTFPN(nn.Layer):
    def __init__(self, in_channels, with_cab=False, **kwargs):
        super(SASTFPN, self).__init__()
        self.in_channels = in_channels
        self.with_cab = with_cab
        self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels)
        self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels)
        self.out_channels = 128
        self.cross_attention = Cross_Attention(self.out_channels)

    def forward(self, x):
        #down fpn
        f_down = self.FPN_Down_Fusion(x)

        #up fpn
        f_up = self.FPN_Up_Fusion(x)

        #fusion
        f_common = paddle.add(x=f_down, y=f_up)
        f_common = F.relu(f_common)

        if self.with_cab:
            # print('enhence f_common with CAB.')
            f_common = self.cross_attention(f_common)

        return f_common

在这里插入图片描述

Model Head

## 
class SAST_Header1(nn.Layer):
    def __init__(self, in_channels, **kwargs):
        super(SAST_Header1, self).__init__()
        out_channels = [64, 64, 128]
        ## TCL(文本中心线)
        self.score_conv = nn.Sequential(
            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'),
            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'),
            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'),
            ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4')
        )
        ## TBO(文本边界偏移)
        self.border_conv = nn.Sequential(
            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'),
            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'),
            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'),
            ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4')            
        )

    def forward(self, x):
        f_score = self.score_conv(x)
        f_score = F.sigmoid(f_score)
        f_border = self.border_conv(x)
        return f_score, f_border


class SAST_Header2(nn.Layer):
    def __init__(self, in_channels, **kwargs):
        super(SAST_Header2, self).__init__()
        out_channels = [64, 64, 128]
        ## TVO(文本顶点偏移)
        self.tvo_conv = nn.Sequential(
            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'),
            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'),
            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'),
            ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4')
        )
        ## TCO(文本中心偏移)
        self.tco_conv = nn.Sequential(
            ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'),
            ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'),
            ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'),
            ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4')            
        )

    def forward(self, x):
        f_tvo = self.tvo_conv(x)
        f_tco = self.tco_conv(x)
        return f_tvo, f_tco


class SASTHead(nn.Layer):
    """
    """
    def __init__(self, in_channels, **kwargs):
        super(SASTHead, self).__init__()

        self.head1 = SAST_Header1(in_channels)
        self.head2 = SAST_Header2(in_channels)

    def forward(self, x, targets=None):
        f_score, f_border = self.head1(x)
        f_tvo, f_tco = self.head2(x)

        predicts = {}
        predicts['f_score'] = f_score
        predicts['f_border'] = f_border
        predicts['f_tvo'] = f_tvo
        predicts['f_tco'] = f_tco
        return predicts

损失函数构建

class SASTLoss(nn.Layer):
    """
    """

    def __init__(self, eps=1e-6, **kwargs):
        super(SASTLoss, self).__init__()
        ## 申明dice_loss(然鹅后面并没有调用该变量)
        self.dice_loss = DiceLoss(eps=eps)

    def forward(self, predicts, labels):
        """
        tcl_pos: N x 128 x 3
        tcl_mask: N x 128 x 1
        tcl_label: N x X list or LoDTensor
        """

        f_score = predicts['f_score']
        f_border = predicts['f_border']
        f_tvo = predicts['f_tvo']
        f_tco = predicts['f_tco']

        l_score, l_border, l_mask, l_tvo, l_tco = labels[1:]

        #score_loss，用的是dice_loss
        intersection = paddle.sum(f_score * l_score * l_mask)
        union = paddle.sum(f_score * l_mask) + paddle.sum(l_score * l_mask)
        score_loss = 1.0 - 2 * intersection / (union + 1e-5)
		## 以下三个损失均为回归损失，处理方式类似
        #border loss
        ## 将label中的l_border进行切分，获得l_border与norm参数
        l_border_split, l_border_norm = paddle.split(
            l_border, num_or_sections=[4, 1], axis=1)
        f_border_split = f_border
        border_ex_shape = l_border_norm.shape * np.array([1, 4, 1, 1])
        l_border_norm_split = paddle.expand(
            x=l_border_norm, shape=border_ex_shape)
        l_border_score = paddle.expand(x=l_score, shape=border_ex_shape)
        l_border_mask = paddle.expand(x=l_mask, shape=border_ex_shape)
		## 计算差值
        border_diff = l_border_split - f_border_split
        abs_border_diff = paddle.abs(border_diff)
        border_sign = abs_border_diff < 1.0
        ## 数据类型转换（bool ——> float）
        border_sign = paddle.cast(border_sign, dtype='float32')
        border_sign.stop_gradient = True
        border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \
                    (abs_border_diff - 0.5) * (1.0 - border_sign)
        border_out_loss = l_border_norm_split * border_in_loss
        border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \
                    (paddle.sum(l_border_score * l_border_mask) + 1e-5)

        #tvo_loss
        l_tvo_split, l_tvo_norm = paddle.split(
            l_tvo, num_or_sections=[8, 1], axis=1)
        f_tvo_split = f_tvo
        tvo_ex_shape = l_tvo_norm.shape * np.array([1, 8, 1, 1])
        l_tvo_norm_split = paddle.expand(x=l_tvo_norm, shape=tvo_ex_shape)
        l_tvo_score = paddle.expand(x=l_score, shape=tvo_ex_shape)
        l_tvo_mask = paddle.expand(x=l_mask, shape=tvo_ex_shape)
        #
        tvo_geo_diff = l_tvo_split - f_tvo_split
        abs_tvo_geo_diff = paddle.abs(tvo_geo_diff)
        tvo_sign = abs_tvo_geo_diff < 1.0
        tvo_sign = paddle.cast(tvo_sign, dtype='float32')
        tvo_sign.stop_gradient = True
        tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \
                    (abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign)
        tvo_out_loss = l_tvo_norm_split * tvo_in_loss
        tvo_loss = paddle.sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \
                    (paddle.sum(l_tvo_score * l_tvo_mask) + 1e-5)

        #tco_loss
        l_tco_split, l_tco_norm = paddle.split(
            l_tco, num_or_sections=[2, 1], axis=1)
        f_tco_split = f_tco
        tco_ex_shape = l_tco_norm.shape * np.array([1, 2, 1, 1])
        l_tco_norm_split = paddle.expand(x=l_tco_norm, shape=tco_ex_shape)
        l_tco_score = paddle.expand(x=l_score, shape=tco_ex_shape)
        l_tco_mask = paddle.expand(x=l_mask, shape=tco_ex_shape)

        tco_geo_diff = l_tco_split - f_tco_split
        abs_tco_geo_diff = paddle.abs(tco_geo_diff)
        tco_sign = abs_tco_geo_diff < 1.0
        tco_sign = paddle.cast(tco_sign, dtype='float32')
        tco_sign.stop_gradient = True
        tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \
                    (abs_tco_geo_diff - 0.5) * (1.0 - tco_sign)
        tco_out_loss = l_tco_norm_split * tco_in_loss
        tco_loss = paddle.sum(tco_out_loss * l_tco_score * l_tco_mask) / \
                    (paddle.sum(l_tco_score * l_tco_mask) + 1e-5)

        # total loss
        tvo_lw, tco_lw = 1.5, 1.5
        score_lw, border_lw = 1.0, 1.0
        total_loss = score_loss * score_lw + border_loss * border_lw + \
                    tvo_loss * tvo_lw + tco_loss * tco_lw

        losses = {'loss':total_loss, "score_loss":score_loss,\
            "border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss}
        return losses

paddle中实现的loss

class DiceLoss(nn.Layer):
    def __init__(self, eps=1e-6):
        super(DiceLoss, self).__init__()
        self.eps = eps

    def forward(self, pred, gt, mask, weights=None):
        """
        DiceLoss function.
        """
		## pred与gt与mask的shape保持一致
        assert pred.shape == gt.shape
        assert pred.shape == mask.shape
        if weights is not None:
            assert weights.shape == mask.shape
            mask = weights * mask
        ## 获取pred与gt的交集
        intersection = paddle.sum(pred * gt * mask)

        union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps
        loss = 1 - 2.0 * intersection / union
        assert loss <= 1
        return loss

数据预处理


class SASTProcessTrain(object):
    def __init__(self,
                 image_shape=[512, 512],
                 min_crop_size=24,
                 min_crop_side_ratio=0.3,
                 min_text_size=10,
                 max_text_size=512,
                 **kwargs):
        self.input_size = image_shape[1]
        self.min_crop_size = min_crop_size
        self.min_crop_side_ratio = min_crop_side_ratio
        self.min_text_size = min_text_size
        self.max_text_size = max_text_size

    def quad_area(self, poly):
        """
        compute area of a polygon
        :param poly:
        :return:
        """
        edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
                (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
                (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
                (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
        return np.sum(edge) / 2.

    def gen_quad_from_poly(self, poly):
        """
        Generate min area quad from poly.
        """
        point_num = poly.shape[0]
        min_area_quad = np.zeros((4, 2), dtype=np.float32)
        if True:
            rect = cv2.minAreaRect(poly.astype(
                np.int32))  # (center (x,y), (width, height), angle of rotation)
            center_point = rect[0]
            box = np.array(cv2.boxPoints(rect))

            first_point_idx = 0
            min_dist = 1e4
            for i in range(4):
                dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
                    np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
                    np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
                    np.linalg.norm(box[(i + 3) % 4] - poly[-1])
                if dist < min_dist:
                    min_dist = dist
                    first_point_idx = i
            for i in range(4):
                min_area_quad[i] = box[(first_point_idx + i) % 4]

        return min_area_quad

    def check_and_validate_polys(self, polys, tags, xxx_todo_changeme):
        """
        check so that the text poly is in the same direction,
        and also filter some invalid polygons
        :param polys:
        :param tags:
        :return:
        """
        (h, w) = xxx_todo_changeme
        if polys.shape[0] == 0:
            return polys, np.array([]), np.array([])
        polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
        polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)

        validated_polys = []
        validated_tags = []
        hv_tags = []
        for poly, tag in zip(polys, tags):
            quad = self.gen_quad_from_poly(poly)
            p_area = self.quad_area(quad)
            if abs(p_area) < 1:
                print('invalid poly')
                continue
            if p_area > 0:
                if tag == False:
                    print('poly in wrong direction')
                    tag = True  # reversed cases should be ignore
                poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
                             1), :]
                quad = quad[(0, 3, 2, 1), :]

            len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] -
                                                                       quad[2])
            len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] -
                                                                       quad[2])
            hv_tag = 1

            if len_w * 2.0 < len_h:
                hv_tag = 0

            validated_polys.append(poly)
            validated_tags.append(tag)
            hv_tags.append(hv_tag)
        return np.array(validated_polys), np.array(validated_tags), np.array(
            hv_tags)

    def crop_area(self,
                  im,
                  polys,
                  tags,
                  hv_tags,
                  crop_background=False,
                  max_tries=25):
        """
        make random crop from the input image
        :param im:
        :param polys:
        :param tags:
        :param crop_background:
        :param max_tries: 50 -> 25
        :return:
        """
        h, w, _ = im.shape
        pad_h = h // 10
        pad_w = w // 10
        h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
        w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
        for poly in polys:
            poly = np.round(poly, decimals=0).astype(np.int32)
            minx = np.min(poly[:, 0])
            maxx = np.max(poly[:, 0])
            w_array[minx + pad_w:maxx + pad_w] = 1
            miny = np.min(poly[:, 1])
            maxy = np.max(poly[:, 1])
            h_array[miny + pad_h:maxy + pad_h] = 1
        # ensure the cropped area not across a text
        h_axis = np.where(h_array == 0)[0]
        w_axis = np.where(w_array == 0)[0]
        if len(h_axis) == 0 or len(w_axis) == 0:
            return im, polys, tags, hv_tags
        for i in range(max_tries):
            xx = np.random.choice(w_axis, size=2)
            xmin = np.min(xx) - pad_w
            xmax = np.max(xx) - pad_w
            xmin = np.clip(xmin, 0, w - 1)
            xmax = np.clip(xmax, 0, w - 1)
            yy = np.random.choice(h_axis, size=2)
            ymin = np.min(yy) - pad_h
            ymax = np.max(yy) - pad_h
            ymin = np.clip(ymin, 0, h - 1)
            ymax = np.clip(ymax, 0, h - 1)
            # if xmax - xmin < ARGS.min_crop_side_ratio * w or \
            #   ymax - ymin < ARGS.min_crop_side_ratio * h:
            if xmax - xmin < self.min_crop_size or \
            ymax - ymin < self.min_crop_size:
                # area too small
                continue
            if polys.shape[0] != 0:
                poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \
                                    & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax)
                selected_polys = np.where(
                    np.sum(poly_axis_in_area, axis=1) == 4)[0]
            else:
                selected_polys = []
            if len(selected_polys) == 0:
                # no text in this area
                if crop_background:
                    return im[ymin : ymax + 1, xmin : xmax + 1, :], \
                        polys[selected_polys], tags[selected_polys], hv_tags[selected_polys]
                else:
                    continue
            im = im[ymin:ymax + 1, xmin:xmax + 1, :]
            polys = polys[selected_polys]
            tags = tags[selected_polys]
            hv_tags = hv_tags[selected_polys]
            polys[:, :, 0] -= xmin
            polys[:, :, 1] -= ymin
            return im, polys, tags, hv_tags

        return im, polys, tags, hv_tags

    def generate_direction_map(self, poly_quads, direction_map):
        """
        """
        width_list = []
        height_list = []
        for quad in poly_quads:
            quad_w = (np.linalg.norm(quad[0] - quad[1]) +
                      np.linalg.norm(quad[2] - quad[3])) / 2.0
            quad_h = (np.linalg.norm(quad[0] - quad[3]) +
                      np.linalg.norm(quad[2] - quad[1])) / 2.0
            width_list.append(quad_w)
            height_list.append(quad_h)
        norm_width = max(sum(width_list) / (len(width_list) + 1e-6), 1.0)
        average_height = max(sum(height_list) / (len(height_list) + 1e-6), 1.0)

        for quad in poly_quads:
            direct_vector_full = (
                (quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0
            direct_vector = direct_vector_full / (
                np.linalg.norm(direct_vector_full) + 1e-6) * norm_width
            direction_label = tuple(
                map(float, [
                    direct_vector[0], direct_vector[1], 1.0 / (average_height +
                                                               1e-6)
                ]))
            cv2.fillPoly(direction_map,
                         quad.round().astype(np.int32)[np.newaxis, :, :],
                         direction_label)
        return direction_map

    def calculate_average_height(self, poly_quads):
        """
        """
        height_list = []
        for quad in poly_quads:
            quad_h = (np.linalg.norm(quad[0] - quad[3]) +
                      np.linalg.norm(quad[2] - quad[1])) / 2.0
            height_list.append(quad_h)
        average_height = max(sum(height_list) / len(height_list), 1.0)
        return average_height

    def generate_tcl_label(self,
                           hw,
                           polys,
                           tags,
                           ds_ratio,
                           tcl_ratio=0.3,
                           shrink_ratio_of_width=0.15):
        """
        Generate polygon.
        """
        h, w = hw
        h, w = int(h * ds_ratio), int(w * ds_ratio)
        polys = polys * ds_ratio

        score_map = np.zeros(
            (
                h,
                w, ), dtype=np.float32)
        tbo_map = np.zeros((h, w, 5), dtype=np.float32)
        training_mask = np.ones(
            (
                h,
                w, ), dtype=np.float32)
        direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape(
            [1, 1, 3]).astype(np.float32)

        for poly_idx, poly_tag in enumerate(zip(polys, tags)):
            poly = poly_tag[0]
            tag = poly_tag[1]

            # generate min_area_quad
            min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
            min_area_quad_h = 0.5 * (
                np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
                np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
            min_area_quad_w = 0.5 * (
                np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
                np.linalg.norm(min_area_quad[2] - min_area_quad[3]))

            if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \
                or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio:
                continue

            if tag:
                # continue
                cv2.fillPoly(training_mask,
                             poly.astype(np.int32)[np.newaxis, :, :], 0.15)
            else:
                tcl_poly = self.poly2tcl(poly, tcl_ratio)
                tcl_quads = self.poly2quads(tcl_poly)
                poly_quads = self.poly2quads(poly)
                # stcl map
                stcl_quads, quad_index = self.shrink_poly_along_width(
                    tcl_quads,
                    shrink_ratio_of_width=shrink_ratio_of_width,
                    expand_height_ratio=1.0 / tcl_ratio)
                # generate tcl map
                cv2.fillPoly(score_map,
                             np.round(stcl_quads).astype(np.int32), 1.0)

                # generate tbo map
                for idx, quad in enumerate(stcl_quads):
                    quad_mask = np.zeros((h, w), dtype=np.float32)
                    quad_mask = cv2.fillPoly(
                        quad_mask,
                        np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0)
                    tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]],
                                                quad_mask, tbo_map)
        return score_map, tbo_map, training_mask
	## 生成TVO map与TCO map
    def generate_tvo_and_tco(self,
                             hw,
                             polys,
                             tags,
                             tcl_ratio=0.3,
                             ds_ratio=0.25):
        """
        Generate tcl map, tvo map and tbo map.
        """
        h, w = hw
        h, w = int(h * ds_ratio), int(w * ds_ratio)
        polys = polys * ds_ratio
        ## 获取 poly_mask
        poly_mask = np.zeros((h, w), dtype=np.float32)
		## 对tvo map进行坐标分配
        tvo_map = np.ones((9, h, w), dtype=np.float32)
        ## 按间隔分配
        tvo_map[0:-1:2] = np.tile(np.arange(0, w), (h, 1))
        tvo_map[1:-1:2] = np.tile(np.arange(0, w), (h, 1)).T
        poly_tv_xy_map = np.zeros((8, h, w), dtype=np.float32)

        # tco map
        tco_map = np.ones((3, h, w), dtype=np.float32)
        tco_map[0] = np.tile(np.arange(0, w), (h, 1))
        tco_map[1] = np.tile(np.arange(0, w), (h, 1)).T
        ## TCO map 初始化
        poly_tc_xy_map = np.zeros((2, h, w), dtype=np.float32)

        poly_short_edge_map = np.ones((h, w), dtype=np.float32)
		## 遍历polys
        for poly, poly_tag in zip(polys, tags):

            if poly_tag == True:
                continue

            # adjust point order for vertical poly
            ## 
            poly = self.adjust_point(poly)

            # generate min_area_quad
            ## 获取poly最小外接框的坐标与中心坐标
            min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly)
            ## 计算最小外接框的w,h
            min_area_quad_h = 0.5 * (
                np.linalg.norm(min_area_quad[0] - min_area_quad[3]) +
                np.linalg.norm(min_area_quad[1] - min_area_quad[2]))
            min_area_quad_w = 0.5 * (
                np.linalg.norm(min_area_quad[0] - min_area_quad[1]) +
                np.linalg.norm(min_area_quad[2] - min_area_quad[3]))

            # generate tcl map and text, 128 * 128
            tcl_poly = self.poly2tcl(poly, tcl_ratio)

            # generate poly_tv_xy_map
            for idx in range(4): ## 将外接框min_area_quad(shape:[4,2])坐标分配到相对应的channel中
                cv2.fillPoly(
                    poly_tv_xy_map[2 * idx],
                    np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
                    float(min(max(min_area_quad[idx, 0], 0), w)))
                cv2.fillPoly(
                    poly_tv_xy_map[2 * idx + 1],
                    np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
                    float(min(max(min_area_quad[idx, 1], 0), h)))

            # generate poly_tc_xy_map
            for idx in range(2):
                cv2.fillPoly(
                    poly_tc_xy_map[idx],
                    np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
                    float(center_point[idx]))

            # generate poly_short_edge_map
            cv2.fillPoly(
                poly_short_edge_map,
                np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
                float(max(min(min_area_quad_h, min_area_quad_w), 1.0)))

            # generate poly_mask and training_mask
            cv2.fillPoly(poly_mask,
                         np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32),
                         1)

        tvo_map *= poly_mask
        tvo_map[:8] -= poly_tv_xy_map
        tvo_map[-1] /= poly_short_edge_map
        tvo_map = tvo_map.transpose((1, 2, 0))

        tco_map *= poly_mask
        tco_map[:2] -= poly_tc_xy_map
        tco_map[-1] /= poly_short_edge_map
        tco_map = tco_map.transpose((1, 2, 0))

        return tvo_map, tco_map

    def adjust_point(self, poly):
        """
        adjust point order.
        """
        point_num = poly.shape[0]
        if point_num == 4:
            len_1 = np.linalg.norm(poly[0] - poly[1])
            len_2 = np.linalg.norm(poly[1] - poly[2])
            len_3 = np.linalg.norm(poly[2] - poly[3])
            len_4 = np.linalg.norm(poly[3] - poly[0])

            if (len_1 + len_3) * 1.5 < (len_2 + len_4):
                poly = poly[[1, 2, 3, 0], :]

        elif point_num > 4:
            vector_1 = poly[0] - poly[1]
            vector_2 = poly[1] - poly[2]
            cos_theta = np.dot(vector_1, vector_2) / (
                np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6)
            theta = np.arccos(np.round(cos_theta, decimals=4))

            if abs(theta) > (70 / 180 * math.pi):
                index = list(range(1, point_num)) + [0]
                poly = poly[np.array(index), :]
        return poly

    def gen_min_area_quad_from_poly(self, poly):
        """
        Generate min area quad from poly.
        """
        point_num = poly.shape[0]
        min_area_quad = np.zeros((4, 2), dtype=np.float32)
        if point_num == 4:
            min_area_quad = poly
            center_point = np.sum(poly, axis=0) / 4
        else:
            rect = cv2.minAreaRect(poly.astype(
                np.int32))  # (center (x,y), (width, height), angle of rotation)
            center_point = rect[0]
            box = np.array(cv2.boxPoints(rect))

            first_point_idx = 0
            min_dist = 1e4
            for i in range(4):
                dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \
                    np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \
                    np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \
                    np.linalg.norm(box[(i + 3) % 4] - poly[-1])
                if dist < min_dist:
                    min_dist = dist
                    first_point_idx = i

            for i in range(4):
                min_area_quad[i] = box[(first_point_idx + i) % 4]

        return min_area_quad, center_point

    def shrink_quad_along_width(self,
                                quad,
                                begin_width_ratio=0.,
                                end_width_ratio=1.):
        """
        Generate shrink_quad_along_width.
        """
        ratio_pair = np.array(
            [[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
        p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
        p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
        return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])

    def shrink_poly_along_width(self,
                                quads,
                                shrink_ratio_of_width,
                                expand_height_ratio=1.0):
        """
        shrink poly with given length.
        """
        upper_edge_list = []

        def get_cut_info(edge_len_list, cut_len):
            for idx, edge_len in enumerate(edge_len_list):
                cut_len -= edge_len
                if cut_len <= 0.000001:
                    ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx]
                    return idx, ratio

        for quad in quads:
            upper_edge_len = np.linalg.norm(quad[0] - quad[1])
            upper_edge_list.append(upper_edge_len)

        # length of left edge and right edge.
        left_length = np.linalg.norm(quads[0][0] - quads[0][
            3]) * expand_height_ratio
        right_length = np.linalg.norm(quads[-1][1] - quads[-1][
            2]) * expand_height_ratio

        shrink_length = min(left_length, right_length,
                            sum(upper_edge_list)) * shrink_ratio_of_width
        # shrinking length
        upper_len_left = shrink_length
        upper_len_right = sum(upper_edge_list) - shrink_length

        left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left)
        left_quad = self.shrink_quad_along_width(
            quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1)
        right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right)
        right_quad = self.shrink_quad_along_width(
            quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio)

        out_quad_list = []
        if left_idx == right_idx:
            out_quad_list.append(
                [left_quad[0], right_quad[1], right_quad[2], left_quad[3]])
        else:
            out_quad_list.append(left_quad)
            for idx in range(left_idx + 1, right_idx):
                out_quad_list.append(quads[idx])
            out_quad_list.append(right_quad)

        return np.array(out_quad_list), list(range(left_idx, right_idx + 1))

    def vector_angle(self, A, B):
        """
        Calculate the angle between vector AB and x-axis positive direction.
        """
        AB = np.array([B[1] - A[1], B[0] - A[0]])
        return np.arctan2(*AB)

    def theta_line_cross_point(self, theta, point):
        """
        Calculate the line through given point and angle in ax + by + c =0 form.
        """
        x, y = point
        cos = np.cos(theta)
        sin = np.sin(theta)
        return [sin, -cos, cos * y - sin * x]

    def line_cross_two_point(self, A, B):
        """
        Calculate the line through given point A and B in ax + by + c =0 form.
        """
        angle = self.vector_angle(A, B)
        return self.theta_line_cross_point(angle, A)

    def average_angle(self, poly):
        """
        Calculate the average angle between left and right edge in given poly.
        """
        p0, p1, p2, p3 = poly
        angle30 = self.vector_angle(p3, p0)
        angle21 = self.vector_angle(p2, p1)
        return (angle30 + angle21) / 2

    def line_cross_point(self, line1, line2):
        """
        line1 and line2 in  0=ax+by+c form, compute the cross point of line1 and line2
        """
        a1, b1, c1 = line1
        a2, b2, c2 = line2
        d = a1 * b2 - a2 * b1

        if d == 0:
            #print("line1", line1)
            #print("line2", line2)
            print('Cross point does not exist')
            return np.array([0, 0], dtype=np.float32)
        else:
            x = (b1 * c2 - b2 * c1) / d
            y = (a2 * c1 - a1 * c2) / d

        return np.array([x, y], dtype=np.float32)

    def quad2tcl(self, poly, ratio):
        """
        Generate center line by poly clock-wise point. (4, 2)
        """
        ratio_pair = np.array(
            [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
        p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair
        p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair
        return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]])

    def poly2tcl(self, poly, ratio):
        """
        Generate center line by poly clock-wise point.
        """
        ratio_pair = np.array(
            [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32)
        tcl_poly = np.zeros_like(poly)
        point_num = poly.shape[0]

        for idx in range(point_num // 2):
            point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx]
                                      ) * ratio_pair
            tcl_poly[idx] = point_pair[0]
            tcl_poly[point_num - 1 - idx] = point_pair[1]
        return tcl_poly

    def gen_quad_tbo(self, quad, tcl_mask, tbo_map):
        """
        Generate tbo_map for give quad.
        """
        # upper and lower line function: ax + by + c = 0;
        up_line = self.line_cross_two_point(quad[0], quad[1])
        lower_line = self.line_cross_two_point(quad[3], quad[2])

        quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) +
                        np.linalg.norm(quad[1] - quad[2]))
        quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) +
                        np.linalg.norm(quad[2] - quad[3]))

        # average angle of left and right line.
        angle = self.average_angle(quad)

        xy_in_poly = np.argwhere(tcl_mask == 1)
        for y, x in xy_in_poly:
            point = (x, y)
            line = self.theta_line_cross_point(angle, point)
            cross_point_upper = self.line_cross_point(up_line, line)
            cross_point_lower = self.line_cross_point(lower_line, line)
            ##FIX, offset reverse
            upper_offset_x, upper_offset_y = cross_point_upper - point
            lower_offset_x, lower_offset_y = cross_point_lower - point
            tbo_map[y, x, 0] = upper_offset_y
            tbo_map[y, x, 1] = upper_offset_x
            tbo_map[y, x, 2] = lower_offset_y
            tbo_map[y, x, 3] = lower_offset_x
            tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2
        return tbo_map

    def poly2quads(self, poly):
        """
        Split poly into quads.
        """
        quad_list = []
        point_num = poly.shape[0]

        # point pair
        point_pair_list = []
        for idx in range(point_num // 2):
            point_pair = [poly[idx], poly[point_num - 1 - idx]]
            point_pair_list.append(point_pair)

        quad_num = point_num // 2 - 1
        for idx in range(quad_num):
            # reshape and adjust to clock-wise
            quad_list.append((np.array(point_pair_list)[[idx, idx + 1]]
                              ).reshape(4, 2)[[0, 2, 3, 1]])

        return np.array(quad_list)

    def __call__(self, data):
        im = data['image']
        text_polys = data['polys']
        text_tags = data['ignore_tags']
        if im is None:
            return None
        if text_polys.shape[0] == 0:
            return None

        h, w, _ = im.shape
        text_polys, text_tags, hv_tags = self.check_and_validate_polys(
            text_polys, text_tags, (h, w))

        if text_polys.shape[0] == 0:
            return None

        #set aspect ratio and keep area fix
        asp_scales = np.arange(1.0, 1.55, 0.1)
        asp_scale = np.random.choice(asp_scales)

        if np.random.rand() < 0.5:
            asp_scale = 1.0 / asp_scale
        asp_scale = math.sqrt(asp_scale)

        asp_wx = asp_scale
        asp_hy = 1.0 / asp_scale
        im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy)
        text_polys[:, :, 0] *= asp_wx
        text_polys[:, :, 1] *= asp_hy

        h, w, _ = im.shape
        if max(h, w) > 2048:
            rd_scale = 2048.0 / max(h, w)
            im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
            text_polys *= rd_scale
        h, w, _ = im.shape
        if min(h, w) < 16:
            return None

        #no background
        im, text_polys, text_tags, hv_tags = self.crop_area(im, \
            text_polys, text_tags, hv_tags, crop_background=False)

        if text_polys.shape[0] == 0:
            return None
        #continue for all ignore case
        if np.sum((text_tags * 1.0)) >= text_tags.size:
            return None
        new_h, new_w, _ = im.shape
        if (new_h is None) or (new_w is None):
            return None
        #resize image
        std_ratio = float(self.input_size) / max(new_w, new_h)
        rand_scales = np.array(
            [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0])
        rz_scale = std_ratio * np.random.choice(rand_scales)
        im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale)
        text_polys[:, :, 0] *= rz_scale
        text_polys[:, :, 1] *= rz_scale

        #add gaussian blur
        if np.random.rand() < 0.1 * 0.5:
            ks = np.random.permutation(5)[0] + 1
            ks = int(ks / 2) * 2 + 1
            im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0)
        #add brighter
        if np.random.rand() < 0.1 * 0.5:
            im = im * (1.0 + np.random.rand() * 0.5)
            im = np.clip(im, 0.0, 255.0)
        #add darker
        if np.random.rand() < 0.1 * 0.5:
            im = im * (1.0 - np.random.rand() * 0.5)
            im = np.clip(im, 0.0, 255.0)

        # Padding the im to [input_size, input_size]
        new_h, new_w, _ = im.shape
        if min(new_w, new_h) < self.input_size * 0.5:
            return None

        im_padded = np.ones(
            (self.input_size, self.input_size, 3), dtype=np.float32)
        im_padded[:, :, 2] = 0.485 * 255
        im_padded[:, :, 1] = 0.456 * 255
        im_padded[:, :, 0] = 0.406 * 255

        # Random the start position
        del_h = self.input_size - new_h
        del_w = self.input_size - new_w
        sh, sw = 0, 0
        if del_h > 1:
            sh = int(np.random.rand() * del_h)
        if del_w > 1:
            sw = int(np.random.rand() * del_w)

        # Padding
        im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy()
        text_polys[:, :, 0] += sw
        text_polys[:, :, 1] += sh

        score_map, border_map, training_mask = self.generate_tcl_label(
            (self.input_size, self.input_size), text_polys, text_tags, 0.25)

        # SAST head
        tvo_map, tco_map = self.generate_tvo_and_tco(
            (self.input_size, self.input_size),
            text_polys,
            text_tags,
            tcl_ratio=0.3,
            ds_ratio=0.25)
        # print("test--------tvo_map shape:", tvo_map.shape)

        im_padded[:, :, 2] -= 0.485 * 255
        im_padded[:, :, 1] -= 0.456 * 255
        im_padded[:, :, 0] -= 0.406 * 255
        im_padded[:, :, 2] /= (255.0 * 0.229)
        im_padded[:, :, 1] /= (255.0 * 0.224)
        im_padded[:, :, 0] /= (255.0 * 0.225)
        im_padded = im_padded.transpose((2, 0, 1))

        data['image'] = im_padded[::-1, :, :]
        data['score_map'] = score_map[np.newaxis, :, :]
        data['border_map'] = border_map.transpose((2, 0, 1))
        data['training_mask'] = training_mask[np.newaxis, :, :]
        data['tvo_map'] = tvo_map.transpose((2, 0, 1))
        data['tco_map'] = tco_map.transpose((2, 0, 1))
        return data

注解

generate_tvo_and_tco

在这里插入图片描述

华灯初上~(unique)

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
模型结构构建

基于paddlepaddle的模型构建卷积层构建残差层构建残差网络构建FPN实现卷积层构建class ConvBNLayer(nn.Layer): def __init__( self, in_channels, ## 输入channel out_channels, ## 输出channel kernel_size, ## 卷积核大小 stride=1, ## 滑动步长
复制链接

扫一扫