ctpn注释_ctpn.pth-CSDN博客

本文链接：https://blog.csdn.net/yangzheng_520/article/details/120438267
自己写的注释，可能不对，将就看

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import numpy as np
from PIL import Image
from PIL import Image
"""
GPU加载
"""
prob_thresh = 0.5
gpu = True
if not torch.cuda.is_available():
    gpu = False
device = torch.device('cuda:0' if gpu else 'cpu')
print("能够使用GPU"+str(gpu))


"""
模型加载
"""
class basic_conv(nn.Module):
    def __init__(self,
                 in_planes,#输入图像通道数
                 out_planes,#卷积产生的通道数
                 kernel_size,#卷积核大小
                 stride=1,#卷积步长
                 padding=0,#填充
                 dilation=1,#扩张操作
                 groups=1,#卷积分组
                 relu=True,#激活函数
                 bn=True,
                 bias=True):#添加可学习偏差
        super(basic_conv, self).__init__()
        self.out_channels = out_planes
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding,
                              dilation=dilation, groups=groups, bias=bias)
        self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None #归一化，（卷积产生的通道数，为分数值稳定而添加到分母的值，简单平均线默认0.1，可学习的放射参数）
        self.relu = nn.ReLU(inplace=True) if relu else None #激活函数

    def forward(self, x):#确保是否有归一化和激活函数
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.relu is not None:
            x = self.relu(x)
        return x

class CTPN_Model(nn.Module):
    def __init__(self):
        super().__init__()
        base_model = models.vgg16(pretrained=False)#加载VGG16模型
        layers = list(base_model.features)[:-1]#转换VGG模型
        self.base_layers = nn.Sequential(*layers)  # block5_conv3 output  引入容器
        self.rpn = basic_conv(512, 512, 3, 1, 1, bn=False)#调用模型函数
        self.brnn = nn.GRU(512, 128, bidirectional=True, batch_first=True)#初始化（特征维度，隐藏宽度，双向GRU，三个维度）
        self.lstm_fc = basic_conv(256, 512, 1, 1, relu=True, bn=False)#调用模型函数
        self.rpn_class = basic_conv(512, 10 * 2, 1, 1, relu=False, bn=False)#调用模型函数
        self.rpn_regress = basic_conv(512, 10 * 2, 1, 1, relu=False, bn=False)#调用模型函数

    def forward(self, x):
        x = self.base_layers(x)#引用容器
        # rpn
        x = self.rpn(x)  # [b, c, h, w] #调用函数

        x1 = x.permute(0, 2, 3, 1).contiguous()  # channels last   [b, h, w, c]  #permute对任意高维矩阵进行转置，对tensor维度进行转置，contiguous返回一个内存连续的有相同数据的tensor
        b = x1.size()  # b, h, w, c #转换后tensor的维度
        x1 = x1.view(b[0] * b[1], b[2], b[3])#转化size大小，根据tensor和给定的数据确定列数

        x2, _ = self.brnn(x1) #初始化

        xsz = x.size() #同上
        x3 = x2.view(xsz[0], xsz[2], xsz[3], 256)  # torch.Size([4, 20, 20, 256]) #同上

        x3 = x3.permute(0, 3, 1, 2).contiguous()  # channels first [b, c, h, w] #同上
        x3 = self.lstm_fc(x3) #调用模型函数
        x = x3

        cls = self.rpn_class(x)#调用模型函数
        regr = self.rpn_regress(x)#调用模型函数

        cls = cls.permute(0, 2, 3, 1).contiguous() #同上
        regr = regr.permute(0, 2, 3, 1).contiguous()#同上

        cls = cls.view(cls.size(0), cls.size(1) * cls.size(2) * 10, 2) #同上
        regr = regr.view(regr.size(0), regr.size(1) * regr.size(2) * 10, 2)#同上

        return cls, regr

weights = 'E:\\ctpn-zc\\train_ctpn\\checkpoints2\\CTPN.pth'  # CTPN模型路径
model = CTPN_Model() #调用函数
model.load_state_dict(torch.load(weights, map_location=device)['model_state_dict']) #加载模型（模型路径）
model.to(device)#将图片放到GPU中
model.eval()#是保证BN层能够用全部训练数据的均值和方差，即测试过程中要保证BN层的均值和方差不变


"""
配置信息
"""
IMAGE_MEAN = [123.68, 116.779, 103.939]
def gen_anchor(featuresize, scale): #中心点，缩放倍数
    """
        gen base anchor from feature map [HXW][9][4]
        reshape  [HXW][9][4] to [HXWX9][4]
    """
    #划分网格点，10个先验框
    heights = [11, 16, 23, 33, 48, 68, 97, 139, 198, 283]
    widths = [16, 16, 16, 16, 16, 16, 16, 16, 16, 16]

    # gen k=9 anchor size (h,w)
    heights = np.array(heights).reshape(len(heights), 1) #改变数组形状
    widths = np.array(widths).reshape(len(widths), 1)#改变数组形状

    base_anchor = np.array([0, 0, 15, 15]) #数组
    # center x,y 计算网格中心，左上角的中心值
    xt = (base_anchor[0] + base_anchor[2]) * 0.5 #（0+15）*0.5
    yt = (base_anchor[1] + base_anchor[3]) * 0.5 #（0+15）*0.5

    # x1 y1 x2 y2  相对中心点（计算每个的中心值）对应9个初始框的左上角和右下角坐标
    x1 = xt - widths * 0.5
    y1 = yt - heights * 0.5
    x2 = xt + widths * 0.5
    y2 = yt + heights * 0.5
    base_anchor = np.hstack((x1, y1, x2, y2)) #y轴拼接

    h, w = featuresize #中心点
    shift_x = np.arange(0, w) * scale #生成0-w的数值*倍数 划分网格，步长scale
    shift_y = np.arange(0, h) * scale
    # apply shift
    anchor = []
    for i in shift_y:
        for j in shift_x:
            anchor.append(base_anchor + [j, i, j, i]) #保存数组，返回anchor的四个回归值
    return np.array(anchor).reshape((-1, 4)) #返回锚点
def bbox_transfor_inv(anchor, regr):#（锚点数组，线性）
    """
        return predict bbox
    """
#返回预测框，得到改善后的anchor的信息
    Cya = (anchor[:, 1] + anchor[:, 3]) * 0.5 #预测框中心
    ha = anchor[:, 3] - anchor[:, 1] + 1 #预测框高
#得到anchor的中心点
    Vcx = regr[0, :, 0]
    Vhx = regr[0, :, 1]
#计算后的中心点
    Cyx = Vcx * ha + Cya
    hx = np.exp(Vhx) * ha  #返回e的幂次方
    xt = (anchor[:, 0] + anchor[:, 2]) * 0.5
#计算后的anchor的信息
    x1 = xt - 16 * 0.5
    y1 = Cyx - hx * 0.5
    x2 = xt + 16 * 0.5
    y2 = Cyx + hx * 0.5
    bbox = np.vstack((x1, y1, x2, y2)).transpose() #x轴拼接

    return bbox
def clip_box(bbox, im_shape):#使锚点框在图片里
    # x1 >= 0
    bbox[:, 0] = np.maximum(np.minimum(bbox[:, 0], im_shape[1] - 1), 0)
    # y1 >= 0
    bbox[:, 1] = np.maximum(np.minimum(bbox[:, 1], im_shape[0] - 1), 0)
    # x2 < im_shape[1] 图片宽
    bbox[:, 2] = np.maximum(np.minimum(bbox[:, 2], im_shape[1] - 1), 0)
    # y2 < im_shape[0] 图片高
    bbox[:, 3] = np.maximum(np.minimum(bbox[:, 3], im_shape[0] - 1), 0)

    return bbox
def filter_bbox(bbox, minsize):#按最小尺寸画框，过滤框
    ws = bbox[:, 2] - bbox[:, 0] + 1 #宽
    hs = bbox[:, 3] - bbox[:, 1] + 1 #高
    keep = np.where((ws >= minsize) & (hs >= minsize))[0] #np.where(condition,x,y)满足条件(condition)，输出x，不满足输出y；0是行索引
    return keep
def nms(dets, thresh):#再次提纯，返回anchor的四个回归值（非极大值抑制优化，降低了模型的召回率）
    print(dets)
    x1 = dets[:, 0] #4个锚点数据
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1) #计算面积
    order = scores.argsort()[::-1] #输出scores中元素从大到小排列的对应的index(索引)

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        # 获取矩形框交集对应的左上角和右下角的坐标
        xx1 = np.maximum(x1[i], x1[order[1:]]) #求取最大值
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]]) #求取最小值
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1) #宽高最大值
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h #计算交集面积
        ovr = inter / (areas[i] + areas[order[1:]] - inter)#计算交并比

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]
    return keep
class Graph: #建图
    def __init__(self, graph):
        self.graph = graph

    def sub_graphs_connected(self):
        sub_graphs = []#两层列表，内层每个列表是一个文本行，内层列表的每个元素是text_proposals的索引
        for index in range(self.graph.shape[0]):
            if not self.graph[:, index].any() and self.graph[index, :].any():
                v = index
                sub_graphs.append([v])
                while self.graph[v, :].any():
                    v = np.where(self.graph[v, :])[0][0]
                    # 以v为起点对应的最长连接的终点index作为起点，再寻找当前起点对应的最长连接
                    # np.where(self.graph[v, :])=(array([36]),)，本身就只有一个元素。[0][0]是为了把值取出来
                    sub_graphs[-1].append(v)
                    # 因为前面text_proposals没有按x坐标从小到大排序，所以这里一个文本行的元素不是从小到大排列的，
                    # 但是索引对应的proposal在原图上的位置是按x坐标从小到大排列的
        return sub_graphs
class TextLineCfg:#回归框合并生成最终文本框
    SCALE = 600 #短边大于600
    MAX_SCALE = 1200 #长边小于1200
    TEXT_PROPOSALS_WIDTH = 16 #文本宽度
    MIN_NUM_PROPOSALS = 2
    MIN_RATIO = 0.5
    LINE_MIN_SCORE = 0.9
    MAX_HORIZONTAL_GAP = 60 #水平最大连接距离60
    TEXT_PROPOSALS_MIN_SCORE = 0.7
    TEXT_PROPOSALS_NMS_THRESH = 0.3
    MIN_V_OVERLAPS = 0.6 #垂直维度IOU满足大于0.6比例
    MIN_SIZE_SIM = 0.6 #相邻anchor的高度满足小于0.6比例

    class Graph:
        def __init__(self, graph):
            self.graph = graph

        def sub_graphs_connected(self):
            sub_graphs = [] #两层列表，内层每个列表是一个文本行，内层列表的每个元素是text_proposals的索引
            for index in range(self.graph.shape[0]):
                if not self.graph[:, index].any() and self.graph[index, :].any():
                    v = index
                    sub_graphs.append([v])
                    while self.graph[v, :].any():
                        v = np.where(self.graph[v, :])[0][0]
                        # 以v为起点对应的最长连接的终点index作为起点，再寻找当前起点对应的最长连接
                        # np.where(self.graph[v, :])=(array([36]),)，本身就只有一个元素。[0][0]是为了把值取出来
                        sub_graphs[-1].append(v)
                        # 因为前面text_proposals没有按x坐标从小到大排序，所以这里一个文本行的元素不是从小到大排列的，
                        # 但是索引对应的proposal在原图上的位置是按x坐标从小到大排列的
            return sub_graphs
#生成候选框，候选框分组，确认每个候选框分组坐标
class TextProposalGraphBuilder:
    """
        Build Text proposals into a graph.
    """
    def get_successions(self, index):#找到离候选窗口最近的窗口，从左往右
        box = self.text_proposals[index]
        results = []
        for left in range(int(box[0]) + 1, min(int(box[0]) + TextLineCfg.MAX_HORIZONTAL_GAP + 1, self.im_size[1])):
            adj_box_indices = self.boxes_table[left] # 一个x坐标纵向可能对应多个text_proposal
            for adj_box_index in adj_box_indices:
                if self.meet_v_iou(adj_box_index, index):
                    results.append(adj_box_index)
            if len(results) != 0:
                return results # 从左往右60个像素就是x方向找3个，找到1个后面的就不用找了
        return results

    def get_precursors(self, index):#找到离候选窗口最近的窗口，从右往左
        box = self.text_proposals[index]
        results = []
        for left in range(int(box[0]) - 1, max(int(box[0] - TextLineCfg.MAX_HORIZONTAL_GAP), 0) - 1, -1):
            adj_box_indices = self.boxes_table[left]# 一个x坐标纵向可能对应多个text_proposal
            for adj_box_index in adj_box_indices:
                if self.meet_v_iou(adj_box_index, index):
                    results.append(adj_box_index)
            if len(results) != 0:
                return results # 从左往右60个像素就是x方向找3个，找到1个后面的就不用找了
        return results

    def is_succession_node(self, index, succession_index):#搜索最大的窗口值
        precursors = self.get_precursors(succession_index)
        #precursors是负向搜索找到anchor值
        if self.scores[index] >= np.max(self.scores[precursors]):
            return True
        return False

    def meet_v_iou(self, index1, index2):#满足小于0.6的
        def overlaps_v(index1, index2):#高度重合的部分和短高的比
            h1 = self.heights[index1]
            h2 = self.heights[index2]
            y0 = max(self.text_proposals[index2][1], self.text_proposals[index1][1])
            y1 = min(self.text_proposals[index2][3], self.text_proposals[index1][3])
            return max(0, y1 - y0 + 1) / min(h1, h2) # 注意这里overlap的计算并不是除以h1和h2纵向距离的并集

        def size_similarity(index1, index2):#比较两个框的高度
            h1 = self.heights[index1]
            h2 = self.heights[index2]
            return min(h1, h2) / max(h1, h2)

        return overlaps_v(index1, index2) >= TextLineCfg.MIN_V_OVERLAPS and \
               size_similarity(index1, index2) >= TextLineCfg.MIN_SIZE_SIM

    def build_graph(self, text_proposals, scores, im_size): #构建图
        self.text_proposals = text_proposals
        self.scores = scores
        self.im_size = im_size
        self.heights = text_proposals[:, 3] - text_proposals[:, 1] + 1

        boxes_table = [[] for _ in range(self.im_size[1])] # 长度就是img的宽度，第i个元素就是所有x0==i的text_proposal的索引组成的列表
        for index, box in enumerate(text_proposals):
            boxes_table[int(box[0])].append(index)#将左上坐标相同的放在一起，方便后面按index寻找
        self.boxes_table = boxes_table
        #图片取最大宽高，小于该宽高的填充0
        graph = np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool) #方形矩阵，元素为bool，行列数为候选口个数，初始化为0

        for index, box in enumerate(text_proposals):
            #水平方向寻找overlap>0.7的匹配值
            successions = self.get_successions(index) #根据上面的index找候选窗口
            if len(successions) == 0:
                continue
            #找到succession_index最大的值anchor值，确定最长链接（窗口最大的）
            succession_index = successions[np.argmax(scores[successions])]
            if self.is_succession_node(index, succession_index):#负方向寻找最大（候选窗口）
                # NOTE: a box can have multiple successions(precursors) if multiple successions(precursors)
                # have equal scores.
                graph[index, succession_index] = True
        return Graph(graph)
class TextProposalConnectorOriented: #文本图像生成器
    """
        Connect text proposals into text lines
    """

    def __init__(self):
        self.graph_builder = TextProposalGraphBuilder()

    def group_text_proposals(self, text_proposals, scores, im_size):#建图，获取文本信息
        graph = self.graph_builder.build_graph(text_proposals, scores, im_size)
        return graph.sub_graphs_connected()

    def fit_y(self, X, Y, x1, x2):
        # len(X) != 0
        # if X only include one point, the function will get line y=Y[0]
        if np.sum(X == X[0]) == len(X):#判断x只有一个点
            return Y[0], Y[0]
        #获取多个x,y坐标集合
        p = np.poly1d(np.polyfit(X, Y, 1))#一阶多项式拟合
        return p(x1), p(x2)

    def get_text_lines(self, text_proposals, scores, im_size):
        """
        text_proposals:boxes

        """
        # tp=text proposal
        tp_groups = self.group_text_proposals(text_proposals, scores, im_size)  # 首先还是建图，获取到文本行由哪几个小框构成

        text_lines = np.zeros((len(tp_groups), 8), np.float32) #数组填充0

        for index, tp_indices in enumerate(tp_groups):
            text_line_boxes = text_proposals[list(tp_indices)]  # 每个文本行的全部小框
            X = (text_line_boxes[:, 0] + text_line_boxes[:, 2]) / 2  # 求每一个小框的中心x，y坐标
            Y = (text_line_boxes[:, 1] + text_line_boxes[:, 3]) / 2

            z1 = np.polyfit(X, Y, 1)  # 多项式拟合，根据之前求的中心店拟合一条直线（最小二乘）

            x0 = np.min(text_line_boxes[:, 0])  # 文本行x坐标最小值
            x1 = np.max(text_line_boxes[:, 2])  # 文本行x坐标最大值

            offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5  # 小框宽度的一半

            # 以全部小框的左上角这个点去拟合一条直线，然后计算一下文本行x坐标的极左极右对应的y坐标
            lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset)
            # 以全部小框的左下角这个点去拟合一条直线，然后计算一下文本行x坐标的极左极右对应的y坐标
            lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset)

            score = scores[list(tp_indices)].sum() / float(len(tp_indices))  # 求全部小框得分的均值作为文本行的均值

            text_lines[index, 0] = x0
            text_lines[index, 1] = min(lt_y, rt_y)  # 文本行上端 线段 的y坐标的小值
            text_lines[index, 2] = x1
            text_lines[index, 3] = max(lb_y, rb_y)  # 文本行下端 线段 的y坐标的大值
            text_lines[index, 4] = score  # 文本行得分
            text_lines[index, 5] = z1[0]  # 根据中心点拟合的直线的k，b
            text_lines[index, 6] = z1[1]
            height = np.mean((text_line_boxes[:, 3] - text_line_boxes[:, 1]))  # 小框平均高度
            text_lines[index, 7] = height + 2.5

        text_recs = np.zeros((len(text_lines), 9), np.float)
        index = 0
        for line in text_lines:
            b1 = line[6] - line[7] / 2  # 根据高度和文本行中心线，求取文本行上下两条线的b值
            b2 = line[6] + line[7] / 2
            x1 = line[0]
            y1 = line[5] * line[0] + b1  # 左上
            x2 = line[2]
            y2 = line[5] * line[2] + b1  # 右上
            x3 = line[0]
            y3 = line[5] * line[0] + b2  # 左下
            x4 = line[2]
            y4 = line[5] * line[2] + b2  # 右下
            disX = x2 - x1
            disY = y2 - y1
            width = np.sqrt(disX * disX + disY * disY)  # 文本行宽度

            fTmp0 = y3 - y1  # 文本行高度
            fTmp1 = fTmp0 * disY / width
            x = np.fabs(fTmp1 * disX / width)  # 做补偿
            y = np.fabs(fTmp1 * disY / width)
            if line[5] < 0: #调整预测框的位置
                x1 -= x
                y1 += y
                x4 += x
                y4 -= y
            else:
                x2 += x
                y2 += y
                x3 -= x
                y3 -= y
            text_recs[index, 0] = x1
            text_recs[index, 1] = y1
            text_recs[index, 2] = x2
            text_recs[index, 3] = y2
            text_recs[index, 4] = x3
            text_recs[index, 5] = y3
            text_recs[index, 6] = x4
            text_recs[index, 7] = y4
            text_recs[index, 8] = line[4]
            index = index + 1

        return text_recs



"""
调用
"""
def get_det_boxes(image,display = True, expand = True):
    # image = resize(image, height=height)
    image_r = image.copy()#拷贝图片
    image_c = image.copy()
    h, w = image.shape[:2]#获取宽高
    image = image.astype(np.float32) - IMAGE_MEAN #转换类型
    image = torch.from_numpy(image.transpose(2, 0, 1)).unsqueeze(0).float() #将numpy格式图片转化为相应的tensor格式

    with torch.no_grad():
        image = image.to(device) #将转换好的图片放到GPU上
        cls, regr = model(image) #模型调用
        cls_prob = F.softmax(cls, dim=-1).cpu().numpy() #对每一行进行softmax（归一化） --- dim = -1轴
        regr = regr.cpu().numpy()
        anchor = gen_anchor((int(h / 16), int(w / 16)), 16) #调用函数（中心点，倍数），确定锚点
        bbox = bbox_transfor_inv(anchor, regr) #调用函数，返回预测框
        bbox = clip_box(bbox, [h, w]) #确保框在图片里面
        # print(bbox.shape)

        fg = np.where(cls_prob[0, :, 1] > prob_thresh)[0] #np.where(condition,x,y)满足条件(condition)，输出x，不满足输出y；0是行索引
        # print(np.max(cls_prob[0, :, 1]))
        select_anchor = bbox[fg, :] #按行输出预测框
        select_score = cls_prob[0, fg, 1] #按行归一化
        select_anchor = select_anchor.astype(np.int32) #转换数组类型整数型
        # print(select_anchor.shape)
        keep_index = filter_bbox(select_anchor, 16) #过滤数组

        # nms
        select_anchor = select_anchor[keep_index] #过滤后的数据，输出预测框
        select_score = select_score[keep_index] #按行归一化
        select_score = np.reshape(select_score, (select_score.shape[0], 1)) #转化数组
        nmsbox = np.hstack((select_anchor, select_score)) #拼接数组
        keep = nms(nmsbox, 0.3) #优化预测框，降低了模型的召回率，阈值难以确定
        # print(keep)
        select_anchor = select_anchor[keep] #按行输出预测框
        select_score = select_score[keep] #按行归一化

        # text line- 文本行
        textConn = TextProposalConnectorOriented()#调用函数，获取文本信息
        text = textConn.get_text_lines(select_anchor, select_score, [h, w])

        # expand text 展开文本
        if expand:
            for idx in range(len(text)):#读取文本信息
                text[idx][0] = max(text[idx][0] - 10, 0)
                text[idx][2] = min(text[idx][2] + 10, w - 1)
                text[idx][4] = max(text[idx][4] - 10, 0)
                text[idx][6] = min(text[idx][6] + 10, w - 1)
        if display:
            blank = np.zeros(image_c.shape,dtype=np.uint8) # 用0填充数组
            for box in select_anchor:
                pt1 = (box[0], box[1])
                pt2 = (box[2], box[3])
                print(pt1, pt2)
                cv2.rectangle(image_c,pt1, pt2, (0, 0, 0)) #画框（图片，左上坐标，右下坐标，颜色）
        return [pt1, pt2],image_c #返回检测框，画框图片


def single_pic_proc(image_file):
    image = np.array(Image.open(image_file).convert('RGB')) #打开图片，转化为RGB类型
    _, img = get_det_boxes(image)
    return img
if __name__ == '__main__':
    """
    上传图片路径
    返回图片和坐标
    """
    url = 'E:\\qichacha\\img\\zengguang\\liangdu\\091110000MA05HX6M2H.jpg'
    img = single_pic_proc(url)
    Image.fromarray(img).save('./091110000MA05HX6M2H.jpg')