MTCNN手动实现pnet-python

归一码字

已于 2024-05-07 16:28:59 修改

阅读量257

点赞数 2

文章标签： python opencv 计算机视觉

于 2024-05-07 16:27:48 首次发布

本文链接：https://blog.csdn.net/m0_46221545/article/details/138535316

版权

MTCNN手动实现pnet-python

前言
二、

前言

前文已经介绍了mtcnn算法通过训练一个分类器进行图片的修正

二、

1.引入库

import numpy as np
import mtcnn.caffe_pb2 as pb
import cv2
from matplotlib import pyplot as pltsl._create_unverified_context

2.读入数据

raw_image = cv2.imread("face.jpg")
input_image = cv2.cvtColor(raw_image,cv2.COLOR_BGR2YCrCb)#rgb进行颜色
input_image = (input_image-127.5) *0.0078125#进行归一化然后使其在-1到1的区间
origin_h, origin_w, _ = input_image.shape

其中COLOR_BGR2YCrCb 是一个用于图像颜色空间转换的枚举值，它指定将图像从BGR（默认的OpenCV颜色格式）转换为YCrCb颜色空间。

建立图像金字塔的scales

def calculateScales(img):
    pr_scale = 1.0
    h, w, _ = img.shape

    if min(w, h) > 500:
        pr_scale = 500.0/min(h, w)
        w = int(w*pr_scale)
        h = int(h*pr_scale)
    elif max(w, h)<500:
        pr_scale = 500.0/max(h,w)
        w = int(w*pr_scale)
        h = int(h*pr_scale)

    scales = []
    factor = 0.709
    factor_count = 0
    minl = min(h, w)
    while minl >= 12:
        scales.append(pr_scale*pow(factor, factor_count))
        minl *= factor
        factor_count += 1
    return scales
#%%
scales = calculateScales(input_image)
scales

	[1.0,
 0.709,
 0.5026809999999999,
 0.3564008289999999,
 0.25268818776099994,
 0.17915592512254896,
 0.12702155091188722,
 0.09005827959652803,
 0.06385132023393836,
 0.045270586045862295,结果如下

主体封装

net = pb.NetParameter()
with open(r"D:\BaiduNetdiskDownload\第四期资料\2020\12.14课件MTCNN-BP-Conv\mtcnn\det1.caffemodel", "rb") as f:
    net.ParseFromString(f.read())

layer_mapper = {item.name: item for item in net.layer}
image = cv2.imread("my.jpg")
image.shape
image = image.transpose(2, 0, 1)[None]
image.shape
layer_mapper


class Initializer:
    def __init__(self, name):
        self.name = name

    def __call__(self, *args):
        return self.apply(*args)


class GaussInitializer(Initializer):
    # where :math:`\mu` is the mean and :math:`\sigma` the standard
    # deviation. The square of the standard deviation, :math:`\sigma^2`,
    # is called the variance.
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma

    def apply(self, value):
        value[...] = np.random.normal(self.mu, self.sigma, value.shape)


class Parameter:
    def __init__(self, value):
        self.value = value
        self.delta = np.zeros(value.shape)

    def zero_grad(self):
        self.delta[...] = 0


def conv2d_forward(x, kernel, bias, in_feature, out_feature, kernel_size, padding=0, stride=1):
    in_shape = x.shape
    ib, ic, ih, iw = in_shape
    oh = (ih + padding * 2 - kernel_size) // stride + 1
    ow = (iw + padding * 2 - kernel_size) // stride + 1
    col_w = oh * ow
    col_h = kernel_size * kernel_size * in_feature
    column = np.zeros((ib, col_h, col_w))
    output = np.zeros((ib, out_feature, oh, ow))
    khalf = kernel_size // 2
    kcol = np.array(kernel).reshape(out_feature, -1)
    for b in range(ib):
        for c in range(ic):
            for oy in range(oh):
                for ox in range(ow):
                    for ky in range(kernel_size):
                        for kx in range(kernel_size):
                            column_y = ky * kernel_size + kx + c * kernel_size * kernel_size
                            column_x = ox + oy * ow
                            ix = ox * stride + kx - padding
                            iy = oy * stride + ky - padding
                            if ix >= 0 and iy >= 0 and ix < iw and iy < ih:
                                column[b, column_y, column_x] = x[b, c, iy, ix]
        output[b] = (kcol @ column[b]).reshape(out_feature, oh, ow) + np.array(bias).reshape(out_feature, 1, 1)
    return output


def prelu(x, weigth):
    x = x.copy()

    channels = x.shape[1]
    for c in range(channels):
        current_channel = x[:, c, :, :]
        select = current_channel < 0
        current_channel[select] *= weigth[c]
    return x


def max_pooling2d(x, kernel_size, stride):
    ib, ic, ih, iw = x.shape
    output_height = int(np.ceil((ih - kernel_size) / stride) + 1)
    output_width = int(np.ceil((iw - kernel_size) / stride) + 1)
    output = np.zeros((ib, ic, output_height, output_width))
    minvalue = float("-inf")

    for b in range(ib):
        for c in range(ic):
            for oy in range(output_height):
                for ox in range(output_width):
                    value = minvalue
                    for kx in range(kernel_size):
                        for ky in range(kernel_size):
                            ix = ox * stride + kx
                            iy = oy * stride + ky
                            if ix < iw and iy < ih:
                                value = max(value, x[b, c, iy, ix])
                    output[b, c, oy, ox] = value
    return output


class Conv2d:
    def __init__(self, kernel, bias, in_feature, out_feature, kernel_size, padding, stride):
        self.kernel = kernel
        self.bias = bias
        self.in_feature = in_feature
        self.out_feature = out_feature
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride

    def forward(self, x):
        return conv2d_forward(x, self.kernel, self.bias, self.in_feature, self.out_feature, self.kernel_size,
                              self.padding, self.stride)


class PReLU:
    def __init__(self, weight):
        self.weight = weight

    def forward(self, x):
        return prelu(x, self.weight)


class MaxPooling2d:
    def __init__(self, kernel_size, stride):
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        return max_pooling2d(x, self.kernel_size, self.stride)


class PNet:
    def __init__(self, layer_mapper):
        conv1_weight = layer_mapper["conv1"]
        prelu1_weight = layer_mapper["PReLU1"]
        conv2_weight = layer_mapper["conv2"]
        prelu2_weight = layer_mapper["PReLU2"]
        conv3_weight = layer_mapper["conv3"]
        prelu3_weight = layer_mapper["PReLU3"]

        self.layers = [
            Conv2d(
                kernel=conv1_weight.blobs[0].data,
                bias=conv1_weight.blobs[1].data,
                in_feature=3,
                out_feature=10,
                kernel_size=3,
                padding=0,
                stride=1
            ),
            PReLU(prelu1_weight.blobs[0].data),
            MaxPooling2d(2, 2),
            Conv2d(
                kernel=conv2_weight.blobs[0].data,
                bias=conv2_weight.blobs[1].data,
                in_feature=10,
                out_feature=16,
                kernel_size=3,
                padding=0,
                stride=1
            ),
            PReLU(prelu2_weight.blobs[0].data),
            Conv2d(
                kernel=conv3_weight.blobs[0].data,
                bias=conv3_weight.blobs[1].data,
                in_feature=16,
                out_feature=32,
                kernel_size=3,
                padding=0,
                stride=1
            ),
            PReLU(prelu3_weight.blobs[0].data)
        ]

        conv41_weight = layer_mapper["conv4-1"]
        self.conv41 = Conv2d(
            kernel=conv41_weight.blobs[0].data,
            bias=conv41_weight.blobs[1].data,
            in_feature=32,
            out_feature=2,
            kernel_size=1,
            padding=0,
            stride=1
        )

        conv42_weight = layer_mapper["conv4-2"]
        self.conv42 = Conv2d(
            kernel=conv42_weight.blobs[0].data,
            bias=conv42_weight.blobs[1].data,
            in_feature=32,
            out_feature=4,
            kernel_size=1,
            padding=0,
            stride=1
        )

    def forward(self, image):
        x = image
        for layer in self.layers:
            x = layer.forward(x)

        return self.conv41.forward(x), self.conv42.forward(x)

打印结果

out1 = []
out2 = []
for scale in scales:
    dif_w = int(origin_w*scale)
    dif_h = int(origin_h*scale)
    dif_img = cv2.resize(input_image,(dif_w,dif_h))
    input_image_tf = dif_img.transpose(2, 1, 0)[None]
    classfy,reg = pnet.forward(input_image_tf)
    out1.append(classfy)
    out2.append(reg)
    print("out1:",out1)

 [-2.44777699, -1.12126755, -1.36328562, -3.53338945,
          -3.44102572, -3.23571284],
         [-2.59822884, -2.139627  , -4.29727245, -3.46872557,
          -3.48400743, -3.33075571],
         [-1.87888146, -1.86857724, -0.58649391, -1.20888515,
          -0.78939488, -2.35174356],
         [-2.78993544, -2.38483926, -0.73207628, -1.92172781,
          -3.02145895, -3.97328327]]]]), array([[[[ 1.98634708,  2.07212636,  2.64673702],
         [ 0.82790078,  0.73264951,  2.55249553],
         [ 1.76121664,  1.01266622,  1.9616819 ]],

    [[-1.41277386, -0.87433834, -1.85184214],
     [-1.38557532, -0.8822668 , -3.65898619],
     [-2.251872  , -2.31059625, -3.10679517]]]])]

out1[10]

array([[[[ 1.98634708,  2.07212636,  2.64673702],
         [ 0.82790078,  0.73264951,  2.55249553],
         [ 1.76121664,  1.01266622,  1.9616819 ]],

        [[-1.41277386, -0.87433834, -1.85184214],
         [-1.38557532, -0.8822668 , -3.65898619],
         [-2.251872  , -2.31059625, -3.10679517]]]])