YOLOv1代码注释

最新推荐文章于 2025-02-19 15:12:42 发布

@@老胡

最新推荐文章于 2025-02-19 15:12:42 发布

阅读量401

点赞数 3

分类专栏：源代码文章标签： YOLO

本文链接：https://blog.csdn.net/CodePlayMe/article/details/140324166

版权

源代码专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章目录

这个代码的主网络修改为resnet50。

write_txt.py

from curses import endwin
import xml.etree.ElementTree as ET
import os
import random

"""
这个文件的主要功能是读取文件夹内所有的xml的文件以及信息，将这些信息(name,bbox,class)写入一个txt文件中，并且按照7:3划分训练集和测试集
"""

VOC_CLASSES = (  # 定义所有的类名
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')   # 使用其他训练集需要更改

# VOC_CLASSES=[]# 自己的数据集

# 切换成当前路径-需要修改
# os.chdir('/root/workspace/YOLOV1-pytorch/')

# 定义一些参数
train_set = open('voctrain.txt', 'w')
test_set = open('voctest.txt', 'w')
# Annotations = 'VOCdevkit//VOC2007//Annotations//'
Annotations='VOCdevkit/VOC2007/Annotations/'
xml_files = os.listdir(Annotations)
random.shuffle(xml_files)  # 打乱数据集
train_num = int(len(xml_files) * 0.7)  # 训练集数量
train_lists = xml_files[:train_num]   # 训练列表
test_lists = xml_files[train_num:]    # 测测试列表
# 输出一些信息
print("train_lists:",len(train_lists))
print("test_lists:",len(test_lists))


def parse_rec(filename):  # 输入xml文件名
    """
    读取xml文件信息，在"object"目录下查看"difficult"值是否为1，若不为1则在名为"obj_struct"的字典中存入"bbox"和"name"的信息，
    再将这个字典作为名为"objects"的列表的元素，最终输出这个列表。所以这个名为"objects"的列表中的每一个元素都是一个字典。
    """
    tree = ET.parse(filename)# 生成一个总目录名为tree
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        difficult = int(obj.find('difficult').text)
        if difficult == 1:  # 若为1则跳过本次循环
            continue
        obj_struct['name'] = obj.find('name').text
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(float(bbox.find('xmin').text)),
                              int(float(bbox.find('ymin').text)),
                              int(float(bbox.find('xmax').text)),
                              int(float(bbox.find('ymax').text))]
        objects.append(obj_struct)

    return objects


def write_txt():
    count = 0
    for train_list in train_lists: # 生成训练集txt
        count += 1
        image_name = train_list.split('.')[0] + '.jpg'  # 图片文件名
        results = parse_rec(Annotations + train_list)
        if len(results) == 0:
            print(train_list)
            continue
        train_set.write(image_name)
        for result in results:
            class_name = result['name']
            # # 添加类别名字
            # if class_name not in VOC_CLASSES:
            #     VOC_CLASSES.append(class_name)
            
            bbox = result['bbox']
            class_name = VOC_CLASSES.index(class_name)
            train_set.write(' ' + str(bbox[0]) +
                            ' ' + str(bbox[1]) +
                            ' ' + str(bbox[2]) +
                            ' ' + str(bbox[3]) +
                            ' ' + str(class_name))
        train_set.write('\n')
    train_set.close()

    for test_list in test_lists:   # 生成测试集txt
        count += 1
        image_name = test_list.split('.')[0] + '.jpg'  # 图片文件名
        results = parse_rec(Annotations + test_list)
        if len(results) == 0:
            print(test_list)
            continue
        test_set.write(image_name)
        for result in results:
            class_name = result['name']

            # # 添加类别名字
            # if class_name not in VOC_CLASSES:
            #     VOC_CLASSES.append(class_name)

            bbox = result['bbox']
            class_name = VOC_CLASSES.index(class_name)
            test_set.write(' ' + str(bbox[0]) +
                            ' ' + str(bbox[1]) +
                            ' ' + str(bbox[2]) +
                            ' ' + str(bbox[3]) +
                            ' ' + str(class_name))
        test_set.write('\n')
    test_set.close()

"""
if __name__ == "__main__": 的作用
在Python中，每个Python文件（模块）都可以作为脚本直接运行，也可以被其他文件导入。__name__ 是一个特殊变量，
当文件被直接运行时，__name__ 的值被设置为 "__main__"。如果文件是被导入的，则 __name__ 的值会被设置为该模块的名字。
if __name__ == "__main__": 这行代码的作用是判断该文件是否作为主程序运行。如果是，则执行该条件语句块下的代码。
这种方式通常用于提供一个可执行的入口点给该文件，同时也允许该文件中的函数和类被其他文件导入而不会自动执行这些代码。
"""
if __name__ == '__main__':
    write_txt()
    print(VOC_CLASSES)# 类别名称

yoloData.py

import torch
import cv2
import os
import os.path
import random
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor
from PIL import Image
# from write_txt import VOC_CLASSES # 这个使用要谨慎，因为对应文件里面定义的两个txt是全局变量，导入的时候里面的全局变量会重新赋值

CLASS_NUM = 20  # 使用其他训练集需要更改
# CLASS_NUM=len(VOC_CLASSES) # 类别的数量
# os.chdir('/root/workspace/YOLOV1-pytorch/')

class yoloDataset(Dataset):
    image_size = 448  # 输入图片大小

    def __init__(self, img_root, list_file, train, transform):   # list_file为txt文件  img_root为图片路径
        """
        逐行读取生成的文本文件的内容，然后对其进行分类，将信息保存在fnames，boxes，labels三个列表中
        """
        self.root = img_root
        self.train = train
        self.transform = transform
        # 后续要提取txt文件信息，分类后装入以下三个列表
        self.fnames = []
        self.boxes = []
        self.labels = []

        self.S = 7   # YOLOV1
        self.B = 2   # 相关
        self.C = CLASS_NUM  # 参数
        self.mean = (123, 117, 104)  # RGB
        file_txt = open(list_file,'r')
        lines = file_txt.readlines()   # 读取txt文件每一行
        for line in lines:   # 逐行开始操作
            # strip()  # 移除首位的换行符号；split()  # 以空格为分界线，将所有元素组成一个列表
            splited = line.strip().split() # 移除首位的换行符号再生成一张列表
            self.fnames.append(splited[0])  # 存储图片的名字
            num_boxes = (len(splited) - 1) // 5  # 每一幅图片里面有多少个bbox
            box = []
            label = []
            for i in range(num_boxes): # bbox四个角的坐标
                x = float(splited[1 + 5 * i])
                y = float(splited[2 + 5 * i])
                x2 = float(splited[3 + 5 * i])
                y2 = float(splited[4 + 5 * i])
                c = splited[5 + 5 * i]  # 代表物体的类别，即是20种物体里面的哪一种  值域 0-19
                box.append([x, y, x2, y2])
                label.append(int(c))
            self.boxes.append(torch.Tensor(box))
            self.labels.append(torch.LongTensor(label))
        self.num_samples = len(self.boxes)

    # 访问坐标的时候就会直接执行这个函数
    def __getitem__(self, idx):
        fname = self.fnames[idx]
        img = cv2.imread(os.path.join(self.root + fname))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx].clone()
        if self.train:  # 数据增强里面的各种变换用torch自带的transform是做不到的，因为对图片进行旋转、随即裁剪等会造成bbox的坐标也会发生变化，所以需要自己来定义数据增强
            img, boxes = self.random_flip(img, boxes) # 随机翻转
            img, boxes = self.randomScale(img, boxes) # 随机伸缩变换
            img = self.randomBlur(img)# 随机模糊处理
            img = self.RandomBrightness(img)# 随机调整亮度
            # img = self.RandomHue(img)
            # img = self.RandomSaturation(img)
            img, boxes, labels = self.randomShift(img, boxes, labels)# 平移转换
            # img, boxes, labels = self.randomCrop(img, boxes, labels)
        h, w, _ = img.shape
        boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)  # 坐标归一化处理，为了方便训练，这个表示的bbox的宽高占整个图像的比例
        img = self.BGR2RGB(img)  # because pytorch pretrained model use RGB
        img = self.subMean(img, self.mean)  # 减去均值
        """
        这里对图像resize后不需要对boxes变化，原因一是这里不是图像增强，只是方便图片输入网络；
        原因二是YOLO原文写的是，对bbox的宽高做归一化，这个归一化是相当于整个原来图像的宽高进行归一化的（上面已经归一化了），而对bbox的中心坐标的归一化
        是相当于bbox所在的grid cell的左上角坐标进行归一化的，也就是下面的encoder操作，所以这一步是正确的。

        而且在后面使用到这个bbox的xywh的时候，是会做相应的操作的，详情可以看yoloLoss
        """
        # YOLO V1输入图像大小设置为448*448* 3
        img = cv2.resize(img, (self.image_size, self.image_size))  # 将所有图片都resize到指定大小，这里不是图像增强，而是为了方便网络的输入
        target = self.encoder(boxes, labels)  # 将图片标签编码到7x7*30的向量

        for t in self.transform:
            img = t(img)

        # 返回的img是经过图像增强的img
        return img, target

    def __len__(self):
        return self.num_samples

    # def letterbox_image(self, image, size):
    #     # 对图片进行resize，使图片不失真。在空缺的地方进行padding
    #     iw, ih = image.size
    #     scale = min(size / iw, size / ih)
    #     nw = int(iw * scale)
    #     nh = int(ih * scale)
    #
    #     image = image.resize((nw, nh), Image.BICUBIC)
    #     new_image = Image.new('RGB', size, (128, 128, 128))
    #     new_image.paste(image, ((size - nw) // 2, (size - nh) // 2))
    #     return new_image

    def encoder(self, boxes, labels):  # 输入的box为归一化形式(X1,Y1,X2,Y2) , 输出ground truth  (7*7*30)
        grid_num = 7
        target = torch.zeros((grid_num, grid_num, int(CLASS_NUM + 10)))    # 7*7*30
        # cell_size 是图像宽度和高度被划分成的等分数，用于将归一化的坐标转换为网格索引。
        cell_size = 1. / grid_num  # 1/7
        # 这个是bbox的归一化后的宽高
        wh = boxes[:, 2:] - boxes[:, :2] # wh = [w, h]  1*1

        # 物体中心坐标集合
        cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2  # 归一化含小数的中心坐标
        for i in range(cxcy.size()[0]):
            cxcy_sample = cxcy[i]  # 中心坐标  1*1
            """
            ij 并不是直接表示“左上角坐标（7*7)为整数，而是表示边界框中心点所在的网格的索引。ceil()表示向上取整；-1是因为python的索引是从0开始的
            ij 是一个包含两个元素的tensor，分别表示边界框中心点所在的网格的x和y索引

            下面的公式的解释可以这样理解：假设有一个图像大小为w*h，现在把图像分为7分，问坐标为（x，y）的点位于哪一个网格中，这就是小学乘法问题，
            很明显，求一下x和y占比w和h的占比，分别乘以7，最后向上取整，所以答案就是（7x/w,7y/h）.ceil(),这里再看cxcy_sample本身就是已经归一化（也就是已经除以w）
            了，所以直接乘7，也就是 / cell_size 就可以得到结果。-1是为了让索引从0开始。 
            """
            ij = (cxcy_sample / cell_size).ceil() - 1  # 左上角坐标 （7*7)为整数
            # 这里先1后0是因为坐标提取就是先行后列
            # 第一个框的置信度，4表示第一个标注框的置信度存储在下标为4的位置，下面9同理，并且这里的意义是，只有有标注框的置信度置位为1
            target[int(ij[1]), int(ij[0]), 4] = 1
            # 第二个框的置信度
            target[int(ij[1]), int(ij[0]), 9] = 1

            target[int(ij[1]), int(ij[0]), int(labels[i]) + 10] = 1  # 20个类别对应处的概率设置为1

            xy = ij * cell_size  # 归一化左上坐标  （1*1）
            # 在YOLOV1原文中，其bbox的五个参数中的x，y就是中心坐标相对于其grid cell左上角的坐标的相对值
            delta_xy = (cxcy_sample - xy) / cell_size  # 中心与左上坐标差值  （7*7）

            # 坐标w,h代表了预测的bounding box的width、height相对于整幅图像width,height的比例
            target[int(ij[1]), int(ij[0]), 2:4] = wh[i]  # w1,h1
            target[int(ij[1]), int(ij[0]), :2] = delta_xy  # x1,y1

            # 每一个网格有两个边框，在真实数据中，两个边框的值是一样的
            target[int(ij[1]), int(ij[0]), 7:9] = wh[i]  # w2,h2
            # 由此可得其实返回的中心坐标其实是相对左上角顶点的偏移，因此在进行预测的时候还需要进行解码
            target[int(ij[1]), int(ij[0]), 5:7] = delta_xy  # [5,7) 表示x2,y2
        """
        这里来解释为什么(xc,yc) = 7*7   (w,h) = 1*1
        首先解释一个简单点的，(w,h) = 1*1，是因为target保存的时候，是直接保存了前面归一化的wh，所以这里是1*1
        接下来解释(xc,yc) = 7*7，这里理一遍整个流程，首先获得了归一化的中心坐标cxcy，这个时候是1*1的，和上面wh的解释一样，
        然后取了一个cxcy作为例子，也就是cxcy_sample，那么自然cxcy_sample也是1*1的。
        然后求ij的时候是cxcy_sample*7，所以ij是7*7，
        接着求xy是ij/7，所以，xy为1*1
        最后求delta_xy的时候是(cxcy_sample - xy)*7，并且保存的也是delta_xy，那么自然也就是7*7的，这里理解很重要
        """
        return target   # (xc,yc) = 7*7   (w,h) = 1*1

    # 以下方法都是数据增强操作

    def BGR2RGB(self, img):
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    def BGR2HSV(self, img):# BGR变换为HSV
        return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    def HSV2BGR(self, img):
        return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)

    def RandomBrightness(self, bgr):# 随机调整亮度
        if random.random() < 0.5:
            # 由于直接在BGR颜色空间调整亮度可能会改变图像的颜色，因此通常会在HSV（色调、饱和度、亮度）颜色空间中进行调整。
            hsv = self.BGR2HSV(bgr)
            # 使用 cv2.split() 将HSV图像分离成三个单独的通道：色调（H）、饱和度（S）和亮度（V）。
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            v = v * adjust
            # 使用 np.clip() 函数确保亮度值不会超出有效范围（0到255），并将结果转换回原始HSV图像的数据类型。
            v = np.clip(v, 0, 255).astype(hsv.dtype)
            # 使用 cv2.merge() 将调整后的亮度通道（V）与原始的色调（H）和饱和度（S）通道合并回HSV图像。
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomSaturation(self, bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            s = s * adjust
            s = np.clip(s, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomHue(self, bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            h = h * adjust
            h = np.clip(h, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def randomBlur(self, bgr):# 随机模糊处理
        if random.random() < 0.5:
            """
            cv2.blur() 函数实际上是一个简单的平均模糊函数，它会计算核内所有像素的平均值，并用这个平均值替换核中心的像素值。
            这种方法在去除图像噪声的同时，也会丢失一些细节信息。
            """
            bgr = cv2.blur(bgr, (5, 5))# 固定模糊核是5*5，核越大，模糊效果越明显
        return bgr

    def randomShift(self, bgr, boxes, labels):# 平移转换
        """
        主要是对输入的图像进行随机的平移变换，并且相应的更新图像中的目标框的位置，同时它还会处理平移后可能超过图像边界的情况，以及更新目标框的位置确保它们
        仍然位于图像的有效区域
        """
        """
        这里计算的是每一个bbox的center，首先boxes是一个二维数组，所以第一个冒号是取了二维数组里面所有的元素，
        而2：表示从每一个元素里面的第三第四列，也就是xmax和ymax，而：2表示取第一第二列，也就是xmin和ymin，如此计算得到center=（xcenter，ycenter）
        """
        center = (boxes[:, 2:] + boxes[:, :2]) / 2
        if random.random() < 0.5:
            height, width, c = bgr.shape
            # 创建一个与原图相同大小和类型的全零图像，并用特定的BGR值（104, 117, 123）填充，这个值通常用于图像预处理中的均值归一化。
            after_shift_image = np.zeros((height, width, c), dtype=bgr.dtype)
            after_shift_image[:, :, :] = (104, 117, 123)  # bgr
            # 随机生成水平或者垂直方向上的平移量
            shift_x = random.uniform(-width * 0.2, width * 0.2)
            shift_y = random.uniform(-height * 0.2, height * 0.2)
            # print(bgr.shape,shift_x,shift_y)
            # 原图像的平移
            # 根据平移量的正负，分别处理图像的不同部分，分四种情况处理
            if shift_x >= 0 and shift_y >= 0:# 右下
                # 这里要注意，这里的注释里面括号是(y，x)，不是（x，y）
                # 填充，偏移后需要填充的部分是(shift_y,shift_x)到(height,width),用的是原始图像的（0,0）到（height - int(shift_y)，width - int(shift_x)）填充
                after_shift_image[int(shift_y):,int(shift_x):,:] = bgr[:height - int(shift_y),:width - int(shift_x),:]
            elif shift_x >= 0 and shift_y < 0:# 右上
                # 填充，偏移后需要填充的部分是(0，height + int(shift_y))（这里int(shift_y)是个负数），原始图像是（-int(shift_y)，0）到（height，width - int(shift_x)）
                after_shift_image[:height + int(shift_y),int(shift_x):,:] = bgr[-int(shift_y):,:width - int(shift_x),:]
            elif shift_x < 0 and shift_y >= 0:# 左下
                after_shift_image[int(shift_y):, :width +int(shift_x), :] = bgr[:height -int(shift_y), -int(shift_x):, :]
            elif shift_x < 0 and shift_y < 0:# 左上
                after_shift_image[:height + int(shift_y), :width + int(shift_x), :] = bgr[-int(shift_y):, -int(shift_x):, :]

            # 扩展后的center
            shift_xy = torch.FloatTensor([[int(shift_x), int(shift_y)]]).expand_as(center)
            center = center + shift_xy
            # 检查中心点的坐标是否在图像的边界内
            mask1 = (center[:, 0] > 0) & (center[:, 0] < width)
            mask2 = (center[:, 1] > 0) & (center[:, 1] < height)
            """
            view是维度变换
            这里可以看到boxes是m*4的，m是box的数量，4是坐标的数量，labels是m维的，所以view将mask展平成m维的，实际上是（m，1）
            """
            mask = (mask1 & mask2).view(-1, 1)
            # mask.expand_as(boxes)的操作等同于mask.squeeze(1)，这里操作后，boxes_in只会包含到mask中维true的部分，得到一个新的张量，也就是包含了中心的bbox会被保留
            boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
            if len(boxes_in) == 0:# 如果变换后没有包含任何的标注框，那么就不变换，返回原始的图片、bbox和labels
                return bgr, boxes, labels
            # 变换标注框的坐标
            box_shift = torch.FloatTensor([[int(shift_x), int(shift_y), int(shift_x), int(shift_y)]]).expand_as(boxes_in)
            boxes_in = boxes_in + box_shift
            # 保留还剩余的标注框的label
            labels_in = labels[mask.view(-1)]
            return after_shift_image, boxes_in, labels_in
        return bgr, boxes, labels

    def randomScale(self, bgr, boxes):# 随机伸缩变换
        # 固定住高度，以0.8-1.2伸缩宽度，做图像形变
        if random.random() < 0.5:
            scale = random.uniform(0.8, 1.2)
            height, width, c = bgr.shape
            bgr = cv2.resize(bgr, (int(width * scale), height))
            """
            使用 expand_as(boxes) 方法将 scale_tensor 扩展到与 boxes 相同的形状，以便逐元素相乘。
            这样，boxes 中的每个边界框都会根据 scale_tensor 中的值进行相应的伸缩变换。
            expand_as方法要求 boxes 的第一维度（即批处理大小或边界框的数量）与 scale_tensor 的第一维度（这里是1）兼容
            """
            scale_tensor = torch.FloatTensor([[scale, 1, scale, 1]]).expand_as(boxes)
            boxes = boxes * scale_tensor
            return bgr, boxes
        return bgr, boxes

    def randomCrop(self, bgr, boxes, labels):
        if random.random() < 0.5:
            center = (boxes[:, 2:] + boxes[:, :2]) / 2
            height, width, c = bgr.shape
            h = random.uniform(0.6 * height, height)
            w = random.uniform(0.6 * width, width)
            x = random.uniform(0, width - w)
            y = random.uniform(0, height - h)
            x, y, h, w = int(x), int(y), int(h), int(w)

            center = center - torch.FloatTensor([[x, y]]).expand_as(center)
            mask1 = (center[:, 0] > 0) & (center[:, 0] < w)
            mask2 = (center[:, 1] > 0) & (center[:, 1] < h)
            mask = (mask1 & mask2).view(-1, 1)

            boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
            if (len(boxes_in) == 0):
                return bgr, boxes, labels
            box_shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_in)

            boxes_in = boxes_in - box_shift
            boxes_in[:, 0] = boxes_in[:, 0].clamp_(min=0, max=w)
            boxes_in[:, 2] = boxes_in[:, 2].clamp_(min=0, max=w)
            boxes_in[:, 1] = boxes_in[:, 1].clamp_(min=0, max=h)
            boxes_in[:, 3] = boxes_in[:, 3].clamp_(min=0, max=h)

            labels_in = labels[mask.view(-1)]
            img_croped = bgr[y:y + h, x:x + w, :]
            return img_croped, boxes_in, labels_in
        return bgr, boxes, labels

    def subMean(self, bgr, mean):# 减掉均值
        mean = np.array(mean, dtype=np.float32)
        bgr = bgr - mean
        return bgr

    def random_flip(self, im, boxes):# 随机翻转
        """
        给定的图像 im 和对应的边界框坐标 boxes 进行随机水平翻转。如果随机生成的数小于0.5（即有一半的概率），
        则执行翻转操作，并相应地调整边界框的坐标以反映图像的变化。
        """
        if random.random() < 0.5:
            # 使用 np.fliplr(im).copy() 对图像进行水平翻转，并复制结果以避免修改原始图像
            im_lr = np.fliplr(im).copy()
            h, w, _ = im.shape# h w c
            xmin = w - boxes[:, 2]# w-xmax
            xmax = w - boxes[:, 0]# w-xmin
            boxes[:, 0] = xmin
            boxes[:, 2] = xmax
            return im_lr, boxes
        return im, boxes

    def random_bright(self, im, delta=16):
        alpha = random.random()
        if alpha > 0.3:
            im = im * alpha + random.randrange(-delta, delta)
            im = im.clip(min=0, max=255).astype(np.uint8)
        return im


def main():
    file_root = 'VOCdevkit/VOC2007/JPEGImages/'
    train_dataset = yoloDataset(
        img_root=file_root,
        list_file='voctrain.txt',
        train=True,
        transform=[ToTensor()])
    """
    DataLoader是一个常用的类，用于封装数据集并提供批量加载数据的功能。它支持多进程数据加载、打乱数据、自定义数据采样等。
    当创建后，DataLoader会返回一个可迭代对象，每一次迭代会返回一批数据（batch_size）

    shuffle=False：这意味着数据加载器不会打乱数据集中的样本顺序。如果您希望每个epoch的数据顺序都不同，应该将此参数设置为 True
    """
    train_loader = DataLoader(
        train_dataset,
        batch_size=2,
        drop_last=True,
        shuffle=False,
        num_workers=0)

    """
    将一个数据加载器（train_loader）转换为一个迭代器（train_iter）

    通过将 train_loader 转换为迭代器 train_iter，您可以使用 next(train_iter) 来逐个批次地获取数据。
    但是，请注意，一旦迭代器被耗尽（即，当您已经遍历了数据集中的所有批次时），再次调用 next(train_iter) 将引发 StopIteration 异常。
    """
    # train_iter = iter(train_loader)
    # for i in range(100):
    #     img, target = next(train_iter)
    #     print(img.shape)
    #     print(target)

    for img,target in train_loader:
        print(img.shape)
        print(target)


if __name__ == '__main__':
    os.chdir('/root/workspace/YOLOV1-pytorch/')
    main()

yoloLoss.py

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import warnings

warnings.filterwarnings('ignore')  # 忽略警告消息
CLASS_NUM = 20    # （使用自己的数据集时需要更改）

class yoloLoss(nn.Module):
    def __init__(self, S, B, l_coord, l_noobj):
        # 一般而言 l_coord = 5 ， l_noobj = 0.5
        super(yoloLoss, self).__init__()
        self.S = S  # S = 7
        self.B = B  # B = 2
        self.l_coord = l_coord
        self.l_noobj = l_noobj

    def compute_iou(self, box1, box2):  # box1(2,4)  box2(1,4)
        """
        这里要注意，box1包括两个边界框，box2只包括一个边界框，所以IOU的结果是box1的边界框分别与box2的边界框进行IOU操作，所以返回的IOU时2*1的
        """
        N = box1.size(0)  # 2
        M = box2.size(0)  # 1

        lt = torch.max(  # 返回张量所有元素的最大值
            # [N,2] -> [N,1,2] -> [N,M,2]
            box1[:, :2].unsqueeze(1).expand(N, M, 2),
            # [M,2] -> [1,M,2] -> [N,M,2]
            box2[:, :2].unsqueeze(0).expand(N, M, 2),
        )

        rb = torch.min(
            # [N,2] -> [N,1,2] -> [N,M,2]
            box1[:, 2:].unsqueeze(1).expand(N, M, 2),
            # [M,2] -> [1,M,2] -> [N,M,2]
            box2[:, 2:].unsqueeze(0).expand(N, M, 2),
        )

        # 求差值
        wh = rb - lt  # [N,M,2]
        """
        wh < 0：这是一个布尔索引操作，它会生成一个与原数组wh形状相同的布尔数组。在这个布尔数组中，所有对应于原数组中值小于0的位置的元素
        都会被设置为True，其余位置为False。
        """
        wh[wh < 0] = 0  # clip at 0，去除那些可能没有交集的框
        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]  框重叠的部分的面积

        area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])  # [N,]
        area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])  # [M,]
        area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
        area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]

        iou = inter / (area1 + area2 - inter) # iou=交集/并集
        return iou  # [2,1]
    """
    传入的两个参数格式为(batch_size*7*7*30)的张量，前者将图片出入神经网络得到的输出值，
    后者就是yoloData制作的target也就是ground truth。需要提取ground truth与pred_target的bbox信息，置信度信息
    以及类别信息，求取损失函数。这里有五个损失，都是在YOLOv1原文中定义的
    """
    def forward(self, pred_tensor, target_tensor):
        '''
        pred_tensor: (tensor) size(batchsize,7,7,30)
        target_tensor: (tensor) size(batchsize,7,7,30) --- ground truth
        '''
        N = pred_tensor.size()[0]  # batchsize
        coo_mask = target_tensor[:, :, :, 4] > 0  # 具有目标标签的索引值 true ，batchsize*7*7
        noo_mask = target_tensor[:, :, :, 4] == 0  # 不具有目标的标签索引值 false ，batchsize*7*7
        # unsqueeze(-1)：在最后面增加一个维度，expand_as：将原本的张量扩充，一般是将通道数扩充，扩充的部分就是将原来的部分复制粘贴。
        coo_mask = coo_mask.unsqueeze(-1).expand_as(target_tensor)  # 得到含物体的坐标等信息,复制粘贴 batchsize*7*7*30
        noo_mask = noo_mask.unsqueeze(-1).expand_as(target_tensor)  # 得到不含物体的坐标等信息 batchsize*7*7*30

        """
        首先选择pred_tensor中coo_mask为True的元素。这通常用于从大量预测中筛选出有效的或感兴趣的预测。
        然后，使用.view(-1, int(CLASS_NUM + 10))将这些选出的元素重新塑形。-1表示该维度的大小将自动计算，以保持总元素数量不变。
        int(CLASS_NUM + 10)指定了第二维的大小，这里假设每个预测包含CLASS_NUM个类别预测加上额外的10个值（可能是边界框的坐标或其他属性）。
        """
        coo_pred = pred_tensor[coo_mask].view(-1, int(CLASS_NUM + 10))  # view类似于reshape
        # .contiguous()确保这些元素在内存中是连续的，这对于某些PyTorch操作是必需的，尤其是在重塑（reshape）或转换设备（如CPU到GPU）时。
        box_pred = coo_pred[:, :10].contiguous().view(-1, 5)  # 塑造成X行5列（-1表示自动计算），一个box包含5个值
        class_pred = coo_pred[:, 10:]  # [n_coord, 20]

        coo_target = target_tensor[coo_mask].view(-1, int(CLASS_NUM + 10))
        box_target = coo_target[:, :10].contiguous().view(-1, 5)
        class_target = coo_target[:, 10:]

        # 不包含物体grid ceil的置信度损失
        noo_pred = pred_tensor[noo_mask].view(-1, int(CLASS_NUM + 10))
        noo_target = target_tensor[noo_mask].view(-1, int(CLASS_NUM + 10))
        """
        创建了一个名为noo_pred_mask的PyTorch张量（tensor），它的数据类型是torch.cuda.ByteTensor，并且其形状（size）与另一个张量noo_pred相同。
        将这个新创建的张量转换为布尔类型（bool），以便它可以被用作掩码（mask）来索引或筛选其他张量。
        调用.zero_()方法将noo_pred_mask中的所有元素初始化为0（在布尔上下文中，0被视为False）。
        """
        noo_pred_mask = torch.cuda.ByteTensor(noo_pred.size()).bool()
        noo_pred_mask.zero_()

        # YOLOv1原文提到，如果不负责预测物体的noobj为1，负责预测物体的noobj为0
        noo_pred_mask[:, 4] = 1
        noo_pred_mask[:, 9] = 1

        # 只会留下noo_pred_mask被置位为1的数字，也就是只有第4个位置和第9个位置的数字会被留下来，其他都为0，方便使用均方损失计算置信度的损失
        noo_pred_c = noo_pred[noo_pred_mask]  # noo pred只需要计算 c 的损失 size[-1,2]
        noo_target_c = noo_target[noo_pred_mask]
        nooobj_loss = F.mse_loss(noo_pred_c, noo_target_c, size_average=False)  # 均方误差

        # compute contain obj loss
        coo_response_mask = torch.cuda.ByteTensor(box_target.size()).bool()  # ByteTensor 构建Byte类型的tensor元素全为0
        coo_response_mask.zero_()  # 全部元素置False                            bool:将其元素转变为布尔值

        no_coo_response_mask = torch.cuda.ByteTensor(box_target.size()).bool()  # ByteTensor 构建Byte类型的tensor元素全为0
        no_coo_response_mask.zero_()  # 全部元素置False                            bool:将其元素转变为布尔值

        box_target_iou = torch.zeros(box_target.size()).cuda()

        # box1 = 预测框  box2 = ground truth
        for i in range(0, box_target.size()[0], 2):  # box_target.size()[0]：有多少bbox，并且一次取两个bbox
            box1 = box_pred[i:i + 2]  # 第一个grid ceil对应的两个bbox，取的是 i 和 i+1，2*5
            box1_xyxy = Variable(torch.FloatTensor(box1.size()))
            # 这个是求bbox的左上角和右下角坐标，也就是xmin,ymin,xmax,ymax
            box1_xyxy[:, :2] = box1[:, :2] / float(self.S) - 0.5 * box1[:, 2:4]  # 原本(xc,yc)为7*7 所以要除以7
            box1_xyxy[:, 2:4] = box1[:, :2] / float(self.S) + 0.5 * box1[:, 2:4]

            box2 = box_target[i].view(-1, 5)# 因为对于真实值，两个bbox的值是完全一样的
            box2_xyxy = Variable(torch.FloatTensor(box2.size()))
            box2_xyxy[:, :2] = box2[:, :2] / float(self.S) - 0.5 * box2[:, 2:4]
            box2_xyxy[:, 2:4] = box2[:, :2] / float(self.S) + 0.5 * box2[:, 2:4]

            iou = self.compute_iou(box1_xyxy[:, :4], box2_xyxy[:, :4]) # 2*1
            max_iou, max_index = iou.max(0)
            max_index = max_index.data.cuda()
            coo_response_mask[i + max_index] = 1  # IOU最大的bbox
            no_coo_response_mask[i + 1 - max_index] = 1  # 舍去的bbox
            # confidence score = predicted box 与 the ground truth 的 IOU
            """
            torch.LongTensor([4]).cuda()：这里创建一个只包含单个元素4的Long类型张量，并将其移动到GPU上。
            """
            box_target_iou[i + max_index, torch.LongTensor([4]).cuda()] = max_iou.data.cuda()

        box_target_iou = Variable(box_target_iou).cuda()
        # 置信度误差（含物体的grid ceil的两个bbox与ground truth的IOU较大的一方）
        box_pred_response = box_pred[coo_response_mask].view(-1, 5)
        box_target_response_iou = box_target_iou[coo_response_mask].view(-1, 5)
        # IOU较小的一方
        no_box_pred_response = box_pred[no_coo_response_mask].view(-1, 5)
        no_box_target_response_iou = box_target_iou[no_coo_response_mask].view(-1, 5)
        no_box_target_response_iou[:, 4] = 0  # 保险起见置0（其实原本就是0）

        box_target_response = box_target[coo_response_mask].view(-1, 5)

        # 含物体grid ceil中IOU较大的bbox置信度损失
        contain_loss = F.mse_loss(box_pred_response[:, 4], box_target_response_iou[:, 4], size_average=False)
        # 含物体grid ceil中舍去的bbox损失
        no_contain_loss = F.mse_loss(no_box_pred_response[:, 4], no_box_target_response_iou[:, 4], size_average=False)
        # bbox坐标损失
        loc_loss = F.mse_loss(box_pred_response[:, :2], box_target_response[:, :2], size_average=False) + F.mse_loss(
            torch.sqrt(box_pred_response[:, 2:4]), torch.sqrt(box_target_response[:, 2:4]), size_average=False)

        # 类别损失
        class_loss = F.mse_loss(class_pred, class_target, size_average=False)

        return (self.l_coord * loc_loss + contain_loss + self.l_noobj * (nooobj_loss + no_contain_loss) + class_loss) / N

new_resnet.py

import torch
from torch.nn import Sequential, Conv2d, MaxPool2d, ReLU, BatchNorm2d
from torch import nn
from torch.utils import model_zoo

# model_urls = {'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'}
model_urls = {'resnet50': '/root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth'}
CLASS_NUM = 20   # 使用其他训练集需要更改

class Bottleneck(nn.Module):  # 定义基本块
    def __init__(self, in_channel, out_channel, stride, downsample):
        super(Bottleneck, self).__init__()
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.in_channel = in_channel
        self.out_channel = out_channel
        self.bottleneck = Sequential(

            Conv2d(in_channel, out_channel, kernel_size=1, stride=stride[0], padding=0, bias=False),
            BatchNorm2d(out_channel),# 归一化层
            ReLU(inplace=True),

            Conv2d(out_channel, out_channel, kernel_size=3, stride=stride[1], padding=1, bias=False),
            BatchNorm2d(out_channel),
            ReLU(inplace=True),

            Conv2d(out_channel, out_channel * 4, kernel_size=1, stride=stride[2], padding=0, bias=False),
            BatchNorm2d(out_channel * 4),
        )
        if self.downsample is False:  # 如果 downsample = True则为Conv_Block 为False为Identity_Block
            self.shortcut = Sequential()
        else:
            self.shortcut = Sequential(
                Conv2d(self.in_channel, self.out_channel * 4, kernel_size=1, stride=stride[0], bias=False),
                BatchNorm2d(self.out_channel * 4)
            )

    def forward(self, x):
        out = self.bottleneck(x)
        out += self.shortcut(x)# 残差连接
        out = self.relu(out)
        return out


class output_net(nn.Module):
    # no expansion
    # dilation = 2
    # type B use 1x1 conv
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, block_type='A'):
        super(output_net, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=2, bias=False, dilation=2)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion * planes)
        self.downsample = nn.Sequential()# 按顺序堆叠不同的层（layer），形成一个序列模型（Sequential Model）
        self.relu = nn.ReLU(inplace=True)
        if stride != 1 or in_planes != self.expansion * planes or block_type == 'B':
            self.downsample = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion * planes,
                    kernel_size=1,
                    stride=stride,
                    bias=False),
                nn.BatchNorm2d(self.expansion * planes))

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.downsample(x)
        out = self.relu(out)
        return out


class ResNet50(nn.Module):
    def __init__(self, block):
        super(ResNet50, self).__init__()
        self.block = block# block=Bottleneck
        self.layer0 = Sequential(
            # 指定了输入通道为3，输出为64
            Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            BatchNorm2d(64),
            ReLU(inplace=True),
            MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        self.layer1 = self.make_layer(self.block, channel=[64, 64], stride1=[1, 1, 1], stride2=[1, 1, 1], n_re=3)
        self.layer2 = self.make_layer(self.block, channel=[256, 128], stride1=[2, 1, 1], stride2=[1, 1, 1], n_re=4)
        self.layer3 = self.make_layer(self.block, channel=[512, 256], stride1=[2, 1, 1], stride2=[1, 1, 1], n_re=6)
        self.layer4 = self.make_layer(self.block, channel=[1024, 512], stride1=[2, 1, 1], stride2=[1, 1, 1], n_re=3)
        self.layer5 = self._make_output_layer(in_channels=2048)# out=256
        self.avgpool = nn.AvgPool2d(2)  # kernel_size = 2  , stride = 2
        self.conv_end = nn.Conv2d(256, int(CLASS_NUM + 10), kernel_size=3, stride=1, padding=1, bias=False)
        self.bn_end = nn.BatchNorm2d(int(CLASS_NUM + 10))

    def make_layer(self, block, channel, stride1, stride2, n_re):
        layers = []
        for num_layer in range(0, n_re):
            if num_layer == 0:
                layers.append(block(channel[0], channel[1], stride1, downsample=True))
            else:
                layers.append(block(channel[1]*4, channel[1], stride2, downsample=False))
        return Sequential(*layers)

    def _make_output_layer(self, in_channels):
        layers = []
        layers.append(
            output_net(
                in_planes=in_channels,
                planes=256,
                block_type='B'))
        layers.append(
            output_net(
                in_planes=256,
                planes=256,
                block_type='A'))
        layers.append(
            output_net(
                in_planes=256,
                planes=256,
                block_type='A'))
        return nn.Sequential(*layers)

    def forward(self, x):
        # print(x.shape) # 3*448*448
        out = self.layer0(x)
        # print(out.shape) # 64*112*112
        out = self.layer1(out)
        # print(out.shape)  # 256*112*112
        out = self.layer2(out)
        # print(out.shape) # 512*56*56
        out = self.layer3(out)
        # print(out.shape) # 1024*28*28
        out = self.layer4(out)  # 2048*14*14
        out = self.layer5(out)  # batch_size*256*14*14
        out = self.avgpool(out)  # batch_size*256*7*7
        out = self.conv_end(out)  # batch_size*30*7*7
        out = self.bn_end(out)
        out = torch.sigmoid(out)
        out = out.permute(0, 2, 3, 1)  # bitch_size*7*7*30
        return out


def resnet50(pretrained=False):
    model = ResNet50(Bottleneck)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
        # model.load_state_dict(torch.load('resnet50.pth')) # 手动加载预训练权重
    return model


if __name__=='__main__':
    model = resnet50()
    # 遍历state_dict()的键，输出模型中所有可训练参数（以及可能的一些非可训练参数，如批量归一化层的运行均值和方差）的名称
    for i in model.state_dict().keys():
        print(i)

train.py

from yoloData import yoloDataset
from yoloLoss import yoloLoss
from new_resnet import resnet50
from torchvision import models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch
import os

os.chdir('/root/workspace/YOLOV1-pytorch')


device = 'cuda'
file_root = 'VOCdevkit/VOC2007/JPEGImages/'
batch_size = 2   # 若显存较大可以调大此参数 4，8，16，32等等
learning_rate = 0.001
num_epochs = 100

train_dataset = yoloDataset(img_root=file_root, list_file='voctrain.txt', train=True, transform=[transforms.ToTensor()])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)# shuffle=True：这意味着数据加载器会打乱数据集中的样本顺序
test_dataset = yoloDataset(img_root=file_root, list_file='voctest.txt', train=False, transform=[transforms.ToTensor()])
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
print('the train_dataset has %d images' % (len(train_dataset)))

"""
因之前定义的网络后面一部分与ResNet50的结构略有差异，所以并不能完全使用使用torchvision的models库中的resnet50导入权重参数。
需要对其权重参数进行一定的筛选。

权重参数导入方法：自己定义的网络以及models库内的网络各自创建一个对象。接着使用state_dict()导入各自的权重参数。
网络结构相同的部分将new_state_dict的值赋给op。但是如果自己定义的网络结构的键值与torch自带的库不一致的话，导入权重参数会稍微麻烦一点。
这里给出了一种解决办法，具体参考代码。

"""
net = resnet50()  # 自己定义的网络
net = net.cuda()
resnet = models.resnet50(pretrained=True)  # torchvison库中的网络
new_state_dict = resnet.state_dict()
op = net.state_dict()

# for i in new_state_dict.keys():   # 查看网络结构的名称 并且得出一共有320个key
#     print(i)

# 若定义的网络结构的key()名称与torchvision库中的ResNet50的key()相同则可以使用此方法
# for k in new_state_dict.keys():
#     # print(k)                    # 输出层的名字
#     if k in op.keys() and not k.startswith('fc'):  # startswith() 方法用于检查字符串是否是以指定子字符串开头，如果是则返回 True，否则返回 False
#         op[k] = new_state_dict[k]  # 与自定义的网络比对 相同则把权重参数导入 不同则不导入
# net.load_state_dict(op)

# 无论名称是否相同都可以使用；enumerate: for循环中经常用到，既可以遍历元素又可以遍历索引
for new_state_dict_num, new_state_dict_value in enumerate(new_state_dict.values()):
    for op_num, op_key in enumerate(op.keys()):
        if op_num == new_state_dict_num and op_num <= 317:  # 320个key中不需要最后的全连接层的两个参数
            op[op_key] = new_state_dict_value
net.load_state_dict(op)  # 更改了state_dict的值记得把它导入网络中

print('cuda', torch.cuda.current_device(), torch.cuda.device_count())   # 确认一下cuda的设备

criterion = yoloLoss(7, 2, 5, 0.5)
criterion = criterion.to(device)
net.train()  # 训练前需要加入的语句

params = []  # 里面存字典
# net网络的参数名称和参数对象的元祖，通过named_parameters()方法获取，返回的事一个字典
params_dict = dict(net.named_parameters()) # 返回各层中key(只包含weight and bias) and value
for key, value in params_dict.items():
    params += [{'params': [value], 'lr':learning_rate}]  # value和学习率相加，其实是append

optimizer = torch.optim.SGD(    # 定义优化器  “随机梯度下降”
    params,   # net.parameters() 为什么不用这个???
    lr=learning_rate,
    momentum=0.9,   # 即更新的时候在一定程度上保留之前更新的方向  可以在一定程度上增加稳定性，从而学习地更快
    weight_decay=5e-4)     # L2正则化理论中出现的概念
# torch.multiprocessing.freeze_support()  # 多进程相关 猜测是使用多显卡训练需要

for epoch in range(num_epochs):
    net.train()
    # 更平滑的衰减，可以考虑使用学习率调度器（如torch.optim.lr_scheduler）
    if epoch == 60:
        learning_rate = 0.0001
    if epoch == 80:
        learning_rate = 0.00001
    for param_group in optimizer.param_groups:   # 其中的元素是2个字典；optimizer.param_groups[0]： 长度为6的字典，包括[‘amsgrad’, ‘params’, ‘lr’, ‘betas’, ‘weight_decay’, ‘eps’]这6个参数；
                                                # optimizer.param_groups[1]： 好像是表示优化器的状态的一个字典；
        param_group['lr'] = learning_rate      # 更改全部的学习率
    print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
    print('Learning Rate for this epoch: {}'.format(learning_rate))

    # 训练阶段
    total_loss = 0.
    for i, (images, target) in enumerate(train_loader):
        images, target = images.cuda(), target.cuda()
        pred = net(images)# 前向传播
        loss = criterion(pred, target)# 计算损失
        total_loss += loss.item()# 累积损失

        optimizer.zero_grad()# 梯度清零
        loss.backward()# 反向传播
        optimizer.step()# 更新权重
        if (i + 1) % 5 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f' % (epoch +1, num_epochs,
                                                                                 i + 1, len(train_loader), loss.item(), total_loss / (i + 1)))
    # 验证阶段，每次训练完成之后，都验证一下模型的准确性，并且保存
    validation_loss = 20.0
    net.eval()# net.eval() 方法用于将模型设置为评估模式（evaluation mode）
    for i, (images, target) in enumerate(test_loader):  # 导入dataloader 说明开始训练了  enumerate 建立一个迭代序列
        images, target = images.cuda(), target.cuda()
        pred = net(images)    # 将图片输入
        loss = criterion(pred, target)
        validation_loss += loss.item()   # 累加loss值  （固定搭配）
    validation_loss /= len(test_loader)  # 计算平均loss

    best_test_loss = validation_loss
    print('get best test loss %.5f' % best_test_loss)
    torch.save(net.state_dict(), 'yolo.pth')

predict.py

import numpy as np
import torch
import cv2
from torchvision.transforms import ToTensor
from new_resnet import resnet50
import os

os.chdir('/root/workspace/YOLOV1-pytorch')

img_root = "VOCdevkit/VOC2007/test/1.jpg"   # 需要预测的图片路径 （自己填入）
model = resnet50()
model.load_state_dict(torch.load("yolo.pth"))   # 导入参数  （自己填入）
model.eval()
confident = 0.2
iou_con = 0.4

VOC_CLASSES = (
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')  # 将自己的名称输入 （使用自己的数据集时需要更改）
CLASS_NUM = len(VOC_CLASSES)   # 20


# target 7*7*30  值域为0-1
class Pred():
    def __init__(self, model, img_root):
        self.model = model
        self.img_root = img_root

    def result(self):
        img = cv2.imread(self.img_root)
        h, w, _ = img.shape
        print(h, w)
        image = cv2.resize(img, (448, 448))
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        mean = (123, 117, 104)  # RGB
        img = img - np.array(mean, dtype=np.float32)
        transform = ToTensor()
        img = transform(img)
        img = img.unsqueeze(0)  # 输入要求是4维的
        Result = self.model(img)   # 1*7*7*30
        bbox = self.Decode(Result)# 解码
        bboxes = self.NMS(bbox)    # n*6   bbox坐标是基于7*7网格需要将其转换成448，非极大值抑制
        if len(bboxes) == 0:
            print("未识别到任何物体")
            print("尝试减小 confident 以及 iou_con")
            print("也可能是由于训练不充分，可在训练时将epoch增大")        
        for i in range(0, len(bboxes)):    # bbox坐标将其转换为原图像的分辨率
            bboxes[i][0] = bboxes[i][0] * 64
            bboxes[i][1] = bboxes[i][1] * 64
            bboxes[i][2] = bboxes[i][2] * 64
            bboxes[i][3] = bboxes[i][3] * 64

            x1 = bboxes[i][0].item()    # 后面加item()是因为画框时输入的数据不可一味tensor类型
            x2 = bboxes[i][1].item()
            y1 = bboxes[i][2].item()
            y2 = bboxes[i][3].item()
            class_name = bboxes[i][5].item()
            print(x1, x2, y1, y2, VOC_CLASSES[int(class_name)])

            # cv2.rectangle() 函数用于在图像上绘制矩形，最后一个表示矩形框的颜色，是(BGR)格式
            cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (144, 144, 255))   # 画框
            """
            在图像中的边界框中打上标签
            org：文本字符串左下角的坐标（x, y）,fontScale：字体比例因子，它决定了字体的大小,字体黑色,thickness：线条的粗细
            """
            cv2.putText(image, VOC_CLASSES[int(class_name)],(int(x1), int(y1)+5),cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), thickness=1)

        # cv2.imshow('img', image)
        cv2.imwrite('result.jpg',image)# 保存结果
        # cv2.waitKey(0)# 等待键盘任意键

    def Decode(self, result):  # x -> 1*7*7*30
        result = result.squeeze()   # 7*7*30
        grid_ceil1 = result[:, :, 4].unsqueeze(2)  # 7*7*1
        grid_ceil2 = result[:, :, 9].unsqueeze(2)
        grid_ceil_con = torch.cat((grid_ceil1, grid_ceil2), 2)  # 7*7*2,合并操作
        grid_ceil_con, grid_ceil_index = grid_ceil_con.max(2)    # 按照第二个维度求最大值  7*7   一个grid ceil两个bbox，两个confidence
        class_p, class_index = result[:, :, 10:].max(2)   # size -> 7*7   找出单个grid ceil预测的物体类别最大者
        class_confidence = class_p * grid_ceil_con   # 7*7   真实的类别概率
        bbox_info = torch.zeros(7, 7, 6)
        for i in range(0, 7):
            for j in range(0, 7):
                bbox_index = grid_ceil_index[i, j]
                # 获取到bbox_index里的位置信息xywh
                bbox_info[i, j, :5] = result[i, j, (bbox_index * 5):(bbox_index+1) * 5]   # 删选bbox 0-5 或者5-10
        # 置信度
        bbox_info[:, :, 4] = class_confidence
        # 类别id
        bbox_info[:, :, 5] = class_index
        print(bbox_info[1, 5, :])
        return bbox_info  # 7*7*6    6 = bbox4个信息+类别概率+类别代号

    def NMS(self, bbox, iou_con=iou_con):# 非极大值抑制
        for i in range(0, 7):
            for j in range(0, 7):
                # xc = bbox[i, j, 0]        # 目前bbox的四个坐标是以grid ceil的左上角为坐标原点 而且单位不一致
                # yc = bbox[i, j, 1]         # (xc,yc) 单位= 7*7   (w,h) 单位= 1*1
                # w = bbox[i, j, 2] * 7
                # h = bbox[i, j, 3] * 7
                # Xc = i + xc
                # Yc = j + yc
                # xmin = Xc - w/2     # 计算bbox四个顶点的坐标（以整张图片的左上角为坐标原点）单位7*7
                # xmax = Xc + w/2
                # ymin = Yc - h/2
                # ymax = Yc + h/2     # 更新bbox参数  xmin and ymin的值有可能小于0
                xmin = j + bbox[i, j, 0] - bbox[i, j, 2] * 7 / 2     # xmin
                xmax = j + bbox[i, j, 0] + bbox[i, j, 2] * 7 / 2     # xmax
                ymin = i + bbox[i, j, 1] - bbox[i, j, 3] * 7 / 2     # ymin
                ymax = i + bbox[i, j, 1] + bbox[i, j, 3] * 7 / 2     # ymax

                bbox[i, j, 0] = xmin
                bbox[i, j, 1] = xmax
                bbox[i, j, 2] = ymin
                bbox[i, j, 3] = ymax

        bbox = bbox.view(-1, 6)   # 49*6
        bboxes = []
        ori_class_index = bbox[:, 5] # 49，类别id
        class_index, class_order = ori_class_index.sort(dim=0, descending=False)# 升序排序
        class_index = class_index.tolist()   # 从0开始排序到7
        bbox = bbox[class_order, :]  # 更改bbox排列顺序
        a = 0
        for i in range(0, CLASS_NUM):
            num = class_index.count(i)# 同一个类别的数量
            if num == 0:
                continue
            x = bbox[a:a+num, :]   # 提取同一类别的所有信息
            score = x[:, 4]# 置信度
            score_index, score_order = score.sort(dim=0, descending=True)# 同一类别置信度降序排序
            y = x[score_order, :]   # 同一种类别按照置信度排序，并且把所有信息都包含进来
            if y[0, 4] >= confident:    # 物体类别的最大置信度大于给定值才能继续删选bbox，否则丢弃全部bbox
                for k in range(0, num):
                    y_score = y[:, 4]   # 每一次将置信度置零后都重新进行排序，保证排列顺序依照置信度递减，num个置信度
                    _, y_score_order = y_score.sort(dim=0, descending=True)# num个置信度降序排序
                    y = y[y_score_order, :]
                    if y[k, 4] > 0:# 如果置信度大于0，则执行操作
                        area0 = (y[k, 1] - y[k, 0]) * (y[k, 3] - y[k, 2])# 面积
                        for j in range(k+1, num):
                            area1 = (y[j, 1] - y[j, 0]) * (y[j, 3] - y[j, 2])
                            x1 = max(y[k, 0], y[j, 0])
                            x2 = min(y[k, 1], y[j, 1])
                            y1 = max(y[k, 2], y[j, 2])
                            y2 = min(y[k, 3], y[j, 3])
                            w = x2 - x1
                            h = y2 - y1
                            if w < 0 or h < 0:
                                w = 0
                                h = 0
                            inter = w * h# 交叉的面积
                            iou = inter / (area0 + area1 - inter)
                            # iou大于一定值则认为两个bbox识别了同一物体删除置信度较小的bbox
                            # 同时物体类别概率小于一定值则认为不包含物体
                            if iou >= iou_con or y[j, 4] < confident:
                                y[j, 4] = 0
                for mask in range(0, num):# 如果筛选完一遍之后，剩下的bbox存储起来
                    if y[mask, 4] > 0:
                        bboxes.append(y[mask])
            a = num + a
        return bboxes


if __name__ == "__main__":
    Pred = Pred(model, img_root)
    Pred.result()