【计算机视觉】对图像目标做探索性数据分析_论文图像数据集训练前如何做数据分析-CSDN博客

本文链接：https://blog.csdn.net/weixin_44984705/article/details/133992164

文章目录

前言
一、数据准备
二、数据处理
总结

前言

对图像目标做探索性数据分析，有助于帮助我们了解目标分布，训练过程中应该关注什么，训练超参数的设置等。本文适用于coco格式的数据集分析。

一、数据准备

本文使用PCB缺陷检测数据集进行演示。
数据集下载链接
将数据整理成pandas表格形式：

from pycocotools.coco import COCO
import pandas as pd
import os.path as osp
import logging


def is_pic(img_name):
    valid_suffix = ['JPEG', 'jpeg', 'JPG', 'jpg', 'BMP', 'bmp', 'PNG', 'png']
    suffix = img_name.split('.')[-1]
    if suffix not in valid_suffix:
        return False
    return True


if __name__ == '__main__':
    anno_file = r".\pcb\pcb_cocoanno\train.json"
    data_dir = r'.\pcb\images'
    labels = []
    coco = COCO(anno_file)
    img_ids = sorted(coco.getImgIds())
    cat_ids = coco.getCatIds()
    cat_id2cls_id = dict({cat_id: i for i, cat_id in enumerate(cat_ids)})
    cname2clsid = dict({
        coco.loadCats(cat_id)[0]['name']: clsid
        for cat_id, clsid in cat_id2cls_id.items()
    })
    
    df = pd.DataFrame()
    cls2id = dict({id: name for name, id in cname2clsid.items()})
    for label, cid in sorted(cname2clsid.items(), key=lambda d: d[1]):
        labels.append(label)

    ct = 0
    for img_id in img_ids:
        is_empty = False
        img_anno = coco.loadImgs(img_id)[0]
        im_fname = osp.join(data_dir, img_anno['file_name'])
        if not is_pic(im_fname):
            continue
        im_w = float(img_anno['width'])
        im_h = float(img_anno['height'])
        ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
        instances = coco.loadAnns(ins_anno_ids)
        
        temp_dict = {}
        bboxes = []
        for inst in instances:
            temp_dict['id'] = img_id
            temp_dict['imagePath'] = im_fname
            temp_dict['imageWidth'] = im_w
            temp_dict['imageHeight'] = im_h
            x, y, box_w, box_h = inst['bbox']
            x1 = max(0, x)
            y1 = max(0, y)
            x2 = min(im_w - 1, x1 + max(0, box_w))
            y2 = min(im_h - 1, y1 + max(0, box_h))
            temp_dict['category'] = cls2id[inst['category_id']]
            if inst['area'] > 0 and x2 >= x1 and y2 >= y1:
                if inst['area'] < 32 * 32:
                    temp_dict["area"] = "small"
                elif inst['area'] > 96 * 96:
                    temp_dict["area"] = "large"
                else:
                    temp_dict["area"] = "medium"
                inst['clean_bbox'] = [x1, y1, x2, y2]
                temp_dict['bbox'] = [x1, y1, x2, y2]
            else:
                logging.warning(
                    "Found an invalid bbox in annotations: "
                    "im_id: {}, area: {} x1: {}, y1: {}, x2: {}, y2: {}."
                    .format(img_id, float(inst['area']), x1, y1, x2, y2))
            df = df.append(temp_dict, ignore_index=True)
        num_bbox = len(bboxes)
    
    df.to_csv('./train_dataset.csv', index=False)

二、数据处理

1.图像尺寸分布

统计图像尺寸，并可视化出图像分布。

import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def size_distribution(dataframe):
    x = dataframe['imageWidth']
    y = dataframe['imageHeight']
    
    xy = np.vstack([x, y])
    z = gaussian_kde(xy)(xy)
    
    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    plt.figure(figsize=(10, 10))
    plt.scatter(x, y, c=z, s=5, cmap='Spectral_r')
    plt.tick_params(labelsize=15)
    
    xy_max = max(max(x), max(y))
    plt.xlim(xmin=0, xmax=xy_max)
    plt.ylim(ymin=0, ymax=xy_max)
    
    plt.ylabel('height', fontsize=25)
    plt.xlabel('width', fontsize=25)
    
    plt.savefig('chart/size_distribution.png', dpi=120, bbox_inches='tight')
    plt.show()

if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
	df = pd.read_csv('./train_dataset.csv')
	size_distribution(df)

在这里插入图片描述
由结果可知，图片大小集中分布较为均匀。

2.不同类别的数量

在训练过程中，我们希望要训练的目标种类尽量相同，使得模型不会偏向于任何一方。

import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


def number_of_category(dataframe):
    df_num = pd.DataFrame()
    label_type_list = []
    num_list = []
    for each in df['category'].unique():
        label_type_list.append(each)
        num_list.append(len(dataframe[dataframe['category'] == each]))
    
    df_num['label_type'] = label_type_list
    df_num['num'] = num_list
    
    df_num = df_num.sort_values(by='num', ascending=False)
    
    plt.figure(figsize=(22, 10))
    
    x = df_num['label_type']
    y = df_num['num']
    
    plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')
    
    plt.xticks(rotation=90)
    plt.tick_params(labelsize=15)
    plt.xlabel('标注类别', fontsize=20)
    plt.ylabel('目标数量', fontsize=20)
    
    plt.savefig('chart/number_of_category.png', dpi=120, bbox_inches='tight')
    
    plt.show()


if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    df = pd.read_csv('./train_dataset.csv')
    number_of_category(df)

在这里插入图片描述

可以看出，不同类别的目标数量大致相等。

3.不同大小目标数量统计

目标检测常用的coco检测指标会将，目标分为小目标(<32X=x32)、中等目标(32x32~96x96)、大目标(>96x96)。我们可以事先统计目标大小，方便我们知道训练验证过程中优先关注哪个目标。

# -*- coding: utf-8 -*-
import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


def num_of_area(dataframe):
    new_df = pd.DataFrame()
    label_type_list = []
    num_list = []
    for area_type in dataframe['area'].unique():
        label_type_list.append(area_type)
        num_list.append(len(dataframe[dataframe['area'] == area_type]))
    
    new_df['label_type'] = label_type_list
    new_df['num'] = num_list
    
    df_num = new_df.sort_values(by='num', ascending=False)
    
    plt.figure(figsize=(11, 5))
    
    x = df_num['label_type']
    y = df_num['num']
    
    x = df_num['label_type']
    y = df_num['num']
    
    plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')
    
    plt.xticks(rotation=90)
    plt.tick_params(labelsize=15)
    plt.xlabel('目标大小', fontsize=20)
    plt.ylabel('目标数量', fontsize=20)
    
    plt.savefig('chart/num_of_area.png', dpi=120, bbox_inches='tight')
    
    plt.show()


if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    df = pd.read_csv('./train_dataset.csv')
    num_of_area(df)