前言
rcnn家族其实已经是一个“非常老”的算法,虽然一开始提出是2014。但是深度学习用于图像识别的开山之作,我打算从rcnn开始学习,网上对于网络和论文的讲解很详细,我这系列就不说理论了,主要是记录我学习过程和写代码跳过的坑。
一、bccd数据介绍
bccd血细胞数据集是一个比较老的数据集,也不大,可以在这下载 https://public.roboflow.com/object-detection/bccd
该数据集共有三类364张图像:(WBC白细胞),RBC(红细胞)和Platelets。3个类别中有4888个标签(有0个空示例)。下图是网站得到的可视化数据(三个类别细胞标注数量计数)。这个网站的教程里还有很多可视化,比如各个细胞分布的热点图等,这里就不全放了。
二、读取数据
1.读xml标签数据
xml的格式如下图所示,需要重点关注的是框起来的部分,从上往下分别是,图片名称和图片位置、图片size(416,416,3)、真实框的标签、真实框的difficult、真实框的位置。
为了读该xml文件,以下是一个相对框架式的代码:
import os
import numpy as np
import xml.etree.ElementTree as ET
CELL_NAMES = ['RBC', 'WBC', 'Platelets']
# 返回一个字典,将类别转为数字
# {'RBC': 0, 'WBC': 1, 'Platelets': 2}
def get_cell_names():
cell_category2id = {
}
for i, item in enumerate(CELL_NAMES):
cell_category2id[item] = i
return cell_category2id
# 获得数据集列表
def get_annotations(cname2cid, datadir):
filenames = os.listdir(datadir)
ct = 0
records = []
for fname in filenames:
fib = fname.split('.')
if fib[3]=='jpg':
continue
elif fib[3]=='xml':
fpath = os.path.join(datadir, fname)
img_file = os.path.join(datadir, fib[0]+'.'+fib[1]+'.'+fib[2]+'.jpg')
tree = ET.parse(fpath)
objs = tree.findall('object')
im_w = float(tree.find('size').find('width').text)
im_h = float(tree.find('size').find('height').text)
gt_bbox = np.zeros((len(objs), 4), dtype=np.float32)
gt_class = np.zeros((len(objs), ), dtype=np.int32)
is_crowd = np.zeros((len(objs), ), dtype=np.int32)
difficult = np.zeros((len(objs), ), dtype=np.int32)
for i, obj in enumerate(objs):
cname = obj.find('name').text
gt_class[i] = cname2cid[cname]
_difficult = int(obj.find('difficult').text)
x1 = float(obj.find('bndbox').find('xmin').text)
y1 = float(obj.find('bndbox').find('ymin').text)
x2 = float(obj.find('bndbox').find('xmax').text)
y2 = float(obj.find('bndbox').find('ymax').text)
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(im_w - 1, x2)
y2 = min(im_h - 1, y2)
# 这里使用xywh格式来表示目标物体真实框
gt_bbox[i] = [(x1+x2)/2.0 , (y1+y2)/2.0, x2-x1+1., y2-y1+1.]
is_crowd[i] = 0
difficult[i] = _difficult
voc_rec = {
'im_file': img_file,
'h': im_h,
'w': im_w,
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': [],
'difficult': difficult
}
if len(objs) != 0:
records.append(voc_rec)
ct += 1
return records
train_path = '/content/gdrive/My Drive/bccd/train'
val_path = '/content/gdrive/My Drive/bccd/valid'
test_path = '/content/gdrive/My Drive/bccd/test'
cname2cid = get_cell_names()
records = get_annotations(cname2cid,train_path)
读取后的示例
{‘difficult’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
‘gt_bbox’: array([[188.5, 269.5, 68. , 80. ],
[142.5, 40.5, 68. , 80. ],
[277.5, 135.5, 68. , 96. ],
[364. , 152. , 81. , 107. ],
[164.5, 123.5, 74. , 88. ],
[ 37.5, 109. , 72. , 87. ],
[264. , 231.5, 67. , 100. ],
[ 88. , 195. , 75. , 109. ],
[341.5, 326. , 76. , 103. ],
[102.5, 375.5, 68. , 80. ],
[112.5, 300.5, 36. , 38. ],
[155. , 232.5, 29. , 38. ],
[235.5, 280. , 30. , 41. ],
[246.5, 360.5, 104. , 110. ]], dtype=float32),
‘gt_class’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1], dtype=int32),
‘gt_poly’: [],
‘h’: 416.0,
‘im_file’: ‘/content/gdrive/My Drive/bccd/train/BloodImage_00145_jpg.rf.a265e7f4f0aab5586c6aa5258bb03966.jpg’,
‘is_crowd’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
‘w’: 416.0}
2.读图片数据
import cv2
# 对于一般的检测任务来说,一张图片上往往会有多个目标物体
# 设置参数MAX_NUM = 50, 即一张图片最多取50个真实框;如果真实
# 框的数目少于50个,则将不足部分的gt_bbox, gt_class和gt_score的各项数值全设置为0
def get_bbox(gt_bbox, gt_class):
MAX_NUM = 50
gt_bbox2 = np.zeros((MAX_NUM, 4))
gt_class2 = np.zeros((MAX_NUM,))
for i in range(len(gt_bbox)):
gt_bbox2[i, :] = gt_bbox[i, :]
gt_class2[i] = gt_class[i]
if i >= MAX_NUM:
break
return gt_bbox2, gt_class2
def get_img_data_from_file(record):
im_file = record['im_file']
h = record['h']
w = record['w']
is_crowd = record['is_crowd']
gt_class = record['gt_class']
gt_bbox = record['gt_bbox']
difficult = record['difficult']
img = cv2.imread(im_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
gt_boxes, gt_labels = get_bbox(gt_bbox, gt_class)
# gt_bbox 用相对值
gt_boxes[:, 0] = gt_boxes[:, 0] / float(w)
gt_boxes[:, 1] = gt_boxes[:,