前言
rcnn家族其实已经是一个“非常老”的算法,虽然一开始提出是2014。但是深度学习用于图像识别的开山之作,我打算从rcnn开始学习,网上对于网络和论文的讲解很详细,我这系列就不说理论了,主要是记录我学习过程和写代码跳过的坑。
一、bccd数据介绍
bccd血细胞数据集是一个比较老的数据集,也不大,可以在这下载 https://public.roboflow.com/object-detection/bccd
该数据集共有三类364张图像:(WBC白细胞),RBC(红细胞)和Platelets。3个类别中有4888个标签(有0个空示例)。下图是网站得到的可视化数据(三个类别细胞标注数量计数)。这个网站的教程里还有很多可视化,比如各个细胞分布的热点图等,这里就不全放了。
二、读取数据
1.读xml标签数据
xml的格式如下图所示,需要重点关注的是框起来的部分,从上往下分别是,图片名称和图片位置、图片size(416,416,3)、真实框的标签、真实框的difficult、真实框的位置。
为了读该xml文件,以下是一个相对框架式的代码:
import os
import numpy as np
import xml.etree.ElementTree as ET
CELL_NAMES = ['RBC', 'WBC', 'Platelets']
# 返回一个字典,将类别转为数字
# {'RBC': 0, 'WBC': 1, 'Platelets': 2}
def get_cell_names():
cell_category2id = {}
for i, item in enumerate(CELL_NAMES):
cell_category2id[item] = i
return cell_category2id
# 获得数据集列表
def get_annotations(cname2cid, datadir):
filenames = os.listdir(datadir)
ct = 0
records = []
for fname in filenames:
fib = fname.split('.')
if fib[3]=='jpg':
continue
elif fib[3]=='xml':
fpath = os.path.join(datadir, fname)
img_file = os.path.join(datadir, fib[0]+'.'+fib[1]+'.'+fib[2]+'.jpg')
tree = ET.parse(fpath)
objs = tree.findall('object')
im_w = float(tree.find('size').find('width').text)
im_h = float(tree.find('size').find('height').text)
gt_bbox = np.zeros((len(objs), 4), dtype=np.float32)
gt_class = np.zeros((len(objs), ), dtype=np.int32)
is_crowd = np.zeros((len(objs), ), dtype=np.int32)
difficult = np.zeros((len(objs), ), dtype=np.int32)
for i, obj in enumerate(objs):
cname = obj.find('name').text
gt_class[i] = cname2cid[cname]
_difficult = int(obj.find('difficult').text)
x1 = float(obj.find('bndbox').find('xmin').text)
y1 = float(obj.find('bndbox').find('ymin').text)
x2 = float(obj.find('bndbox').find('xmax').text)
y2 = float(obj.find('bndbox').find('ymax').text)
x1 = max(0, x1)
y1 = max(0, y1)
x2 = min(im_w - 1, x2)
y2 = min(im_h - 1, y2)
# 这里使用xywh格式来表示目标物体真实框
gt_bbox[i] = [(x1+x2)/2.0 , (y1+y2)/2.0, x2-x1+1., y2-y1+1.]
is_crowd[i] = 0
difficult[i] = _difficult
voc_rec = {
'im_file': img_file,
'h': im_h,
'w': im_w,
'is_crowd': is_crowd,
'gt_class': gt_class,
'gt_bbox': gt_bbox,
'gt_poly': [],
'difficult': difficult
}
if len(objs) != 0:
records.append(voc_rec)
ct += 1
return records
train_path = '/content/gdrive/My Drive/bccd/train'
val_path = '/content/gdrive/My Drive/bccd/valid'
test_path = '/content/gdrive/My Drive/bccd/test'
cname2cid = get_cell_names()
records = get_annotations(cname2cid,train_path)
读取后的示例
{‘difficult’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
‘gt_bbox’: array([[188.5, 269.5, 68. , 80. ],
[142.5, 40.5, 68. , 80. ],
[277.5, 135.5, 68. , 96. ],
[364. , 152. , 81. , 107. ],
[164.5, 123.5, 74. , 88. ],
[ 37.5, 109. , 72. , 87. ],
[264. , 231.5, 67. , 100. ],
[ 88. , 195. , 75. , 109. ],
[341.5, 326. , 76. , 103. ],
[102.5, 375.5, 68. , 80. ],
[112.5, 300.5, 36. , 38. ],
[155. , 232.5, 29. , 38. ],
[235.5, 280. , 30. , 41. ],
[246.5, 360.5, 104. , 110. ]], dtype=float32),
‘gt_class’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1], dtype=int32),
‘gt_poly’: [],
‘h’: 416.0,
‘im_file’: ‘/content/gdrive/My Drive/bccd/train/BloodImage_00145_jpg.rf.a265e7f4f0aab5586c6aa5258bb03966.jpg’,
‘is_crowd’: array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
‘w’: 416.0}
2.读图片数据
import cv2
# 对于一般的检测任务来说,一张图片上往往会有多个目标物体
# 设置参数MAX_NUM = 50, 即一张图片最多取50个真实框;如果真实
# 框的数目少于50个,则将不足部分的gt_bbox, gt_class和gt_score的各项数值全设置为0
def get_bbox(gt_bbox, gt_class):
MAX_NUM = 50
gt_bbox2 = np.zeros((MAX_NUM, 4))
gt_class2 = np.zeros((MAX_NUM,))
for i in range(len(gt_bbox)):
gt_bbox2[i, :] = gt_bbox[i, :]
gt_class2[i] = gt_class[i]
if i >= MAX_NUM:
break
return gt_bbox2, gt_class2
def get_img_data_from_file(record):
im_file = record['im_file']
h = record['h']
w = record['w']
is_crowd = record['is_crowd']
gt_class = record['gt_class']
gt_bbox = record['gt_bbox']
difficult = record['difficult']
img = cv2.imread(im_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
gt_boxes, gt_labels = get_bbox(gt_bbox, gt_class)
# gt_bbox 用相对值
gt_boxes[:, 0] = gt_boxes[:, 0] / float(w)
gt_boxes[:, 1] = gt_boxes[:, 1] / float(h)
gt_boxes[:, 2] = gt_boxes[:, 2] / float(w)
gt_boxes[:, 3] = gt_boxes[:, 3] / float(h)
return img, gt_boxes, gt_labels, (h, w)
3.图片预处理(随机改变明暗、对比度、颜色等)
(1)随机改变亮暗、对比度和颜色等
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random
# 随机改变亮暗、对比度和颜色等
def random_distort(img):
# 随机改变亮度
def random_brightness(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Brightness(img).enhance(e)
# 随机改变对比度
def random_contrast(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Contrast(img).enhance(e)
# 随机改变颜色
def random_color(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Color(img).enhance(e)
ops = [random_brightness, random_contrast, random_color]
np.random.shuffle(ops)
img = Image.fromarray(img)
img = ops[0](img)
img = ops[1](img)
img = ops[2](img)
img = np.asarray(img)
return img
(2)随机填充
# 随机填充
def random_expand(img,
gtboxes,
max_ratio=4.,
fill=None,
keep_ratio=True,
thresh=0.5):
if random.random() > thresh:
return img, gtboxes
if max_ratio < 1.0:
return img, gtboxes
h, w, c = img.shape
ratio_x = random.uniform(1, max_ratio)
if keep_ratio:
ratio_y = ratio_x
else:
ratio_y = random.uniform(1, max_ratio)
oh = int(h * ratio_y)
ow = int(w * ratio_x)
off_x = random.randint(0, ow - w)
off_y = random.randint(0, oh - h)
out_img = np.zeros((oh, ow, c))
if fill and len(fill) == c:
for i in range(c):
out_img[:, :, i] = fill[i] * 255.0
out_img[off_y:off_y + h, off_x:off_x + w, :] = img
gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow)
gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh)
gtboxes[:, 2] = gtboxes[:, 2] / ratio_x
gtboxes[:, 3] = gtboxes[:, 3] / ratio_y
return out_img.astype('uint8'), gtboxes
(3)随机裁剪 (这个涉及裁剪之后会不会裁掉过多的原本图像)
#随机裁剪辅助函数
import numpy as np
def multi_box_iou_xywh(box1, box2):
"""
In this case, box1 or box2 can contain multi boxes.
Only two cases can be processed in this method:
1, box1 and box2 have the same shape, box1.shape == box2.shape
2, either box1 or box2 contains only one box, len(box1) == 1 or len(box2) == 1
If the shape of box1 and box2 does not match, and both of them contain multi boxes, it will be wrong.
"""
assert box1.shape[-1] == 4, "Box1 shape[-1] should be 4."
assert box2.shape[-1] == 4, "Box2 shape[-1] should be 4."
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
inter_x1 = np.maximum(b1_x1, b2_x1)
inter_x2 = np.minimum(b1_x2, b2_x2)
inter_y1 = np.maximum(b1_y1, b2_y1)
inter_y2 = np.minimum(b1_y2, b2_y2)
inter_w = inter_x2 - inter_x1
inter_h = inter_y2 - inter_y1
inter_w = np.clip(inter_w, a_min=0., a_max=None)
inter_h = np.clip(inter_h, a_min=0., a_max=None)
inter_area = inter_w * inter_h
b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
return inter_area / (b1_area + b2_area - inter_area)
def box_crop(boxes, labels, crop, img_shape):
x, y, w, h = map(float, crop)
im_w, im_h = map(float, img_shape)
boxes = boxes.copy()
boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (
boxes[:, 0] + boxes[:, 2] / 2) * im_w
boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (
boxes[:, 1] + boxes[:, 3] / 2) * im_h
crop_box = np.array([x, y, x + w, y + h])
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(
axis=1)
boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2])
boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:])
boxes[:, :2] -= crop_box[:2]
boxes[:, 2:] -= crop_box[:2]
mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1))
boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1)
labels = labels * mask.astype('float32')
boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (
boxes[:, 2] - boxes[:, 0]) / w
boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (
boxes[:, 3] - boxes[:, 1]) / h
return boxes, labels, mask.sum()
# 随机裁剪
def random_crop(img,
boxes,
labels,
scales=[0.3, 1.0],
max_ratio=2.0,
constraints=None,
max_trial=50):
if len(boxes) == 0:
return img, boxes
if not constraints:
constraints = [(0.1, 1.0), (0.3, 1.0), (0.5, 1.0), (0.7, 1.0),
(0.9, 1.0), (0.0, 1.0)]
img = Image.fromarray(img)
w, h = img.size
crops = [(0, 0, w, h)]
for min_iou, max_iou in constraints:
for _ in range(max_trial):
scale = random.uniform(scales[0], scales[1])
aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \
min(max_ratio, 1 / scale / scale))
crop_h = int(h * scale / np.sqrt(aspect_ratio))
crop_w = int(w * scale * np.sqrt(aspect_ratio))
crop_x = random.randrange(w - crop_w)
crop_y = random.randrange(h - crop_h)
crop_box = np.array([[(crop_x + crop_w / 2.0) / w,
(crop_y + crop_h / 2.0) / h,
crop_w / float(w), crop_h / float(h)]])
iou = multi_box_iou_xywh(crop_box, boxes)
if min_iou <= iou.min() and max_iou >= iou.max():
crops.append((crop_x, crop_y, crop_w, crop_h))
break
while crops:
crop = crops.pop(np.random.randint(0, len(crops)))
crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h))
if box_num < 1:
continue
img = img.crop((crop[0], crop[1], crop[0] + crop[2],
crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
img = np.asarray(img)
return img, crop_boxes, crop_labels
img = np.asarray(img)
return img, boxes, labels
(4)随机缩放图片
# 随机缩放
def random_interp(img, size, interp=None):
interp_method = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
if not interp or interp not in interp_method:
interp = interp_method[random.randint(0, len(interp_method) - 1)]
h, w, _ = img.shape
im_scale_x = size / float(w)
im_scale_y = size / float(h)
img = cv2.resize(
img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=interp)
return img
(5)随机随机翻转
# 随机翻转
def random_flip(img, gtboxes, thresh=0.5):
if random.random() > thresh:
img = img[:, ::-1, :]
gtboxes[:, 0] = 1.0 - gtboxes[:, 0]
return img, gtboxes
(6)随机打乱真实框排列顺序(网络会对后面的数据更加敏感)
# 随机打乱真实框排列顺序
def shuffle_gtbox(gtbox, gtlabel):
gt = np.concatenate(
[gtbox, gtlabel[:, np.newaxis]], axis=1)
idx = np.arange(gt.shape[0])
np.random.shuffle(idx)
gt = gt[idx, :]
return gt[:, :4], gt[:, 4]
4.批处理
# 图像预处理
def image_augment(img, gtboxes, gtlabels, size, means=None):
# 随机改变亮暗、对比度和颜色等
img = random_distort(img)
# 随机填充
img, gtboxes = random_expand(img, gtboxes, fill=means)
# 随机裁剪
img, gtboxes, gtlabels, = random_crop(img, gtboxes, gtlabels)
# 随机缩放
img = random_interp(img, size)
# 随机翻转
img, gtboxes = random_flip(img, gtboxes)
# 随机打乱真实框排列顺序
gtboxes, gtlabels = shuffle_gtbox(gtboxes, gtlabels)
return img.astype('float32'), gtboxes.astype('float32'), gtlabels.astype('int32')
# 读取图片并做归一化
# 将图片从(227,227,3)转为(batch,3,227,227)形式
def get_img_data(record, size=640):
img, gt_boxes, gt_labels, scales = get_img_data_from_file(record)
img, gt_boxes, gt_labels = image_augment(img, gt_boxes, gt_labels, size)
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
mean = np.array(mean).reshape((1, 1, -1))
std = np.array(std).reshape((1, 1, -1))
img = (img / 255.0 - mean) / std
img = img.astype('float32').transpose((2, 0, 1))
return img, gt_boxes, gt_labels, scales
# 获取一个批次内样本随机缩放的尺寸(在这并没用,alexnet的输入是固定的227,但是spp网络等可以用)
def get_img_size(mode):
if (mode == 'train') or (mode == 'valid'):
inds = np.array([0,1,2,3,4,5,6,7,8,9])
ii = np.random.choice(inds)
img_size = 320 + ii * 32
else:
img_size = 608
return img_size
# 将 list形式的batch数据 转化成多个array构成的tuple
def make_array(batch_data):
img_array = np.array([item[0] for item in batch_data], dtype = 'float32')
gt_box_array = np.array([item[1] for item in batch_data], dtype = 'float32')
gt_labels_array = np.array([item[2] for item in batch_data], dtype = 'int32')
img_scale = np.array([item[3] for item in batch_data], dtype='int32')
return img_array, gt_box_array, gt_labels_array, img_scale
# 批量读取数据,同一批次内图像的尺寸大小必须是一样的,
# 不同批次之间的大小是随机的,
# 由上面定义的get_img_size函数产生
def data_loader(datadir, batch_size= 10, mode='train'):
cname2cid = get_cell_names()
records = get_annotations(cname2cid, datadir)
def reader():
if mode == 'train':
np.random.shuffle(records)
batch_data = []
img_size = get_img_size(mode)
for record in records:
#print(record)
img, gt_bbox, gt_labels, im_shape = get_img_data(record, size=img_size)
batch_data.append((img, gt_bbox, gt_labels, im_shape))
if len(batch_data) == batch_size:
yield make_array(batch_data)
batch_data = []
img_size = get_img_size(mode)
if len(batch_data) > 0:
yield make_array(batch_data)
return reader
以上批处理没有用面向对象的思想,之前看了一下飞桨老师的写法,我觉得也精简的,只放一个demo,需要的话自己拓展为面向对象写法~
class data_load(object):
# 初始化或者
def __init__(self):
self.list_num = [0,1,2,3,4,5,6,7,8,9]
self.batch_data = []
self.idx = 0
# 这里可以加很多其他预处理函数
# 生成批数据
def __call__(self):
for i in self.list_num:
self.batch_data.append(i)
if len(self.batch_data) == 2:
yield self.batch_data
self.batch_data = []
# 调用方法
x = data_load()
for i in x():
print(i)
5.数据可视化
# 定义画矩形框的函数
def draw_rectangle(currentAxis, bbox, edgecolor = 'k', facecolor = 'y', fill=False, linestyle='-',scales=(1.0,1.0)):
# currentAxis,坐标轴,通过plt.gca()获取
# bbox,边界框,包含四个数值的list, [x1, y1, x2, y2]
# edgecolor,边框线条颜色
# facecolor,填充颜色
# fill, 是否填充
# linestype,边框线型
# patches.Rectangle需要传入左上角坐标、矩形区域的宽度、高度等参数
rect=matplotlib.patches.Rectangle((bbox[0]*scales[0]-bbox[2]*scales[0]/2.0, bbox[1]*scales[1]-bbox[3]*scales[1]/2.0), bbox[2]*scales[0], bbox[3]*scales[1], linewidth=1,
edgecolor=edgecolor,facecolor=facecolor,fill=fill, linestyle=linestyle)
currentAxis.add_patch(rect)
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt
import matplotlib.patches
img, gt_boxes, gt_labels, scales = get_img_data_from_file(records[0])
plt.figure(figsize=(10, 10))
plt.imshow(img)
currentAxis=plt.gca()
colors = ['r', 'g', 'b', 'k']
for i,gt_box in enumerate(gt_boxes):
if (gt_box == 0).all():
break
box = gt_box
label = int(gt_labels[i])
name = CELL_NAMES[label]
draw_rectangle(currentAxis, box, edgecolor = colors[label],scales=scales)
plt.text(box[0]*scales[0]-box[2]*scales[0]/2.0, box[1]*scales[1]-box[3]*scales[1]/2.0, name, fontsize=12, color=colors[label])
通过上述代码绘制,训练集的标注框及其标签,可以直观的看到RBC类别比较多,WBC数据较少。从尺寸来看,Platelets相对较小。后期制作训练集可以有针对性对训练集进行调整。
通过对训练集数据进行聚类,可以大概看一下细胞尺寸大概的分布。
boxes = []
for r in records:
boxes.extend(r['gt_bbox'])
boxes = np.array(boxes)
boxes = boxes[:, 2:4]
tmp = np.array(boxes)
ratio = tmp[:,0]/tmp[:,1]
ratio = np.array(ratio).reshape(-1, 1)
from sklearn.cluster import KMeans
# 正式定义模型
model1 = KMeans(n_clusters=5)
# 跑模型
model1.fit(boxes)
# 需要知道每个类别有哪些参数
C_i = model1.predict(boxes)
# 还需要知道聚类中心的坐标
centers = model1.cluster_centers_
centers = centers.astype('int32')
print(centers)
from matplotlib import pyplot as plt
plt.figure(figsize=(12,8),dpi=80)
plt.scatter(boxes[:, 0], boxes[:, 1], s=1)
plt.scatter(centers[:, 0], centers[:, 1], s=50)
plt.show()
plt.figure(figsize=(12,8),dpi=80)
plt.scatter(ratio, s=1)
plt.scatter(centers_r, s=50)
plt.show()
sums = centers[:, 0] + centers[:, 1]
centers_sorted = np.append(centers.T, [sums], axis = 0)
centers_sorted = centers_sorted.T
print(centers_sorted)
print('--------')
centers_sorted = centers_sorted[centers_sorted[:,2].argsort()]
print(centers_sorted)
可以看到细胞的尺寸(长和宽)在[20,200]的区间范围内,其中大部分聚集在[70,100]范围内。
总结
这篇文字主要放了数据集前处理的一个比较基础的框架,包括标签读取,图片读取和预处理,将图片和标签批处理,最后放了一点点数据的可视化。