前言
本篇博客主要记录我对U-net代码的理解。U-Net网络的代码可以参考:U-Net Tensorflow2。U-Net网络比较简单,主要是一个Decoder-Encoder,通过Decoder提取相关特征,Encoder部分复原并对像素进行分类。
数据载入
数据载入部分比较简单,调用的函数是:
gen= UnetDatasetGenerator(Batch_size, train_lines, inputs_size, num_classes, dataset_path)()
上述的参数Batch_size
表示批量的大小,train_lines
表示训练的图像,inputs_classes
输入的类别数,num_classes
表示类别数,dataset_path
数据集的路径。整个数据的代码不难理解,如下所示:
class UnetDatasetGenerator(object):
def __init__(self,batch_size,train_lines,image_size,num_classes,dataset_path):
self.batch_size = batch_size
self.train_lines = train_lines
self.train_batches = len(train_lines)
self.image_size = image_size
self.num_classes = num_classes
self.dataset_path = dataset_path
def __call__(self):
i = 0
length = len(self.train_lines)
inputs = []
targets = []
while True:
if i == 0:
shuffle(self.train_lines)
annotation_line = self.train_lines[i]
name = annotation_line.split()[0]
# 从文件中读取图像
jpg = Image.open(os.path.join(os.path.join(self.dataset_path, "JPEGImages"), name + ".jpg"))
png = Image.open(os.path.join(os.path.join(self.dataset_path, "labels"), name + ".png"))
jpg, png = letterbox_image(jpg, png, (int(self.image_size[1]),int(self.image_size[0])))
inputs.append(np.array(jpg)/255)
png = np.array(png)
png[png >= self.num_classes] = self.num_classes
seg_labels = np.eye(self.num_classes+1)[png.reshape([-1])]
seg_labels = seg_labels.reshape((int(self.image_size[1]),int(self.image_size[0]),self.num_classes+1))
targets.append(seg_labels)
i = (i + 1) % length
if len(targets) == self.batch_size:
tmp_inp = np.array(inputs)
tmp_targets = np.array(targets)
inputs = []
targets = []
yield tmp_inp, tmp_targets
模型搭建
模型的搭建以VGG16为Decoder:
def VGG16(img_input):
# Block 1
# 512,512,3 -> 512,512,64
x = layers.Conv2D(64, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block1_conv1')(img_input)
x = layers.Conv2D(64, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block1_conv2')(x)
feat1 = x
# 512,512,64 -> 256,256,64
x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
# Block 2
# 256,256,64 -> 256,256,128
x = layers.Conv2D(128, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block2_conv1')(x)
x = layers.Conv2D(128, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block2_conv2')(x)
feat2 = x
# 256,256,128 -> 128,128,128
x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
# Block 3
# 128,128,128 -> 128,128,256
x = layers.Conv2D(256, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block3_conv1')(x)
x = layers.Conv2D(256, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block3_conv2')(x)
x = layers.Conv2D(256, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block3_conv3')(x)
feat3 = x
# 128,128,256 -> 64,64,256
x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
# Block 4
# 64,64,256 -> 64,64,512
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block4_conv1')(x)
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block4_conv2')(x)
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block4_conv3')(x)
feat4 = x
# 64,64,512 -> 32,32,512
x = layers.MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
# Block 5
# 32,32,512 -> 32,32,512
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block5_conv1')(x)
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block5_conv2')(x)
x = layers.Conv2D(512, (3, 3),
activation='relu',
padding='same',
kernel_initializer = random_normal(stddev=0.02),
name='block5_conv3')(x)
feat5 = x
return feat1, feat2, feat3, feat4, feat5
在Decoder部分采用UpSampling2D
和Concatenate
进行上采样,恢复到原图的大小。
def Unet(input_shape=(256,256,3), num_classes=21):
inputs = Input(input_shape)
feat1, feat2, feat3, feat4, feat5 = VGG16(inputs)
channels = [64, 128, 256, 512]
# 32, 32, 512 -> 64, 64, 512
P5_up = UpSampling2D(size=(2, 2))(feat5)
# 64, 64, 512 + 64, 64, 512 -> 64, 64, 1024
P4 = Concatenate(axis=3)([feat4, P5_up])
# 64, 64, 1024 -> 64, 64, 512
P4 = Conv2D(channels[3], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P4)
P4 = Conv2D(channels[3], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P4)
# 64, 64, 512 -> 128, 128, 512
P4_up = UpSampling2D(size=(2, 2))(P4)
# 128, 128, 256 + 128, 128, 512 -> 128, 128, 768
P3 = Concatenate(axis=3)([feat3, P4_up])
# 128, 128, 768 -> 128, 128, 256
P3 = Conv2D(channels[2], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P3)
P3 = Conv2D(channels[2], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P3)
# 128, 128, 256 -> 256, 256, 256
P3_up = UpSampling2D(size=(2, 2))(P3)
# 256, 256, 256 + 256, 256, 128 -> 256, 256, 384
P2 = Concatenate(axis=3)([feat2, P3_up])
# 256, 256, 384 -> 256, 256, 128
P2 = Conv2D(channels[1], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P2)
P2 = Conv2D(channels[1], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P2)
# 256, 256, 128 -> 512, 512, 128
P2_up = UpSampling2D(size=(2, 2))(P2)
# 512, 512, 128 + 512, 512, 64 -> 512, 512, 192
P1 = Concatenate(axis=3)([feat1, P2_up])
# 512, 512, 192 -> 512, 512, 64
P1 = Conv2D(channels[0], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P1)
P1 = Conv2D(channels[0], 3, activation='relu', padding='same', kernel_initializer = random_normal(stddev=0.02))(P1)
# 512, 512, 64 -> 512, 512, num_classes
P1 = Conv2D(num_classes, 1, activation="softmax")(P1)
model = Model(inputs=inputs, outputs=P1)
return model
在Tensorflow中conv2dtranspose
即转置卷积也和Upsampling2D一样具有复原的能力,现在来看下两者的区别。
- UpSampling2D is just a simple scaling up of the image by using nearest neighbour or bilinear upsampling, so nothing smart. Advantage is it’s cheap.
- Conv2DTranspose is a convolution operation whose kernel is learnt (just like normal conv2d operation) while training your model. Using Conv2DTranspose will also upsample its input but the key difference is the model should learn what is the best upsampling for the job.
模型评估
使用mIoU进行评估。
from os.path import join
import numpy as np
from PIL import Image
# 设标签宽W,长H
def fast_hist(a, b, n):
k = (a >= 0) & (a < n)
return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n)
def per_class_iu(hist):
return np.diag(hist) / np.maximum((hist.sum(1) + hist.sum(0) - np.diag(hist)), 1)
def per_class_PA(hist):
return np.diag(hist) / np.maximum(hist.sum(1), 1)
def compute_mIoU(gt_dir, pred_dir, png_name_list, num_classes, name_classes):
print('Num classes', num_classes)
hist = np.zeros((num_classes, num_classes))
gt_imgs = [join(gt_dir, x + ".png") for x in png_name_list]
pred_imgs = [join(pred_dir, x + ".png") for x in png_name_list]
for ind in range(len(gt_imgs)):
pred = np.array(Image.open(pred_imgs[ind]))
label = np.array(Image.open(gt_imgs[ind]))
# 如果图像分割结果与标签的大小不一样,这张图片就不计算
if len(label.flatten()) != len(pred.flatten()):
print(
'Skipping: len(gt) = {:d}, len(pred) = {:d}, {:s}, {:s}'.format(
len(label.flatten()), len(pred.flatten()), gt_imgs[ind],
pred_imgs[ind]))
continue
hist += fast_hist(label.flatten(), pred.flatten(),num_classes)
# 每计算10张就输出一下目前已计算的图片中所有类别平均的mIoU值
if ind > 0 and ind % 10 == 0:
print('{:d} / {:d}: mIou-{:0.2f}; mPA-{:0.2f}'.format(ind, len(gt_imgs),
100 * np.nanmean(per_class_iu(hist)),
100 * np.nanmean(per_class_PA(hist))))
mIoUs = per_class_iu(hist)
mPA = per_class_PA(hist)
for ind_class in range(num_classes):
print('===>' + name_classes[ind_class] + ':\tmIou-' + str(round(mIoUs[ind_class] * 100, 2)) + '; mPA-' + str(round(mPA[ind_class] * 100, 2)))
print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2)) + '; mPA: ' + str(round(np.nanmean(mPA) * 100, 2)))
return mIoUs
if __name__ == "__main__":
gt_dir = "VOCdevkit/VOC2007/SegmentationClass"
pred_dir = "miou_pr_dir"
png_name_list = open("VOCdevkit/VOC2007/ImageSets/Segmentation/val.txt",'r').read().splitlines()
num_classes = 21
name_classes = ["background","aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
compute_mIoU(gt_dir, pred_dir, png_name_list, num_classes, name_classes) # 执行计算mIoU的函数