该模型可通过输入不定大小的X光影像数据进行正常/肺炎二分类。在含624个数据的测试集中,准确率达到了90%。模型及、训练测试源代码及其它配置文件可在OskajhZ/Pneumonia-Analysis-System-Based-on-CNN (github.com)中获取。 训练及测试均采用COVID-19-Chest-Xray数据集。
运行应用代码,可得到结果如下:
对应的应用代码如下:
import tensorflow as tf
model=tf.keras.models.load_model("C:\prog\X_ray_Classification\Models\F_M1_90")
import PIL.Image as img
import matplotlib.pyplot as plt
import numpy as np
test=img.open("C:\\prog\\lung\\test.jpeg").convert("L")
test_orig=test
test=test.resize((1100,1100),img.LANCZOS)
test=tf.reshape(tf.constant(test,dtype=tf.float32),(1,1100,1100,1))
res=np.argmax(model.predict(test))
print(model.predict(test))
if res==0:
tit="参考结果:正常"
else:
tit="参考结果:肺炎"
plt.rcParams["font.sans-serif"]="Microsoft Yahei"
plt.figure()
plt.subplot(1,2,1)
plt.imshow(test_orig)
plt.title("原图像")
plt.subplot(1,2,2)
plt.title("模型输入图像")
plt.imshow(tf.reshape(test,(1100,1100)))
plt.suptitle(tit)
plt.tight_layout()
plt.show()
该模型的神经网络结构定义如下述代码所示:
class ConvM(Model):
def __init__(self):
super(ConvM,self).__init__()
#注意:每一层不只是做运算,而是有它自己的权重和偏置,所以每前向传播一步,都必须设一个新层
self.conv2d1=tf.keras.layers.Conv2D(16,kernel_size=(5,5),activation="relu")
self.conv2d2=tf.keras.layers.Conv2D(16,kernel_size=(5,5),activation="relu")
self.conv2d3=tf.keras.layers.Conv2D(32,kernel_size=(5,5),activation="relu")
self.conv2d4=tf.keras.layers.Conv2D(32,kernel_size=(5,5),activation="relu")
self.maxpool1=tf.keras.layers.MaxPool2D(pool_size=(4,4))
self.maxpool2=tf.keras.layers.MaxPool2D(pool_size=(4,4))
self.flatten=tf.keras.layers.Flatten()
self.dropout1=tf.keras.layers.Dropout(0.2)
self.dropout2=tf.keras.layers.Dropout(0.2)
self.dense1=tf.keras.layers.Dense(256,activation="relu")
self.dense2=tf.keras.layers.Dense(2,activation="softmax")
def call(self,x):
x=self.conv2d1(x)
x=self.conv2d2(x)
x=self.maxpool1(x)
x=self.dropout1(x)
x=self.conv2d3(x)
x=self.conv2d4(x)
x=self.maxpool2(x)
x=self.dropout2(x)
x=self.flatten(x)
x=self.dense1(x)
x=self.dense2(x)
return x
可以看到,前向传播过程中设置了两个卷积单元和一个全连接单元。每个卷积单元有两个卷积层(各16或32个卷积核,卷积核尺寸为5*5)、一个4*4最大池化层和一个比例为0.2的Dropout层(在训练时每次随机丢弃2%神经元输出,实现正则化)。
考虑到X光影像数据的复杂性,对数据预处理时,进行了1100*1100像素的尺寸调整,并除以每个图像转化为灰度图后的最大像素值实现归一化:
def gettensor(folder_path,file_name):
im=img.open(os.path.join(folder_path,file_name)).convert('L')
im=im.resize((1100,1100),img.LANCZOS)
tens=tf.constant(im,dtype=tf.float32)
tens=tf.reshape(tens,(1,1100,1100,1))/tf.reduce_max(tens)
del im
return tens
这时就出现了一个非常严峻的问题:如此巨大的数据,势必给显存和内存带来威胁,一旦现存或内存爆满,训练就无法继续。在进行多种尝试后,提出如下策略:综合利用显存和内存,不预先将训练和测试数据封装为Dataset对象;对于训练数据使用显存,按文件名列表,每轮随机抽取200个数据,拼接为四维张量;对于测试数据使用内存,不预先拼接为高维张量,而是每次读取并处理一张图像,每轮重复100次,即每轮用100条测试集数据进行验证。这样,在一定程度上牺牲训练速度的条件下,实现了模型的可持续训练。
此外,为保证模型效果,指定了如下训练策略:当测试集上准确率达到87%以上时,不再进行训练,并将测试用数据条数增加至300,进行五轮测试;若五轮测试通过,则训练终止;若其中存在一轮测试准确率低于84%,则重启训练部分,继续训练。整个训练过程如下述代码所示:
if __name__=="__main__":
os.system('featurize event create --title "通讯测试" --content "这是内容"')
verbose_dir="/home/featurize/work/verbose.txt"
file=open(verbose_dir,"w")
file.write("New Trainning\n------------------\n")
file.close()
flag=True
counter=0
test_range=100
EPOCHS=500
for epoch in range(EPOCHS):
train_loss.reset_states()
test_loss.reset_states()
train_accuracy.reset_states()
test_accuracy.reset_states()
train_list=np.random.randint(0,len(train_y),200)
train_tensor=prepare(train_list,train_folder,train_name)
test_list=np.random.randint(0,len(test_y),test_range)
if flag:
j=0
for i in train_list:
image=tf.reshape(train_tensor[j],(1,1100,1100,1))
j=j+1
label=train_y[i]
train_step(image,label)
if j%500==0 and j!=0 :
file=open(verbose_dir,"a")
file.write(f"Training(epoch={epoch+1}), Accuracy=%{train_accuracy.result()*100}\n")
file.close()
model.save(model_path,save_format="tf")
for i in test_list:
image=gettensor(test_folder,test_name[i])
label=test_y[i]
test_step(image,label)
if flag==True and test_accuracy.result()>=0.87:
flag=False
test_range=300
elif flag==False and test_accuracy.result()>=0.84 and counter<=4:
counter=counter+1
file=open(verbose_dir,'a')
file.write(f"Validating. Test accuracy = %{test_accuracy.result()*100}.\n")
file.close()
elif flag==False and test_accuracy.result()>=0.84 and counter>4:
file=open(verbose_dir,'a')
file.write(f"Validating finished. final Test accuracy = %{test_accuracy.result()*100}. Training finished.\n")
file.close()
os.system('featurize event create --title "已实现训练目标" --content "这是内容"')
break
elif flag==False and test_accuracy.result()<0.84:
flag=True
counter=0
test_range=100
else:
pass
model.save(model_path,save_format="tf")
print("Model Saved.")
file=open(verbose_dir,"a")
file.write(f"Epoch {epoch+1}, "
f"Train Loss: {train_loss.result()}, "
f"Train Accuracy: %{train_accuracy.result()*100}, "
f"Test Loss: {test_loss.result()}, "
f"Test Accuracy: %{test_accuracy.result()*100}.\n")
file.close()
file=open(verbose_dir,'a')
file.write("All Epochs Are Done.\n")
file.close()
os.system('featurize event create --title "实例已归还" --content "这是内容"')
os.system("featurize instance release")
其中prepare函数用于拼接四维张量,定义如下:
def prepare(list,folder_path,file_name_list):
x=gettensor(folder_path,file_name_list[list[0]])
for i in list[1:]:
sup=gettensor(folder_path,file_name_list[i])
x=tf.concat((x,sup),axis=0)
return x
训练结束后,按如下代码,在整个测试集上进行进一步验证:
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import PIL.Image as img
import os
from scipy.io import savemat,loadmat
model=tf.keras.models.load_model("/home/featurize/work/System/F1_M_90")
@tf.function
def test_step(image,label):
predict=model(image,training=False)
loss=loss_obj(label,predict)
test_loss(loss)
test_accuracy(label,predict)
def gettensor(folder_path,file_name):
im=img.open(os.path.join(folder_path,file_name)).convert('L')
im=im.resize((1100,1100),img.LANCZOS)
tens=tf.constant(im,dtype=tf.float32)
tens=tf.reshape(tens,(1,1100,1100,1))/tf.reduce_max(tens)
del im
return tens
test_folder="/home/featurize/data/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test"
test_name=pd.read_csv("/home/featurize/work/data/test_shuffled.csv")["X_ray_image_name"]
test_name=np.array(test_name)
l_test_y=pd.read_csv("/home/featurize/work/data/test_shuffled.csv")["Label"]
def numberize(l_y):
y=[]
for lab in l_y:
if lab=="Normal":
y.append(0)
else:
y.append(1)
return y
test_y=tf.constant(numberize(l_test_y),dtype=tf.int32)
loss_obj=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer=tf.keras.optimizers.RMSprop(0.001,0.9)
train_loss=tf.keras.metrics.Mean(name="train_loss")
train_accuracy=tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")
test_loss=tf.keras.metrics.Mean(name="test_loss")
test_accuracy=tf.keras.metrics.SparseCategoricalAccuracy(name="test_accuracy")
for i in range(len(test_y)):
image=gettensor(test_folder,test_name[i])
label=test_y[i]
test_step(image,label)
print(f"Accuracy: %{test_accuracy.result()*100}")
print(f"length:{len(test_y)}")
运行后输出如下:
咳咳百分号写前面去了无伤大雅......
可以看到,在整个测试集上,训练准确率达到90.22%。