新手。。请多多指教~
数据集rsna-bone-age来自kaggle
云盘链接 提取码: n24q
因为上传限制,训练集分成了三个压缩包。解压后照着boneage-test-dataset,把图片放在boneage-training-dataset文件夹里。
参考代码:
Bone-Age-Detection-From-X-Ray
数据集解压后,boneage-training-dataset文件夹中是:
boneage-training-dataset.csv中是:
第一步,读取数据集
1
import os
import pandas as pd
print("Reading data...")
base_data_dir = ’/input/rsna-bone-age‘
img_dir = os.path.join(base_data_dir , 'boneage-training-dataset/')
csv_dir = os.path.join(base_data_dir , 'boneage-training-dataset.csv')
boneage_df = pd.read_csv(csv_dir)
print (boneage_df)
column_headers = list(boneage_df.columns.values)
print(column_headers) #列标签
print(boneage_df.sample(3)) #csv中随机抽取3行
输出:
(12611, 3)
['id', 'boneage', 'male'] #列标签
id boneage male
11314 11314 96 True
11583 11583 228 True
4583 4583 94 False
参考:
1.pandas文档 —— pandas.read_csv
2.python pandas读取csv后,获取列标签
——————————————————————————————————
2
获得需要的标签和数据,比如性别(male或female),img文件夹中图片的路径等
boneage_df['path'] = boneage_df['id'].map(lambda x: img_dir+"{}.png".format(x))
boneage_df['gender'] = boneage_df['male'].map(lambda x: "male" if x else "female")
boneage_df['exists'] = boneage_df['path'].map(os.path.exists) #判断img_dir中的图片和csv中的数据是否对应
#标准化骨龄数据
mu = boneage_df['boneage'].mean()
sigma = boneage_df['boneage'].std()
boneage_df['zscore'] = boneage_df['boneage'].map(lambda x: (x-mu)/sigma)
boneage_df.dropna(inplace=True)
column_headers = list(boneage_df.columns.values)
print("column_hander = ",column_headers) #列标签
#print(boneage_df.sample(3)) #csv中随机抽取3行
print("{} images found out of total {} images".format(boneage_df['exists'].sum(),boneage_df.shape[0]))
print("Reading complete !!!\n")
boneage_df [['boneage','zscore']].hist()
输出:
column_hander = ['id', 'boneage', 'male', 'path', 'exists', 'gender', 'zscore']
12611 images found out of total 12611 images
Reading complete !!!
横轴是骨龄(按月计算),纵轴是样本数量,标准化后,样本的骨龄数据转化为-3~3的分值
参考:
3.pandas中强大的绘制图表功能——DataFrame.hist()
4.3.4.2数据标准化(一) - Z-Score标准化
第二步,准备数据集(训练集/测试集/验证集)
1 划分数据集
from sklearn.model_selection import train_test_split
print("Preparing training, testing and validation datasets ...")
#按骨龄的梯度分为10个量级
boneage_df['boneage_category'] = pd.cut(boneage_df['boneage'], 10)
#划分数据集
raw_train_df, test_df = train_test_split(boneage_df, test_size = 0.2, random_state = 2018,stratify = boneage_df['boneage_category'])
raw_train_df, valid_df = train_test_split(raw_train_df,test_size = 0.25,random_state = 2018,stratify = raw_train_df['boneage_category'])
raw_train_df[['boneage']].hist(figsize = (10, 5)) #绘图(均衡前)
# Training images: 7566 | Validation images: 2522 | Test images: 2523
#Balance the distribution in the training set
train_df = raw_train_df.groupby(['boneage_category', 'male']).apply(lambda x: x.sample(500, replace = True)).reset_index(drop=True)
train_df[['boneage']].hist(figsize = (10, 5)) #绘图 (均衡后)
# Training images: 10000 | Validation images: 2522 | Test images: 2523
train_size = train_df.shape[0]
valid_size = valid_df.shape[0]
test_size = test_df.shape[0]
print("# Training images: {}".format(train_size))
print("# Validation images: {}".format(valid_size))
print("# Test images: {}".format(test_size))
输出:
# Training images: 10000
# Validation images: 2522
# Test images: 2523
注:
数据均衡过程:
train_df = raw_train_df.groupby(['boneage_category', 'male']).apply(lambda x: x.sample(500, replace = True)).reset_index(drop=True)
raw_train_df中,boneage_category有10类,male有两类,排列组合共20种
每类重复采样500次,共得到 20 * 500 = 10000 个样本
参考:
5.pandas文档 ——pandas.cut
6. .sample method
7.Sample Pandas dataframe based on values in column
2 数据预处理
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import tensorflow as tf
import numpy as np
IMG_SIZE = (224,224)
BATCH_SIZE_TRAIN = 10
BATCH_SIZE_VAL = 16
SEED = 1234
# 将图像和年龄作为输入,构建数据生成器
def gen_2inputs(imgDatGen, df, batch_size, seed, img_size):
gen_img = imgDatGen.flow_from_dataframe(dataframe=df,
x_col='path', y_col='zscore',
batch_size=batch_size, seed=seed, shuffle=True, class_mode='other',
target_size=img_size, color_mode='rgb',
drop_duplicates=False)
gen_gender = imgDatGen.flow_from_dataframe(dataframe=df,
x_col='path', y_col='gender',
batch_size=batch_size, seed=seed, shuffle=True, class_mode='other',
target_size=img_size, color_mode='rgb',
drop_duplicates=False)
while True:
X1i = gen_img.next()
X2i = gen_gender.next()
yield [X1i[0], X2i[1]], X1i[1]
def test_gen_2inputs(imgDatGen, df, batch_size, img_size):
gen_img = imgDatGen.flow_from_dataframe(dataframe=df,
x_col='path', y_col='zscore',
batch_size=batch_size, shuffle=False, class_mode='other',
target_size=img_size, color_mode='rgb',
drop_duplicates=False)
gen_gender = imgDatGen.flow_from_dataframe(dataframe=df,
x_col='path', y_col='gender',
batch_size=batch_size, shuffle=False, class_mode='other',
target_size=img_size, color_mode='rgb',
drop_duplicates=False)
while True:
X1i = gen_img.next()
X2i = gen_gender.next()
yield [X1i[0], X2i[1]], X1i[1]
train_idg = ImageDataGenerator(zoom_range=0.2,
fill_mode='nearest',
rotation_range=25,
width_shift_range=0.25,
height_shift_range=0.25,
vertical_flip=False,
horizontal_flip=True,
shear_range = 0.2,
samplewise_center=False,
samplewise_std_normalization=False)
val_idg = ImageDataGenerator(width_shift_range=0.25,
height_shift_range=0.25,
horizontal_flip=True)
test_idg = ImageDataGenerator()
train_flow = gen_2inputs(train_idg, train_df, BATCH_SIZE_TRAIN, SEED, IMG_SIZE)
valid_flow = gen_2inputs(val_idg, valid_df, BATCH_SIZE_VAL, SEED, IMG_SIZE)
test_flow = test_gen_2inputs(test_idg, test_df, 500, IMG_SIZE)
# 计算月份平均绝对误差
def mae_months(in_gt, in_pred):
return mean_absolute_error(boneage_div * in_gt, boneage_div * in_pred)
参考:
8.Tutorial on Keras flow_from_dataframe
9.师兄的代码 😄
第三步,构建神经网络
from keras.utils import plot_model
from keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout, Flatten, Concatenate
from keras.models import Sequential,Model
from keras.metrics import mean_absolute_error
from keras.applications.inception_v3 import InceptionV3, preprocess_input
print("Compiling deep model ...")
IMG_SHAPE = (224, 224,3)
# 1、两个输入分别是原始图像和性别输入
img = Input(shape = IMG_SHAPE)
gender = Input(shape=(1,))
#2、预训练主干网络(图像)
cnn_vec = InceptionV3(input_shape = IMG_SHAPE, include_top = False, weights = 'imagenet')(img)
#3、主干网络输出
cnn_vec = GlobalAveragePooling2D()(cnn_vec)
cnn_vec = Dropout(0.2)(cnn_vec)
#4.性别输入网络
gender_vec = Dense(32,activation='relu')(gender)
#5.两个网络拼接
features = Concatenate(axis=-1)([cnn_vec,gender_vec])
dense_layer = Dense(1024, activation = 'relu')(features)
dense_layer = Dropout(0.2)(dense_layer)
dense_layer = Dense(1024,activation='relu')(dense_layer)
dense_layer = Dropout(0.2)(dense_layer)
output_layer = Dense(1, activation = 'linear')(dense_layer) # linear is what 16bit did
bone_age_model = Model(inputs=[img,gender],outputs=output_layer)
def mae_months(in_gt, in_pred):
return mean_absolute_error(mu+sigma*in_gt, mu+sigma*in_pred)
bone_age_model.compile(optimizer = 'adam', loss = 'mse', metrics = [mae_months])
bone_age_model.summary()
print("Model compiled !!!\n")
第四步,训练模型
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error as sk_mae
import matplotlib.pyplot as plt
print("Training deep model ...")
# 测试步长
EPOCHS = 10
BATCH_SIZE_TEST = len(test_df) // 3
STEP_SIZE_TEST = 3
STEP_SIZE_TRAIN = len(train_df) // BATCH_SIZE_TRAIN
STEP_SIZE_VALID = len(valid_df) // BATCH_SIZE_VAL
weight_path="bone_age_weights_trainable_inceptionv3_gender_50_epochs_relu_less_dropout_dense.best.hdf5" # saved_model_name
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1,save_best_only=True, mode='min', save_weights_only = True)
reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=10, verbose=1, mode='auto', epsilon=0.0001, cooldown=5, min_lr=0.0001)
early = EarlyStopping(monitor="val_loss", mode="min", patience=10) # probably needs to be more patient, but kaggle time is limited
callbacks_list = [checkpoint, early, reduceLROnPlat]
if not os.path.exists(weight_path):
model_history = bone_age_model.fit_generator(generator=train_flow,
steps_per_epoch=STEP_SIZE_TRAIN,
validation_data=valid_flow,
validation_steps=STEP_SIZE_VALID,
epochs=EPOCHS,
callbacks = callbacks_list)
bone_age_model.load_weights(weight_path)
print("Training complete !!!\n")
# 保存训练的loss
loss_history = model_history.history['loss']
history_df = pd.DataFrame.from_dict(model_history.history)
history_df.to_csv('loss_history.csv')
#Evaluate model on test dataset
print("Evaluating model on test data ...\n")
print("Preparing testing dataset...")
test_X, test_Y = next(test_flow) # one big batch
print("Data prepared !!!")
pred_Y = mu+sigma*bone_age_model.predict(x=test_X,batch_size=25,verbose=1)
test_Y_months = mu+sigma*test_Y
print("Mean absolute error on test data: "+str(sk_mae(test_Y_months,pred_Y)))
fig, ax1 = plt.subplots(1,1, figsize = (6,6))
ax1.plot(test_Y_months, pred_Y, 'r.', label = 'predictions')
ax1.plot(test_Y_months, test_Y_months, 'b-', label = 'actual')
ax1.legend()
ax1.set_xlabel('Actual Age (Months)')
ax1.set_ylabel('Predicted Age (Months)')
ord_idx = np.argsort(test_Y)
ord_idx = ord_idx[np.linspace(0, len(ord_idx)-1, num=8).astype(int)] # take 8 evenly spaced ones
fig, m_axs = plt.subplots(2, 4, figsize = (16, 32))
for (idx, c_ax) in zip(ord_idx, m_axs.flatten()):
c_ax.imshow(test_X[0][idx, :,:,0], cmap = 'bone')
title = 'Age: %2.1f\nPredicted Age: %2.1f\nGender: ' % (test_Y_months[idx], pred_Y[idx])
if test_X[1][idx]==0:
title+="Female\n"
else:
title+="Male\n"
c_ax.set_title(title)
c_ax.axis('off')
plt.show()