声明:本文仅提供了如何生成k折数据集。并没有提供实现k折的具体过程。避免有人骂我,提供一个K折的例子。个人感觉吧。。。挺简单的,用tf.Keras实现的。和keras很像,可惜我keras太弱。。虽然模型可以看懂,但是很多逻辑我有点云里雾里。比如如如如如如如如如如。。。不说也罢。
附链接:https://blog.csdn.net/coolyuan/article/details/104276183
(1)K折交叉验证生成TFrecords
import os
import tensorflow as tf
import numpy as np
from PIL import Image
from sklearn.model_selection import StratifiedKFold
# ---------------------------------------参数信息---------------------------------------------------
# region一、参数信息
"""上次生成数据信息
0 Beach
1 Farmland
2 Mountain
3 River
4 Bridge
5 Forest
6 Meadow
样本总数:7068
训练集数量:702
测试集总量:702
验证集总量:572
"""
# 原始图片的存储位置
orig_picture = 'D:/1/XiongAnDatasets/AID_1'
# 需要的识别类型
classes = {'Bridge', 'Meadow', 'River', 'Mountain', 'Beach', 'Farmland', 'Forest'}
# 将图片尺寸大小统一
new_height = 200
new_width = 200
new_channels = 3
# 训练集和测试集存放路径
TF_train = "D:/1/tf_file/train_"
TF_test = "D:/1/tf_file/test_"
TF_valid = "D:/1/tf_file/valid_"
# K(10)折交叉验证
n_splits = 10
# 数据集划分比例
radio = 0.9
# 记录数据
All_examples = 0
Train_examples = 0
Test_examples = 0
Valid_examples = 0
# endregion
# ----------------------------------函数信息--------------------------------------------
# region二、函数区
"""说明
(1)get_files(): 数据读取并打乱
(2)create_record(): 制作TFRecords
"""
# region(1)读取数据(顺序已经打乱)
def get_files():
class_train = []
label_train = []
for index, name in enumerate(classes):
print(index, name)
class_path = orig_picture + '/' + name
for pic in os.listdir(class_path):
class_train.append(class_path + '/' + pic)
label_train.append(index)
temp = np.array([class_train, label_train])
temp = temp.transpose()
# shuffle the samples
np.random.shuffle(temp)
# after transpose, images is in dimension 0 and label in dimension 1
image_list = list(temp[:, 0])
label_list = list(temp[:, 1])
return image_list, label_list
# endregion
# region(3)制作TFRecords数据
def create_record(img_list, lab_list, path):
# region1、写出路径
with tf.python_io.TFRecordWriter(path) as writer:
# 已知数据是一一对应的。所以利用同一个for i in range(len(image_list))将数据转换为样本
for i in range(len(img_list)):
img = Image.open(img_list[i])
img = img.resize((new_width, new_height), Image.ANTIALIAS)
image_val = img.tobytes() # 将图片转化为原生bytes
label_val = int(lab_list[i])
# 创建example实例
example = tf.train.Example(features=tf.train.Features(feature={
"image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_val])),
"label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label_val])),
}))
# 序列化example对象,并写入到文件
writer.write(example.SerializeToString())
return None
# endregion
# endregion
# ---------------------------------------执行---------------------------------------------------
if __name__ == '__main__':
# region(1)划分数据集
# region(1.1)准备images、labels
images_list, labels_list = get_files()
All_examples = len(images_list)
# endregion
# region(1.2)开始划分
skf = StratifiedKFold(n_splits=n_splits)
split_i = 0
for train_index, test_index in skf.split(images_list, labels_list):
split_i = split_i + 1
# print("Train Index:", train_index, ",Test Index:", test_index)
X_train, X_test = np.array(images_list)[0:int(len(train_index) * radio)], np.array(images_list)[test_index]
Y_train, Y_test = np.array(labels_list)[0:int(len(train_index) * radio)], np.array(labels_list)[test_index]
X_valid, Y_valid = np.array(X_train)[int(len(X_train) * radio):-1], np.array(Y_train)[
int(len(Y_train) * radio):-1]
Train_examples = len(X_train)
Test_examples = len(X_test)
Valid_examples = len(X_valid)
# region(1.2.1)生成10份TFrecords
create_record(X_train, Y_train, TF_train+str(split_i)+".tfrecords")
create_record(X_test, Y_test, TF_test+str(split_i)+".tfrecords")
create_record(X_valid, Y_valid, TF_valid+str(split_i)+".tfrecords")
print(str(split_i)+".tfrecords文件生成成功!")
# endregion
print("样本总数:\n", All_examples)
print("训练集数量:\n", Test_examples)
print("测试集总量:\n", Test_examples)
print("验证集总量:\n", Valid_examples)
print("(n_splits折划分的结果(训练集、验证集和测试集的数量)并不一定每次都相同(可以通过"
"将上述打印缩进到上面的for循环中查看)。但是上下最多错5个左右,所以在训练时,"
"建议train_batch_size>20且验证集和测试集的batch_size=1,即每次取出来一个,"
"预测结果对比真实值,如果相同,true_num+=1,"
"最终准确率为:true_num/len(验证集或者测试集的总数)")
print("每次生成时,类别以及类别对应的编号会被重新打乱,我在运行时控制台的开头将该信息进行了打印输出,"
"建议记录下来。方便后期验证。")
# endregion
(2)非交叉验证生成TFrecords
修改下主函数就行
if __name__ == '__main__':
# region(1)划分数据集
# region(1.1)准备images、labels
images_list, labels_list = get_files()
All_examples = len(images_list)
# endregion
X_train, X_test = np.array(images_list)[0:int(All_examples * radio)], \
np.array(images_list)[0:int(All_examples * (1 - radio))]
Y_train, Y_test = np.array(labels_list)[0:int(All_examples * radio)], \
np.array(labels_list)[0:int(All_examples * (1 - radio))]
X_valid, Y_valid = np.array(X_train)[int(len(X_train) * radio):-1], \
np.array(Y_train)[int(len(Y_train) * radio):-1]
Train_examples = len(X_train)
Test_examples = len(X_test)
Valid_examples = len(X_valid)