Tensorflow生成Train、valid、test.TFRecords文件实例+Kfold生成

声明:本文仅提供了如何生成k折数据集。并没有提供实现k折的具体过程。避免有人骂我,提供一个K折的例子。个人感觉吧。。。挺简单的,用tf.Keras实现的。和keras很像,可惜我keras太弱。。虽然模型可以看懂,但是很多逻辑我有点云里雾里。比如如如如如如如如如如。。。不说也罢。
附链接:https://blog.csdn.net/coolyuan/article/details/104276183

(1)K折交叉验证生成TFrecords

import os
import tensorflow as tf
import numpy as np
from PIL import Image
from sklearn.model_selection import StratifiedKFold

# ---------------------------------------参数信息---------------------------------------------------
# region一、参数信息

"""上次生成数据信息
0 Beach
1 Farmland
2 Mountain
3 River
4 Bridge
5 Forest
6 Meadow
样本总数:7068
训练集数量:702
测试集总量:702
验证集总量:572
"""

# 原始图片的存储位置
orig_picture = 'D:/1/XiongAnDatasets/AID_1'

# 需要的识别类型
classes = {'Bridge', 'Meadow', 'River', 'Mountain', 'Beach', 'Farmland', 'Forest'}

# 将图片尺寸大小统一
new_height = 200
new_width = 200
new_channels = 3

# 训练集和测试集存放路径
TF_train = "D:/1/tf_file/train_"
TF_test = "D:/1/tf_file/test_"
TF_valid = "D:/1/tf_file/valid_"

# K(10)折交叉验证
n_splits = 10

# 数据集划分比例
radio = 0.9

# 记录数据
All_examples = 0
Train_examples = 0
Test_examples = 0
Valid_examples = 0

# endregion
# ----------------------------------函数信息--------------------------------------------
# region二、函数区

"""说明
(1)get_files(): 数据读取并打乱 
(2)create_record(): 制作TFRecords

"""


# region(1)读取数据(顺序已经打乱)
def get_files():
    class_train = []
    label_train = []
    for index, name in enumerate(classes):
        print(index, name)
        class_path = orig_picture + '/' + name
        for pic in os.listdir(class_path):
            class_train.append(class_path + '/' + pic)
            label_train.append(index)
    temp = np.array([class_train, label_train])
    temp = temp.transpose()
    # shuffle the samples
    np.random.shuffle(temp)
    # after transpose, images is in dimension 0 and label in dimension 1
    image_list = list(temp[:, 0])
    label_list = list(temp[:, 1])
    return image_list, label_list


# endregion


# region(3)制作TFRecords数据
def create_record(img_list, lab_list, path):
    # region1、写出路径
    with tf.python_io.TFRecordWriter(path) as writer:
        # 已知数据是一一对应的。所以利用同一个for i in range(len(image_list))将数据转换为样本
        for i in range(len(img_list)):
            img = Image.open(img_list[i])
            img = img.resize((new_width, new_height), Image.ANTIALIAS)
            image_val = img.tobytes()  # 将图片转化为原生bytes
            label_val = int(lab_list[i])
            # 创建example实例
            example = tf.train.Example(features=tf.train.Features(feature={
                "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_val])),
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label_val])),
            }))
            # 序列化example对象,并写入到文件
            writer.write(example.SerializeToString())
    return None


# endregion

# endregion
# ---------------------------------------执行---------------------------------------------------
if __name__ == '__main__':

    # region(1)划分数据集

    # region(1.1)准备images、labels
    images_list, labels_list = get_files()
    All_examples = len(images_list)
    # endregion

    # region(1.2)开始划分
    skf = StratifiedKFold(n_splits=n_splits)
    split_i = 0
    for train_index, test_index in skf.split(images_list, labels_list):
        split_i = split_i + 1
        # print("Train Index:", train_index, ",Test Index:", test_index)
        X_train, X_test = np.array(images_list)[0:int(len(train_index) * radio)], np.array(images_list)[test_index]
        Y_train, Y_test = np.array(labels_list)[0:int(len(train_index) * radio)], np.array(labels_list)[test_index]
        X_valid, Y_valid = np.array(X_train)[int(len(X_train) * radio):-1], np.array(Y_train)[
                                                                            int(len(Y_train) * radio):-1]
        Train_examples = len(X_train)
        Test_examples = len(X_test)
        Valid_examples = len(X_valid)

        # region(1.2.1)生成10份TFrecords
        create_record(X_train, Y_train, TF_train+str(split_i)+".tfrecords")
        create_record(X_test, Y_test, TF_test+str(split_i)+".tfrecords")
        create_record(X_valid, Y_valid, TF_valid+str(split_i)+".tfrecords")
        print(str(split_i)+".tfrecords文件生成成功!")
    # endregion

    print("样本总数:\n", All_examples)
    print("训练集数量:\n", Test_examples)
    print("测试集总量:\n", Test_examples)
    print("验证集总量:\n", Valid_examples)
    print("(n_splits折划分的结果(训练集、验证集和测试集的数量)并不一定每次都相同(可以通过"
          "将上述打印缩进到上面的for循环中查看)。但是上下最多错5个左右,所以在训练时,"
          "建议train_batch_size>20且验证集和测试集的batch_size=1,即每次取出来一个,"
          "预测结果对比真实值,如果相同,true_num+=1,"
          "最终准确率为:true_num/len(验证集或者测试集的总数)")

    print("每次生成时,类别以及类别对应的编号会被重新打乱,我在运行时控制台的开头将该信息进行了打印输出,"
    	  "建议记录下来。方便后期验证。")
    # endregion

(2)非交叉验证生成TFrecords

修改下主函数就行

if __name__ == '__main__':
    # region(1)划分数据集

    # region(1.1)准备images、labels
    images_list, labels_list = get_files()
    All_examples = len(images_list)
    # endregion

    X_train, X_test = np.array(images_list)[0:int(All_examples * radio)], \
                      np.array(images_list)[0:int(All_examples * (1 - radio))]

    Y_train, Y_test = np.array(labels_list)[0:int(All_examples * radio)], \
                      np.array(labels_list)[0:int(All_examples * (1 - radio))]

    X_valid, Y_valid = np.array(X_train)[int(len(X_train) * radio):-1], \
   					   np.array(Y_train)[int(len(Y_train) * radio):-1]
   					   
    Train_examples = len(X_train)
    Test_examples = len(X_test)
    Valid_examples = len(X_valid)

  

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值