Tensorflow文件读取操作

最新推荐文章于 2020-10-03 23:39:24 发布

宋建国

最新推荐文章于 2020-10-03 23:39:24 发布

阅读量491

点赞数

分类专栏：深度学习

本文链接：https://blog.csdn.net/hot7732788/article/details/92803945

版权

深度学习专栏收录该内容

6 篇文章 0 订阅

订阅专栏

所有文件类型的文件读取流程

在这里插入图片描述

1.CSV文件

csv文件读取流程

在这里插入图片描述

构造文件队列api
构造文件阅读器api并进行数据读取
对文件内容进行解码（一次一行）
读取多个文件开启批处理过程
在主线程中开启子线程读取文件的线程协调器，别忘记用完后要回收线程
代码

import tensorflow as tf
import os

def csv_read(file_list):
    #构造文件队列
    file_queue = tf.train.string_input_producer(file_list)

    #构造csv文件阅读器，读取数据（一行）
    reader = tf.TextLineReader()
    key, value = reader.read(file_queue)

    #对每行内容进行解码   record_defaults-->指定每一个样本的每一列的类型和指定默认值
    records = [["None"], [1]]
    col1, col2 = tf.decode_csv(value, field_delim=",", record_defaults=records)

    #想要一次读取多个数据，要进行批处理,如果读取超过样本数量，就循环读取(数据重复)
    col1_more, col2_more = tf.train.batch([col1, col2], batch_size=10, num_threads=1, capacity=10)

    print(col1_more, col2_more)
    return col1_more, col2_more

if __name__ == '__main__':
    #找到文件，放入列表
    file_names = os.listdir("./csv_data/")

    #进行文件夹和文件名的拼接
    file_list = [os.path.join("./csv_data/", file) for file in file_names]
    example, label = csv_read(file_list)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config = config) as sess:

        #定义一个线程协调器
        coord = tf.train.Coordinator()

        #开启读取文件的线程
        threads = tf.train.start_queue_runners(sess,coord = coord)

        #打印读入的内容
        a, b  = sess.run([example, label])
        print(a, b)  #[b'b' b'b' b'b' b'a' b'a' b'a' b'c' b'c' b'c' b'c'] [1 2 3 1 2 3 1 2 3 1]

        #这里可以接训练了


        #不要忘记回收线程
        coord.request_stop()
        coord.join(threads)

2.图片读取

图片读取器和图像解码api，图像解码有很多格式，这里只列举两种
为了统一图像特征值（像素点数相同，调整为统一大小）
代码

import tensorflow as tf
import os


def tupian_read(file_list):

    #构造文件队列
    file_queue = tf.train.string_input_producer(file_list)

    #构造文件阅读器(一张一张的读)
    reader = tf.WholeFileReader()
    key, value = reader.read(file_queue)

    #对读取的数据进行解码(可以有多种图片格式的解码，只列举一种)
    image = tf.image.decode_jpeg(value)

    #对图片特征进行统一大小（图片统一大小）
    image_resize = tf.image.resize_images(image, [200, 200])

    #注意：一定要把图片的通道数进行设定好，不然没有办法进行批处理
    image_resize.set_shape([200, 200, 3])

    #进行批处理(5张)
    image_batch = tf.train.batch([image_resize], batch_size=5, num_threads=1, capacity=5)
    print(image_batch)
    return image_resize

    # return example, label

if __name__ == '__main__':
    # 找到文件，放入列表
    file_names = os.listdir("./狗/")

    # 进行文件夹和文件名的拼接
    file_list = [os.path.join("./狗/", file) for file in file_names]

    values = tupian_read(file_list)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 定义一个线程协调器
        coord = tf.train.Coordinator()

        # 开启读取文件的线程
        threads = tf.train.start_queue_runners(sess, coord=coord)

        # 打印读入的内容
        a= sess.run([values])
        print(a)

        # 这里可以接训练了

        # 不要忘记回收线程
        coord.request_stop()
        coord.join(threads)

3.二进制文件

使用cifar10作为数据读取例子数据集
二进制文件阅读器
二进制文件解码器
- 源码（为了后面的转化为tfrecord方便直接做成了一个类）

import tensorflow as tf
import os


class erjinzhi_duqu:
    def __init__(self, file_list):
        #文件列表
        self.file_list = file_list

        #定义一些数据属性
        self.height = 32
        self.width = 32
        self.channel = 3
        self.label_bytes = 1
        self.image_bytes = 3072
        self.bytes = 3073

    def readanddecode(self):
        #构造文件队列
        file_queue = tf.train.string_input_producer(self.file_list)

        #构造二进制文件读取器, 每个样本的字节数都要指定
        reader = tf.FixedLengthRecordReader(self.bytes)
        key, value = reader.read(file_queue)

        #构造文件解码器
        label_image = tf.decode_raw(value, tf.uint8) #这是lable和image在一起的数据，需要分开

        #处理数据，分开图片和标签数据
            #tfapi切片分割
        # label1 = tf.cast(tf.slice(label_image, [0], [self.label_bytes]), tf.int32)#当需要进行计算时可以指定数据的类型
        # image1 = tf.cast(tf.slice(label_image, [self.label_bytes], [self.image_bytes]), tf.float32)
        label1 = tf.slice(label_image, [0], [self.label_bytes])  # 对标签进行转换
        image1 = tf.slice(label_image, [self.label_bytes], [self.image_bytes])

            #直接切片分割(不可行，因为他无法识别数量，后续就无法进行矩阵转换)
        label2 = tf.cast(label_image[0:self.label_bytes], tf.int32)
        image2 = label_image[self.label_bytes:self.image_bytes]

        """
            Tensor("Cast:0", shape=(1,), dtype=int32)
            Tensor("Slice_1:0", shape=(3072,), dtype=uint8)
                Tensor("Cast_1:0", shape=(?,), dtype=int32)
                Tensor("strided_slice_1:0", shape=(?,), dtype=uint8)
        """

        #对图片的特征数据进行改变
        image_reshape1 = tf.reshape(image1, [self.height, self.width, self.channel])
        # image_reshape2 = tf.reshape(image2, [self.height, self.width, self.channel])


        #批处理
        image_batch1, label_batch1 = tf.train.batch([image_reshape1, label1], batch_size=10, num_threads=1, capacity=10)
        # image_batch2, label_batch2 = tf.train.batch([image_reshape2, label2], batch_size=10, num_threads=1, capacity=10)

        return image_batch1, label_batch1



if __name__ == '__main__':
    #找到文件，放入列表
    file_names = os.listdir("./cifar10/cifar-10-batches-bin/")

    #进行文件夹和文件名的拼接,进行文件切片，读取出需要的文件
    file_list = [os.path.join("./cifar10/cifar-10-batches-bin/", file) for file in file_names if file[-3:] == "bin"]
    br = erjinzhi_duqu(file_list)
    example, label = br.readanddecode()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config = config) as sess:

        #定义一个线程协调器
        coord = tf.train.Coordinator()

        #开启读取文件的线程
        threads = tf.train.start_queue_runners(sess,coord = coord)

        #打印读入的内容
        a, b  = sess.run([example, label])
        print(a, b)

        #这里可以接训练了


        #不要忘记回收线程
        coord.request_stop()
        coord.join(threads)

4.tfrecord文件的存储与读取

1.从二进制文件写入tfrecords文件

创建tfrecords存储器（注意：存储需要序列化）
example协议块填写参数
核心存储代码

    def write_to_tfrecords(self, image_batch, label_batch, save_path):
        """
        将图片的特征值和目标值存进tfrecord
        :param image_batch:
        :param label_batch:
        :return:
        """
        print("开始存储")

        #构造一个存储器
        writer = tf.python_io.TFRecordWriter(save_path)

        #循环的将所有样本写入文件
        for i in range(10):
            #取出第i个图片的特征值和标签值-->需要用eval来取出值（必须放在sess中执行才可以）
            image = image_batch[i].eval().tostring()
            label = int(label_batch[i].eval()[0])# label_batch[i].eval()输出的是一个标签列表，要值需要【0】
                                                 #[9]这种

            #构造example协议快
            example = tf.train.Example(features = tf.train.Features(feature = {
                "image":tf.train.Feature(bytes_list = tf.train.BytesList(value=[image])),
                "label":tf.train.Feature(int64_list = tf.train.Int64List(value=[label])),
            }))

            # 写入样本
            writer.write(example.SerializeToString())

        #关闭
        writer.close()
        print("存储完毕")

2.从tfrecords文件中读取数据（流程就和前面的读取差不多了，就是多了一个解码）

在这里插入图片描述

整个从存储，到读取的代码

import tensorflow as tf
import os


class erjinzhi_duqu:
    def __init__(self, file_list):
        #文件列表
        self.file_list = file_list

        #定义一些数据属性
        self.height = 32
        self.width = 32
        self.channel = 3
        self.label_bytes = 1
        self.image_bytes = 3072
        self.bytes = 3073

    def readanddecode(self):
        #构造文件队列
        file_queue = tf.train.string_input_producer(self.file_list)

        #构造二进制文件读取器, 每个样本的字节数都要指定
        reader = tf.FixedLengthRecordReader(self.bytes)
        key, value = reader.read(file_queue)

        #构造文件解码器
        label_image = tf.decode_raw(value, tf.uint8) #这是lable和image在一起的数据，需要分开

        #处理数据，分开图片和标签数据
            #tfapi切片分割
        # label1 = tf.cast(tf.slice(label_image, [0], [self.label_bytes]), tf.int32)#当需要进行计算时可以指定数据的类型
        # image1 = tf.cast(tf.slice(label_image, [self.label_bytes], [self.image_bytes]), tf.float32)
        label1 = tf.slice(label_image, [0], [self.label_bytes])  # 对标签进行转换
        image1 = tf.slice(label_image, [self.label_bytes], [self.image_bytes])

            #直接切片分割(不可行，因为他无法识别数量，后续就无法进行矩阵转换)
        label2 = tf.cast(label_image[0:self.label_bytes], tf.int32)
        image2 = label_image[self.label_bytes:self.image_bytes]

        """
            Tensor("Cast:0", shape=(1,), dtype=int32)
            Tensor("Slice_1:0", shape=(3072,), dtype=uint8)
                Tensor("Cast_1:0", shape=(?,), dtype=int32)
                Tensor("strided_slice_1:0", shape=(?,), dtype=uint8)
        """

        #对图片的特征数据进行改变
        image_reshape1 = tf.reshape(image1, [self.height, self.width, self.channel])
        # image_reshape2 = tf.reshape(image2, [self.height, self.width, self.channel])


        #批处理
        image_batch1, label_batch1 = tf.train.batch([image_reshape1, label1], batch_size=10, num_threads=1, capacity=10)
        # image_batch2, label_batch2 = tf.train.batch([image_reshape2, label2], batch_size=10, num_threads=1, capacity=10)

        return image_batch1, label_batch1

    def write_to_tfrecords(self, image_batch, label_batch, save_path):
        """
        将图片的特征值和目标值存进tfrecord
        :param image_batch:
        :param label_batch:
        :return:
        """
        print("开始存储")

        #构造一个存储器
        writer = tf.python_io.TFRecordWriter(save_path)

        #循环的将所有样本写入文件
        for i in range(10):
            #取出第i个图片的特征值和标签值-->需要用eval来取出值（必须放在sess中执行才可以）
            image = image_batch[i].eval().tostring()
            label = int(label_batch[i].eval()[0])# label_batch[i].eval()输出的是一个标签列表，要值需要【0】
                                                 #[9]这种

            #构造example协议快
            example = tf.train.Example(features = tf.train.Features(feature = {
                "image":tf.train.Feature(bytes_list = tf.train.BytesList(value=[image])),
                "label":tf.train.Feature(int64_list = tf.train.Int64List(value=[label])),
            }))

            # 写入样本
            writer.write(example.SerializeToString())

        #关闭
        writer.close()
        print("存储完毕")

    def tfrecords_read(self, read_path):
        #构造文件队列
        file_queue = tf.train.string_input_producer([read_path])

        #构造文件阅读器,一次也是读入一个样本的序列化
        reader = tf.TFRecordReader()
        key, value = reader.read(file_queue)

        #解析example协议块,返回的是一个字典
        features = tf.parse_single_example(value, features = {
            "image":tf.FixedLenFeature([], tf.string),
            "label":tf.FixedLenFeature([], tf.int64)
        })
        print(features["image"], features["label"])

        #解码， 如果读取的内容是string需要解码，其他int，float不需要解码
        image = tf.decode_raw(features["image"], tf.uint8)
        label = features["label"]
        print(image, label)

        #固定图片的形状方便批处理
        image_reshape = tf.reshape(image, [self.height, self.width, self.channel])
        print(image_reshape)

        #批处理
        image_batch, label_batch = tf.train.batch([image_reshape, label], batch_size=10, num_threads=1, capacity=10)
        print(image_batch, label_batch)
        return image_batch, label_batch







if __name__ == '__main__':
    #找到文件，放入列表
    file_names = os.listdir("./cifar10/cifar-10-batches-bin/")

    #进行文件夹和文件名的拼接,进行文件切片，读取出需要的文件
    file_list = [os.path.join("./cifar10/cifar-10-batches-bin/", file) for file in file_names if file[-3:] == "bin"]
    br = erjinzhi_duqu(file_list)
    # example, label = br.readanddecode()

    # 读出数据(必须放在sess外面)
    read_path = "./写入的tfrecords文件/cifar10.tfrecords"
    image_batch, label_batch = br.tfrecords_read(read_path)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config = config) as sess:

        #定义一个线程协调器
        coord = tf.train.Coordinator()

        #开启读取文件的线程
        threads = tf.train.start_queue_runners(sess,coord = coord)

        #打印读入的内容(二进制)
        # a, b  = sess.run([example, label])
        # print(a, b)


        # 保存为tfrecord文件格式
        save_path = "./写入的tfrecords文件/cifar10.tfrecords"
        # br.write_to_tfrecords(example, label, save_path)



        print(sess.run([image_batch, label_batch]))


        #不要忘记回收线程
        coord.request_stop()
        coord.join(threads)