TensorFlow 读取CSV文件

最新推荐文章于 2024-06-13 23:13:14 发布

廷益--飞鸟

最新推荐文章于 2024-06-13 23:13:14 发布

阅读量667

点赞数

分类专栏： TensorFlow

本文链接：https://blog.csdn.net/weixin_45875105/article/details/104329505

版权

TensorFlow 专栏收录该内容

70 篇文章 11 订阅

订阅专栏

数据文件
链接：https://pan.baidu.com/s/1igjhKnIBTx1Lqujyf1iB1Q
提取码：yuwg

TensorFlow 读取CSV文件 1.0

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time

from tensorflow import keras

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
    
def csvread(file_list):
    """
    读取CSV文件
    :param filelist: 文件路径 + 名字的列表
    :return:读取的内容
    """
    # 1、构造文件的队列
    file_queue = tf.train.string_input_producer(file_list)
    
    # 2.构造csv阅读器读取队列数据（按行）
    reader = tf.TextLineReader()
    key, value = reader.read(file_queue)
    print(value)
    
    # 3.对每行内容解码
    # record_defaults:指定每个样本的类型， 指定默认值
    records = [["None"],["None"]]
    example, label = tf.decode_csv(value, record_defaults=records)
    print(example, label)
    
    # 4.读取多个数据
    example_batch, label_batch= tf.train.batch([example, label], batch_size=20, num_threads=1, capacity=20)
    print(example_batch, label_batch)
    return example_batch, label_batch

if __name__ == "__main__":
    # 找到文件，放入列表  路径+名字 ->列表当中
    file_names = os.listdir("./csvdata/")
    file_list = [os.path.join("./csvdata/", file) for file in file_names]
    print(file_list)
    example_batch, label_batch = csvread(file_list)
    
    # 开启会话运行结果
    with tf.Session() as sess:
        # 定义一个线程协调
        coord = tf.train.Coordinator()
        # 开启读文件的线程
        threads = tf.train.start_queue_runners(sess, coord)
        
        print(sess.run([example_batch, label_batch]))
        
        # 回收子线程
        coord.request_stop()
        coord.join(threads)

TensorFlow 读取CSV文件 2.0

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

def parse_csv_line(line, n_fields=2):
    """
    对每行内容解码
    :param line:每行数据
    :param n_fields: 文件中的列数
    return: x 特征值列表, y 目标值
    """
    # 1.每列数据类型指定
    defs = [tf.constant("None")] * n_fields 
    
    # 2.对每行数据解码
    parsed_fields = tf.io.decode_csv(line, defs)
    x = tf.stack(parsed_fields[0])  # x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[1])  # y = tf.stack(parsed_fields[-1:]) 最后一列为结果
    return x, y

def csv_reader_dataset(filenames, n_readers=5, batch_size=32,
                      n_parse_threads=5, shuffle_buffer_size=10000):
    """
    读取csv文件列表
    :param filenames: csv文件列表
    :param n_readers: 并行读取文件的个数
    :param batch_size: batch个数
    :param n_parse_threads: 解码数据的线程个数
    :param shuffle_buffer_size: 缓存buffer个数
    return: dataset数据
    """
    # 读取文件列表
    dataset = tf.data.Dataset.list_files(filenames)
    
    # 数据打乱
    dataset = dataset.repeat()
    
    # 读取文件数据
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename), # .skip(1), # 忽略第一行
        cycle_length=n_readers           # 并行读取的个数
    )
    
    # 缓存数据
    dataset.shuffle(shuffle_buffer_size)
    
    # 对每行数据解码
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    
    # batch文件
    dataset = dataset.batch(batch_size)
    
    return dataset

if __name__ == "__main__":
    # 1.找到文件列表
    # 找到文件，放入列表  路径+名字 ->列表当中
    file_names = os.listdir("./csvdata/")
    file_list = [os.path.join("./csvdata/", file) for file in file_names]
    print(file_list)
    
    # 2.读取csv文件
    train_set = csv_reader_dataset(file_list, batch_size=3)
    
    # 3.显示读取结果
    for x_batch, y_batch in train_set.take(1):
        print("x:")
        print(x_batch)
        print("y:")
        print(y_batch)

2、读取csv文件

链接：https://pan.baidu.com/s/1t4IoQc0D36dAJ2b8-o2vug
提取码：wbcy
在这里插入图片描述

import tensorflow as tf
import numpy as np

def read_csv_line(line, n_fields=2):
    # 设置每列数据类型
    defaults = [tf.constant(0), 
                tf.constant(0.0),
                tf.constant(0.0),
                tf.constant(0.0),
                tf.constant(0.0),
                tf.constant(0)]
    # 解析每一行数据
    parsed_fields = tf.io.decode_csv(line, defaults)
    x = tf.stack(parsed_fields[1:-1])
    y = tf.stack(parsed_fields[-1])
    return x, y

def csv_reader_dataset(filenames, n_readers=5, batch_size=32,
                      n_parse_threads=5, shuffle_buffer_size=10000):
    """
    读取csv文件列表
    :param filenames: csv文件列表
    :param n_readers: 并行读取文件的个数
    :param batch_size: batch个数
    :param n_parse_threads: 解码数据的线程个数
    :param shuffle_buffer_size: 缓存buffer个数
    return: dataset数据
    """
    # 读取文件列表
    dataset = tf.data.Dataset.list_files(filenames)
    
    # 数据无限循环
    dataset = dataset.repeat()
    
    # 读取文件数据
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1), # 忽略第一行
        cycle_length=n_readers           # 并行读取的个数
    )
    
    # 对每行数据解码
    dataset = dataset.map(read_csv_line, num_parallel_calls=n_parse_threads)
    
    # 数据打乱
    dataset = dataset.shuffle(shuffle_buffer_size)
    
    # batch文件
    dataset = dataset.batch(batch_size)
    
    return dataset

# 1.读取训练csv文件
train_set = csv_reader_dataset(["iris_training.csv"], batch_size=8)

# 2.显示读取结果
for x_batch, y_batch in train_set.take(1):
    
    print("x:")
    print(x_batch)
    print("y:")
    print(y_batch)

在这里插入图片描述

# 1.读取测试csv文件
test_ds = csv_reader_dataset(["iris_test.csv"], batch_size=8)

# 2.显示读取结果
for x_batch, y_batch in test_ds.take(1):
    print("x:")
    print(x_batch)
    print("y:")
    print(y_batch)

在这里插入图片描述

廷益--飞鸟

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
打赏
1
评论
TensorFlow 读取CSV文件

TensorFlow 读取CSV文件 1.0import matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlineimport numpy as npimport sklearnimport pandas as pdimport osimport sysimport timefrom tensorfl...
复制链接

扫一扫