tensorflow2.0学习笔记:tf.data 数据处理

  1. tf.data API 介绍
  2. tf.data处理csv文件数据
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras
print(tf.__version__)
2.0.0

1. tf.data API 介绍

# 构建数据集
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)
<TensorSliceDataset shapes: (), types: tf.int32>
# 遍历数据集
for item in dataset:
    print(item)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
# 1. repeat epoch 重复
# 2. get batch 获得batch_size
dataset = dataset.repeat(3).batch(7) 
for item in dataset:
    print(item)
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)
# interleave: 对dataset中每个元素处理后合并,形成新的数据集
# 常用于:文件dataset -> 具体数据集

dataset2 = dataset.interleave(
    lambda v : tf.data.Dataset.from_tensor_slices(v), # 每次取出一个V,封装成一个新的数据集
    cycle_length = 5, # cycle_length,并行程度
    block_length = 5  # block_length,每次取的个数
)
for item in dataset2:
    print(item)
# 前5个元素来源于dataset的第一个tensor的前5个元素,一次类推
# 每次从一个tensor中取一部分,不够的话从其他tensor中取出补齐,获得均匀混合的效果
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
# 元组数据的处理
x = np.array([[1,2],[3,4],[5,6]])
y = np.array(['cat','dog','fox'])
dataset3 = tf.data.Dataset.from_tensor_slices((x,y))
for item_x, item_y in dataset3:
    print(item_x.numpy(),item_y.numpy())
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'
# 字典类型的处理
dataset4 = tf.data.Dataset.from_tensor_slices({"feature":x,"label":y})
for item in dataset4:
    print(item["feature"].numpy(),item["label"].numpy())
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'

2. tf.data处理csv文件数据

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid, = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
# 建立存放数据文件夹
output_dir = "generate_csv" 
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
# 生成csv文件并保存
def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
    path_format = os.path.join(output_dir,"{}_{:02d}.csv") # 生成文件名,第一个{name_prefix},第二个{数字}
    filename = []
    '''
    1.生成和data一样长的数组np.arange(len(data)用来当索引
    2.enumerate给每一组标记一个值
    3.生成子文件名
    '''
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
        # np.arange():生成和data一样长的数组
        # np.array_split():分成n_parts个部分
        # enumerate(): 给每一组标记一个值
        part_csv = path_format.format(name_prefix,file_idx) #生成子文件名
        filename.append(part_csv)
        with open(part_csv,"wt",encoding="utf-8") as f:
            if header is not None:
                f.write(header+"\n")
                for row_indix in row_indices:
                    f.write(",".join([repr(col) for col in data[row_indix]]))
                    f.write("\n")
                            
    return filename

train_data = np.c_[x_train_scaled,y_train] # 列表拼接
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"] # x.name + y.name
header_str = ",".join(header_cols) #用逗号连接

train_filenames = save_to_csv(output_dir,train_data,"train",header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",header_str,n_parts=10)
import pprint
# print("train filenames:")
# pprint.pprint(train_filenames)
# print("valid_filenames:")
# pprint.pprint(valid_filenames)
print("test_filenames:")
pprint.pprint(test_filenames)
test_filenames:
['generate_csv\\test_00.csv',
 'generate_csv\\test_01.csv',
 'generate_csv\\test_02.csv',
 'generate_csv\\test_03.csv',
 'generate_csv\\test_04.csv',
 'generate_csv\\test_05.csv',
 'generate_csv\\test_06.csv',
 'generate_csv\\test_07.csv',
 'generate_csv\\test_08.csv',
 'generate_csv\\test_09.csv']
# 读取csv文件步骤
# 1.filenames -> dataset
# 2.read file -> dataset -> datasets -> merge
# 3.parse csv 解析

# 1.生成文件名dataset
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
# 2.读取文件,形成dataset,合并dataset

n_readers = 5
dataset = filename_dataset.interleave(
    # interleave(): 遍历,合并。
    lambda filename: tf.data.TextLineDataset(filename).skip(1), # 按行读取,skip(1)省略一行去除header
    cycle_length=n_readers
)
for line in dataset.take(10):
    print(line.numpy())
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'2.51504373119231,1.0731637904355105,0.5574401201546321,-0.17273513019187772,-0.612912610473286,-0.01909156503651574,-0.5710993036045546,-0.027490309606616956,5.00001'
b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147'
b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286'
b'-0.8219588176953616,1.874166156711919,0.18212349433218608,-0.03170019246279883,-0.6011178900722581,-0.14337494105109344,1.0852205298015787,-0.8613994495208361,1.054'
b'0.4369234889778008,-1.9706452014148417,-0.1664210569911193,0.05486205164394496,-0.8379195842775115,-0.1323988058685803,-0.9956770637171147,0.941242463706905,1.73'
b'1.8444675088321243,0.5124621340420246,0.505783649224786,-0.20645711406004988,-0.021362018052499883,-0.05811312281214649,0.8332732875369839,-1.2658703497187516,4.513'
b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352'
b'-1.453851024367546,1.874166156711919,-1.1315714708271856,0.3611276016530489,-0.3978857847006997,-0.03273859332533962,-0.7390641317809511,0.646627857389904,1.875'
b'-0.46794146200516895,-0.9293421252555106,0.11909925912590703,-0.060470113038678074,0.30344643606811583,-0.021851890609536125,1.873722084296329,-1.0411642940532422,1.012'
# 3.tf.io.decode_csv(str,record_defaults):解析csv文件,record_defaults 解析类型

sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32)]*5 # int32解析类型
parsed_fields = tf.io.decode_csv(sample_str,record_defaults=record_defaults)
print(parsed_fields)
[<tf.Tensor: id=3636, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=3637, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=3638, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=3639, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=3640, shape=(), dtype=int32, numpy=5>]
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32),0,np.nan,"hello",tf.constant([])] # 不同解析类型
parsed_fields = tf.io.decode_csv(sample_str,record_defaults=record_defaults)
print(parsed_fields)
[<tf.Tensor: id=3647, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=3648, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=3649, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=3650, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=3651, shape=(), dtype=float32, numpy=5.0>]
# 输入空字符串报错
try:
    parsed_fields = tf.io.decode_csv(',,,,',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)
Field 4 is required but missing in record 0! [Op:DecodeCSV]
#输入过多字符串报错
try:
    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)
Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]
# 解析dataset中的一行
def parse_csv_line(line,n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields 
    parsed_fields = tf.io.decode_csv(line,record_defaults = defs)
    x = tf.stack(parsed_fields[0:-1]) #tf.stack 转化为向量,前8个为x
    y = tf.stack(parsed_fields[-1:]) #第9个为y
    return x,y

parse_csv_line(b'1.8444675088321243,0.5124621340420246,0.505783649224786,-0.20645711406004988,-0.021362018052499883,-0.05811312281214649,0.8332732875369839,-1.2658703497187516,4.513',
              n_fields=9)
parse_csv_line(b'-0.8246762898717912,-0.04823952235146133,-0.3448658166118309,-0.08477587145199328,0.5012348243315076,-0.034699996532417135,0.5300034588851571,-0.08741192445075467,0.717',
              n_fields=9)
(<tf.Tensor: id=3684, shape=(8,), dtype=float32, numpy=
 array([-0.8246763 , -0.04823952, -0.34486583, -0.08477587,  0.5012348 ,
        -0.0347    ,  0.5300035 , -0.08741193], dtype=float32)>,
 <tf.Tensor: id=3685, shape=(1,), dtype=float32, numpy=array([0.717], dtype=float32)>)
# 转化整个数据集
# 1.filenames -> dataset
# 2.read file -> dataset -> datasets -> merge
# 3.parse csv 解析
def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames) # 1.filenames -> dataset
    dataset = dataset.repeat() # repeat()无限次,重复dataset
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),# 2.read file -> dataset
        cycle_length = n_readers)  
    dataset.shuffle(shuffle_buffer_size) # shuffle
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads) # 解析,map:一对一
    dataset = dataset.batch(batch_size) #生成batch
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
print(train_set)

for x_batch,y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)
<DatasetV1Adapter shapes: ((None, 8), (None, 1)), types: (tf.float32, tf.float32)>
x:
<tf.Tensor: id=3769, shape=(3, 8), dtype=float32, numpy=
array([[-1.0591781 ,  1.3935647 , -0.02633197, -0.1100676 , -0.6138199 ,
        -0.09695935,  0.3247131 , -0.03747724],
       [ 0.63034356,  1.8741661 , -0.06713215, -0.12543367, -0.19737554,
        -0.02272263, -0.69240725,  0.72652334],
       [-0.66722274, -0.04823952,  0.34529406,  0.53826684,  1.8521839 ,
        -0.06112538, -0.8417093 ,  1.5204847 ]], dtype=float32)>
y:
<tf.Tensor: id=3770, shape=(3, 1), dtype=float32, numpy=
array([[0.672],
       [2.419],
       [1.59 ]], dtype=float32)>
x:
<tf.Tensor: id=3771, shape=(3, 8), dtype=float32, numpy=
array([[ 0.48530516, -0.8492419 , -0.06530126, -0.02337966,  1.4974351 ,
        -0.07790658, -0.90236324,  0.78145146],
       [-1.1157656 ,  0.99306357, -0.334192  , -0.06535219, -0.32893205,
         0.04343066, -0.12785879,  0.30707204],
       [-0.22235657,  1.3935647 ,  0.029913  ,  0.0801452 , -0.50948197,
        -0.06238599, -0.86503774,  0.86134696]], dtype=float32)>
y:
<tf.Tensor: id=3772, shape=(3, 1), dtype=float32, numpy=
array([[2.956],
       [0.524],
       [2.   ]], dtype=float32)>
batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size, #设置每个epoch迭代的步数
                    validation_steps = 3870 // batch_size, 
                    epochs = 10,
                    callbacks = callbacks)
model.evaluate(test_set, steps = 5160 // batch_size)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值