tf.data API 介绍 tf.data处理csv文件数据
import matplotlib as mpl
import matplotlib. pyplot as plt
% matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print ( tf. __version__)
2.0.0
1. tf.data API 介绍
dataset = tf. data. Dataset. from_tensor_slices( np. arange( 10 ) )
print ( dataset)
<TensorSliceDataset shapes: (), types: tf.int32>
for item in dataset:
print ( item)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
dataset = dataset. repeat( 3 ) . batch( 7 )
for item in dataset:
print ( item)
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)
dataset2 = dataset. interleave(
lambda v : tf. data. Dataset. from_tensor_slices( v) ,
cycle_length = 5 ,
block_length = 5
)
for item in dataset2:
print ( item)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
x = np. array( [ [ 1 , 2 ] , [ 3 , 4 ] , [ 5 , 6 ] ] )
y = np. array( [ 'cat' , 'dog' , 'fox' ] )
dataset3 = tf. data. Dataset. from_tensor_slices( ( x, y) )
for item_x, item_y in dataset3:
print ( item_x. numpy( ) , item_y. numpy( ) )
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'
dataset4 = tf. data. Dataset. from_tensor_slices( { "feature" : x, "label" : y} )
for item in dataset4:
print ( item[ "feature" ] . numpy( ) , item[ "label" ] . numpy( ) )
[1 2] b'cat'
[3 4] b'dog'
[5 6] b'fox'
2. tf.data处理csv文件数据
from sklearn. datasets import fetch_california_housing
housing = fetch_california_housing( )
from sklearn. model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
housing. data, housing. target, random_state = 7 )
x_train, x_valid, y_train, y_valid, = train_test_split(
x_train_all, y_train_all, random_state = 11 )
print ( x_train. shape, y_train. shape)
print ( x_valid. shape, y_valid. shape)
print ( x_test. shape, y_test. shape)
(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)
from sklearn. preprocessing import StandardScaler
scaler = StandardScaler( )
x_train_scaled = scaler. fit_transform( x_train)
x_valid_scaled = scaler. transform( x_valid)
x_test_scaled = scaler. transform( x_test)
output_dir = "generate_csv"
if not os. path. exists( output_dir) :
os. mkdir( output_dir)
def save_to_csv ( output_dir, data, name_prefix, header= None , n_parts= 10 ) :
path_format = os. path. join( output_dir, "{}_{:02d}.csv" )
filename = [ ]
'''
1.生成和data一样长的数组np.arange(len(data)用来当索引
2.enumerate给每一组标记一个值
3.生成子文件名
'''
for file_idx, row_indices in enumerate ( np. array_split( np. arange( len ( data) ) , n_parts) ) :
part_csv = path_format. format ( name_prefix, file_idx)
filename. append( part_csv)
with open ( part_csv, "wt" , encoding= "utf-8" ) as f:
if header is not None :
f. write( header+ "\n" )
for row_indix in row_indices:
f. write( "," . join( [ repr ( col) for col in data[ row_indix] ] ) )
f. write( "\n" )
return filename
train_data = np. c_[ x_train_scaled, y_train]
valid_data = np. c_[ x_valid_scaled, y_valid]
test_data = np. c_[ x_test_scaled, y_test]
header_cols = housing. feature_names + [ "MidianHouseValue" ]
header_str = "," . join( header_cols)
train_filenames = save_to_csv( output_dir, train_data, "train" , header_str, n_parts= 20 )
valid_filenames = save_to_csv( output_dir, valid_data, "valid" , header_str, n_parts= 10 )
test_filenames = save_to_csv( output_dir, test_data, "test" , header_str, n_parts= 10 )
import pprint
print ( "test_filenames:" )
pprint. pprint( test_filenames)
test_filenames:
['generate_csv\\test_00.csv',
'generate_csv\\test_01.csv',
'generate_csv\\test_02.csv',
'generate_csv\\test_03.csv',
'generate_csv\\test_04.csv',
'generate_csv\\test_05.csv',
'generate_csv\\test_06.csv',
'generate_csv\\test_07.csv',
'generate_csv\\test_08.csv',
'generate_csv\\test_09.csv']
filename_dataset = tf. data. Dataset. list_files( train_filenames)
for filename in filename_dataset:
print ( filename)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
n_readers = 5
dataset = filename_dataset. interleave(
lambda filename: tf. data. TextLineDataset( filename) . skip( 1 ) ,
cycle_length= n_readers
)
for line in dataset. take( 10 ) :
print ( line. numpy( ) )
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'2.51504373119231,1.0731637904355105,0.5574401201546321,-0.17273513019187772,-0.612912610473286,-0.01909156503651574,-0.5710993036045546,-0.027490309606616956,5.00001'
b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147'
b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286'
b'-0.8219588176953616,1.874166156711919,0.18212349433218608,-0.03170019246279883,-0.6011178900722581,-0.14337494105109344,1.0852205298015787,-0.8613994495208361,1.054'
b'0.4369234889778008,-1.9706452014148417,-0.1664210569911193,0.05486205164394496,-0.8379195842775115,-0.1323988058685803,-0.9956770637171147,0.941242463706905,1.73'
b'1.8444675088321243,0.5124621340420246,0.505783649224786,-0.20645711406004988,-0.021362018052499883,-0.05811312281214649,0.8332732875369839,-1.2658703497187516,4.513'
b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352'
b'-1.453851024367546,1.874166156711919,-1.1315714708271856,0.3611276016530489,-0.3978857847006997,-0.03273859332533962,-0.7390641317809511,0.646627857389904,1.875'
b'-0.46794146200516895,-0.9293421252555106,0.11909925912590703,-0.060470113038678074,0.30344643606811583,-0.021851890609536125,1.873722084296329,-1.0411642940532422,1.012'
sample_str = '1,2,3,4,5'
record_defaults = [ tf. constant( 0 , dtype= tf. int32) ] * 5
parsed_fields = tf. io. decode_csv( sample_str, record_defaults= record_defaults)
print ( parsed_fields)
[<tf.Tensor: id=3636, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=3637, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=3638, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=3639, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=3640, shape=(), dtype=int32, numpy=5>]
sample_str = '1,2,3,4,5'
record_defaults = [ tf. constant( 0 , dtype= tf. int32) , 0 , np. nan, "hello" , tf. constant( [ ] ) ]
parsed_fields = tf. io. decode_csv( sample_str, record_defaults= record_defaults)
print ( parsed_fields)
[<tf.Tensor: id=3647, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=3648, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=3649, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=3650, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=3651, shape=(), dtype=float32, numpy=5.0>]
try :
parsed_fields = tf. io. decode_csv( ',,,,' , record_defaults)
except tf. errors. InvalidArgumentError as ex:
print ( ex)
Field 4 is required but missing in record 0! [Op:DecodeCSV]
try :
parsed_fields = tf. io. decode_csv( '1,2,3,4,5,6,7' , record_defaults)
except tf. errors. InvalidArgumentError as ex:
print ( ex)
Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]
def parse_csv_line ( line, n_fields = 9 ) :
defs = [ tf. constant( np. nan) ] * n_fields
parsed_fields = tf. io. decode_csv( line, record_defaults = defs)
x = tf. stack( parsed_fields[ 0 : - 1 ] )
y = tf. stack( parsed_fields[ - 1 : ] )
return x, y
parse_csv_line( b'1.8444675088321243,0.5124621340420246,0.505783649224786,-0.20645711406004988,-0.021362018052499883,-0.05811312281214649,0.8332732875369839,-1.2658703497187516,4.513' ,
n_fields= 9 )
parse_csv_line( b'-0.8246762898717912,-0.04823952235146133,-0.3448658166118309,-0.08477587145199328,0.5012348243315076,-0.034699996532417135,0.5300034588851571,-0.08741192445075467,0.717' ,
n_fields= 9 )
(<tf.Tensor: id=3684, shape=(8,), dtype=float32, numpy=
array([-0.8246763 , -0.04823952, -0.34486583, -0.08477587, 0.5012348 ,
-0.0347 , 0.5300035 , -0.08741193], dtype=float32)>,
<tf.Tensor: id=3685, shape=(1,), dtype=float32, numpy=array([0.717], dtype=float32)>)
def csv_reader_dataset ( filenames, n_readers= 5 ,
batch_size= 32 , n_parse_threads= 5 ,
shuffle_buffer_size= 10000 ) :
dataset = tf. data. Dataset. list_files( filenames)
dataset = dataset. repeat( )
dataset = dataset. interleave(
lambda filename: tf. data. TextLineDataset( filename) . skip( 1 ) ,
cycle_length = n_readers)
dataset. shuffle( shuffle_buffer_size)
dataset = dataset. map ( parse_csv_line,
num_parallel_calls= n_parse_threads)
dataset = dataset. batch( batch_size)
return dataset
train_set = csv_reader_dataset( train_filenames, batch_size= 3 )
print ( train_set)
for x_batch, y_batch in train_set. take( 2 ) :
print ( "x:" )
pprint. pprint( x_batch)
print ( "y:" )
pprint. pprint( y_batch)
<DatasetV1Adapter shapes: ((None, 8), (None, 1)), types: (tf.float32, tf.float32)>
x:
<tf.Tensor: id=3769, shape=(3, 8), dtype=float32, numpy=
array([[-1.0591781 , 1.3935647 , -0.02633197, -0.1100676 , -0.6138199 ,
-0.09695935, 0.3247131 , -0.03747724],
[ 0.63034356, 1.8741661 , -0.06713215, -0.12543367, -0.19737554,
-0.02272263, -0.69240725, 0.72652334],
[-0.66722274, -0.04823952, 0.34529406, 0.53826684, 1.8521839 ,
-0.06112538, -0.8417093 , 1.5204847 ]], dtype=float32)>
y:
<tf.Tensor: id=3770, shape=(3, 1), dtype=float32, numpy=
array([[0.672],
[2.419],
[1.59 ]], dtype=float32)>
x:
<tf.Tensor: id=3771, shape=(3, 8), dtype=float32, numpy=
array([[ 0.48530516, -0.8492419 , -0.06530126, -0.02337966, 1.4974351 ,
-0.07790658, -0.90236324, 0.78145146],
[-1.1157656 , 0.99306357, -0.334192 , -0.06535219, -0.32893205,
0.04343066, -0.12785879, 0.30707204],
[-0.22235657, 1.3935647 , 0.029913 , 0.0801452 , -0.50948197,
-0.06238599, -0.86503774, 0.86134696]], dtype=float32)>
y:
<tf.Tensor: id=3772, shape=(3, 1), dtype=float32, numpy=
array([[2.956],
[0.524],
[2. ]], dtype=float32)>
batch_size = 32
train_set = csv_reader_dataset( train_filenames,
batch_size = batch_size)
valid_set = csv_reader_dataset( valid_filenames,
batch_size = batch_size)
test_set = csv_reader_dataset( test_filenames,
batch_size = batch_size)
model = keras. models. Sequential( [
keras. layers. Dense( 30 , activation= 'relu' ,
input_shape= [ 8 ] ) ,
keras. layers. Dense( 1 ) ,
] )
model. compile ( loss= "mean_squared_error" , optimizer= "sgd" )
callbacks = [ keras. callbacks. EarlyStopping(
patience= 5 , min_delta= 1e - 2 ) ]
history = model. fit( train_set,
validation_data = valid_set,
steps_per_epoch = 11160 // batch_size,
validation_steps = 3870 // batch_size,
epochs = 10 ,
callbacks = callbacks)
model. evaluate( test_set, steps = 5160 // batch_size)