tf2.0中怎么实现dataset与cvs数据的转换
import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
import sklearn
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV
housing = fetch_california_housing()
print(housing.feature_names)#['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
x_train_all, x_test, y_train_all, y_test = \
train_test_split(housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = \
train_test_split(x_train_all, y_train_all, random_state=11)
# 归一化
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
output_dir = "generate_csv"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
path_format = os.path.join(output_dir,"{}_{:02d}.csv")#第一个{}中放name_prefix,表示数据的种类,如:x_train_scaled
filenames = []
for file_idx,row_indices in enumerate(
#生产和data一样长的数组,并进行切分
np.array_split(np.arange(len(data)),n_parts)):
part_csv = path_format.format(name_prefix,file_idx)#子文件名
filenames.append(part_csv)
with open(part_csv,"wt",encoding="utf-8") as f:
if header is not None:
f.write(header+"\n")
#行索引
for row_index in row_indices:
f.write(",".join(
[repr(col) for col in data[row_index]]))
f.write('\n')
return filenames
#把数据换行进行merge
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols =housing.feature_names+["MedianHouseValue"]
header_str = ",".join(header_cols)
train_filenames = save_to_csv(output_dir,train_data,"train",
header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",
header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",
header_str,n_parts=10)
#read file ->dataset ->datasets ->merge
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
print(filename)
n_readers = 5
dataset = filename_dataset.interleave(
lambda filename: tf.data.TextLineDataset(filename).skip(1), # 按行读取文本,形成一个dataset, skip(1)代表省略第一行header
cycle_length=n_readers
)
# 只读取dataset的前面15个
for line in dataset.take(15):
print(line.numpy())
#parse csv 解析csv文档
#tf.io.decode_csv(str,record_defaults)
sample_str = '1,3,5,7,9'
#定义每个元素的类型以及默认值
record_defaults = [tf.constant(0,dtype=tf.int32),
0,
np.nan,
"hello",
tf.constant([])]
parse_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parse_fields)
#解析csv的一行
def parse_csv_line(line,n_fields):
defs = [tf.constant(np.nan)]*n_fields
parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])#共9个值,对前8个值进行转换
y = tf.stack(parsed_fields[-1:])
return x,y
# 1.filename ->dataset
# 2.read file ->dataset ->datasets ->merge
# 3.parse csv
def csv_reader_dataset(filenames, n_readers=5, batch_size=32, # n_readers读取时的并行度,n_parse_thread解析时的并行度,
n_parse_threads=5, shuffle_buffer_size=10000): # shuffle_buffer_size取函数域时shuffle_buffer大小
dataset = tf.data.Dataset.list_files(filenames) # 取出数据
dataset = dataset.repeat() # repeat代表取数据重复次数,不填代表重复无数次
# 将文件名转换为文件内容
dataset = dataset.interleave(
lambda filename: tf.data.TextLineDataset(filename).skip(1), # 忽略第一行header
cycle_length=n_readers,
)
dataset.shuffle(shuffle_buffer_size) # 随机排布
# 解析
dataset = dataset.map(parse_csv_line,
num_parallel_calls=n_parse_threads) # 并行程度
dataset = dataset.batch(batch_size)
return dataset
import pprint
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
print("x:")
pprint.pprint(x_batch)
print("y:")
pprint.pprint(y_batch)
batch_size =32
train_set = csv_reader_dataset(train_filenames,batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size=batch_size)
# 设置模型层次
model = keras.models.Sequential([
keras.layers.Dense(30, activation='relu', input_shape=[8]),
keras.layers.Dense(1),
])
# 设置模型参数
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [
keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)
]
history = model.fit(train_set,
validation_data=valid_set,
steps_per_epoch = 11160//batch_size,
validation_steps=3870//batch_size,
epochs=100,
callbacks=callbacks)