tf2.0中怎么实现dataset与cvs数据的转换

tf2.0中怎么实现dataset与cvs数据的转换

import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
import sklearn
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

housing = fetch_california_housing()
print(housing.feature_names)#['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
x_train_all, x_test, y_train_all, y_test = \
    train_test_split(housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = \
    train_test_split(x_train_all, y_train_all, random_state=11)
# 归一化
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
    path_format = os.path.join(output_dir,"{}_{:02d}.csv")#第一个{}中放name_prefix,表示数据的种类,如:x_train_scaled
    filenames = []

    for file_idx,row_indices in enumerate(
        #生产和data一样长的数组,并进行切分
        np.array_split(np.arange(len(data)),n_parts)):
        part_csv = path_format.format(name_prefix,file_idx)#子文件名
        filenames.append(part_csv)
        with open(part_csv,"wt",encoding="utf-8") as f:
            if header is not  None:
                f.write(header+"\n")
            #行索引
            for row_index in row_indices:
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames
#把数据换行进行merge
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]

header_cols =housing.feature_names+["MedianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir,train_data,"train",
                              header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",
                              header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",
                             header_str,n_parts=10)

#read file ->dataset ->datasets ->merge
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),  # 按行读取文本,形成一个dataset, skip(1)代表省略第一行header
    cycle_length=n_readers
)
# 只读取dataset的前面15个
for line in dataset.take(15):
    print(line.numpy())


#parse csv 解析csv文档
#tf.io.decode_csv(str,record_defaults)
sample_str = '1,3,5,7,9'
#定义每个元素的类型以及默认值
record_defaults = [tf.constant(0,dtype=tf.int32),
                  0,
                  np.nan,
                  "hello",
                  tf.constant([])]
parse_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parse_fields)

#解析csv的一行
def parse_csv_line(line,n_fields):
    defs = [tf.constant(np.nan)]*n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])#共9个值,对前8个值进行转换
    y = tf.stack(parsed_fields[-1:])
    return x,y


# 1.filename ->dataset
# 2.read file ->dataset ->datasets ->merge
# 3.parse csv
def csv_reader_dataset(filenames, n_readers=5, batch_size=32,  # n_readers读取时的并行度,n_parse_thread解析时的并行度,
                       n_parse_threads=5, shuffle_buffer_size=10000):  # shuffle_buffer_size取函数域时shuffle_buffer大小
    dataset = tf.data.Dataset.list_files(filenames)  # 取出数据
    dataset = dataset.repeat()  # repeat代表取数据重复次数,不填代表重复无数次
    # 将文件名转换为文件内容
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),  # 忽略第一行header
        cycle_length=n_readers,
    )
    dataset.shuffle(shuffle_buffer_size)  # 随机排布

    # 解析
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)  # 并行程度
    dataset = dataset.batch(batch_size)
    return dataset

import pprint
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

batch_size =32
train_set = csv_reader_dataset(train_filenames,batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size=batch_size)

# 设置模型层次
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=[8]),
    keras.layers.Dense(1),
])
# 设置模型参数
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [
    keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)
]
history = model.fit(train_set,
                    validation_data=valid_set,
                    steps_per_epoch = 11160//batch_size,
                    validation_steps=3870//batch_size,
                    epochs=100,
                    callbacks=callbacks)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值