# 利用fetch_california_housing生成CSV文件,并用tensorflow读取csv文件
# 第一步,生成CSCV文件
import tensorflow as tf
import keras
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# 生成CSV文件
housing = fetch_california_housing()
x_train_valid, x_test, y_train_valid, y_test = train_test_split(housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_valid, y_train_valid)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)
x_test = scaler.transform(x_test)
print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)
output_dir = './csv_data'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def build_csv_file(output_dir, data, num_classes, pre_name, header=None):
file_names = []
path_format = os.path.join(output_dir, '{}_{:02d}.csv')
for i, row_index in enumerate(np.array_split(
np.arange(len(data)), num_classes)):
file_name = path_format.format(pre_name, i)
file_names.append(file_name)
with open(file_name, 'w', encoding='utf-8') as f:
if header:
f.write(header + '\n')
for row in row_index:
f.write(','.join([str(col) for col in data[row]]))
f.write('\n')
return file_names
train_data = np.c_[x_train, y_train]
valid_data = np.c_[x_valid, y_valid]
test_data = np.c_[x_test, y_test]
header = housing.feature_names + ['price']
header = ','.join(header)
train_file_names = build_csv_file(output_dir, train_data, 10, 'train', header)
valid_file_names = build_csv_file(output_dir, valid_data, 10, 'valid', header)
test_file_names = build_csv_file(output_dir, test_data, 10, 'test', header)
print(train_file_names)
print(valid_file_names)
print(test_file_names)
# tensorflow读取CSV文件,
# 第一步,通过tf.data.Dataset.list_files创建文件名file_dataset
# 第二步,file_dataset.interleave()读取文件内容, tf.data.TextLineDataset
# 第三步,对文件内容进行解析 tf.io.decode_csv
import tensorflow as tf
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, BatchNormalization, AlphaDropout
# 读取CSV文件
train_file_names = ['./csv_data\\train_00.csv', './csv_data\\train_01.csv', './csv_data\\train_02.csv',
'./csv_data\\train_03.csv', './csv_data\\train_04.csv', './csv_data\\train_05.csv',
'./csv_data\\train_06.csv', './csv_data\\train_07.csv', './csv_data\\train_08.csv',
'./csv_data\\train_09.csv']
valid_file_names = ['./csv_data\\valid_00.csv', './csv_data\\valid_01.csv', './csv_data\\valid_02.csv',
'./csv_data\\valid_03.csv', './csv_data\\valid_04.csv', './csv_data\\valid_05.csv',
'./csv_data\\valid_06.csv', './csv_data\\valid_07.csv', './csv_data\\valid_08.csv',
'./csv_data\\valid_09.csv']
test_file_names = ['./csv_data\\test_00.csv', './csv_data\\test_01.csv', './csv_data\\test_02.csv',
'./csv_data\\test_03.csv', './csv_data\\test_04.csv', './csv_data\\test_05.csv',
'./csv_data\\test_06.csv', './csv_data\\test_07.csv', './csv_data\\test_08.csv',
'./csv_data\\test_09.csv']
# # 第一步,生成列表文件名dataset
# train_file_dataset = tf.data.Dataset.list_files(train_file_names)
# valid_file_dataset = tf.data.Dataset.list_files(valid_file_names)
# test_file_dataset = tf.data.Dataset.list_files(test_file_names)
#
# # 第二步,利用列表文件名dataset对每一个文件内容进行读取
# train_dataset = train_file_dataset.interleave(
# lambda file_name: tf.data.TextLineDataset(file_name).skip(1),
# cycle_length=5
# )
# valid_dataset = train_file_dataset.interleave(
# lambda file_name: tf.data.TextLineDataset(file_name).skip(1),
# cycle_length=5
# )
# test_dataset = test_file_dataset.interleave(
# lambda file_name: tf.data.TextLineDataset(file_name).skip(1),
# cycle_length=5
# )
# 第三步,对读取的文件内容进行解析
def parse_csv_line(line, n_fields=9):
record = [tf.constant(np.nan)] * n_fields
new_line = tf.io.decode_csv(line, record_defaults=record)
x = new_line[: -1]
y = new_line[-1:]
return tf.stack(x, axis=0), tf.stack(y, axis=0)
# 第四步,对上述操作进行整合
def csv_dataset_reader(file_names, n_fields=9, epochs=20, shuffle=True):
file_dataset = tf.data.Dataset.list_files(file_names)
dataset = file_dataset.interleave(
lambda file_name: tf.data.TextLineDataset(file_name).skip(1),
cycle_length=5
)
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs)
dataset = dataset.map(parse_csv_line, num_parallel_calls=5)
dataset = dataset.batch(batch_size=32).prefetch(32)
return dataset
train_dataset = csv_dataset_reader(train_file_names)
valid_dataset = csv_dataset_reader(valid_file_names)
test_dataset = csv_dataset_reader(test_file_names)
x, y = next(iter(train_dataset))
print(x.shape)
print(y.shape)
def build_cnn_net():
input_layer = keras.layers.Input(shape=(8, ), dtype=tf.float32)
x = Dense(32, activation='relu')(input_layer)
# x = Dense(128, activation='relu')(x)
# x = Dense(32, activation='relu')(x)
x = Dense(1)(x)
model = tf.keras.models.Model(inputs=input_layer, outputs=x)
return model
def draw_curve(history):
pd.DataFrame(history.history).plot()
plt.gca().set_ylim(0, 1)
plt.grid(True)
plt.show()
model = build_cnn_net()
model.compile(loss='mse', optimizer='adam', metrics=['mse'])
history = model.fit(train_dataset, steps_per_epoch=11610 // 32, validation_data=valid_dataset, validation_steps=3870 // 32, epochs=10)
draw_curve(history)
model.evaluate(test_dataset, steps=5160 // 32)