实战csv文件的生成与读取

将加利福尼亚房价预测数据集(回归问题)生成csv文件,并读取生成的csv文件。

1、csv文件的生成

csv文件:“,”分割,按存储的文件。

output_dir = "generate_csv" #定义文件夹存储生成的数据文件
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

#将一个单独的dataset,train,valid,test保存到文件中。
def save_to_csv(output_dir,  #输出文件夹
                data, #存储的数据
                name_prefix, #表示是train还是test数据
                header=None, 
                n_parts=10): #表示数据切分为10个文件进行存储
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n") 
            for row_index in row_indices: #遍历行索引
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

#np.c_可以把数据按行进行merge
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train",
                              header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                              header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)

note:

for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):

上面的for循环完成了以下事情:

1、np.arange(len(data))生成了和data一样长的数组,例如data里有n个元素,,则生成的数组中也有n个元素,元素值为[0, n -1],它是用来当索引从data中取数据的。

2、np.array_split,将当索引的数组分为 n_parts个部分,分为 n_parts个部分后,就可以 用n_parts里面每一组的索引到data里面取值,获得这一部分的数据。

3、enumerate,给每一组标记了一个值,这样,每一组可以通过row_indices获得,标记的值可以通过file_idx获得,这个file_idx可以作为生成文件的ID,填充文件名的一部分。

运行后,在当前目录下生成 generate_csv文件夹,其中包含20个train文件,10个test文件,10个valid文件,每个文件中都包含一行header:MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude, MidianHouseValue。部分目录列表如下图所示:

生成文件部分内容截图:

2、读取csv文件

1、把filenames生成一个dataset————list_files

list_files是专门用来处理文件名的,它会把文件名生成一个dataset

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)

2、对于这个filenames生成的dataset中的每一个元素(即:每一个文件名),去读取文件生成一个dataset,从而得到多个dataset,然后将这些dataset merge起来,生成一个最后的总的dataset。————interleave

interleave函数:遍历filename_dataset中的每一个元素,对每个元素进行操作,操作完的结果合并起来。

n_readers = 5

dataset = filename_dataset.interleave(
    #TextLineDataset:按行读取文本生成Dataset
    lambda filename: tf.data.TextLineDataset(filename).skip(1),#省略header一行
    cycle_length = n_readers #读取文件并行度
)
for line in dataset.take(15):
    print(line.numpy())
b'0.6303435674178064,1.874166156711919,-0.06713214279531016,-0.12543366804152128,-0.19737553788322462,-0.022722631725889016,-0.692407235065288,0.7265233438487496,2.419'
b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672'
b'-0.6672227549433569,-0.04823952235146133,0.34529405473316743,0.5382668657200925,1.8521839533415545,-0.0611253832474835,-0.8417093045554153,1.520484740533546,1.59'
b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147'
b'0.6363646332204844,-1.0895425985107923,0.09260902815633619,-0.20538124656801682,1.2025670451003232,-0.03630122549633783,-0.6784101660505877,0.182235342347858,2.429'
b'1.6312258686346301,0.3522616607867429,0.04080576110152256,-0.1408895163348976,-0.4632103899987006,-0.06751623819156843,-0.8277122355407183,0.5966931783531273,3.376'
b'-0.2223565745313433,1.393564736946074,0.02991299565857307,0.0801452044790158,-0.509481985418118,-0.06238599304952824,-0.86503775291325,0.8613469772480595,2.0'
b'1.5180511450515526,-0.5288409421173064,0.8102470510967439,-0.1921416982863481,0.44135393614167334,0.027335058055345158,-0.8183808561975836,0.8563535093443789,2.898'
b'-0.6906143291679195,-0.1283397589791022,7.0201810347470595,5.624287386169439,-0.2663292879200034,-0.03662080416157129,-0.6457503383496215,1.2058962626018372,1.352'
b'-1.0635474225567902,1.874166156711919,-0.49344892844525906,-0.06962612737313081,-0.273587577397559,-0.13419514417565354,1.0338979434143465,-1.3457658361775973,1.982'
b'0.04049225382803661,-0.6890414153725881,-0.44379851741607473,0.022374585146687852,-0.22187226486997497,-0.1482850314959248,-0.8883662012710817,0.6366409215825501,2.852'
b'0.199384450496934,1.0731637904355105,-0.19840853933562783,-0.29328906965393414,-0.07852104768825069,0.018804888420646343,0.8006134598360177,-1.1510205879341566,1.99'
b'-0.3295635160799086,0.9930635538078697,-0.8771740525217612,-0.3636710820906513,-1.1164564429787098,-0.08510593365640572,1.0665577711153127,-1.38571357940702,1.563'
b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138'
b'-0.9974222662636643,1.2333642636907922,-0.7577192870888144,-0.011109251557751528,-0.23003784053222506,0.05487422342718872,-0.757726890467217,0.7065494722340417,1.739'

现在得到的是一个大的dataset,这个dataset里面的数据都是一个字符串,一个字符串由9个field组成,这9个field都是数字,接下来应该解析这9个field。

3、解析csv文件

在这里使用tf.io.decode_csv(str, record_defaults)来解析csv文件,其中str表示要解析的字符串,record_defaults:定义字符串中的各个field的类型。

下面尝试解析一条数据:

def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1]) #前8个值变成一个向量
    y = tf.stack(parsed_fields[-1:]) #最后一个值变成一个向量
    return x, y

parse_csv_line(b'-0.9868720801669367,0.832863080552588,-0.18684708416901633,-0.14888949288707784,-0.4532302419670616,-0.11504995754593579,1.6730974284189664,-0.7465496877362412,1.138',
               n_fields=9)
(<tf.Tensor: id=59, shape=(8,), dtype=float32, numpy=
 array([-0.9868721 ,  0.8328631 , -0.18684709, -0.1488895 , -0.45323023,
        -0.11504996,  1.6730974 , -0.74654967], dtype=float32)>,
 <tf.Tensor: id=60, shape=(1,), dtype=float32, numpy=array([1.138], dtype=float32)>)

4、整体流程

现在针对上面的三个部分进行总体连接

在csv_reader_dataset中,我们在map函数中对每一行进行解析。

在parse_csv_line中,我们把每一行给拆成了前八个和后一个,即x和y。这样csv_reader_dataset返回的dataset里,每一个batch都是两个元素,即x_batch和y_batch。

def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    
    #将文件名生成数据集
    dataset = tf.data.Dataset.list_files(filenames)
    
    #重复数据集
    dataset = dataset.repeat() 
    
    #读取文件内容,生成大的dataset
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    #混排
    dataset.shuffle(shuffle_buffer_size)
    
    #解析
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    
    #batch
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)#一次取出3条数据
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

 note:

先说为什么重复无限次,和为什么混排。

重复无限次的原因是因为我们可能要遍历数据集很多遍。混排的原因是我们希望数据的顺序不影响训练,因为我们是按照batch进行训练的,如果一些数据经常在同一个batch中出现,那么网络可能会学到同一个batch中不同数据间的联系,这不是我们所需要的。

1、首先将全部文件名变成一个dataset,然后将数据集重复了无限次,是为什么?

这是因为文件名重复无数次,整个最后的数据集也就会是无限的。

3. map函数解析,将数据集按照batch_size分块。

map方法是对上一步的数据集中的每个item做解析。

最后,repeat()和shuffle()和batch()这三个函数的执行顺序不同是否影响结果,有什么影响?

这个一般影响不大,不过对我们这个例子来说,其实是有两种dataset,即文件名dataset,以及文件内容dataset。repeat可以在任意一个上做,但shuffle和batch只能在后一个dataset上做,不然数据就不是我们所需要的。

x:
<tf.Tensor: id=383, shape=(3, 8), dtype=float32, numpy=
array([[ 8.0154431e-01,  2.7216142e-01, -1.1624393e-01, -2.0231152e-01,
        -5.4305160e-01, -2.1039616e-02, -5.8976209e-01, -8.2418457e-02],
       [ 8.1150836e-01, -4.8239522e-02,  5.1873392e-01, -2.9386396e-02,
        -3.4064025e-02, -5.0815947e-02, -7.1573567e-01,  9.1627514e-01],
       [ 4.9710345e-02, -8.4924191e-01, -6.2146995e-02,  1.7878747e-01,
        -8.0253541e-01,  5.0660671e-04,  6.4664572e-01, -1.1060793e+00]],
      dtype=float32)>
y:
<tf.Tensor: id=384, shape=(3, 1), dtype=float32, numpy=
array([[3.226],
       [2.147],
       [2.286]], dtype=float32)>
x:
<tf.Tensor: id=385, shape=(3, 8), dtype=float32, numpy=
array([[ 0.40127665, -0.92934215, -0.0533305 , -0.18659453,  0.65456617,
         0.02643447,  0.9312528 , -1.4406418 ],
       [ 0.15782312,  0.4323619 ,  0.3379948 , -0.01588031, -0.37338907,
        -0.05305246,  0.80061346, -1.2359096 ],
       [-0.29807281,  0.35226166, -0.10920507, -0.25055522, -0.03406402,
        -0.006034  ,  1.0805548 , -1.0611382 ]], dtype=float32)>
y:
<tf.Tensor: id=386, shape=(3, 1), dtype=float32, numpy=
array([[2.512],
       [3.169],
       [1.514]], dtype=float32)>

附全部代码:

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

output_dir = "generate_csv" #定义文件夹存储生成的数据文件
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

#将一个单独的dataset,train,valid,test保存到文件中。
def save_to_csv(output_dir, 
                data, 
                name_prefix, 
                header=None, 
                n_parts=10): 
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n") 
            for row_index in row_indices: #遍历行索引
                f.write(",".join(
                    [repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

#np.c_可以把数据按行进行merge
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train",
                              header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                              header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)

def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1]) #前8个值变成一个向量
    y = tf.stack(parsed_fields[-1:]) #最后一个值变成一个向量
    return x, y

def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    
    #将文件名生成数据集
    dataset = tf.data.Dataset.list_files(filenames)
    
    #重复数据集
    dataset = dataset.repeat() 
    
    #读取文件内容,生成大的dataset
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    #混排
    dataset.shuffle(shuffle_buffer_size)
    
    #解析
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    
    #batch
    dataset = dataset.batch(batch_size)
    return dataset


batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

model.evaluate(test_set, steps = 5160 // batch_size)

 

  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值