读取csv文件
-
numpy直接读取,适用于数据量较小的情况
my_matrix = numpy.loadtxt(open("./train.csv", "rb"), delimiter ="\t", skiprows = 1)
-
pandas读取
source_data = pd.read_csv("./train.csv", header=None, skiprows=1) data = source_data.values
数据处理
数组读取
数组格式为[[0, 0], [10, 1], [20, 2]]
import numpy as np
import math
data = np.array([[x*10,x] for x in range(16)])
def xs_gen(data, batch_size):
lists = data
num_batch = math.ceil(len(lists) / batch_size) # 确定每轮有多少个batch
for i in range(num_batch):
batch_list = lists[i * batch_size : i * batch_size + batch_size]
np.random.shuffle(batch_list)
batch_x = np.array([x for x in batch_list[:,0]])
batch_y = np.array([y for y in batch_list[:,1]])
yield batch_x, batch_y
def xs_gen_pro(data, batch_size):
lists = data
num_batch = math.ceil(len(lists) / batch_size) # 确定每轮有多少个batch
np.random.shuffle(lists)
for i in range(num_batch):
batch_list = lists[i * batch_size : i * batch_size + batch_size]
np.random.shuffle(batch_list)
batch_x = np.array([x for x in batch_list[:,0]])
batch_y = np.array([y for y in batch_list[:,1]])
yield batch_x, batch_y
#自定义的generator()函数
def xs_gen_keras(data, batch_size):
lists = data
num_batch = math.ceil(len(lists) / batch_size) # 确定每轮有多少个batch
while True:
np.random.shuffle(lists)
for i in range(num_batch):
batch_list = lists[i * batch_size : i * batch_size + batch_size]
np.random.shuffle(batch_list)
batch_x = np.array([x for x in batch_list[:,0]])
batch_y = np.array([y for y in batch_list[:,1]])
yield batch_x, batch_y
if __name__ == "__main__":
for x, y in xs_gen(data, 5):
print("item", x, y)
for x, y in xs_gen(data, 5):
print("item", x, y)
利用keras.utils.Sequence类来生成batch
(参考@Xovee同学的文章 https://blog.csdn.net/xovee/article/details/91357143)
class Generator(keras.utils.Sequence):
"""
"""
def __init__(self, x, y, batch_size):
"""
"""
self.x, self.y = x, y
self.batch_size = batch_size
def __len__(self):
"""
"""
return math.ceil(len(self.y) / self.batch_size)
def __getitem__(self, idx):
"""
"""
b_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
b_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]
return np.array(b_x), np.array(b_y)
def on_epoch_end(self):
pass
LENS = 640
LONGS = 792
batch_size = 100
input_data = pd.read_csv("./train.csv", header=None, skiprows=1).values
train_data = input_data[:LENS]
train_x = np.array([x for x in train_data[:,1:-1]])
train_y = np.array([x for x in train_data[:,-1]])
val_data = input_data[LENS:]
val_x = np.array([x for x in val_data[:,1:-1]])
val_y = np.array([x for x in val_data[:,-1]])
train_generator = Generator(train_x, train_y, batch_size)
val_generator = Generator(val_x, val_y, batch_size)
生成器的应用
在TensorFlow,pytorch应用生成器
for e in Epochs:
for x,y in xs_gen():
train(x,y)
在keras应用生成器
train_data_gen = xs_gen_keras(train_data, BATCH_SIZE)
val_data_gen = xs_gen_keras(val_data, BATCH_SIZE))
history = model.fit_generator(
train_data_gen,
steps_per_epoch=int(np.ceil(total_train / float(BATCH_SIZE))),
epochs=EPOCHS,
validation_data=val_data_gen,
validation_steps=int(np.ceil(total_val / float(BATCH_SIZE)))
)