训练神经网络的前期数据准备

训练神经网络需要对数据进行前期处理和准备,以图片分类为例。
训练神经网络的框架一般主流tensorflow、pytorch、keras等等。以这三种框架为例准备神经网络训练所需要的数据格式。

以数组的方式保存图片

该方法以数组的形式读取图片通过append保存在内存中,在训练的过程中直接在内存中读取图片进行训练。优点是直观明了,缺点是当图片数量较多的时候回占用较多的内存。当图片数量比较多的时候不建议使用。

实现代码

import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def read_data(filepath):
    data = []
    label = []
    pathDir = os.listdir(filepath)
    print(pathDir)
    for allDir in pathDir:
        class_path = os.path.join(filepath, allDir)
        folder_list = os.listdir(class_path)
        for folder in folder_list:
            img_path = os.path.join(class_path, folder)
            print(img_path)
            data.append(cv2.imread(img_path))
            label.append(allDir)
    return data, label


def batch_generator(all_data, batch_size, shuffle=True):
  """
  :param all_data : all_data整个数据集
  :param batch_size: batch_size表示每个batch的大小
  :param shuffle: 每次是否打乱顺序
  :return:
  """
  all_data = [np.array(d) for d in all_data]
  data_size = all_data[0].shape[0]
  print("data_size: ", data_size)
  if shuffle:
    p = np.random.permutation(data_size)
    all_data = [d[p] for d in all_data]

  batch_count = 0
  while True:
    if batch_count * batch_size + batch_size > data_size:
      batch_count = 0
      if shuffle:
        p = np.random.permutation(data_size)
        all_data = [d[p] for d in all_data]
    start = batch_count * batch_size
    end = start + batch_size
    batch_count += 1
    yield [d[start: end] for d in all_data] # yield有return的功能,return之后再把它看做一个是生成器(generator)的一部分(带yield的函数才是真正的迭代器)


batch_size = 124
file_path = 'D:/Pictures'
data, label = read_data(file_path)
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.8)  # train_test_split函数可以将数据集拆分成训练集和测试集
batch_gen = batch_generator([data, label],batch_size)
batch_x,batch_y = next(batch_gen)
tfrecord读取数据

tfrecord其实是一种数据存储形式。使用tfrecord时,实际上是先读取原生数据,然后转换成tfrecord格式,再存储在硬盘上。其实,Tensorflow有和tfrecord配套的一些函数,可以加快数据的处理。实际读取tfrecord数据时,先以相应的tfrecord文件为参数,创建一个输入队列,这个队列有一定的容量(视具体硬件限制,用户可以设置不同的值),在一部分数据出队列时,tfrecord中的其他数据就可以通过预取进入队列,并且这个过程和网络的计算是独立进行的。也就是说,网络每一个iteration的训练不必等待数据队列准备好再开始,队列中的数据始终是充足的,而往队列中填充数据时,也可以使用多线程加速。

具体实现代码:

import os
import tensorflow as tf
from PIL import Image

cwd = os.getcwd()

classes = {'cat', 'dog', 'fox'}

def create_record():# 制作二进制数据
    writer = tf.python_io.TFRecordWriter("train.tfrecords")
    for index, name in enumerate(classes):
        class_path = cwd + "/" + name + "/"
        for img_name in os.listdir(class_path):
            img_path = class_path + img_name
            img = Image.open(img_path)
            img = img.resize((224, 224))
            img_raw = img.tobytes()  # 将图片转化为原生bytes
            print
            index, img_raw
            example = tf.train.Example(
                features=tf.train.Features(feature={
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[index])),
                    'img_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_raw]))
                }))
            writer.write(example.SerializeToString())
    writer.close()



def read_and_decode(filename):# 读取二进制数据
    # 创建文件队列,不限读取的数量
    filename_queue = tf.train.string_input_producer([filename])
    # create a reader from file queue
    reader = tf.TFRecordReader()
    # reader从文件队列中读入一个序列化的样本
    _, serialized_example = reader.read(filename_queue)
    # get feature from serialized example
    # 解析符号化的样本
    features = tf.parse_single_example(
        serialized_example,
        features={
            'label': tf.FixedLenFeature([], tf.int64),
            'img_raw': tf.FixedLenFeature([], tf.string)
        }
    )
    label = features['label']
    img = features['img_raw']
    img = tf.decode_raw(img, tf.uint8)
    img = tf.reshape(img, [224, 224, 3])
    img = tf.cast(img, tf.float32) * (1. / 255) - 0.5
    label = tf.cast(label, tf.int32)
    return img, label


if __name__ == '__main__':
    tfrecord_file = "train.tfrecords"
    if os.path.exists(tfrecord_file):
        data = create_record()
        img, label = read_and_decode("train.tfrecords")
    else:
        img, label = read_and_decode("train.tfrecords")
        print("tengxing", img, label)

   # 使用shuffle_batch可以随机打乱输入 next_batch挨着往下取
   # shuffle_batch才能实现[img,label]的同步,也即特征和label的同步,不然可能输入的特征和label不匹配
   # 比如只有这样使用,才能使img和label一一对应,每次提取一个image和对应的label
   # shuffle_batch返回的值就是RandomShuffleQueue.dequeue_many()的结果
   # Shuffle_batch构建了一个RandomShuffleQueue,并不断地把单个的[img,label],送入队列中
   img_batch, label_batch = tf.train.shuffle_batch([img, label],
                                                   batch_size=4, capacity=2000,
                                                   min_after_dequeue=1000)

   # 初始化所有的op
   init = tf.initialize_all_variables()

   with tf.Session() as sess:
       sess.run(init)
       # 启动队列
       threads = tf.train.start_queue_runners(sess=sess)
       for i in range(5):
           print
           img_batch.shape, label_batch
           val, l = sess.run([img_batch, label_batch])
           print(val.shape, l)
pytorch框架训练网络读取数据

pytorch可以直接从内存中读取构建训练和测试数据集,具体参考网站,也可以通过torchvision包中的transforms、datasets函数构建数据集。

具体代码参考

import os
from torch.autograd import Variable
import time
import torch.nn as nn
from torchvision import datasets,transforms, models


path = 'scene'
transform = transforms.Compose([transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])])

'''

Resize:把给定的图片resize到given size
Normalize:Normalized an tensor image with mean and standard deviation
ToTensor:convert a PIL image to tensor (H*W*C) in range [0,255] to a torch.Tensor(C*H*W) in the range [0.0,1.0]
ToPILImage: convert a tensor to PIL image
Scale:目前已经不用了,推荐用Resize
CenterCrop:在图片的中间区域进行裁剪
RandomCrop:在一个随机的位置进行裁剪
RandomHorizontalFlip:以0.5的概率水平翻转给定的PIL图像
RandomVerticalFlip:以0.5的概率竖直翻转给定的PIL图像
RandomResizedCrop:将PIL图像裁剪成任意大小和纵横比
Grayscale:将图像转换为灰度图像
RandomGrayscale:将图像以一定的概率转换为灰度图像
FiceCrop:把图像裁剪为四个角和一个中心
Pad:填充
ColorJitter:随机改变图像的亮度对比度和饱和度

'''
data_image = {x:datasets.ImageFolder(root = os.path.join(path,x),
                                     transform = transform)
              for x in ["train", "val"]}

data_loader_image = {x:torch.utils.data.DataLoader(dataset=data_image[x],
                                                batch_size = 64,
                                                shuffle = True)
                     for x in ["train", "val"]}


use_gpu = torch.cuda.is_available()
print(use_gpu)

classes = data_image["train"].classes # 查看训练集中的类别数
classes_index = data_image["train"].class_to_idx #查看每个类别对应的数字
print(classes)
print(classes_index)

print("train data set:", len(data_image["train"]))
print("val data set:", len(data_image["val"]))


# X_train,y_train = next(iter(data_loader_image["train"]))

#绘制一个batch_size的图片
# mean = [0.5, 0.5, 0.5]
# std = [0.5, 0.5, 0.5]
# img = torchvision.utils.make_grid(X_train)
# img = img.numpy().transpose((1,2,0))
# img = img*std + mean
#
# print([classes[i] for i in y_train])
# plt.imshow(img)



param = 'train'
for data in data_loader_image[param]:  # 获取训练集或者验证集中的数据,data_loader_image是一个迭代器
    batch += 1
    x, y = data
    if use_gpu:
        x, y = Variable(x.cuda()), Variable(y.cuda())
    else:
        x, y = Variable(x), Variable(y)
keras框架中构建数据集

keras加载的方式有很多中,具体可以参照网址

## ------------------ 加载自带数据集 -----------------------
from keras.datasets import mnist
from keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28, 28, 1))
train_images = train_images.astype('float32') / 255

test_images = test_images.reshape((10000, 28, 28, 1))
test_images = test_images.astype('float32') / 255

train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)


## ------------------- 从csv加载数据 -----------------------
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator

train_df=pd.read_csv("train.csv", dtype={'label':str})
print(type(train_df['label'][0]))
train_datagen=ImageDataGenerator(rescale=1./255)
train_generator=train_datagen.flow_from_dataframe(
    dataframe=train_df, directory="/data/SDWE/data", 
    x_col="id", y_col="label", 
    class_mode="categorical", target_size=(160,120), batch_size=40)

val_df=pd.read_csv("val.csv", dtype={'label':str})
val_datagen = ImageDataGenerator(rescale=1./255)
val_generator=val_datagen.flow_from_dataframe(
    dataframe=val_df, directory="/data/SDWE/data", 
    x_col="id", y_col="label", 
    class_mode="categorical", target_size=(160,120), batch_size=40)

test_df=pd.read_csv("test.csv", dtype={'label':str})
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator=test_datagen.flow_from_dataframe(
    dataframe=test_df, directory="/data/SDWE/data", 
    x_col="id", y_col="label", 
    class_mode="categorical", target_size=(160,120), batch_size=40)

for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break

## ------------------- 从文件目录加载数据 -----------------------
from keras.preprocessing.image import ImageDataGenerator

train_dir = './scene/train_val'
test_dir = './scene/test'

train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)
# train_datagen = ImageDataGenerator(
#       rescale=1./255,
#       rotation_range=40,
#       width_shift_range=0.2,
#       height_shift_range=0.2,
#       shear_range=0.2,
#       zoom_range=0.2,
#       horizontal_flip=True,
#       fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=(224,224),
        batch_size=11,
        class_mode='categorical')

val_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(224,224),
        batch_size=11,
        class_mode='categorical')

for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break
  • 1
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值