用Python将jpg格式文件逐个打标签之后再写入hdf5文件

最新推荐文章于 2024-05-19 09:36:09 发布

SSSLLL1118

最新推荐文章于 2024-05-19 09:36:09 发布

阅读量2.7k

点赞数 2

文章标签： python 卷积神经网络 cnn 计算机视觉分类

本文链接：https://blog.csdn.net/qwaszx3793/article/details/121940879

版权

在用CNN进行图片分类的任务时，发现很多demo都是使用已有的h5文件数据集进行训练

但是其他人的数据集毕竟不能100%契合每个项目

所以还是想自己准备数据集

可照片好拍，打标签和制作h5文件却很麻烦

试了一些第三方打标工具，包括labelme啥的，不是要花钱就是不好用

所以自己用Python写了个小程序，功能是：

从文件夹遍历所有图片，然后逐一打标

最后生成h5文件

程序代码如下：

1.导入所有的需要的包：

import os
import cv2
from keras.preprocessing import image
from matplotlib.pyplot import imshow, show
import numpy as np
from keras.applications.imagenet_utils import preprocess_input
import h5py
from sklearn.model_selection import train_test_split

2.手动输入分类数量、训练集h5文件名、测试集h5文件名、训练集占比：

# 分类数量：
class_num = int(input('Input the number of classes:'))
print(str(class_num) + " classes!")

# 创建hdf5训练集文件：
file_name = input('Input the hdf5 training dataset file name:')
f = h5py.File(file_name + ".hdf5", "w")

# 创建hdf5测试集文件：
file_name = input('Input the hdf5 test dataset file name:')
f2 = h5py.File(file_name + ".hdf5", "w")

# 训练集占比：
train_per = float(input('Input the percentage of the training set:'))
print('The training set accounts for ' + str(train_per * 100) + '% of the whole dataset.')

3.从当前目录遍历所有图片：

def getFileList(dir,Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir[-3:]:
                Filelist.append(dir)
    
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir=os.path.join(dir,s)
            getFileList(newDir, Filelist, ext)
 
    return Filelist
 
org_img_folder='./'
 
# 检索文件
img_num = len(imglist)
imglist = getFileList(org_img_folder, [], 'jpg')
print('本次执行检索到 '+str(img_num)+' 张图像\n')

4.进行图片逐一标注：

# 新建一个储存图片像素值的矩阵：
X_train = np.zeros((img_num, 64, 64, 3))
print(X_train.shape)
# 储存对应标签的矩阵：
y_train = np.zeros((img_num, 1))

i = 0

for img_path in imglist:
    print(img_path)
    img = image.load_img(img_path, target_size=(64, 64))
    imshow(img)
    show()
    time.sleep(1)
    class_tf = False

    while class_tf == False:
        class1 = input('which class?')
        if int(class1) < class_num:
            class_tf = True
            
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    # 将像素值和标签写入矩阵：
    X_train[i] = x
    y_train[i] = class1
    
    i += 1

5.分割训练集和测试集：

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=train_per, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
# 如果只是想把图片数据储存到numpy里，可以到此结束。

6.将训练集和测试集分别写入hdf5文件中：

# 写训练数据到hdf5文件里：
d = f.create_dataset("X_train", data=X_train)
d = f.create_dataset("y_train", data=y_train)
f.close()

# 写测试数据到hdf5文件里：
d = f2.create_dataset("X_test", data=X_test)
d = f2.create_dataset("y_test", data=y_test)
f2.close()

完整代码：

import os
import cv2
from keras.preprocessing import image
from matplotlib.pyplot import imshow, show
import numpy as np
from keras.applications.imagenet_utils import preprocess_input
import h5py
from sklearn.model_selection import train_test_split

# 分类数量：
class_num = int(input('Input the number of classes:'))
print(str(class_num) + " classes!")

# 创建hdf5训练集文件：
file_name = input('Input the hdf5 training dataset file name:')
f = h5py.File(file_name + ".hdf5", "w")

# 创建hdf5测试集文件：
file_name = input('Input the hdf5 test dataset file name:')
f2 = h5py.File(file_name + ".hdf5", "w")

# 训练集占比：
train_per = float(input('Input the percentage of the training set:'))
print('The training set accounts for ' + str(train_per * 100) + '% of the whole dataset.')

def getFileList(dir,Filelist, ext=None):
    """
    获取文件夹及其子文件夹中文件列表
    输入 dir：文件夹根目录
    输入 ext: 扩展名
    返回： 文件路径列表
    """
    newDir = dir
    if os.path.isfile(dir):
        if ext is None:
            Filelist.append(dir)
        else:
            if ext in dir[-3:]:
                Filelist.append(dir)
    
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newDir=os.path.join(dir,s)
            getFileList(newDir, Filelist, ext)
 
    return Filelist
 
org_img_folder='./'
 
# 检索文件
img_num = len(imglist)
imglist = getFileList(org_img_folder, [], 'jpg')
print('本次执行检索到 '+str(img_num)+' 张图像\n')


# 新建一个储存图片像素值的矩阵：
X_train = np.zeros((img_num, 64, 64, 3))
print(X_train.shape)
# 储存对应标签的矩阵：
y_train = np.zeros((img_num, 1))

i = 0

for img_path in imglist:
    print(img_path)
    img = image.load_img(img_path, target_size=(64, 64))
    imshow(img)
    show()
    time.sleep(1)
    class_tf = False

    while class_tf == False:
        class1 = input('which class?')
        if int(class1) < class_num:
            class_tf = True
            
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    
    # 将像素值和标签写入矩阵：
    X_train[i] = x
    y_train[i] = class1
    
    i += 1

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=train_per, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
# 如果只是想把图片数据储存到numpy里，可以到此结束。

# 写训练数据到hdf5文件里：
d = f.create_dataset("X_train", data=X_train)
d = f.create_dataset("y_train", data=y_train)
f.close()

# 写测试数据到hdf5文件里：
d = f2.create_dataset("X_test", data=X_test)
d = f2.create_dataset("y_test", data=y_test)
f2.close()

程序运行效果：