深度学习Caffe实战笔记采集数据并预处理
采集数据并进行分类
针对自己的项目,将收集到的数据分为5类,并生成清单文件:
- 使用OpenCV 采集数据;
#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui_c.h>
#include <iostream>
using namespace std;
using namespace cv;
//工程目录下的temp文件夹里,与.sln同级的temp文件
string writePath;
int CaptureImage(int num ,string people) {
//选择路径
writePath = "****";
}
VideoCapture capture;
capture.open(1); //调取外部摄像头
//capture.open(0); //调取系统默认摄像头
//定义摄像头的宽高
float width = 320;
float height = 240;
capture.set(CAP_PROP_FRAME_WIDTH, width);
capture.set(CAP_PROP_FRAME_HEIGHT, height);
string name;
namedWindow(people, CV_WINDOW_AUTOSIZE);
int i = 0;
int j = 30000;
Mat frame;
while (1){
capture >> frame; //提前测试摄像头
imshow(people, frame);//显示当前图片
if (waitKey(0) == 32){ //按空格退出当前循环
break;
}
}
waitKey(0);
while (j--) {
if (j %60!= 0){
continue;
}
capture >> frame;
name = writePath + people+to_string(i) + ".jpg";
imwrite(name, frame);
cout << name << endl;
i++;
}
waitKey(0);
return 0;
}
- 将拍摄的5类数据按照1:3的比例分为训练集和测试集 ,基于python平台脚本用来生成。
# -*- coding:UTF-8 -*- #在cmd窗口执行时添加在首行
import os, random, shutil
import sys, getopt
import string
def getDir(argv):
trainPath = 'train_dir' #训练集目录
testPath = 'val_dir' #测试集目录
rate=0.25 #比例为0.25
try:
opts, args = getopt.getopt(argv,"hi:o:r:",["ipath=","opath=","rate="])
except getopt.GetoptError:
print('<cmd> -i <trainPath> -o <testPath> -r <rate>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('<cmd> -i <trainPath> -o <testPath>')
sys.exit()
elif opt in ("-i", "--ipath"):
trainPath = arg
elif opt in ("-o", "--opath"):
testPath = arg
elif opt in ("-r", "--rate"):
rate = arg
return trainPath, testPath, rate
def moveFile(trainDir, testDir, rate):
rate=float(rate)
pathDir = os.listdir(trainDir)
filenumber=len(pathDir)
print("filenumber = ", filenumber)
picknumber=int(filenumber*rate)
print("picknumber = ", picknumber)
sample = random.sample(pathDir, picknumber)
for name in sample:
shutil.move(os.path.join(trainDir,name), os.path.join(testDir,name))
return
if __name__ == '__main__':
train, test, rate = getDir(sys.argv[1:])
if not os.path.isdir(train):
print("输入路径不存在:", train)
sys.exit()
if not os.path.isdir(test):
print("输出路径不存在:", test)
sys.exit()
tmp = float(rate)
if tmp<=0.0 or tmp>=1.0:
rate = 0.25
print('输入的文件为:', train)
print('输出的文件为:', test)
print('捡出比例为:', rate)
moveFile(train, test, rate)
- 将拍摄的5类数据分别生成 list_file文件 ,基于python平台脚本用来生成list_file文件;
import os
def generate(dir,label):
files = os.listdir(dir)
files.sort()
print('****************')
print('input :',dir)
print('start...')
listText = open(dir+'\\'+'list.txt','w')
for file in files:
fileType = os.path.split(file)
if fileType[1] == '.txt':
continue
name = file + ' ' + str(int(label)) +'\n'
listText.write(name)
listText.close()
print('down!')
print('****************')
if __name__ == '__main__':
generate('normal_train',0) //有多少个列表需要生产则调用几次
- 标签顺序打乱 功能;
# -*- coding:UTF-8 -*- 在cmd 窗口执行是添加在首行
import random
def ReadFileDatas():
FileNamelist = []
file = open('./train_list.txt','r+') //需要打乱次序的文件
for line in file:
line=line.strip('\n') #删除每一行的\n
FileNamelist.append(line)
print('len ( FileNamelist ) = ' ,len(FileNamelist))
file.close()
return FileNamelist
def WriteDatasToFile(listInfo):
file_handle=open('./train.txt',mode='a')
for idx in range(len(listInfo)):
str = listInfo[idx]
#查找最后一个 “_”的位置
ndex = str.rfind('_')
#print('ndex = ',ndex)
#截取字符串
str_houZhui = str[(ndex+1):]
#print('str_houZhui = ',str_houZhui)
str_Result = str + '\n' #+ str_houZhui+'\n'
print(str_Result)
file_handle.write(str_Result)
file_handle.close()
if __name__ == "__main__":
listFileInfo = ReadFileDatas()
#打乱列表中的顺序
random.shuffle(listFileInfo)
WriteDatasToFile(listFileInfo)
- 生成LMDB格式的数据集;
训练集
convert_imageset.exe \
--resize_height=256 --resize_width=256 \ 改变图片大小
--shuffle --backend="lmdb" \ 图片格式
train_image\\ 源图片所在文件夹
train_shut.txt 标签文件夹
trainlmdb 生成的训练集数据目录
测试集
convert_imageset.exe \
--resize_height=256 --resize_width=256 \
--shuffle --backend="lmdb" \
test_image\\ 测试集图片目录
test_shut.txt 测试集标签数据
testlmdb 生成的测试机数据目录
pause
6. 生成均值文件 ;
compute_image_mean.exe trainlmdb image_mean.binaryproto
pause