本篇文章所有数据和代码的下载地址:点击此处
1.问题描述
已知确定类别的数据集:1934个文件,选取0_0.txt(“0”图像,第一个文件,索引0)内容示例如下
00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000
按文件排列顺序,整体类别划分如下
0~188 :表示数字“0”
189~386:表示数字“1”
387~581:表示数字“2”
582~780:表示数字“3”
781~966:表示数字“4”
967~1153:表示数字“5”
1154~1348:表示数字“6”
1349~1549:表示数字“7“
1550~1729:表示数字”8“
1730~1933;:表示数字”9“
待识别的测试数据集:946个未知类别的txt文件。
我们现在有已知类型的数据集(trainingDigits目录中),每个.txt文件都包含了一种数字,现在给你个未知数据集(testDigits目录中),要求把他们里包含的数字识别出来。
2.思路解析
因数据集特点,结果只会是0~9,故此题可看成多分类问题,简单感知机无法使用(仅能二分类),学过深度学习的很容易解决这类问题,但如果采用机器学习的传统方式,比如k近邻法如何解决这种问题呢?
从上图中大家应该知道kd树等算法是干什么的了吧,其实就是一种建立索引加速查找的方法。创建索引文件后存储下来,下次查找k近邻可以极大提升性能。预处理有很多方式(此处可以再仔细研究一下),我直接采用拉伸拼接的手段。
3.代码详解
(1)根据思路,我们首先对原数据集做预处理,把每一个32*32的txt文件转换成1*1024的向量,统一存储于dataset_sun.dat和testset_sun.dat中。dataConvert.cpp代码如下:
/*author:孙小五(五癫)
*create time:2018/6/30
*version:1.0
*description:0-1数字组成的简单图像格式转换
*note:此程序适用于windows,linux版本以后再补。已预留空间,大家可以自己尝试实现以下
“0”的原数据格式表示(采用.txt文件):
00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000
此程序的目的是:把这32*32的所有txt文件转换成1*1024的向量,并统一存储到文件中(每一行都1024列)。
*/
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#ifdef linux
#include <unistd.h>
#include <dirent.h>
#endif
#ifdef WIN32
//#include "vld.h" //内存泄露检测工具
#include <direct.h>
#include <io.h>
#endif
using namespace std;
void getAllFiles(string path, vector<string>& files)
{
#ifdef linux
#endif
#ifdef WIN32
//文件句柄
long hFile = 0;
//文件信息
struct _finddata_t fileinfo; //很少用的文件信息读取结构
string p; //string类很有意思的一个赋值函数:assign(),有很多重载版本
if ((hFile = _findfirst(p.assign(path).append("\\*").c_str(), &fileinfo)) != -1)
{
do
{
if ((fileinfo.attrib & _A_SUBDIR))
{ //比较文件类型是否是文件夹
if (strcmp(fileinfo.name, ".") != 0 && strcmp(fileinfo.name, "..") != 0)
{
files.push_back(p.assign(path).append("\\").append(fileinfo.name));
getAllFiles(p.assign(path).append("\\").append(fileinfo.name), files);
}
}
else
{
files.push_back(p.assign(path).append("\\").append(fileinfo.name));
}
}
while (_findnext(hFile, &fileinfo) == 0); //寻找下一个,成功返回0,否则-1
_findclose(hFile);
}
#endif
}
char* read_points(const char* filename, int rows, int cols)
{
char* data = nullptr;
char* p = nullptr;
FILE* fin = nullptr;
int count = 0;
fin = fopen(filename,"r");
if (!fin) {
printf("Cannot open input file.\n");
exit(1);
}
data = (char*) malloc(rows*cols*sizeof(char));
if (!data) {
printf("Cannot allocate memory.\n");
exit(1);
}
p = data;
while (count != rows*cols)
{
fscanf(fin, "%c", p);
if (*p != '\n')
{
++count;
++p;
}
}
fclose(fin);
return data;
}
void write_results(const string& filename, char *data, int rows, int cols)
{
ofstream file;
file.open(filename, ios_base::app);
if (!file.is_open()) //判断文件是否正常打开。
{
cerr << "can not open the file~" << endl;
}
char* p = data;
for (int i = 0; i < rows; ++i)
{
for (int j = 0; j < cols-1; ++j)
{
file << *(p++) <<" ";
}
}
file <<*p<< "\n";
file.close();
}
int main(int argc, char** argv)
{
char *dataset = NULL;
char *testset = NULL;
vector<string> fileNames;
int row = 32;
int col = 32;
int trans_row = 1;
int trans_col = 1024;
string path;
/*cout << "输入文件所在目录路径:(空格隔开)" << endl;
cin >> path;
cout << "输入文件原数据的行和列:" << endl;
cin >> row >> col;
cout << "输入文件数据想转换的行和列:" << endl;
cin >> trans_row >> trans_col;
cout << "给汇总文件起名:" << endl;
cin >> newFileName;*/
//转换dataset集文件
getAllFiles(".\\digits\\trainingDigits", fileNames);
for (int i = 0; i < fileNames.size(); ++i)
{
dataset = read_points(fileNames[0].c_str(), row, col);
write_results("dataset_sun.dat", dataset, trans_row, trans_col);
free(dataset);
}
fileNames.clear();
//转换testset测试集文件
/*getAllFiles(".\\digits\\testDigits", fileNames);
for (int i = 0; i < fileNames.size(); ++i)
{
testset = read_points(fileNames[i].c_str(), row, col);
write_results("testset_sun.dat", testset, trans_row, trans_col);
free(dataset);
}*/
return 0;
}
执行结束后,得到1934*1024的dataset_sun.dat和946*1024的testset_sun.dat文件
(2)读取预处理后的文件,建立结构索引(代码中设置具体算法),搜索k近邻,存储索引到results.dat中。(sun.c),源码如下:
//author:孙小五(五癫)
//create time:2018/7/1
//version:1.0
//description:读取预处理后的文件,建立结构索引(代码中设置具体算法),搜索k近邻,存储索引到
//results_sun.dat中。
#include <flann/flann.h>
#include <stdio.h>
#include <stdlib.h>
float* read_points(const char* filename, int rows, int cols)
{
float* data;
float *p;
FILE* fin;
int i,j;
fin = fopen(filename,"r");
if (!fin) {
printf("Cannot open input file.\n");
exit(1);
}
data = (float*) malloc(rows*cols*sizeof(float));
if (!data) {
printf("Cannot allocate memory.\n");
exit(1);
}
p = data;
for (i=0;i<rows;++i) {
for (j=0;j<cols;++j) {
fscanf(fin,"%g ",p);
p++;
}
}
fclose(fin);
return data;
}
void write_results(const char* filename, int *data, int rows, int cols)
{
FILE* fout;
int* p;
int i,j;
fout = fopen(filename,"w");
if (!fout) {
printf("Cannot open output file.\n");
exit(1);
}
p = data;
for (i=0;i<rows;++i) {
for (j=0;j<cols;++j) {
fprintf(fout,"%d ",*p);
p++;
}
fprintf(fout,"\n");
}
fclose(fout);
}
int main(int argc, char** argv)
{
float* dataset;
float* testset;
int nn;
int* result;
float* dists;
struct FLANNParameters p;
float speedup;
flann_index_t index_id;
int rows = 1934;
int cols = 1024;
int tcount = 946;
printf("Reading input data file.\n");
dataset = read_points("/home/sunxiaowu/Downloads/flann-1.8.4-src/examples/digits/data/dataset_sun.dat", rows, cols);
printf("Reading test data file.\n");
testset = read_points("/home/sunxiaowu/Downloads/flann-1.8.4-src/examples/digits/data/testset_sun.dat", tcount, cols);
nn = 5;
result = (int*) malloc(tcount*nn*sizeof(int));
dists = (float*) malloc(tcount*nn*sizeof(float));
p = DEFAULT_FLANN_PARAMETERS;
p.algorithm = FLANN_INDEX_KDTREE;
p.trees = 8;
p.log_level = FLANN_LOG_INFO;
p.checks = 64;
printf("Computing index.\n");
index_id = flann_build_index(dataset, rows, cols, &speedup, &p);
flann_find_nearest_neighbors_index(index_id, testset, tcount, result, dists, nn, &p);
write_results("results_wu.dat",result, tcount, nn);
flann_free_index(index_id, &p);
(dataset);
free(testset);
free(result);
free(dists);
return 0;
}
记录linux下sun.c的编译:
编译命令如下(以后专门写一篇介绍linux下c/c++编译过程和实践的文章,本来写的是介绍编译原理的,突然发现..好大范围...)
gcc -o flann-RecongnizeNum_sun sun.c -I /home/sunxiaowu/Downloads/flann-1.8.4-src/src/cpp/ -L/home/sunxiaowu/Downloads/flann-1.8.4-src/build/lib -lflann
上述命令执行后,生成图中
flann-RecongnizeNum_sun的可执行文件。
因使用了flann的动态库,运行程序前需要查看其依赖的动态库是否都可以搜索到,我们使用ldd命令。
动态库libflann.so.1.8搜索不到,我们要添加其搜索路径,方式有很多,我的操作:vim /etc/ld.so.conf.d/flann.conf
加入/home/sunxiaowu/Downloads/flann-1.8.4-src/build/lib,然后运行ldconfig命令即可。再次ldd,发现动态库都搜索得到,运行flann-RecongnizeNum_sun。
结果得到result_wu.dat文件。
(3)统计k近邻中出现次数最多的种类,依此做判定,recong.cpp源码如下:
//author:孙小五(五癫)
//create time:2018/7/1
//version:1.0
//description:读取resut_wu.dat,对于每一行,统计k近邻出现次数最多的类别作为其判定结果
#include<iostream>
#include<fstream>
#include<vector>
#include<string>
#include<algorithm>
using namespace std;
float* read_points(const char* filename, int rows, int cols)
{
float* data;
float *p;
FILE* fin;
int i,j;
fin = fopen(filename,"r");
if (!fin) {
printf("Cannot open input file.\n");
exit(1);
}
data = (float*) malloc(rows*cols*sizeof(float));
if (!data) {
printf("Cannot allocate memory.\n");
exit(1);
}
p = data;
for (i=0;i<rows;++i) {
for (j=0;j<cols;++j) {
fscanf(fin,"%g ",p);
p++;
}
}
fclose(fin);
return data;
}
bool greater_sun(const pair<char, int>& s1, const pair<char, int>& s2)
{
return s1.second > s2.second;
}
vector<char> Recongnize(const char* filename, int rows, int cols)
{
vector<char> result;
ifstream file(filename);
if (!file.is_open()) //判断文件是否正常打开
{
cerr << "open file error" << endl;
}
int data;
vector<pair<char, int> > mp;
for (int i = 0; i < rows; ++i)
{
//建立mp,做统计之用
mp.clear();
char ch = '0';
for (int k = 0; k < 10; ++k)
{
mp.push_back(make_pair(ch++, 0));
}
//根据索引判断类别
for (int j = 0; j < cols; ++j)
{
file >> data;
if (data <= 188)
mp[0].second++;
else if (data >= 189 && data <= 386)
mp[1].second++;
else if (data >= 387 && data <= 581)
mp[2].second++;
else if (data >= 582 && data <= 780)
mp[3].second++;
else if (data >= 781 && data <= 966)
mp[4].second++;
else if (data >= 967 && data <= 1153)
mp[5].second++;
else if (data >= 1154 && data <= 1348)
mp[6].second++;
else if (data >= 1349 && data <= 1549)
mp[7].second++;
else if (data >= 1550 && data <= 1729)
mp[8].second++;
else if (data >= 1730 && data <= 1933)
mp[9].second++;
}
//根据value值从大到小排序
sort(mp.begin(), mp.end(), greater_sun);
result.push_back(mp[0].first);
}
file.close();
return result;
}
int main()
{
float* dataset = read_points("./digits/data/testset_sun.dat",946,1024);
vector<char> res = Recongnize("results_wu.dat", 946, 5);
cout << "master,the result of my identification is as follows:" << endl;
//for (int i = 0; i < res.size(); ++i) 全部打印,发现识别率很高
//我们示例的展示前10个预测结果
int tmp = 0;
for(int i = 0;i < 10; ++i)
{ cout<<"要识别的图像"<<i<<":"<<endl;
for(int j = 0;j < 32; ++j)
{
for(int k = 0;k < 32;++k)
{
cout<< dataset[tmp++];
}
cout<<endl;
}
cout<<"识别结果:"<< res[i]<<endl;
}
free(dataset);
return 0;
}
4.运行结果部分截图展示
成功~~~
更多内容,可关注小五的个人计算机学习网站-趣玩极乐网(www.sunxiaowu.top)