贝叶斯分类器:
#include <string>
#include <vector>
#include <set>
#include <map>
#include <iostream>
#include <iterator>
#include <sstream>
#include <ctime>
#include <bitset>
#include <algorithm>
#include <fstream>
using namespace std;
#include <opencv/cv.h>
#include <opencv/cxcore.h>
#include <opencv/highgui.h>
#include <opencv2/legacy/legacy.hpp>
#include <boost/filesystem.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/tuple/tuple.hpp>
#include <boost/tuple/tuple_comparison.hpp>
namespace {
void GetALlFilesInDir(vector<string> &all_files_, const string &dir_) {
boost::filesystem::path p(dir_);
if (boost::filesystem::exists(p) && boost::filesystem::is_directory(p)) {
for (boost::filesystem::directory_iterator iter(p);
iter != boost::filesystem::directory_iterator(); ++iter) {
boost::filesystem::directory_entry entry = *iter;
if (boost::filesystem::is_regular(entry.path())) {
all_files_.push_back(entry.path().string());
}
}
}
}
void Split(string const &s, char c, vector<string> &v)
{
string::size_type i = 0;
string::size_type j = s.find(c);
while(j != string::npos)
{
v.push_back(s.substr(i, j - i));
i = ++j;
j = s.find(c, j);
}
if (j == string::npos)
{
v.push_back(s.substr(i, s.length()));
}
}
}
/*!
* 贝叶斯分类器
* P(A|W) * P(W)
* 核心公式: P(W|A) = --------------- 其中A是一种特征,W是一种模式类
* P(A)
*/
class BayesClassify{
public:
explicit BayesClassify() : study_data_number_(0){}
void Save(string const &file){
ofstream f(file.c_str());
f << study_data_number_ << endl;
for (auto iter = pw_.begin(); iter != pw_.end(); ++iter) {
f << iter->first << "," << iter->second << " ";
}
f << endl;
for (auto iter = pa_.begin(); iter != pa_.end(); ++iter) {
f << iter->first.first << "," << iter->first.second << "," << iter->second << " ";
}
f << endl;
for (auto iter = paw_.begin(); iter != paw_.end(); ++iter) {
f << iter->first.get<0>() << "," << iter->first.get<1>() << "," << iter->first.get<2>() << "," << iter->second << " ";
}
}
void Load(string const &file){
ifstream f(file.c_str());
f >> study_data_number_;
string line;
vector<string> splited;
getline(f, line);
pw_.clear();
pa_.clear();
paw_.clear();
getline(f, line);
::Split(line, ' ', splited);
for (int i = 0; i < splited.size(); ++i) {
vector<string> number_splited;
::Split(splited[i], ',', number_splited);
if (number_splited.size() == 2) {
pw_[boost::lexical_cast<int>(number_splited[0])] = boost::lexical_cast<int>(number_splited[1]);
}
}
getline(f, line);
splited.clear();
::Split(line, ' ', splited);
for (int i = 0; i < splited.size(); ++i) {
vector<string> number_splited;
::Split(splited[i], ',', number_splited);
if (number_splited.size() == 3) {
pa_[make_pair(boost::lexical_cast<int>(number_splited[0]), boost::lexical_cast<int>(number_splited[1]))] = boost::lexical_cast<int>(number_splited[2]);
}
}
getline(f, line);
splited.clear();
::Split(line, ' ', splited);
for (int i = 0; i < splited.size(); ++i) {
vector<string> number_splited;
::Split(splited[i], ',', number_splited);
if (number_splited.size() == 4) {
paw_[boost::make_tuple(boost::lexical_cast<int>(number_splited[0]), boost::lexical_cast<int>(number_splited[1]), boost::lexical_cast<int>(number_splited[2]))] = boost::lexical_cast<int>(number_splited[3]);
}
}
}
void Study(vector<int> const &data, int type){
++study_data_number_;
++pw_[type];
for (int i = 0; i < data.size(); ++i) {
++pa_[make_pair(i, data[i])];
++paw_[boost::make_tuple(type, i, data[i])];
}
}
int Classify(vector<int> const &data){
int most_possible_type = -1;
double most_possible = 0;
for (auto iter = pw_.begin(); iter != pw_.end(); ++iter){
int class_type = iter->first;
double current_possible = 0;
for (int i = 0; i < data.size(); ++i) {
if (pa_[make_pair(i, data[i])] != 0) {
current_possible += ((paw_[boost::make_tuple(class_type, i, data[i])] * pw_[class_type])
/(pa_[make_pair(i, data[i])] * study_data_number_ * 1.0));
}
}
if (current_possible > most_possible) {
most_possible = current_possible;
most_possible_type = class_type;
}
}
return most_possible_type;
}
private:
int study_data_number_;
map<int, int> pw_;
map<pair<int, int>, int> pa_;
map<boost::tuple<int, int, int>, int> paw_;
};
vector<int> GetImageData(string const &file){
cv::Moments moment = cv::moments(cv::imread(file, 0));
double hu[7];
cv::HuMoments(moment, hu);
vector<int> image_data;
for (int i = 0; i < 7; ++i) {
hu[i] = std::abs(std::log(hu[i]));
image_data.push_back(hu[i]);
}
return image_data;
}
void Study(BayesClassify &classifer, string const dir, int type){
vector<string> files;
GetALlFilesInDir(files, dir);
for_each(files.begin(), files.end(), [&](string const &file){
classifer.Study(GetImageData(file), type);
});
}
/*!
* 贝叶斯分类器
*/
int main(){
BayesClassify classifer;
string const path_for_image_to_classifier = "/home/chuanqi/ImageDataset/GoogleImages/";
Study(classifer, path_for_image_to_classifier + "Face/", 0);
Study(classifer, path_for_image_to_classifier + "Jianqie/", 1);
Study(classifer, path_for_image_to_classifier + "Shumiao/", 2);
classifer.Save("/home/chuanqi/ImageDataset/study_data.txt");
classifer.Load("/home/chuanqi/ImageDataset/study_data.txt");
vector<string> files;
GetALlFilesInDir(files, path_for_image_to_classifier);
int correct_classify_times = 0;
for (int i = 0; i < files.size(); ++i) {
int type = classifer.Classify(GetImageData(files[i]));
cout << type << endl;
if (type == 2) {
++correct_classify_times;
}
}
cout << correct_classify_times * 1.0 / files.size() << endl;
return 0;
}
学习文件如下:
1041
0,287 1,342 2,412
0,5,14 0,6,484 0,7,543 1,12,1 1,13,14 1,14,56 1,15,184 1,16,335 1,17,201 1,18,73 1,19,43 1,20,42 1,21,43 1,22,22 1,23,12 1,24,7 1,25,4 1,26,2 1,27,1 1,29,1 2,19,9 2,20,13 2,21,21 2,22,55 2,23,61 2,24,71 2,25,66 2,26,94 2,27,104 2,28,117 2,29,136 2,30,109 2,31,76 2,32,51 2,33,26 2,34,8 2,35,11 2,36,6 2,37,3 2,38,1 2,39,1 2,41,1 2,42,1 3,17,1 3,18,1 3,19,1 3,20,9 3,21,16 3,22,39 3,23,63 3,24,77 3,25,93 3,26,116 3,27,100 3,28,143 3,29,103 3,30,96 3,31,62 3,32,44 3,33,42 3,34,16 3,35,8 3,36,6 3,37,2 3,38,1 3,39,1 3,41,1 4,-2147483648,433 4,36,1 4,37,1 4,39,1 4,40,5 4,41,5 4,42,6 4,43,5 4,44,7 4,45,15 4,46,21 4,47,29 4,48,19 4,49,22 4,50,23 4,51,33 4,52,33 4,53,37 4,54,29 4,55,33 4,56,20 4,57,43 4,58,39 4,59,29 4,60,28 4,61,22 4,62,23 4,63,14 4,64,19 4,65,9 4,66,8 4,67,10 4,68,5 4,69,3 4,70,3 4,71,1 4,72,1 4,73,4 4,77,1 4,79,1 5,-2147483648,394 5,26,1 5,27,4 5,28,9 5,29,15 5,30,31 5,31,31 5,32,43 5,33,42 5,34,66 5,35,48 5,36,67 5,37,55 5,38,56 5,39,59 5,40,39 5,41,23 5,42,15 5,43,16 5,44,12 5,45,9 5,46,3 5,47,1 5,51,2 6,-2147483648,542 6,38,1 6,40,1 6,41,1 6,42,5 6,43,3 6,44,5 6,45,7 6,46,12 6,47,15 6,48,17 6,49,24 6,50,22 6,51,24 6,52,23 6,53,20 6,54,24 6,55,24 6,56,24 6,57,25 6,58,38 6,59,31 6,60,21 6,61,26 6,62,22 6,63,16 6,64,12 6,65,16 6,66,12 6,67,8 6,68,8 6,69,4 6,70,3 6,71,1 6,72,1 6,74,1 6,75,1 6,83,1
0,0,5,12 0,0,6,256 0,0,7,19 0,1,12,1 0,1,13,14 0,1,14,47 0,1,15,93 0,1,16,82 0,1,17,30 0,1,18,10 0,1,19,6 0,1,20,2 0,1,21,1 0,1,22,1 0,2,19,7 0,2,20,12 0,2,21,17 0,2,22,53 0,2,23,54 0,2,24,54 0,2,25,28 0,2,26,32 0,2,27,17 0,2,28,7 0,2,29,2 0,2,30,3 0,2,32,1 0,3,18,1 0,3,20,9 0,3,21,14 0,3,22,35 0,3,23,52 0,3,24,58 0,3,25,47 0,3,26,34 0,3,27,20 0,3,28,10 0,3,29,2 0,3,30,4 0,3,31,1 0,4,-2147483648,96 0,4,37,1 0,4,40,5 0,4,41,5 0,4,42,5 0,4,43,5 0,4,44,5 0,4,45,15 0,4,46,19 0,4,47,27 0,4,48,15 0,4,49,15 0,4,50,10 0,4,51,19 0,4,52,13 0,4,53,11 0,4,54,5 0,4,55,5 0,4,56,4 0,4,57,1 0,4,58,3 0,4,59,1 0,4,60,2 0,5,-2147483648,88 0,5,26,1 0,5,27,4 0,5,28,9 0,5,29,12 0,5,30,28 0,5,31,26 0,5,32,34 0,5,33,25 0,5,34,30 0,5,35,13 0,5,36,7 0,5,37,4 0,5,38,5 0,5,42,1 0,6,-2147483648,148 0,6,40,1 0,6,41,1 0,6,42,5 0,6,43,2 0,6,44,5 0,6,45,5 0,6,46,11 0,6,47,13 0,6,48,14 0,6,49,19 0,6,50,16 0,6,51,12 0,6,52,10 0,6,53,5 0,6,54,7 0,6,55,3 0,6,56,3 0,6,57,2 0,6,58,2 0,6,60,1 0,6,61,1 0,6,63,1 1,0,5,1 1,0,6,160 1,0,7,181 1,1,14,6 1,1,15,78 1,1,16,120 1,1,17,53 1,1,18,18 1,1,19,8 1,1,20,20 1,1,21,20 1,1,22,13 1,1,23,3 1,1,24,2 1,1,25,1 1,2,21,3 1,2,22,2 1,2,23,5 1,2,24,5 1,2,25,25 1,2,26,39 1,2,27,51 1,2,28,60 1,2,29,52 1,2,30,39 1,2,31,19 1,2,32,19 1,2,33,10 1,2,34,4 1,2,35,3 1,2,36,2 1,2,37,1 1,2,39,1 1,2,41,1 1,2,42,1 1,3,17,1 1,3,21,1 1,3,22,3 1,3,23,7 1,3,24,12 1,3,25,26 1,3,26,52 1,3,27,48 1,3,28,62 1,3,29,46 1,3,30,34 1,3,31,16 1,3,32,12 1,3,33,12 1,3,34,3 1,3,35,1 1,3,36,4 1,3,39,1 1,3,41,1 1,4,-2147483648,146 1,4,36,1 1,4,44,1 1,4,46,2 1,4,47,1 1,4,48,3 1,4,49,6 1,4,50,7 1,4,51,9 1,4,52,12 1,4,53,20 1,4,54,11 1,4,55,16 1,4,56,9 1,4,57,21 1,4,58,14 1,4,59,10 1,4,60,13 1,4,61,6 1,4,62,9 1,4,63,4 1,4,64,5 1,4,65,2 1,4,66,2 1,4,67,4 1,4,68,3 1,4,70,1 1,4,72,1 1,4,73,1 1,4,77,1 1,4,79,1 1,5,-2147483648,140 1,5,29,1 1,5,30,2 1,5,31,2 1,5,32,8 1,5,33,10 1,5,34,25 1,5,35,18 1,5,36,29 1,5,37,20 1,5,38,23 1,5,39,21 1,5,40,17 1,5,41,8 1,5,42,3 1,5,43,7 1,5,44,2 1,5,45,4 1,5,47,1 1,5,51,1 1,6,-2147483648,180 1,6,45,1 1,6,46,1 1,6,47,1 1,6,48,3 1,6,49,5 1,6,50,2 1,6,51,5 1,6,52,11 1,6,53,11 1,6,54,8 1,6,55,12 1,6,56,14 1,6,57,13 1,6,58,15 1,6,59,12 1,6,60,10 1,6,61,13 1,6,62,5 1,6,63,4 1,6,64,2 1,6,65,3 1,6,66,4 1,6,67,2 1,6,68,2 1,6,72,1 1,6,74,1 1,6,83,1 2,0,5,1 2,0,6,68 2,0,7,343 2,1,14,3 2,1,15,13 2,1,16,133 2,1,17,118 2,1,18,45 2,1,19,29 2,1,20,20 2,1,21,22 2,1,22,8 2,1,23,9 2,1,24,5 2,1,25,3 2,1,26,2 2,1,27,1 2,1,29,1 2,2,19,2 2,2,20,1 2,2,21,1 2,2,23,2 2,2,24,12 2,2,25,13 2,2,26,23 2,2,27,36 2,2,28,50 2,2,29,82 2,2,30,67 2,2,31,57 2,2,32,31 2,2,33,16 2,2,34,4 2,2,35,8 2,2,36,4 2,2,37,2 2,2,38,1 2,3,19,1 2,3,21,1 2,3,22,1 2,3,23,4 2,3,24,7 2,3,25,20 2,3,26,30 2,3,27,32 2,3,28,71 2,3,29,55 2,3,30,58 2,3,31,45 2,3,32,32 2,3,33,30 2,3,34,13 2,3,35,7 2,3,36,2 2,3,37,2 2,3,38,1 2,4,-2147483648,191 2,4,39,1 2,4,42,1 2,4,44,1 2,4,47,1 2,4,48,1 2,4,49,1 2,4,50,6 2,4,51,5 2,4,52,8 2,4,53,6 2,4,54,13 2,4,55,12 2,4,56,7 2,4,57,21 2,4,58,22 2,4,59,18 2,4,60,13 2,4,61,16 2,4,62,14 2,4,63,10 2,4,64,14 2,4,65,7 2,4,66,6 2,4,67,6 2,4,68,2 2,4,69,3 2,4,70,2 2,4,71,1 2,4,73,3 2,5,-2147483648,166 2,5,29,2 2,5,30,1 2,5,31,3 2,5,32,1 2,5,33,7 2,5,34,11 2,5,35,17 2,5,36,31 2,5,37,31 2,5,38,28 2,5,39,38 2,5,40,22 2,5,41,15 2,5,42,11 2,5,43,9 2,5,44,10 2,5,45,5 2,5,46,3 2,5,51,1 2,6,-2147483648,214 2,6,38,1 2,6,43,1 2,6,45,1 2,6,47,1 2,6,50,4 2,6,51,7 2,6,52,2 2,6,53,4 2,6,54,9 2,6,55,9 2,6,56,7 2,6,57,10 2,6,58,21 2,6,59,19 2,6,60,10 2,6,61,12 2,6,62,17 2,6,63,11 2,6,64,10 2,6,65,13 2,6,66,8 2,6,67,6 2,6,68,6 2,6,69,4 2,6,70,3 2,6,71,1 2,6,75,1
效果如下:
基于线性可分类的二分神经网络:
/*!
* 基于线性可分类的二分神经网络
*/
class NeuralNetwork {
public:
explicit NeuralNetwork(int feature_dimension) {
w_.resize(feature_dimension + 1, 0); //系数应该比特征的维度多1
}
/*!
* datas[i].first == true 时属于类别1
* datas[i].first == false 时属于类别2
*/
void Study(vector<pair<bool, vector<double>>> const &datas) {
auto copy_datas = datas;
for (int i = 0; i < copy_datas.size(); ++i) {
copy_datas[i].second.push_back(1); //进行模式向量扩展
}
bool complete_correct = true; //是否对训练数据全部判断正确
do {
complete_correct = true;
for (int i = 0; i < copy_datas.size(); ++i) {
double classify_type = Classify(copy_datas[i].second);
if (classify_type == 0 || (classify_type > 0) != datas[i].first) {
//判断错误
complete_correct = false;
//修正系数
_VectorPlus(w_, (!copy_datas[i].first
? _VectorMulty(copy_datas[i].second, -1)//属于类别2时需要进行乘-1的修正
: copy_datas[i].second));
}
}
}while(!complete_correct); //由于学习数据是线性可分的,所以可以迭代学习到对训练数据全部判断正确
}
/*!
* 判断一个特征所属的类别
* @retval >0 属于类别1
* @retval <0 属于类别2
* @retval =0 处于线性可分的边界
*/
double Classify(vector<double> const &y) {
return _DotProduct(w_, y);
}
private:
double _DotProduct(vector<double> const &v1, vector<double> const &v2){
double product = 0;
for (int i = 0; i < v1.size(); ++i) {
product += v1[i] * v2[i];
}
return product;
}
void _VectorPlus(vector<double> &v1, vector<double> const &v2){
for (int i = 0; i < v1.size(); ++i) {
v1[i] += v2[i];
}
}
vector<double> _VectorMulty(vector<double> const &v1, double m){
vector<double> v;
for (int i = 0; i < v1.size(); ++i) {
v.push_back(v1[i] * m);
}
return v;
}
vector<double> w_;
};
/*!
* 二分神经网络
*/
int main(){
vector<double> w11 = {0, 0};
vector<double> w12 = {0, 1};
vector<double> w21 = {1, 0};
vector<double> w22 = {1, 1};
vector<pair<bool, vector<double>>> datas;
datas.push_back(make_pair(true, w11));
datas.push_back(make_pair(true, w12));
datas.push_back(make_pair(false, w21));
datas.push_back(make_pair(false, w22));
NeuralNetwork nn(2);
nn.Study(datas);
for (int i = 0; i < 10; ++i) {
vector<double> test = {rand() % 10 / 10.0, rand() % 10 / 10.0};
cout << "(" << test[0] << ", " << test[1] << "): " << boolalpha << (nn.Classify(test) > 0) << endl;
}
return 0;
}
输出如下:
(0.3, 0.6): true
(0.7, 0.5): false
(0.3, 0.5): true
(0.6, 0.2): false
(0.9, 0.1): false
(0.2, 0.7): true
(0, 0.9): true
(0.3, 0.6): true
(0, 0.6): true
(0.2, 0.6): true
可见,这种二分神经网络能够正确的分数据进行分类!
多层前馈神经网络:
/*!
* 多层前馈神经网络
*/
class BPNeuralNetwork{
public:
/*!
* 构建一个多层前馈的神经网络
*/
explicit BPNeuralNetwork(vector<int> const &layers, double alpha)
: layers_(layers), alpha_(alpha) {
//设置随机的初始权值
w_.resize(layers_.size());
for (int i = 0; i < w_.size(); ++i) {
w_[i].resize(layers_[i]);
int previous_lay = std::max(i - 1, 0);
for (int j = 0; j < w_[i].size(); ++j) {
w_[i][j].resize(layers_[previous_lay]);
for (int k = 0; k < w_[i][j].size(); ++k) {
w_[i][j][k] = rand() % 10 / 10.0;
}
}
}
}
void DisplayW(){
for (int i = 0; i < w_.size(); ++i) {
for (int j = 0; j < w_[i].size(); ++j) {
cout << "(";
for (int k = 0; k < w_[i][j].size(); ++k) {
cout << w_[i][j][k] << ",";
}
cout << ") ";
}
cout << endl;
}
cout << "================================" << endl;
}
/*!
* 训练神经网络
*/
void Study(vector<pair<int, vector<double>>> const &datas){
for (int datas_index = 0; datas_index < datas.size(); ++datas_index) {
int type = datas[datas_index].first;
vector<double> const &data = datas[datas_index].second;
if (type < 0 || data.size() != layers_[0]) {
continue; //无效训练数据
}
vector<vector<double>> outputs;
_Classify(data, &outputs); //进行分类,并记录中间层的输出结果
//前向反馈修改w系数
vector<vector<double>> h_data(layers_.size()); //h_data[i][j]表示第i层第j个结点的h值
for (int i = layers_.size() - 1; i >= 0; --i) { //处理第i层(反向处理每一层)
for (int j = 0; j < layers_[i]; ++j) { //第i层的第j个神经元
//获取第i层第j个神经元的h值
double h = outputs[i][j] * (1 - outputs[i][j]);
if (i == layers_.size() - 1) { //输出层
h *= ( (type == j ? 0.95 : 0.05) - outputs[i][j]);
} else { //隐藏层
double delta_sum = 0;
for (int p = 0; p < layers_[i + 1]; ++p) {
delta_sum += h_data[i + 1][p] * w_[i + 1][j][p];
}
h *= delta_sum;
}
h_data[i].push_back(h); //记录下来
for (int k = 0; k < layers_[std::max(i - 1, 0)]; ++k) {
double output_value = 0;
if (i != 0) {
output_value = outputs[i - 1][k];
} else {
output_value = data[k];
}
double delta_w = alpha_ * h_data[i][j] * output_value;
w_[i][j][k] += delta_w;
}
}
}
}
}
/*!
* 使用训练好的神经网络进行模式识别
*/
int Classify(vector<double> const &data){
return _Classify(data, NULL);
}
private:
/*!
* 使用训练好的神经网络进行模式识别,outputs!=NULL时记录中间层的输出结果
*/
int _Classify(vector<double> const &data, vector<vector<double>> *outputs){
vector<double> input_i = data;
for (int i = 0; i < layers_.size(); ++i) { //遍历第i层
vector<double> output_i; //第i层的输出
for (int j = 0; j < layers_[i]; ++j) { //第i层上的第j个结点
double I = 0;
for (int k = 0; k < input_i.size(); ++k) {
I += w_[i][j][k] * input_i[k];
}
I = 1 / (1 + std::exp(-1 * I)); //激活函数
output_i.push_back(I);
}
if (outputs != NULL) {
outputs->push_back(output_i);
}
input_i = output_i; //i-1层的输出为第i层的输入
}
auto max_iter = std::max_element(input_i.begin(), input_i.end());
cout << *max_iter <<"|";
if (*max_iter >= 0.95) {
return std::distance(max_iter, input_i.begin());
} else {
return -1; //最大的输出都没有超过0.95,视为模式识别失败
}
}
vector<int> const &layers_; //<! 神经元的体系结构
double alpha_; //<! 前向反馈时的alpha参数
vector<vector<vector<double>>> w_; //<! w_[i][j][k]表示第i层,第j个神经元的第k个输入的权值
};
/*!
* 多层前馈神经网络
*/
int main(){
vector<double> w11 = {0, 0};
vector<double> w12 = {0, 1};
vector<double> w21 = {1, 0};
vector<double> w22 = {1, 1};
vector<pair<int, vector<double>>> datas;
datas.push_back(make_pair(0, w11));
datas.push_back(make_pair(0, w12));
datas.push_back(make_pair(1, w21));
datas.push_back(make_pair(1, w22));
vector<int> layers = {2, 2, 2};
BPNeuralNetwork bp(layers, 0.5);
bp.DisplayW();
bp.Study(datas);
bp.DisplayW();
for (int i = 0; i < 10; ++i) {
double x = rand() % 10 / 10.0, y = rand() % 10 / 10.0;
vector<double> data = {x, y};
printf("(%f, %f) => %d\n", x, y, bp.Classify(data));
}
return 0;
}