train.txt format: label\tword word …
/*************************************************************************
> File Name: bayesNB.cpp
> Author: test
************************************************************************/
#include<iostream>
#include<unordered_map>
#include<fstream>
#include<vector>
#include<string.h>
#include <boost/algorithm/string.hpp>
#include <assert.h>
#include<time.h>
#include<math.h>
#include<algorithm>
using namespace std;
//note: asure that each line of train file more than two words, (class and one word).
void loadData(const string src_file, vector<vector<string> > &data, vector<string> &classVec){
fstream fin;
fin.open(src_file.c_str());
string line;
vector<string> words, lineVec;
int count=0;
while(!fin.eof()) {
if(getline(fin, line)) {
//the format of train file, each line as: class w1 w2 w3...
words = boost::split(words, line, boost::is_any_of(" \t"));
//make sure each line has more than class one word
assert (words.size()>=2);
for (auto iter=words.begin()+1; iter != words.end(); ++iter) {
lineVec.push_back(*iter);
}
data.push_back(lineVec);
classVec.push_back(words[0]);
count++;
words.clear();
lineVec.clear();
}
}
fin.close();
}
void getDictFromData(vector<vector<string> > data, unordered_map<string, float> &vcab, vector<string> &ordered_vcab){
for (auto vec_iter=data.begin(); vec_iter != data.end(); ++vec_iter) {
for (auto iter=(*vec_iter).begin(); iter != (*vec_iter).end(); ++iter) {
if (*iter != "") vcab.insert({*iter, 0});
}
}
for (auto map_iter=vcab.begin(); map_iter != vcab.end(); ++map_iter) ordered_vcab.push_back(map_iter->first);
}
void getMat(vector<vector<string> > data, unordered_map<string, float> vcab, vector<vector<float> > &trainMat, vector<string> ordered_vcab, bool isTest) {
unordered_map<string, float> tmp_map;
for (auto map_iter=vcab.begin(); map_iter != vcab.end(); ++map_iter) tmp_map.insert({map_iter->first, 0.0});
const size_t vcab_size = ordered_vcab.size();
//Vec represents a vec corresponds to the dictionary for each line, as: [0, 1, ....].
float appear;
vector<float> vecMat;
for (auto vec_iter=data.begin(); vec_iter != data.end(); ++vec_iter) {
for (auto str_iter=(*vec_iter).begin(); str_iter != (*vec_iter).end(); ++str_iter) {
if ((*str_iter) != "") {
auto got = tmp_map.find(*str_iter);
if (!isTest) assert (!(got == tmp_map.end()));
if (got==tmp_map.end()) continue;
tmp_map[*str_iter] = 1.0;
}
}
for (auto vcab_iter=ordered_vcab.begin(); vcab_iter != ordered_vcab.end(); ++vcab_iter) {
appear = tmp_map[*vcab_iter];
vecMat.push_back(appear);
}
trainMat.push_back(vecMat);
//erase vecMat and set tmp_map->second to zeros
for (auto str_iter=(*vec_iter).begin(); str_iter != (*vec_iter).end(); ++str_iter) tmp_map[*str_iter] = 0.0;
vecMat.clear();
}
}
//probs mapping class to vec
//prob mapping class to the probability
void getClassifyVec(vector<string> classVec, vector<vector<float> >trainMat,
unordered_map<string, vector<float> > &probs, unordered_map<string, float> &prob) {
assert(classVec.size()==trainMat.size());
assert(classVec.size()>0);
const size_t nb_train = classVec.size();
const size_t nb_dict = trainMat[0].size();
vector<float> fvec;
for (unsigned int i=0; i<nb_train; ++i) {
if (probs.find(classVec[i]) == probs.end()) {
prob.insert({classVec[i], 2});//the time start by 1, not 0, to avoid 0 times.
for (auto iter=trainMat[i].begin(); iter != trainMat[i].end(); ++iter) fvec.push_back(*iter+1.0);
probs.insert({classVec[i], fvec});
fvec.clear();
}else{
prob[classVec[i]] += 1;
for (unsigned int j=0; j<nb_dict; ++j) probs[classVec[i]][j] += trainMat[i][j];
}
}
//normalize the prob
float total = 0.0;
for (auto iter=prob.begin(); iter != prob.end(); ++iter) total += iter->second;
for (auto iter=prob.begin(); iter != prob.end(); ++iter) iter->second /= total;
total = 0.0;
//normalize the probs
for (auto vec_iter=probs.begin(); vec_iter != probs.end(); ++vec_iter) {
for (auto iter=(vec_iter->second).begin(); iter != (vec_iter->second).end(); ++iter) total += *iter;
for (auto iter=(vec_iter->second).begin(); iter != (vec_iter->second).end(); ++iter) *iter/=total;
total = 0.0;
}
}
//predictMat represents the probability matrix corresponding to the classes
//because the testMat is organised by the ordered_vcab
//classes introduced by classVec by clear the repeated elements
void predict(vector<vector<float> >testMat, vector<vector<float> > &predictMat, vector<string> &classes, vector<string> classVec,
unordered_map<string, vector<float> > probs, unordered_map<string, float> prob) {
//we introduce the log to avoid too small number
//also it gives a beautiful form
//init classes by classVec
for (auto iter=classVec.begin(); iter != classVec.end(); ++iter) classes.push_back(*iter);
sort(classes.begin(), classes.end());
auto iter=unique(classes.begin(), classes.end());
classes.erase(iter, classes.end());
vector<float> predict;
float pclass;
float p=0.0;
for (unsigned int i=0; i<testMat.size(); ++i) {
for (auto iter=classes.begin(); iter != classes.end(); ++iter) {
pclass=prob[*iter];
assert(probs[*iter].size()==testMat[i].size());
for (int j=0; j<probs[*iter].size(); ++j) p+=log(probs[*iter][j])*testMat[i][j];
p += log(pclass);
predict.push_back(p);
p=0.0;
}
predictMat.push_back(predict);
predict.clear();
}
}
void predictClass(vector<vector<float> > predictMat, vector<string> classes,
vector<string> &predictClasses) {
size_t index=0;
float maxEle;
for (auto iter=predictMat.begin(); iter != predictMat.end(); ++iter) {
maxEle=(*iter)[0];
for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1)
maxEle = maxEle<(*iter1)?(*iter1):maxEle;
for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
if (maxEle==(*iter1)) break;
index += 1;
}
predictClasses.push_back(classes[index]);
index = 0;
}
}
int main(){
time_t start, end;
time(&start);
string src_file = "train1";
string test_file = "val1";
vector<vector<string> > data, testData;
vector<vector<float> > trainMat, testMat, predictMat;
vector<string> classVec, testClassVec, classes, predictClasses;
vector<string> ordered_vcab;
unordered_map<string, float> vcab;
unordered_map<string, vector<float> > probs;
unordered_map<string, float> prob;
loadData(src_file, data, classVec);
getDictFromData(data, vcab, ordered_vcab);
/*
for (auto iter=data.begin(); iter != data.end(); ++iter) {
for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
cout<<*iter1<<" ";
}
cout<<"\n";
}
*/
//for (auto iter=ordered_vcab.begin(); iter != ordered_vcab.end(); ++iter) cout<<*iter<<" ";
//cout<<ordered_vcab.size()<<"\n";
getMat(data, vcab, trainMat, ordered_vcab, 0);
/*
for (auto iter=trainMat.begin(); iter != trainMat.end(); ++iter) {
for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
cout<<*iter1<<" ";
}
cout<<"\n";
}
*/
getClassifyVec(classVec, trainMat, probs, prob);
/*
for (auto iter=probs.begin(); iter != probs.end(); ++iter) {
cout<<iter->first<<"\n";
auto tmp = &(iter->second);
for (auto iter1=(*tmp).begin(); iter1 != (*tmp).end(); ++iter1) {
cout<<*iter1<<" ";
}
cout<<"\n";
}
*/
loadData(test_file, testData, testClassVec);
getMat(testData, vcab, testMat, ordered_vcab, 1);
predict(testMat, predictMat, classes, classVec, probs, prob);
predictClass(predictMat, classes, predictClasses);
/*
for (auto iter=predictMat.begin(); iter != predictMat.end(); ++iter) {
for (auto iter1=(*iter).begin(); iter1 != (*iter).end(); ++iter1) {
cout<<*iter1<<" ";
}
cout<<"\n";
}
*/
//for (auto iter=predictClasses.begin(); iter != predictClasses.end(); ++iter) cout<<*iter<<"\n";
time(&end);
cout<<"running time: "<<difftime(end, start)<<"\n";
return 0;
}