贝叶斯邮件过滤的第一步是进行中文分词的提取,这里涉及到的第一个技术点就是建立字典的map结构,为后面的垃圾库和和法库做好准备。这里主要介绍字典库的建立:
#include <iostream>
#include <fstream>
#include <sstream>
#include <map>
#include <stdlib.h>
using namespace std;
/*
*Reference:http://blog.csdn.net/qll125596718/article/details/8306767
*/
class CDictionary
{
public:
CDictionary(); //The default constructor,when define the class ,call this function automatically
~CDictionary(); //The destructor
int FindWord(string w); //Public interface,find the word of the dictionary in the hash
private:
string strtmp; //Read the dictionary every line
string word; //save the word
map<string, int> wordhash;
map<string, int >::iterator worditer;
typedef pair<string, int> sipair;
};
/*
*define a wordhash and map the word of wordlexicon.txt to wordhash
*/
CDictionary::CDictionary()
{
ifstream infile("wordlexicon.txt"); // open the wordlexicon.txt
if (!infile.is_open()) // open failure
{
cerr << "Unable to open input file: " << "wordlexicon"
<< " -- bailing out!" << endl;
exit(-1);
}
while (getline(infile, strtmp, '\n'))
{
istringstream istr(strtmp);
istr >> word; //write the word of every line
wordhash.insert(sipair(word, 1));
}
}
CDictionary::~CDictionary()
{
}
int CDictionary::FindWord(string w)
{
if (wordhash.find(w) != wordhash.end())
{
return 1;
}
else
{
return 0;
}
}
字典的结构为
map<string, int> wordhash
这样可以将所有字典里的词语存放在wordhash里面。这里涉及到map的操作,将在下一面博客里介绍。