qt在新版本中整合了qtlucene的组件,单位的机器qt版本不够,也懒得升级。先看一下中文切分词部分。
目的:对文档进行中文切分,并根据词频高低排序。
选择LibMMSeg 作为切分词组件。
下载源码,
./bootstrap
./configure --/prefix=/opt/mmseg
make
make install
完毕,
在qtcreator中新建项目,因为mmseg只提供了客户端开发文档,并未提及切分词部分的调用方法,暂时从 source code : src/mmseg_main.cpp 顺藤摸瓜。
以上代码为实现Demo:
项目文件:
- QT += core webkit network
- SOURCES += main.cpp \
- mmseg_main.cpp
- HEADERS +=
- LIBS += -L/opt/mmseg/lib -lmmseg
- INCLUDEPATH = /opt/mmseg/include/
QT += core webkit network
SOURCES += main.cpp \
mmseg_main.cpp
HEADERS +=
LIBS += -L/opt/mmseg/lib -lmmseg
INCLUDEPATH = /opt/mmseg/include/
main.cpp
- #include <QApplication>
- #include "mmseg_main.cpp"
- int main(int argc, char *argv[])
- {
- QApplication app(argc, argv);
- string dictpath = "/opt/mmseg/etc/";
- string content = "全文检索是数据库的有力补充,全文检索并不能替代数据库在应用系统中的作用。当应用系统的数据以大量的文本信息为主时,采用全文检索技术可以极大的提升应用系统的价值。 ";
- map<string, int> m = segment(content, dictpath);
- vector<pair<string, int> > v;
- sortMapByValue(m, v);
- for (int i = 0; i < 50; ++i)
- cout<<v[i].second<<"\t"<<v[i].first<<endl;
- return 0;
- // return app.exec();
- }
#include <QApplication>
#include "mmseg_main.cpp"
int main(int argc, char *argv[])
{
QApplication app(argc, argv);
string dictpath = "/opt/mmseg/etc/";
string content = "全文检索是数据库的有力补充,全文检索并不能替代数据库在应用系统中的作用。当应用系统的数据以大量的文本信息为主时,采用全文检索技术可以极大的提升应用系统的价值。 ";
map<string, int> m = segment(content, dictpath);
vector<pair<string, int> > v;
sortMapByValue(m, v);
for (int i = 0; i < 50; ++i)
cout<<v[i].second<<"\t"<<v[i].first<<endl;
return 0;
// return app.exec();
}
mmseg_main.cpp
- #include <fstream>
- #include <string>
- #include <iostream>
- #include <cstdio>
- #include <map>
- #include <vector>
- #include <algorithm>
- #include <stdlib.h>
- #include "mmseg/UnigramCorpusReader.h"
- #include "mmseg/UnigramDict.h"
- #include "mmseg/SynonymsDict.h"
- #include "mmseg/ThesaurusDict.h"
- #include "mmseg/SegmenterManager.h"
- #include "mmseg/Segmenter.h"
- using namespace std;
- using namespace css;
- #define SEGMENT_OUTPUT 1
- map<string, int> segment(string content, string dictpath);
- void addmap(const char* str, int len, map<string, int> &map);
- void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector);
- map<string, int> segment(string content, string dictpath)
- {
- map<string, int> map;
- //init segmenter
- const char* dict_path = dictpath.c_str();
- SegmenterManager* mgr = new SegmenterManager();
- mgr->init(dict_path);
- Segmenter* seg = mgr->getSegmenter();
- //load data.
- int length = content.length();
- const char* buffer = content.c_str();
- //begin seg
- seg->setBuffer((u1*)buffer,length);
- u2 len = 0, symlen = 0;
- u2 kwlen = 0, kwsymlen = 0;
- //check 1st token.
- char txtHead[3] = {239,187,191};
- char* tok = (char*)seg->peekToken(len, symlen);
- seg->popToken(len);
- if(seg->isSentenceEnd()){
- do {
- char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
- if(kwsymlen)
- // printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
- addmap(kwtok, kwsymlen, map);
- }while(kwsymlen);
- }
- if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
- //check is 0xFEFF
- //do nothing
- }else{
- // printf("%*.*s/x ",symlen,symlen,tok);
- addmap(tok, symlen, map);
- }
- while(1){
- len = 0;
- char* tok = (char*)seg->peekToken(len,symlen);
- if(!tok || !*tok || !len)
- break;
- seg->popToken(len);
- if(seg->isSentenceEnd()){
- do {
- char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
- if(kwsymlen)
- // printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
- addmap(kwtok, kwsymlen, map);
- }while(kwsymlen);
- }
- if(*tok == '\r')
- continue;
- if(*tok == '\n'){
- printf("\n");
- continue;
- }
- //printf("[%d]%*.*s/x ",len,len,len,tok);
- // printf("%*.*s/x ",symlen,symlen,tok);
- // char word[20];
- // sprintf(word, "%*.*s",symlen,symlen,tok);
- // printf("%s/[%d-%d] ", word,symlen,symlen);
- addmap(tok, symlen, map);
- //check thesaurus
- {
- const char* thesaurus_ptr = seg->thesaurus(tok, symlen);
- while(thesaurus_ptr && *thesaurus_ptr) {
- len = strlen(thesaurus_ptr);
- // printf("%*.*s/s ",len,len,thesaurus_ptr);
- addmap(thesaurus_ptr, len, map);
- thesaurus_ptr += len + 1; //move next
- }
- }
- //printf("%s",tok);
- }
- delete mgr;
- return map;
- }
- void addmap(const char* str, int len, map<string, int> &map)
- {
- char word[len];
- sprintf(word, "%*.*s", len, len, str);
- map[word]++;
- }
- int cmp(const pair<string,int>& a, const pair<string,int>& b)
- {
- return a.second > b.second;
- }
- void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector)
- {
- for(map<string,int>::iterator curr = tMap.begin();curr != tMap.end(); curr++)
- {
- tVector.push_back(make_pair(curr->first, curr->second));
- }
- sort(tVector.begin(),tVector.end(),cmp);
- }
#include <fstream>
#include <string>
#include <iostream>
#include <cstdio>
#include <map>
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include "mmseg/UnigramCorpusReader.h"
#include "mmseg/UnigramDict.h"
#include "mmseg/SynonymsDict.h"
#include "mmseg/ThesaurusDict.h"
#include "mmseg/SegmenterManager.h"
#include "mmseg/Segmenter.h"
using namespace std;
using namespace css;
#define SEGMENT_OUTPUT 1
map<string, int> segment(string content, string dictpath);
void addmap(const char* str, int len, map<string, int> &map);
void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector);
map<string, int> segment(string content, string dictpath)
{
map<string, int> map;
//init segmenter
const char* dict_path = dictpath.c_str();
SegmenterManager* mgr = new SegmenterManager();
mgr->init(dict_path);
Segmenter* seg = mgr->getSegmenter();
//load data.
int length = content.length();
const char* buffer = content.c_str();
//begin seg
seg->setBuffer((u1*)buffer,length);
u2 len = 0, symlen = 0;
u2 kwlen = 0, kwsymlen = 0;
//check 1st token.
char txtHead[3] = {239,187,191};
char* tok = (char*)seg->peekToken(len, symlen);
seg->popToken(len);
if(seg->isSentenceEnd()){
do {
char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
if(kwsymlen)
// printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
addmap(kwtok, kwsymlen, map);
}while(kwsymlen);
}
if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
//check is 0xFEFF
//do nothing
}else{
// printf("%*.*s/x ",symlen,symlen,tok);
addmap(tok, symlen, map);
}
while(1){
len = 0;
char* tok = (char*)seg->peekToken(len,symlen);
if(!tok || !*tok || !len)
break;
seg->popToken(len);
if(seg->isSentenceEnd()){
do {
char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
if(kwsymlen)
// printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
addmap(kwtok, kwsymlen, map);
}while(kwsymlen);
}
if(*tok == '\r')
continue;
if(*tok == '\n'){
printf("\n");
continue;
}
//printf("[%d]%*.*s/x ",len,len,len,tok);
// printf("%*.*s/x ",symlen,symlen,tok);
// char word[20];
// sprintf(word, "%*.*s",symlen,symlen,tok);
// printf("%s/[%d-%d] ", word,symlen,symlen);
addmap(tok, symlen, map);
//check thesaurus
{
const char* thesaurus_ptr = seg->thesaurus(tok, symlen);
while(thesaurus_ptr && *thesaurus_ptr) {
len = strlen(thesaurus_ptr);
// printf("%*.*s/s ",len,len,thesaurus_ptr);
addmap(thesaurus_ptr, len, map);
thesaurus_ptr += len + 1; //move next
}
}
//printf("%s",tok);
}
delete mgr;
return map;
}
void addmap(const char* str, int len, map<string, int> &map)
{
char word[len];
sprintf(word, "%*.*s", len, len, str);
map[word]++;
}
int cmp(const pair<string,int>& a, const pair<string,int>& b)
{
return a.second > b.second;
}
void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector)
{
for(map<string,int>::iterator curr = tMap.begin();curr != tMap.end(); curr++)
{
tVector.push_back(make_pair(curr->first, curr->second));
}
sort(tVector.begin(),tVector.end(),cmp);
}
输出结果:
- 6 的
- 3 应用
- 3 检索
- 3 全文
- 3 系统
- 2 数据库
- 2 ,
- 2 。
- 1 能
- 1 补充
- 1 采用
- 1 极大
- 1 有力
- 1 替代
- 1 是
- 1 时
- 1 文本
- 1 提升
- 1 数据
- 1
- 1 技术
- 1 当
- 1 并不
- 1 大量
- 1 在
- 1 可以
- 1 信息
- 1 作用
- 1 价值
- 1 以
- 1 为主
- 1 中