中文分词

qt在新版本中整合了qtlucene的组件,单位的机器qt版本不够,也懒得升级。先看一下中文切分词部分。

目的:对文档进行中文切分,并根据词频高低排序。


选择LibMMSeg 作为切分词组件。

下载源码,

./bootstrap

./configure --/prefix=/opt/mmseg

make

make install

完毕,

在qtcreator中新建项目,因为mmseg只提供了客户端开发文档,并未提及切分词部分的调用方法,暂时从 source code : src/mmseg_main.cpp 顺藤摸瓜。

以上代码为实现Demo:

项目文件:

  1. QT += core webkit network
  2. SOURCES += main.cpp \
  3. mmseg_main.cpp
  4. HEADERS +=
  5. LIBS += -L/opt/mmseg/lib -lmmseg
  6. INCLUDEPATH = /opt/mmseg/include/
QT += core webkit network

SOURCES += main.cpp \ 
    mmseg_main.cpp

HEADERS +=  

LIBS += -L/opt/mmseg/lib -lmmseg

INCLUDEPATH = /opt/mmseg/include/

main.cpp

  1. #include <QApplication>
  2. #include "mmseg_main.cpp"
  3. int main(int argc, char *argv[])
  4. {
  5. QApplication app(argc, argv);
  6. string dictpath = "/opt/mmseg/etc/";
  7. string content = "全文检索是数据库的有力补充,全文检索并不能替代数据库在应用系统中的作用。当应用系统的数据以大量的文本信息为主时,采用全文检索技术可以极大的提升应用系统的价值。 ";
  8. map<string, int> m = segment(content, dictpath);
  9. vector<pair<string, int> > v;
  10. sortMapByValue(m, v);
  11. for (int i = 0; i < 50; ++i)
  12. cout<<v[i].second<<"\t"<<v[i].first<<endl;
  13. return 0;
  14. // return app.exec();
  15. }
#include <QApplication>

#include "mmseg_main.cpp"

int main(int argc, char *argv[])
{
    QApplication app(argc, argv);

    string dictpath = "/opt/mmseg/etc/";
    string content = "全文检索是数据库的有力补充,全文检索并不能替代数据库在应用系统中的作用。当应用系统的数据以大量的文本信息为主时,采用全文检索技术可以极大的提升应用系统的价值。 ";

    map<string, int> m = segment(content, dictpath);

    vector<pair<string, int> > v;
    sortMapByValue(m, v);

    for (int i = 0; i < 50; ++i)
          cout<<v[i].second<<"\t"<<v[i].first<<endl;

    return 0;

//    return app.exec();
}


mmseg_main.cpp

  1. #include <fstream>
  2. #include <string>
  3. #include <iostream>
  4. #include <cstdio>
  5. #include <map>
  6. #include <vector>
  7. #include <algorithm>
  8. #include <stdlib.h>
  9. #include "mmseg/UnigramCorpusReader.h"
  10. #include "mmseg/UnigramDict.h"
  11. #include "mmseg/SynonymsDict.h"
  12. #include "mmseg/ThesaurusDict.h"
  13. #include "mmseg/SegmenterManager.h"
  14. #include "mmseg/Segmenter.h"
  15. using namespace std;
  16. using namespace css;
  17. #define SEGMENT_OUTPUT 1
  18. map<string, int> segment(string content, string dictpath);
  19. void addmap(const char* str, int len, map<string, int> &map);
  20. void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector);
  21. map<string, int> segment(string content, string dictpath)
  22. {
  23. map<string, int> map;
  24. //init segmenter
  25. const char* dict_path = dictpath.c_str();
  26. SegmenterManager* mgr = new SegmenterManager();
  27. mgr->init(dict_path);
  28. Segmenter* seg = mgr->getSegmenter();
  29. //load data.
  30. int length = content.length();
  31. const char* buffer = content.c_str();
  32. //begin seg
  33. seg->setBuffer((u1*)buffer,length);
  34. u2 len = 0, symlen = 0;
  35. u2 kwlen = 0, kwsymlen = 0;
  36. //check 1st token.
  37. char txtHead[3] = {239,187,191};
  38. char* tok = (char*)seg->peekToken(len, symlen);
  39. seg->popToken(len);
  40. if(seg->isSentenceEnd()){
  41. do {
  42. char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
  43. if(kwsymlen)
  44. // printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
  45. addmap(kwtok, kwsymlen, map);
  46. }while(kwsymlen);
  47. }
  48. if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
  49. //check is 0xFEFF
  50. //do nothing
  51. }else{
  52. // printf("%*.*s/x ",symlen,symlen,tok);
  53. addmap(tok, symlen, map);
  54. }
  55. while(1){
  56. len = 0;
  57. char* tok = (char*)seg->peekToken(len,symlen);
  58. if(!tok || !*tok || !len)
  59. break;
  60. seg->popToken(len);
  61. if(seg->isSentenceEnd()){
  62. do {
  63. char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
  64. if(kwsymlen)
  65. // printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
  66. addmap(kwtok, kwsymlen, map);
  67. }while(kwsymlen);
  68. }
  69. if(*tok == '\r')
  70. continue;
  71. if(*tok == '\n'){
  72. printf("\n");
  73. continue;
  74. }
  75. //printf("[%d]%*.*s/x ",len,len,len,tok);
  76. // printf("%*.*s/x ",symlen,symlen,tok);
  77. // char word[20];
  78. // sprintf(word, "%*.*s",symlen,symlen,tok);
  79. // printf("%s/[%d-%d] ", word,symlen,symlen);
  80. addmap(tok, symlen, map);
  81. //check thesaurus
  82. {
  83. const char* thesaurus_ptr = seg->thesaurus(tok, symlen);
  84. while(thesaurus_ptr && *thesaurus_ptr) {
  85. len = strlen(thesaurus_ptr);
  86. // printf("%*.*s/s ",len,len,thesaurus_ptr);
  87. addmap(thesaurus_ptr, len, map);
  88. thesaurus_ptr += len + 1; //move next
  89. }
  90. }
  91. //printf("%s",tok);
  92. }
  93. delete mgr;
  94. return map;
  95. }
  96. void addmap(const char* str, int len, map<string, int> &map)
  97. {
  98. char word[len];
  99. sprintf(word, "%*.*s", len, len, str);
  100. map[word]++;
  101. }
  102. int cmp(const pair<string,int>& a, const pair<string,int>& b)
  103. {
  104. return a.second > b.second;
  105. }
  106. void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector)
  107. {
  108. for(map<string,int>::iterator curr = tMap.begin();curr != tMap.end(); curr++)
  109. {
  110. tVector.push_back(make_pair(curr->first, curr->second));
  111. }
  112. sort(tVector.begin(),tVector.end(),cmp);
  113. }
#include <fstream>
#include <string>
#include <iostream>
#include <cstdio>

#include <map>
#include <vector>
#include <algorithm>

#include <stdlib.h>

#include "mmseg/UnigramCorpusReader.h"
#include "mmseg/UnigramDict.h"
#include "mmseg/SynonymsDict.h"
#include "mmseg/ThesaurusDict.h"
#include "mmseg/SegmenterManager.h"
#include "mmseg/Segmenter.h"

using namespace std;
using namespace css;

#define SEGMENT_OUTPUT 1

map<string, int> segment(string content, string dictpath);

void addmap(const char* str, int len, map<string, int> &map);

void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector);

map<string, int> segment(string content, string dictpath)
{
    map<string, int> map;

    //init segmenter
    const char* dict_path = dictpath.c_str();
    SegmenterManager* mgr = new SegmenterManager();
    mgr->init(dict_path);
    Segmenter* seg = mgr->getSegmenter();

    //load data.
    int length = content.length();
    const char* buffer = content.c_str();

    //begin seg
    seg->setBuffer((u1*)buffer,length);
    u2 len = 0, symlen = 0;
    u2 kwlen = 0, kwsymlen = 0;

    //check 1st token.
    char txtHead[3] = {239,187,191};
    char* tok = (char*)seg->peekToken(len, symlen);
    seg->popToken(len);
    if(seg->isSentenceEnd()){
            do {
                    char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
                    if(kwsymlen)
//                            printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
                        addmap(kwtok, kwsymlen, map);
            }while(kwsymlen);
    }

    if(len == 3 && memcmp(tok,txtHead,sizeof(char)*3) == 0){
            //check is 0xFEFF
            //do nothing
    }else{
//            printf("%*.*s/x ",symlen,symlen,tok);
            addmap(tok, symlen, map);
    }

    while(1){
            len = 0;
            char* tok = (char*)seg->peekToken(len,symlen);
            if(!tok || !*tok || !len)
                    break;
            seg->popToken(len);
            if(seg->isSentenceEnd()){
                    do {
                            char* kwtok = (char*)seg->peekToken(kwlen , kwsymlen,1);
                            if(kwsymlen)
//                                    printf("[kw]%*.*s/x ",kwsymlen,kwsymlen,kwtok);
                                addmap(kwtok, kwsymlen, map);
                    }while(kwsymlen);
            }

            if(*tok == '\r')
                    continue;
            if(*tok == '\n'){
                    printf("\n");
                    continue;
            }

            //printf("[%d]%*.*s/x ",len,len,len,tok);
//                printf("%*.*s/x ",symlen,symlen,tok);

//            char word[20];
//            sprintf(word, "%*.*s",symlen,symlen,tok);
//            printf("%s/[%d-%d] ", word,symlen,symlen);
            addmap(tok, symlen, map);

            //check thesaurus
            {
                    const char* thesaurus_ptr = seg->thesaurus(tok, symlen);
                    while(thesaurus_ptr && *thesaurus_ptr) {
                            len = strlen(thesaurus_ptr);
//                            printf("%*.*s/s ",len,len,thesaurus_ptr);
                            addmap(thesaurus_ptr, len, map);
                            thesaurus_ptr += len + 1; //move next
                    }
            }
            //printf("%s",tok);
    }

    delete mgr;

    return map;

}

void addmap(const char* str, int len, map<string, int> &map)
{
    char word[len];
    sprintf(word, "%*.*s", len, len, str);
    map[word]++;
}

int cmp(const pair<string,int>& a, const pair<string,int>& b)
{
    return a.second > b.second;
}

void sortMapByValue(map<string,int>& tMap, vector<pair<string,int> >& tVector)
{
     for(map<string,int>::iterator curr = tMap.begin();curr != tMap.end(); curr++)
     {
        tVector.push_back(make_pair(curr->first, curr->second));
     }

     sort(tVector.begin(),tVector.end(),cmp);

}

输出结果:

  1. 6 的
  2. 3 应用
  3. 3 检索
  4. 3 全文
  5. 3 系统
  6. 2 数据库
  7. 2 ,
  8. 2 。
  9. 1 能
  10. 1 补充
  11. 1 采用
  12. 1 极大
  13. 1 有力
  14. 1 替代
  15. 1 是
  16. 1 时
  17. 1 文本
  18. 1 提升
  19. 1 数据
  20. 1
  21. 1 技术
  22. 1 当
  23. 1 并不
  24. 1 大量
  25. 1 在
  26. 1 可以
  27. 1 信息
  28. 1 作用
  29. 1 价值
  30. 1 以
  31. 1 为主
  32. 1 中
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值