mmseg4j支持单个字母、数字及组合搜索

最新推荐文章于 2021-08-13 13:29:27 发布

天冷就回家0202

最新推荐文章于 2021-08-13 13:29:27 发布

阅读量1.1k

点赞数

分类专栏： solr

solr 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

原文地址:http://blog.csdn.net/july_2/article/details/24481935

如题，看到这个题目也许觉得功能有些多余，字母、数字连在一块的话，是不会单独分出来的，分词时候是连在一块的，也算正常搜素需求。如输入：

String txt = "IBM12二次修改123"; 分词效果：

i bm |123 | 二 | 次 | 修 | 改

现在，有一个需求：需要对字母、数字都分词，分词效果要达到：

i | b | m | 1 | 2 | 3 | 二 | 次 | 修 | 改

类似在数据库中使用like加百分号双向查询效果，使用最初版本的mmseg4j无法满足需求，经过阅读mmseg4j部分源代码，稍微修改了一点点，即可满足需求（暂不考虑效率）。

未修改前通过单词，可以查询，通过字母查询不到结果如下图：

单词完全匹配搜素：

字母模糊搜索：

修改mmseg4j源代码MMSeg.java中的next部分代码，其实就是屏蔽了部分代码，很简单：

[plain]view plaincopy 
    
 public Word next() throws IOException {  
         //先从缓存中取  
         Word word = bufWord.poll();  
         if(word == null) {  
             bufSentence.setLength(0);  
   
             int data = -1;  
             boolean read = true;  
 //          while(read && (data=readNext()) != -1) {  
             while((data=readNext()) != -1) {  
                 read = false;   //默认一次可以读出同一类字符,就可以分词内容  
                 int type = Character.getType(data);  
                 String wordType = Word.TYPE_WORD;  
                 switch(type) {  
                 case Character.UPPERCASE_LETTER:  
                 case Character.LOWERCASE_LETTER:  
                 case Character.TITLECASE_LETTER:  
                 case Character.MODIFIER_LETTER:  
                     /*  
                      * 1. 0x410-0x44f -> А-я //俄文  
                      * 2. 0x391-0x3a9 -> Α-Ω //希腊大写  
                      * 3. 0x3b1-0x3c9 -> α-ω //希腊小写  
                      */  
                     data = toAscii(data);  
                     NationLetter nl = getNation(data);  
                     if(nl == NationLetter.UNKNOW) {  
                         read = true;  
                         break;  
                     }  
                     wordType = Word.TYPE_LETTER;  
                     bufSentence.appendCodePoint(data);  
                     switch(nl) {  
                     case EN:  
                         //字母后面的数字,如: VH049PA  
 //                      ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();  
 //                      readChars(bufSentence, rcad);  
 //                      if(rcad.hasDigit()) {  
 //                          wordType = Word.TYPE_LETTER_OR_DIGIT;  
 //                      }  
                         //only english  
                         //readChars(bufSentence, new ReadCharByAscii());  
                         break;  
                     case RA:  
                         readChars(bufSentence, new ReadCharByRussia());  
                         break;  
                     case GE:  
                         readChars(bufSentence, new ReadCharByGreece());  
                         break;  
                     }  
                     bufWord.add(createWord(bufSentence, wordType));  
   
                     bufSentence.setLength(0);  
   
                     break;  
                 case Character.OTHER_LETTER:  
                     /*  
                      * 1. 0x3041-0x30f6 -> ぁ-ヶ   //日文(平|片)假名  
                      * 2. 0x3105-0x3129 -> ㄅ-ㄩ   //注意符号  
                      */  
                     bufSentence.appendCodePoint(data);  
                     readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));  
   
                     currentSentence = createSentence(bufSentence);  
   
                     bufSentence.setLength(0);  
   
                     break;  
                 case Character.DECIMAL_DIGIT_NUMBER:  
                     bufSentence.appendCodePoint(toAscii(data));  
 //                  readChars(bufSentence, new ReadCharDigit());    //读后面的数字, AsciiLetterOr  
                     wordType = Word.TYPE_DIGIT;  
                     int d = readNext();  
                     if(d > -1) {  
                         if(seg.isUnit(d)) { //单位,如时间  
                             bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立)  
   
                             bufSentence.setLength(0);  
   
                             bufSentence.appendCodePoint(d);  
                             wordType = Word.TYPE_WORD;  //单位是 word  
                         } else {    //后面可能是字母和数字  
                             pushBack(d);  
 //                          if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {   //如果有字母或数字都会连在一起.  
 //                              wordType = Word.TYPE_DIGIT_OR_LETTER;  
 //                          }  
                         }  
                     }  
   
                     bufWord.add(createWord(bufSentence, wordType));  
   
   
                     bufSentence.setLength(0);   //缓存的字符清除  
   
                     break;  
                 case Character.LETTER_NUMBER:  
                     // ⅠⅡⅢ 单分  
                     bufSentence.appendCodePoint(data);  
                     readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));  
   
                     int startIdx = startIdx(bufSentence);  
                     for(int i=0; i<bufSentence.length(); i++) {  
                         bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));  
                     }  
   
                     bufSentence.setLength(0);   //缓存的字符清除  
   
                     break;  
                 case Character.OTHER_NUMBER:  
                     //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用  
                     bufSentence.appendCodePoint(data);  
                     readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));  
   
                     bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));  
                     bufSentence.setLength(0);  
                     break;  
                 default :  
                     //其它认为无效字符  
                     read = true;  
                 }//switch  
             }  
                   
             // 中文分词  
             if(currentSentence != null) {  
                 do {  
                     Chunk chunk = seg.seg(currentSentence);  
                     for(int i=0; i<chunk.getCount(); i++) {  
                         bufWord.add(chunk.getWords()[i]);  
                     }  
                 } while (!currentSentence.isFinish());  
                   
                 currentSentence = null;  
             }  
               
             word = bufWord.poll();  
         }  
           
         return word;  
     }