原文地址:http://blog.csdn.net/july_2/article/details/24481935
如题,看到这个题目也许觉得功能有些多余,字母、数字连在一块的话,是不会单独分出来的,分词时候是连在一块的,也算正常搜素需求。如输入 :
String txt = "IBM12二次修改123"; 分词效果:
主要是注释了一些代码,对字母、数字不要连续处理。
i bm |123 | 二 | 次 | 修 | 改
现在,有一个需求:需要对字母、数字都分词,分词效果要达到:
i | b | m | 1 | 2 | 3 | 二 | 次 | 修 | 改
类似在数据库中使用like加百分号双向查询效果,使用最初版本的mmseg4j无法满足需求,经过阅读mmseg4j部分源代码,稍微修改了一点点,即可满足需求(暂不考虑效率)。
- 未修改前通过单词,可以查询,通过字母查询不到结果如下图:
单词完全匹配搜素:
字母模糊搜索:
- 修改mmseg4j源代码MMSeg.java中的next部分代码,其实就是屏蔽了部分代码,很简单:
- public Word next() throws IOException {
- //先从缓存中取
- Word word = bufWord.poll();
- if(word == null) {
- bufSentence.setLength(0);
- int data = -1;
- boolean read = true;
- // while(read && (data=readNext()) != -1) {
- while((data=readNext()) != -1) {
- read = false; //默认一次可以读出同一类字符,就可以分词内容
- int type = Character.getType(data);
- String wordType = Word.TYPE_WORD;
- switch(type) {
- case Character.UPPERCASE_LETTER:
- case Character.LOWERCASE_LETTER:
- case Character.TITLECASE_LETTER:
- case Character.MODIFIER_LETTER:
- /*
- * 1. 0x410-0x44f -> А-я //俄文
- * 2. 0x391-0x3a9 -> Α-Ω //希腊大写
- * 3. 0x3b1-0x3c9 -> α-ω //希腊小写
- */
- data = toAscii(data);
- NationLetter nl = getNation(data);
- if(nl == NationLetter.UNKNOW) {
- read = true;
- break;
- }
- wordType = Word.TYPE_LETTER;
- bufSentence.appendCodePoint(data);
- switch(nl) {
- case EN:
- //字母后面的数字,如: VH049PA
- // ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
- // readChars(bufSentence, rcad);
- // if(rcad.hasDigit()) {
- // wordType = Word.TYPE_LETTER_OR_DIGIT;
- // }
- //only english
- //readChars(bufSentence, new ReadCharByAscii());
- break;
- case RA:
- readChars(bufSentence, new ReadCharByRussia());
- break;
- case GE:
- readChars(bufSentence, new ReadCharByGreece());
- break;
- }
- bufWord.add(createWord(bufSentence, wordType));
- bufSentence.setLength(0);
- break;
- case Character.OTHER_LETTER:
- /*
- * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名
- * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号
- */
- bufSentence.appendCodePoint(data);
- readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));
- currentSentence = createSentence(bufSentence);
- bufSentence.setLength(0);
- break;
- case Character.DECIMAL_DIGIT_NUMBER:
- bufSentence.appendCodePoint(toAscii(data));
- // readChars(bufSentence, new ReadCharDigit()); //读后面的数字, AsciiLetterOr
- wordType = Word.TYPE_DIGIT;
- int d = readNext();
- if(d > -1) {
- if(seg.isUnit(d)) { //单位,如时间
- bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把数字添加(独立)
- bufSentence.setLength(0);
- bufSentence.appendCodePoint(d);
- wordType = Word.TYPE_WORD; //单位是 word
- } else { //后面可能是字母和数字
- pushBack(d);
- // if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { //如果有字母或数字都会连在一起.
- // wordType = Word.TYPE_DIGIT_OR_LETTER;
- // }
- }
- }
- bufWord.add(createWord(bufSentence, wordType));
- bufSentence.setLength(0); //缓存的字符清除
- break;
- case Character.LETTER_NUMBER:
- // ⅠⅡⅢ 单分
- bufSentence.appendCodePoint(data);
- readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));
- int startIdx = startIdx(bufSentence);
- for(int i=0; i<bufSentence.length(); i++) {
- bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
- }
- bufSentence.setLength(0); //缓存的字符清除
- break;
- case Character.OTHER_NUMBER:
- //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
- bufSentence.appendCodePoint(data);
- readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));
- bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
- bufSentence.setLength(0);
- break;
- default :
- //其它认为无效字符
- read = true;
- }//switch
- }
- // 中文分词
- if(currentSentence != null) {
- do {
- Chunk chunk = seg.seg(currentSentence);
- for(int i=0; i<chunk.getCount(); i++) {
- bufWord.add(chunk.getWords()[i]);
- }
- } while (!currentSentence.isFinish());
- currentSentence = null;
- }
- word = bufWord.poll();
- }
- return word;
- }
- 再次搜索字母查询,效果如下:
综上,这样就简单完成了数据库中类似like和百分号双向匹配需求。