elasticsearch 1.1.0 mmseg 英文数字分词

最新推荐文章于 2024-09-11 17:08:01 发布

xiaomin_____

最新推荐文章于 2024-09-11 17:08:01 发布

阅读量240

点赞数

分类专栏：分词器文章标签：大数据 java

本文链接：https://blog.csdn.net/xiaomin1991222/article/details/84748840

版权

分词器专栏收录该内容

24 篇文章 0 订阅

订阅专栏

elasticsearch 1.1.0 mmseg 插件的版本是1.2.2 版本。该版本没有解决英文数字分词问题。

比如 user123。分词后 user123

解决1：

mmseg插件升级 elasticsearch-analysis-mmseg-1.4.0。

https://github.com/medcl/elasticsearch-analysis-mmseg/commit/61b5e8199425c845a3060fe39f40e59868dd364b

index:
  analysis: 
    tokenizer:
      mmseg_maxword:
        type: mmseg
        seg_type: max_word
      mmseg_complex:
        type: mmseg
        seg_type: complex
    analyzer:
      mmseg_maxword:
        type: custom
        filter:
        - lowercase
        - cut_letter_digit
        tokenizer: mmseg_maxword
      mmseg:
        type: custom
        filter:
        - lowercase
        - cut_letter_digit
        tokenizer: mmseg_maxword
      mmseg_complex:
        type: custom
        filter:
        - lowercase
        tokenizer: mmseg_complex
#index.analysis.analyzer.default.type : "org.elasticsearch.index.analysis.MMsegAnalyzerProvider"
#index.analysis.analyzer.default.type : "ik"
index.analysis.analyzer.default.type : "mmseg"

解决2：

修改1.2.2 版本jar包中的MMSegAnalyzer，然后替换class

package com.chenlb.mmseg4j.analysis;

import java.io.File;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.Seg;

/**
 * 榛樿浣跨敤 max-word
 *
 * @see {@link SimpleAnalyzer}, {@link ComplexAnalyzer}, {@link MaxWordAnalyzer}
 *
 * @author chenlb
 */
public class MMSegAnalyzer extends Analyzer {

	protected Dictionary dic;

	/**
	 * @see Dictionary#getInstance()
	 */
	public MMSegAnalyzer() {
		dic = Dictionary.getInstance();
	}

	/**
	 * @param path 璇嶅簱璺緞
	 * @see Dictionary#getInstance(String)
	 */
	public MMSegAnalyzer(String path) {
		dic = Dictionary.getInstance(path);
	}

	/**
	 * @param path 璇嶅簱鐩綍
	 * @see Dictionary#getInstance(File)
	 */
	public MMSegAnalyzer(File path) {
		dic = Dictionary.getInstance(path);
	}

	public MMSegAnalyzer(Dictionary dic) {
		super();
		this.dic = dic;
	}

	protected Seg newSeg() {
		return new MaxWordSeg(dic);
	}

	public Dictionary getDict() {
		return dic;
	}

	/*@Override
	protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
		return new TokenStreamComponents(new MMSegTokenizer(newSeg(), reader));
	}*/
	@Override
	protected TokenStreamComponents createComponents(String fieldName,
			Reader reader) {
		Tokenizer t = new MMSegTokenizer(newSeg(), reader);
		return new TokenStreamComponents(t, new CutLetterDigitFilter(t));
	}
	
	
}