Lucene学习（三）使用IK 分词器

最新推荐文章于 2024-01-10 14:08:37 发布

天才小熊猫12138584

最新推荐文章于 2024-01-10 14:08:37 发布

阅读量671

点赞数

本文链接：https://blog.csdn.net/qq_40990836/article/details/87969698

版权

Java 同时被 3 个专栏收录

36 篇文章 0 订阅

订阅专栏

Lucene

3 篇文章 0 订阅

订阅专栏

IK分词器

1 篇文章 0 订阅

订阅专栏

下载IK分词器

不用下载了之前写错了，

下载好之后。解压缩。使用idea打开项目。

使用如下命令
mvn install -Dmaven.test.skip=true

这样你就可以在项目中自己的maven 仓库中检索到这个分词器了。

<dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
            <version>2012_u6</version>
            <exclusions>
                <!-- 移除旧的lucene 包 -->
                <exclusion>
                    <groupId>org.apache.lucene</groupId>
                    <artifactId>lucene-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.lucene</groupId>
                    <artifactId>lucene-queryparser</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.lucene</groupId>
                    <artifactId>lucene-analyzers-common</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

因为IK分词器好像只支持lucenne 4.0 版本，往上就不支持了。所以我们要自己定义他得分词器

创建两个类 IKAnalyzer4Lucene7 和 IKTokenizer4Lucene7 。

代码如下：

IKTokenizer4Lucene7

package cn.fllday.common.lucene.analyzer;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;

/**
 * 永无八阿哥
 *
 * @Author:gssznb
 * @Date:Created 2018/12/715:13
 * @Description: 这是一个描述，但是我可能不会写
 * @Modified By:
 */
public final class IKTokenizer4Lucene7 extends Tokenizer {

    // IK分词器实现
    private final IKSegmenter _IKImplement;
    // 词元文本属性
    private final CharTermAttribute termAtt;

    // 词元位移属性
    private final OffsetAttribute offsetAtt;

    // 词元分类属性
    private final TypeAttribute typeAtt;
    // 记录最后一个词元结束位置
    private int endPosition;

    /**
     *
     * @param useSmart
     */
    public IKTokenizer4Lucene7(boolean useSmart){
        super();
        offsetAtt = addAttribute(OffsetAttribute.class);
        termAtt = addAttribute(CharTermAttribute.class);
        typeAtt = addAttribute(TypeAttribute.class);
        _IKImplement = new IKSegmenter(input,useSmart);
    }



    @Override
    public boolean incrementToken() throws IOException {
        // 清除所有词元属性
        clearAttributes();
        Lexeme nextLexeme = _IKImplement.next();
        if (nextLexeme != null){
            // 将Lexeme转成Attributes
            // 设置词元属性
            termAtt.append(nextLexeme.getLexemeText());
            // 设置词元长度
            termAtt.setLength(nextLexeme.getLength());
            // 设置词元唯一
            offsetAtt.setOffset(nextLexeme.getBeginPosition(),nextLexeme.getEndPosition());
            // 记录分词的最后位置
            endPosition = nextLexeme.getEndPosition();
            // 记录词元分类
            typeAtt.setType(nextLexeme.getLexemeTypeString());
            // 返回ture告知还有下个词元
            return  true;
        }
        // 返回false告知输出完毕
        return false;
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        _IKImplement.reset(input);
    }

    @Override
    public void end() throws IOException {
        super.end();
        int finalOffset = correctOffset(this.endPosition);
        offsetAtt.setOffset(finalOffset,finalOffset);
    }
}

IKAnalyzer4Lucene7

package cn.fllday.common.lucene.analyzer;


import org.apache.lucene.analysis.Analyzer;

/**
 * 永无八阿哥
 *
 * @Author:gssznb
 * @Date:Created 2018/12/715:05
 * @Description: 这是一个描述，但是我可能不会写
 * @Modified By:
 */
public class IKAnalyzer4Lucene7 extends Analyzer {

    private boolean useSmart = false;

    public IKAnalyzer4Lucene7() {
        this(false);
    }

    public IKAnalyzer4Lucene7(boolean useSmart) {
        super();
        this.useSmart = useSmart;
    }

    public boolean isUseSmart(){
        return useSmart;
    }

    public void setUseSmart(boolean useSmart){
        this.useSmart = useSmart;
    }


    @Override
    protected TokenStreamComponents createComponents(String s) {
        IKTokenizer4Lucene7 tk = new IKTokenizer4Lucene7(this.useSmart);
        return new TokenStreamComponents(tk);
    }
}

这样你就可以在你的项目中使用IK分词器了。 IK分词器对中文的支持比较好！