solr 8.0对接分词组件

使用ansj、stanford coreNLP这种类型的中文分词组件,需要实现两个类。

一个是Tokenizer,一个是TokenizerFactory。

以下是tokenizer的实现,使用的ansj,如果要使用其它组件,把分词那一块的逻辑修改下就可以了:

public class LAnsjTokenizer extends Tokenizer {
    private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);

    private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
    private int offset = 0;//当前buffer的起始偏移距
    private int dataLen = 0; //分词器当前缓存的数据长度
    private int finalOffset = 0;
    private int analysisType ;//使用的分词算法
    private Iterator<Term> tokenIterator;
    private List<Term> tokenBuffer;

    public LAnsjTokenizer(AttributeFactory factory, int analysisType){
        this.analysisType = analysisType;
    }

    public boolean incrementToken() throws IOException {
        if (tokenIterator == null || !tokenIterator.hasNext()) {//当前读取的数据处理完了,再次读取新的数据
            this.offset += this.dataLen;
            CharacterUtils.fill(this.ioBuffer, this.input);
            if (this.ioBuffer.getLength() == 0) {//没有数据了
                this.dataLen = 0;
                return false;
            }

            this.dataLen = this.ioBuffer.getLength();

            String currentSentence = new String(ioBuffer.getBuffer(), 0, ioBuffer.getLength());

            tokenBuffer = new ArrayList<Term>();
            if (analysisType == 1) {
                for (Term term : ToAnalysis.parse(currentSentence)) {
                    tokenBuffer.add(term);
                }
            } else if (analysisType == 0) {
                for (Term term : IndexAnalysis.parse(currentSentence)) {
                    tokenBuffer.add(term);
                }
            } else if (analysisType == 2) {
                for (Term term : NlpAnalysis.parse(currentSentence)) {
                    tokenBuffer.add(term);
                }
            }
            tokenIterator = tokenBuffer.iterator();
            if (!tokenIterator.hasNext()) {
                return false;
            }

        }

        this.clearAttributes();
        Term term = tokenIterator.next();
        termAtt.append(term.getName());
        termAtt.setLength(term.getName().length());
        int currentStart = this.offset + term.getOffe();
        int currentEnd = this.offset + term.toValue();

        this.offsetAtt.setOffset(currentStart, this.finalOffset = this.correctOffset(currentEnd));

        return true;
    }
    public final void end() throws IOException {
        super.end();
        System.out.println(this.finalOffset);
        this.offsetAtt.setOffset(this.finalOffset, this.finalOffset);
    }

    public void reset() throws IOException {
        super.reset();
        this.tokenIterator = null;
        this.tokenBuffer =null;
        this.offset = 0;
        this.dataLen = 0;
        this.finalOffset = 0;
        this.ioBuffer.reset();
    }
    public CharTermAttribute getTermAtt() {
        return termAtt;
    }
}


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值