solr1.4 中文庖丁使用方法

最新推荐文章于 2024-07-21 19:02:10 发布

熊猫家族

最新推荐文章于 2024-07-21 19:02:10 发布

阅读量153

点赞数

分类专栏： Solr 文章标签： Solr lucene .net Apache QQ

本文链接：https://blog.csdn.net/a280606790/article/details/83694104

版权

Solr 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

由于solr1.4使用Lucene 2.9.1 ，故需要修改庖丁源代码：net.paoding.analysis.analyzer.PaodingTokenizer

需要注意的有两点

1. 继承关系由 TokenStream 调整为 Tokenizer，因此需要删除变量

private final Reader input;

删除对应的关闭方法

public void close() throws IOException {
super.close();
input.close();
}

2. 高亮显示功能由于底层实现变化，故需要重写reset方法。原先reset 只是调整input。现在需要将多个值重置。

public void reset(Reader input) throws IOException {
   this.input = input;
   this.inputLength=0;
   this.offset=0;
   this.dissected=0;
   this.tokenIteractor=null;
   this.beef.set(0, 0);
}

调整后的整体代码如下

package net.paoding.analysis.analyzer;

/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.Beef;
import net.paoding.analysis.knife.Collector;
import net.paoding.analysis.knife.Knife;
import net.paoding.analysis.knife.Paoding;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;

/**
* PaodingTokenizer是基于“庖丁解牛”框架的TokenStream实现，为PaodingAnalyzer使用。
* <p>
*
* @author Zhiliang Wang [qieqie.wang@gmail.com]
* @see Beef
* @see Knife
* @see Paoding
* @see Tokenizer
* @see PaodingAnalyzer
*
* @see Collector
* @see TokenCollector
* @see MAxTokenCollector
* @see MostWordsTokenCollector
*
* @since 1.0
*/
public final class PaodingTokenizer extends Tokenizer implements Collector {

// -------------------------------------------------

/**
* 从input读入的总字符数
*/
private int inputLength;

/**
*
*/
private static final int bufferLength = 128;

/**
* 接收来自{@link #input}的文本字符
*
* @see #next()
*/
private final char[] buffer = new char[bufferLength];

/**
* {@link buffer}[0]在{@link #input}中的偏移
*
* @see #collect(String, int, int)
* @see #next()
*/
private int offset;

/**
*
*/
private final Beef beef = new Beef(buffer, 0, 0);

/**
*
*/
private int dissected;

/**
* 用于分解beef中的文本字符，由PaodingAnalyzer提供
*
* @see #next()
*/
private Knife knife;

/**
*
*/
private TokenCollector tokenCollector;

/**
* tokens迭代器，用于next()方法顺序读取tokens中的Token对象
*
* @see #tokens
* @see #next()
*/
private Iterator/* <Token> */ tokenIteractor;

// -------------------------------------------------

/**
*
* @param input
* @param knife
* @param tokenCollector
*/
public PaodingTokenizer(Reader input, Knife knife, TokenCollector tokenCollector) {
   this.input = input;
   this.knife = knife;
   this.tokenCollector = tokenCollector;
}

// -------------------------------------------------

public TokenCollector getTokenCollector() {
return tokenCollector;
}

public void setTokenCollector(TokenCollector tokenCollector) {
this.tokenCollector = tokenCollector;
}

// -------------------------------------------------

public void collect(String word, int offset, int end) {
tokenCollector.collect(word, this.offset + offset, this.offset + end);
}

// -------------------------------------------------
public Token next() throws IOException {
   // 已经穷尽tokensIteractor的Token对象，则继续请求reader流入数据
   while (tokenIteractor == null || !tokenIteractor.hasNext()) {
    //System.out.println(dissected);
    int read = 0;
    int remainning = -1;//重新从reader读入字符前，buffer中还剩下的字符数，负数表示当前暂不需要从reader中读入字符
    if (dissected >= beef.length()) {
     remainning = 0;
    }
    else if (dissected < 0){
     remainning = bufferLength + dissected;
    }
    if (remainning >= 0) {
     if (remainning > 0) {
      System.arraycopy(buffer, -dissected, buffer, 0, remainning);
     }
     read = this.input.read(buffer, remainning, bufferLength - remainning);
     inputLength += read;
     int charCount = remainning + read;
     if (charCount < 0) {
      // reader已尽，按接口next()要求返回null.
      return null;
     }
     if (charCount < bufferLength) {
      buffer[charCount ++] = 0;
     }
     // 构造“牛”，并使用knife“解”之
     beef.set(0, charCount);
     offset += Math.abs(dissected);
     //offset -= remainning;
     dissected = 0;
    }
    dissected = knife.dissect((Collector)this, beef, dissected);
//    offset += read;// !!!
    tokenIteractor = tokenCollector.iterator();
   }
   // 返回tokensIteractor下一个Token对象
   return (Token) tokenIteractor.next();
}

public int getInputLength() {
   return inputLength;
}

//重新实现reset(input),切记需要抛出异常。
public void reset(Reader input) throws IOException {
   this.input = input;
   this.inputLength=0;
   this.offset=0;
   this.dissected=0;
   this.tokenIteractor=null;
   this.beef.set(0, 0);
}
}

另外，关于原先实现的中文切词中的create方法，需要修改返回的类型。代码如下

package net.paoding;

import java.io.Reader;
import java.util.Map;

import net.paoding.analysis.analyzer.PaodingTokenizer;
import net.paoding.analysis.analyzer.TokenCollector;
import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector;
import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.PaodingMaker;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.solr.analysis.BaseTokenizerFactory;

/**
* Solr 1.4 paoding tokenizer factory
*
*/
public class ChineseTokenizerFactory extends BaseTokenizerFactory {

/**
* 最多分词,默认.
*/
public static final String MOST_WORDS_MODE = "most-words";

/**
* 最长分词.
*/
public static final String MAX_WORD_LENGTH_MODE = "max-word-length";

private String mode = null;

public void setMode(String mode) {
if (mode==null||MOST_WORDS_MODE.equalsIgnoreCase(mode)
|| "default".equalsIgnoreCase(mode)) {

    this.mode=MOST_WORDS_MODE;
   } else if (MAX_WORD_LENGTH_MODE.equalsIgnoreCase(mode)) {
    this.mode=MAX_WORD_LENGTH_MODE;
   } else {
    throw new IllegalArgumentException("不合法的分析器Mode参数设置:" + mode);
   }
}

    public void init(Map<String, String> args) {
        super.init(args);
        setMode(args.get("mode"));
    }

    public Tokenizer create(Reader input) {
    return new PaodingTokenizer(input, PaodingMaker.make(),createTokenCollector());
    }

    private TokenCollector createTokenCollector() {
        if( MOST_WORDS_MODE.equals(mode)) {
        return new MostWordsTokenCollector();
        }
        if( MAX_WORD_LENGTH_MODE.equals(mode)) {
        return new MaxWordLengthTokenCollector();
        }
        throw new Error("never happened");
    }
}

如需使用该分词器，则需要配置字段类型schema.xml 中的fieldType的tokenizer 为下面形式。具体class路径根据实际的ChineseTokenizerFactory位置填写。

上面讲述的是solr 1.4配置庖丁的方式。如果您是从solr1.3升级，那么请务必注意下面这段文字：

如果是升级，则需要核对schema.xml 中的 field 的type 定义对应好（如果原来是 int 类型的要注意，solr 1.4 把 int 用 tint 类型了。如果要兼容，要把 int 的类型修改成 pint。其它 long,float 一样修改）具体对应关系可根据solr1.3的fieldType 中各种类型定义与solr1.4中fieldType 各种类型定义对比。

文章出自幸福的小小仙

xian0617@qq.com 或 xian0617@163.com

熊猫家族

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
solr1.4 中文庖丁使用方法

由于solr1.4使用Lucene 2.9.1 ，故需要修改庖丁源代码：net.paoding.analysis.analyzer.PaodingTokenizer需要注意的有两点1. 继承关系由 TokenStream 调整为 Tokenizer，因此需要删除变量private final Reader input;删除对应的关闭方法public void close() t...
复制链接

扫一扫