Lucene 3.1.0 Analyzer 源码中文翻译

最新推荐文章于 2018-12-09 15:24:10 发布

feier87

最新推荐文章于 2018-12-09 15:24:10 发布

阅读量614

点赞数

分类专栏： java jdk 源代码翻译

本文链接：https://blog.csdn.net/u011452696/article/details/9381363

版权

java jdk 源代码翻译专栏收录该内容

2 篇文章 0 订阅

订阅专栏

package com.sunzc.analysis.analyzer;

import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.Modifier;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.CloseableThreadLocal;

/**
 * 一个分词器建立用来分析文本的TokenStreams.因此它代表了一种从文本中提取索引项的策略.
 * <p>
 * Typical implementations first build a Tokenizer, which breaks the stream of
 * characters from the Reader into raw Tokens. One or more TokenFilters may then
 * be applied to the output of the Tokenizer.
 * 典型的实现类,首先要建立一个可以从字符中拆词成不成熟的Token词元的Tokenizer.
 * <p>
 * The {@code Analyzer}-API in Lucene is based on the decorator pattern.翻译不出来
 * Therefore all non-abstract subclasses must be final or their
 * {@link #tokenStream} and {@link #reusableTokenStream} implementations must be
 * final! This is checked when Java assertions are enabled.
 * 因此,所以有非的子类必须是fianl的或者它们的tokenStream & resuableTokenStream 实现体,必须是final的!
 * 这将会在Java assertions 的时候被被激活.
 * 
 */
public abstract class Analyzer implements Closeable {

	protected Analyzer() {
		super();
		assert assertFinal();
	}

	private boolean assertFinal() {
		try {
			final Class<?> clazz = getClass();
			assert clazz.isAnonymousClass()
					|| (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0
					|| (Modifier.isFinal(clazz.getMethod("tokenStream",
							String.class, Reader.class).getModifiers()) && Modifier
							.isFinal(clazz.getMethod("reusableTokenStream",
									String.class, Reader.class).getModifiers())) : "Analyzer implementation classes or at least their tokenStream() and reusableTokenStream() implementations must be final";
			return true;
		} catch (NoSuchMethodException nsme) {
			return false;
		}
	}

	/**
	 * Creates a TokenStream which tokenizes all the text in the provided
	 * Reader. Must be able to handle null field name for backward
	 * compatibility. 建立一个TokenStream,它可以从Reader中读到的文本进行切词操作.
	 * 一定要能够处理null字段域来向后兼容.
	 */
	public abstract TokenStream tokenStream(String fieldName, Reader reader);

	/**
	 * Creates a TokenStream that is allowed to be re-used from the previous
	 * time that the same thread called this method. Callers that do not need to
	 * use more than one TokenStream at the same time from this analyzer should
	 * use this method for better performance.
	 * 建立一个TokeanStream,它允许被重复使用.这需要在同一个线程中上一次调用这个方法.
	 * 调用者不需要在一次分词过程中使用一个以上TokenStream.以便提高效率.
	 */
	public TokenStream reusableTokenStream(String fieldName, Reader reader)
			throws IOException {
		return tokenStream(fieldName, reader);
	}

	private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();

	/**
	 * Used by Analyzers that implement reusableTokenStream to retrieve
	 * previously saved TokenStreams for re-use by the same thread.
	 * 在一个线程中,被实现了resualbeTokenStream的分词器来恢复上次一次保存了的TokenStreams,方便重复使用.
	 */
	protected Object getPreviousTokenStream() {
		try {
			return tokenStreams.get();
		} catch (NullPointerException npe) {
			if (tokenStreams == null) {
				throw new AlreadyClosedException("this Analyzer is closed");
			} else {
				throw npe;
			}
		}
	}

	/**
	 * Used by Analyzers that implement reusableTokenStream to save a
	 * TokenStream for later re-use by the same thread.
	 * 在同一个线程中,被实现了resualbeTokenStream的Analyzers来保存TokenSteam,以便近期的重复使用.
	 */
	protected void setPreviousTokenStream(Object obj) {
		try {
			tokenStreams.set(obj);
		} catch (NullPointerException npe) {
			if (tokenStreams == null) {
				throw new AlreadyClosedException("this Analyzer is closed");
			} else {
				throw npe;
			}
		}
	}

	/**
	 * Invoked before indexing a Fieldable instance if terms have already been
	 * added to that field. This allows custom analyzers to place an automatic
	 * position increment gap between Fieldable instances using the same field
	 * name. The default value position increment gap is 0. With a 0 position
	 * increment gap and the typical default token position increment of 1, all
	 * terms in a field, including across Fieldable instances, are in successive
	 * positions, allowing exact PhraseQuery matches, for instance, across
	 * Fieldable instance boundaries.
	 * 
	 * 
	 * @param fieldName
	 *            Fieldable name being indexed.
	 * @return position increment gap, added to the next token emitted from
	 *         {@link #tokenStream(String,Reader)}
	 */
	public int getPositionIncrementGap(String fieldName) {
		return 0;
	}

	/**
	 * Just like {@link #getPositionIncrementGap}, except for Token offsets
	 * instead. By default this returns 1 for tokenized fields and, as if the
	 * fields were joined with an extra space character, and 0 for un-tokenized
	 * fields. This method is only called if the field produced at least one
	 * token for indexing.
	 * 
	 * @param field
	 *            the field just indexed
	 * @return offset gap, added to the next token emitted from
	 *         {@link #tokenStream(String,Reader)}
	 */
	public int getOffsetGap(Fieldable field) {
		if (field.isTokenized())
			return 1;
		else
			return 0;
	}

	/** Frees persistent resources used by this Analyzer */
	public void close() {
		tokenStreams.close();
		tokenStreams = null;
	}
}

没有翻译完,后期再补吧. 有那么一段,翻译不明白....