package com.sunzc.analysis.analyzer;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.Modifier;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.CloseableThreadLocal;
/**
* 一个分词器建立用来分析文本的TokenStreams.因此它代表了一种从文本中提取索引项的策略.
* <p>
* Typical implementations first build a Tokenizer, which breaks the stream of
* characters from the Reader into raw Tokens. One or more TokenFilters may then
* be applied to the output of the Tokenizer.
* 典型的实现类,首先要建立一个可以从字符中拆词成不成熟的Token词元的Tokenizer.
* <p>
* The {@code Analyzer}-API in Lucene is based on the decorator pattern.翻译不出来
* Therefore all non-abstract subclasses must be final or their
* {@link #tokenStream} and {@link #reusableTokenStream} implementations must be
* final! This is checked when Java assertions are enabled.
* 因此,所以有非的子类必须是fianl的或者它们的tokenStream & resuableTokenStream 实现体,必须是final的!
* 这将会在Java assertions 的时候被被激活.
*
*/
public abstract class Analyzer implements Closeable {
protected Analyzer() {
super();
assert assertFinal();
}
private boolean assertFinal() {
try {
final Class<?> clazz = getClass();
assert clazz.isAnonymousClass()
|| (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0
|| (Modifier.isFinal(clazz.getMethod("tokenStream",
String.class, Reader.class).getModifiers()) && Modifier
.isFinal(clazz.getMethod("reusableTokenStream",
String.class, Reader.class).getModifiers())) : "Analyzer implementation classes or at least their tokenStream() and reusableTokenStream() implementations must be final";
return true;
} catch (NoSuchMethodException nsme) {
return false;
}
}
/**
* Creates a TokenStream which tokenizes all the text in the provided
* Reader. Must be able to handle null field name for backward
* compatibility. 建立一个TokenStream,它可以从Reader中读到的文本进行切词操作.
* 一定要能够处理null字段域来向后兼容.
*/
public abstract TokenStream tokenStream(String fieldName, Reader reader);
/**
* Creates a TokenStream that is allowed to be re-used from the previous
* time that the same thread called this method. Callers that do not need to
* use more than one TokenStream at the same time from this analyzer should
* use this method for better performance.
* 建立一个TokeanStream,它允许被重复使用.这需要在同一个线程中上一次调用这个方法.
* 调用者不需要在一次分词过程中使用一个以上TokenStream.以便提高效率.
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader)
throws IOException {
return tokenStream(fieldName, reader);
}
private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
/**
* Used by Analyzers that implement reusableTokenStream to retrieve
* previously saved TokenStreams for re-use by the same thread.
* 在一个线程中,被实现了resualbeTokenStream的分词器来恢复上次一次保存了的TokenStreams,方便重复使用.
*/
protected Object getPreviousTokenStream() {
try {
return tokenStreams.get();
} catch (NullPointerException npe) {
if (tokenStreams == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Used by Analyzers that implement reusableTokenStream to save a
* TokenStream for later re-use by the same thread.
* 在同一个线程中,被实现了resualbeTokenStream的Analyzers来保存TokenSteam,以便近期的重复使用.
*/
protected void setPreviousTokenStream(Object obj) {
try {
tokenStreams.set(obj);
} catch (NullPointerException npe) {
if (tokenStreams == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Invoked before indexing a Fieldable instance if terms have already been
* added to that field. This allows custom analyzers to place an automatic
* position increment gap between Fieldable instances using the same field
* name. The default value position increment gap is 0. With a 0 position
* increment gap and the typical default token position increment of 1, all
* terms in a field, including across Fieldable instances, are in successive
* positions, allowing exact PhraseQuery matches, for instance, across
* Fieldable instance boundaries.
*
*
* @param fieldName
* Fieldable name being indexed.
* @return position increment gap, added to the next token emitted from
* {@link #tokenStream(String,Reader)}
*/
public int getPositionIncrementGap(String fieldName) {
return 0;
}
/**
* Just like {@link #getPositionIncrementGap}, except for Token offsets
* instead. By default this returns 1 for tokenized fields and, as if the
* fields were joined with an extra space character, and 0 for un-tokenized
* fields. This method is only called if the field produced at least one
* token for indexing.
*
* @param field
* the field just indexed
* @return offset gap, added to the next token emitted from
* {@link #tokenStream(String,Reader)}
*/
public int getOffsetGap(Fieldable field) {
if (field.isTokenized())
return 1;
else
return 0;
}
/** Frees persistent resources used by this Analyzer */
public void close() {
tokenStreams.close();
tokenStreams = null;
}
}
没有翻译完,后期再补吧. 有那么一段,翻译不明白....