测试代码
public static void main(String[] args)
{
String str = new String("我爱武大但我更爱中国");
Analyzer analyzer = newStandardAnalyzer(Version.LUCENE_CURRENT);
TokenStream ts = analyzer.tokenStream(null, newStringReader(str));
TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);
int n = 0;
try
{
while(ts.incrementToken())
{
System.out.println(termAtt.term());
System.out.println(typeAtt.type());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
先分析AttributeSource类:
此类中定义了内部类:AttributeFactory DefaultAttributeFactor后者继承前者:
AttributeFactory有:抽象方法:createAttributeInstance
类成员:DefaultAttributeFactor类实例对象:
DEFAULT_ATTRIBUTE_FACTORY。
DefaultAttributeFactor有方法:createAttributeInstance
getClassForInterface
类成员:attClassImplMap
类型:WeakHashMap<Class<? extendsAttribute>, WeakReference<Class<? extendsAttributeImpl>>>
此weakhashmap存放的键值对形如:
(A.class,Aimpl.class)
(B.class,Bimpl.class)
getClassForInterface:参数为CLASS类型
函数功能:根据入参 查找weakhashmap,返回对应的Class类型的对象;
若weakhashmap中无此类型的键值对,则创建并返会
createAttributeInstance:参数为CLASS类型:
函数功能:调用上方法,增加了异常处理部分的代码。
下面继续分析AttibuteSource类其他部分:
成员变量:
Map<Class<?extends Attribute>, AttributeImpl> attributes;
Map<Class<? extendsAttributeImpl>, AttributeImpl> attributeImpls;
WeakHashMap<Class<? extendsAttributeImpl>,
LinkedList<WeakReference<Class<? extendsAttribute>>>>
knownImplClasses
AttributeFactory类型对象:factory
构造方法:AttributeSource()
它调用下个构造方法
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
AttributeSource(AttributeFactory factory)
此方法利用factory来初始化AttributeSource对象:
实例化 attributes; attributeImpls;
AttributeSource(AttributeSource input)
其他部分方法:addAttribute
getAttribute 入参:Class类型的一个实例
返回该类型实例的一个实例
StandardTokenizer类分析:继承自Tokenizer
(继承细节:Tokenizer—)TokenStream-)AttibuteSouce)
成员变量:StandardTokenizerImpl 类型对象scanner
String[] 类型的数组:记录各种token types。
Token 最大长度。
构造方法:1根据继承关系由上而下调用响应构造方法
2创建一个StandardTokenizerImpl对象
其构造方法:StandardTokenizerImpl(Readerin)
{ this.zzReader = in; }
3初始方法init(Reader input, Version matchVersion)
方法内容如下:
if (matchVersion.onOrAfter(Version.LUCENE_24))
{ ReplaceInvalidAcronym = true;
}
else {
replaceInvalidAcronym = false;
}
this.input = input;
termAtt =addAttribute(TermAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
posIncrAtt =addAttribute(PositionIncrementAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
特注:input成员由Tokenizer继承而来
addAttribute方法由AttributeSource继承而来
返回该类的一个实例,如果不存在,则创建一个实例并返回之
下面分析: StandardFilter类
继承自TokenFilter TokenFilter 继承自 TokenStream
成员变量: TypeAttribute typeAtt;
TermAttribute termAtt;
构造方法:根据继承关系由上而下调用响应的构造方法:
AttributeSource
TokenStream
TokenFilter 初始化其成员变量input
(特注: TokenFilter 有成员变量:TokenStream input
termAtt = addAttribute(TermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
LowerCaseFilter类分析
继承于:TokenFilter
成员变量:private TermAttribute termAtt;
构造方法:根据继承关系由上而下调用响应的构造方法:
AttributeSource
TokenStream
TokenFilter 初始化其成员变量input
(特注: TokenFilter 有成员变量:TokenStream input
termAtt = addAttribute(TermAttribute.class);
下面分析incrementToken方法:返回第一个非停词
首先分析其调用流程:
1.mian方法中 while(ts.incrementToken())
此处将调用StopFilter 的incrementToken
2.StopFilter 的incrementToken: while(ts.incrementToken())
此处将调用.LowerCaseFilter 的 incrementToken:
3.LowerCaseFilter 的 incrementToken: while(ts.incrementToken())
此处将调用StandardFilter 的incrementToken
4.StandardFilter 的incrementToken : while(ts.incrementToken()){
此处将调用StandardTokenizer 的incrementToken()方法。
5.StandardTokenizer的incrementToken : int tokenType=scanner.getNextToken();
将调用sanner的getNextToken方法。
注:StandardTokenizer有个成员变量:StandardTokenizerImpl scanner
在StandardTokenizer的构造方法中为此引用scanner 实例化一个StandardTokenizerImpl对象
StandardTokenizerImpl(java.io.Reader in) {
this.zzReader = in;
}//StandardTokenizerImpl 有Reader类型的成员变量。
回顾tokenStream方法:
public TokenStream tokenStream(StringfieldName, Reader reader) {
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion,reader);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
result = new StopFilter(enableStopPositionIncrements,result, stopSet);
return result;
}
不难发现 incrementToken方法的方向于tokenStream中方向恰好相反。
首先分析StandardTokenizer的incrementToken方法
public final boolean incrementToken() throws IOException
{
clearAttributes(); //清除所有Attribure
int posIncr = 1;
while(true)
{
int tokenType = scanner.getNextToken();//获得匹配的表达式
if(tokenType == StandardTokenizerImpl.YYEOF) {//否到达文件末尾
returnfalse;
}
//当前匹配的单词长达不大于最大长度
if(scanner.yylength() <= maxTokenLength){
//获得当前匹配的单词的位置,并保存在PositionIncrementAttribute类型变
//量postIncrAtt中
posIncrAtt.setPositionIncrement(posIncr)
//获得当前单词放在TermAttribute类型变量termAtt中
scanner.getText(termAtt);
//获得当前token的开始offset和和结束处offset
final int start = scanner.yychar();
//将当前token开始和结束处的偏移量存在OffsetAttribute型变量offsetAtt中
offsetAtt.setOffset(correctOffset(start), correctOffset
(start+termAtt.termLength()));
if (tokenType ==StandardTokenizerImpl.ACRONYM_DEP)
{
if (replaceInvalidAcronym) {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES
[StandardTokenizerImpl.HOST]);
termAtt.setTermLength(termAtt.termLength()- 1); // remove extra '.'
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES
[StandardTokenizerImpl.ACRONYM]);
}
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return true;
} else
// When we skip a too-long term, westill increment the
// position increment
posIncr++;//遇到过长的token将直接skip调,再次进入循环
}
}
StandardFilter的incrementToken方法:
publicfinalboolean incrementToken() throws java.io.IOException
{
if (!input.incrementToken()) {//此input从tokenFilter中继承而来,是TokenStream类型
returnfalse;
}
//当前token在termAtt的buffer中
char[] buffer = termAtt.termBuffer();
finalint bufferLength = termAtt.termLength();//获得当前token长度
final String type = typeAtt.type();
//以下if如果此token以‘s结尾,则去掉最后两个字符,remove ’s
if (type == APOSTROPHE_TYPE&&bufferLength >= 2&&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
termAtt.setTermLength(bufferLength - 2);
}
除掉此token中所有点好(removes all dots)
elseif (type == ACRONYM_TYPE)
{
int upto = 0;
for(int i=0;i<bufferLength;i++)
{
char c = buffer[i];
if (c != '.')
buffer[upto++] =c;
}
termAtt.setTermLength(upto);
}
returntrue;
}
}
LowerCaseFilter的incrementToken的方法:将token中字母转小写
publicfinalboolean incrementToken() throws IOException {
if (input.incrementToken()) {
finalchar[] buffer = termAtt.termBuffer();
finalint length = termAtt.termLength();
for(int i=0;i<length;i++)
buffer[i] = Character.toLowerCase(buffer[i]);
returntrue;
} else
returnfalse;
}
}
StopFilter 的incrementToken方法 //跳过停词
publicfinalboolean incrementToken() throws IOException
{
int skippedPositions = 0;
while (input.incrementToken())
{
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength()))
{
if (enablePositionIncrements)
{
posIncrAtt.setPositionIncrement(
posIncrAtt.getPositionIncrement() + skippedPositions);
}
returntrue;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
:StandardTokenizer有个成员变量:StandardTokenizerImplscanner
在StandardTokenizer的构造方法中为此引用scanner 实例化一个StandardTokenizerImpl对象
StandardTokenizerImpl(java.io.Reader
首先分析StandardTokenizer 的incrementToken方法
public final booleanincrementToken() throws IOException
{ clearAttributes(); //清除所有Attribure
int posIncr = 1;
while(true){
int tokenType = scanner.getNextToken();//获得匹配的表达式
if (tokenType == StandardTokenizerImpl.YYEOF) {//否到达文件末尾
return false;
}
if (scanner.yylength() <= maxTokenLength){//当前匹配的单词长达不大于最大长度
posIncrAtt.setPositionIncrement(posIncr) //获得当前匹配的单词的位置,并保存在
sitionIncrementAttribute类型变量postIncrAtt中
scanner.getText(termAtt);//获得当前单词放在TermAttribute类型变量termAtt中
final int start = scanner.yychar();//获得当前token的开始和结束处offset
offsetAtt.setOffset(correctOffset(start),correctOffset
(start+termAtt.termLength()));//将当前token的开始处和结束处的偏移量保存在
OffsetAttribute 类型变量offsetAtt中
// This 'if' should be removed in the next release. For now,it converts
// invalid acronyms to HOST. When removed, only the 'else' partshould
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES
[StandardTokenizerImpl.HOST]);
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra'.'
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES
[StandardTokenizerImpl.ACRONYM]);
}
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return true;
} else
// When we skip a too-long term, westill increment the
// position increment
posIncr++;//遇到过长的token将直接skip调,再次进入循环
}
}
StandardFilter的incrementToken方法:
public final booleanincrementToken() throws java.io.IOException {
if (!input.incrementToken()) {//此input从tokenFilter中继承而来,是TokenStream类型
return false;
}
//当前token在termAtt的buffer中
char[] buffer = termAtt.termBuffer();
final int bufferLength = termAtt.termLength();//获得当前token长度
final String type= typeAtt.type();
//以下if如果此token以‘s结尾,则去掉最后两个字符,remove ’s
if (type ==APOSTROPHE_TYPE&&bufferLength >= 2 &&
buffer[bufferLength-2] == '\''&&
(buffer[bufferLength-1] == 's' ||buffer[bufferLength-1] == 'S')) {
termAtt.setTermLength(bufferLength - 2);
}
除掉此token中所有点好(removes all dots)
else if (type ==ACRONYM_TYPE)
{
int upto = 0;
for(int i=0;i<bufferLength;i++)
{
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
termAtt.setTermLength(upto);
}
return true;
}
}
LowerCaseFilter 的incrementToken的 方法:将token中字母转小写
public final booleanincrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer =termAtt.termBuffer();
final int length = termAtt.termLength();
for(int i=0;i<length;i++)
buffer[i] =Character.toLowerCase(buffer[i]);
return true;
} else
return false;
}
}
StopFilter 的incrementToken方法 //跳过停词
public final booleanincrementToken() throws IOException
{
int skippedPositions = 0;
while(input.incrementToken())
{
if(!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength()))
{
if (enablePositionIncrements)
{
posIncrAtt.setPositionIncrement(
posIncrAtt.getPositionIncrement()+ skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;//到达文件尾部
}
public TokenStream tokenStream(StringfieldName, Reader reader) {
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion,reader);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = newLowerCaseFilter(result);
result = newStopFilter(enableStopPositionIncrements, result, stopSet);
returnresult;
}
|
|
|
|
|
| ||
|