Lucene分析器的实现。
Lucene分词器Tokenizer,它的继承子类的实现。
Tokenizer类的继承关系
ChineseTokenizer类实现中文分词
中文分词在Lucene中的处理很简单,就是单个字分。它的实现类为ChineseTokenizer,在包org.apache.lucene.analysis.cn中,源代码如下:
这里,还提及到一个CJKTokenizer分词类,它处理分词的时候,比ChineseTokenizer分词处理要好一点,但是也存在弊病,源代码给了一个例子,如下:
如果一个中文词汇C1C2C3C4被索引,使用ChineseTokenizer分词,返回的词条(Token)为:C1,C2,C3,C4;使用CJKTokenizer进行分词,则返回的词条(Token)为:C1C2,C2C3,C3C4。
问题在于:当检索关键字为C1,C1C2,C1C3,C4C2,C1C2C3……的时候,ChineseTokenizer可以对其实现分词,而CJKTokenizer就不能实现分词了。
CJKTokenizer类实现中文分词
CJKTokenizer类的源代码如下所示:
Lucene分词器Tokenizer,它的继承子类的实现。
Tokenizer类的继承关系
ChineseTokenizer类实现中文分词
中文分词在Lucene中的处理很简单,就是单个字分。它的实现类为ChineseTokenizer,在包org.apache.lucene.analysis.cn中,源代码如下:
package org.apache.lucene.analysis.cn;
import java.io.Reader;
import org.apache.lucene.analysis.*;
public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
input = in;
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private final void push(char c) { // 对待分词的文本进行预处理,输入到缓冲区buffer中
if (length == 0)
start = offset - 1; // 根据词条长度,设置起始位置索引
buffer[length++] = Character.toLowerCase(c); // 预处理:将中文Unicode码转化成小写
}
private final Token flush() { // 根据缓冲区预处理后的文本,构造词条
if (length > 0) {
return new Token(new String(buffer, 0, length), start, start
+ length);
} else
return null;
}
public final Token next() throws java.io.IOException { // 返回下一个词条
length = 0;
start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) { // 当缓冲区没有溢出
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1)
return flush();
else
c = ioBuffer[bufferIndex++];
switch (Character.getType(c)) {
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN)
return flush();
break;
case Character.OTHER_LETTER:
if (length > 0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush();
default:
if (length > 0)
return flush();
break;
}
}
}
}
这里,还提及到一个CJKTokenizer分词类,它处理分词的时候,比ChineseTokenizer分词处理要好一点,但是也存在弊病,源代码给了一个例子,如下:
如果一个中文词汇C1C2C3C4被索引,使用ChineseTokenizer分词,返回的词条(Token)为:C1,C2,C3,C4;使用CJKTokenizer进行分词,则返回的词条(Token)为:C1C2,C2C3,C3C4。
问题在于:当检索关键字为C1,C1C2,C1C3,C4C2,C1C2C3……的时候,ChineseTokenizer可以对其实现分词,而CJKTokenizer就不能实现分词了。
CJKTokenizer类实现中文分词
CJKTokenizer类的源代码如下所示:
package org.apache.lucene.analysis.cjk;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
public final class CJKTokenizer extends Tokenizer {
/** Max word length */
private static final int MAX_WORD_LEN = 255;
/** buffer size: */
private static final int IO_BUFFER_SIZE = 256;
/** word offset, used to imply which character(in ) is parsed */
private int offset = 0;
/** the index used only for ioBuffer */
private int bufferIndex = 0;
/** data length */
private int dataLen = 0;
/**
* 字符缓冲区,存储那些经过处理后返回的词条
*/
private final char[] buffer = new char[MAX_WORD_LEN];
/**
* I/O buffer, used to store the content of the input(one of the
*
* members of Tokenizer)
*/
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
private String tokenType = "word";
private boolean preIsTokened = false;
public CJKTokenizer(Reader in) {
input = in;
}
public final Token next() throws java.io.IOException {
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) {
/** current charactor */
char c;
/** unicode block of current charactor for detail */
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
}
break;
} else {
return null;
}
} else {
// get current character
c = ioBuffer[bufferIndex++];
// get the UnicodeBlock of the current character
ub = Character.UnicodeBlock.of(c);
}
// if the current character is ASCII or Extend ASCII
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)) {
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
/** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
int i = (int) c;
i = i - 65248;
c = (char) i;
}
// if the current character is a letter or "_" "+" "#"
if (Character.isLetterOrDigit(c)
|| ((c == '_') || (c == '+') || (c == '#'))) {
if (length == 0) {
// "javaC1C2C3C4linux"
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
} else if (tokenType == "double") {
offset--;
bufferIndex--;
tokenType = "single";
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
} else {
break;
}
}
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
tokenType = "single";
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
break;
}
} else if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
} else {
break;
}
}
} else {
// non-ASCII letter, eg."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
tokenType = "double";
} else {
if (tokenType == "single") {
offset--;
bufferIndex--;
//return the previous ASCII characters
break;
} else {
buffer[length++] = c;
tokenType = "double";
if (length == 2) {
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
} else if (length > 0) {
if (preIsTokened == true) {
// empty the buffer
length = 0;
preIsTokened = false;
} else {
break;
}
}
}
}
return new Token(new String(buffer, 0, length), start, start + length,
tokenType);
}
}