LUCENE3.0 自学吧 6 中文分词

最新推荐文章于 2024-07-25 17:09:29 发布

sustbeckham

最新推荐文章于 2024-07-25 17:09:29 发布

阅读量568

点赞数

分类专栏： Lucene 文章标签： lucene character token buffer c forms

本文链接：https://blog.csdn.net/sustbeckham/article/details/5809861

版权

Lucene 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

ChineseTokenizer 中文分词器，不过坏处是只能把词一个一个字的分开。

package org.apache.lucene.analysis.cn; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; public final class ChineseTokenizer extends Tokenizer { public ChineseTokenizer(Reader in) { super(in); init(); } public ChineseTokenizer(AttributeSource source, Reader in) { super(source, in); init(); } public ChineseTokenizer(AttributeFactory factory, Reader in) { super(factory, in); init(); } private void init() { termAtt = addAttribute(TermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } private int offset = 0, bufferIndex=0, dataLen=0; private final static int MAX_WORD_LEN = 255; //分词最大长度 private final static int IO_BUFFER_SIZE = 1024; private final char[] buffer = new char[MAX_WORD_LEN]; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private int length; private int start; private TermAttribute termAtt; private OffsetAttribute offsetAtt; private final void push(char c) { // 对待分词的文本进行预处理，输入到缓冲区buffer中 if (length == 0) start = offset-1; // 根据词条长度，设置起始位置索引 buffer[length++] = Character.toLowerCase(c); // 预处理：将中文Unicode码转化成小写 } private final boolean flush() { // 根据缓冲区预处理后的文本，构造词条 if (length>0) { termAtt.setTermBuffer(buffer, 0, length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } else return false; } @Override public boolean incrementToken() throws IOException { clearAttributes(); length = 0; start = offset; while (true) { final char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { offset--; return flush(); } else c = ioBuffer[bufferIndex++]; switch(Character.getType(c)) { case Character.DECIMAL_DIGIT_NUMBER: case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); if (length == MAX_WORD_LEN) return flush(); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; return flush(); } push(c); return flush(); default: if (length>0) return flush(); break; } } } @Override public final void end() { // set final offset final int finalOffset = correctOffset(offset); this.offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset() throws IOException { super.reset(); offset = bufferIndex = dataLen = 0; } @Override public void reset(Reader input) throws IOException { super.reset(input); reset(); } }

CJKTokenizer 也好不到哪里去 …..

CJKTokenizer 源码 :

package org.apache.lucene.analysis.cjk; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource.AttributeFactory; public final class CJKTokenizer extends Tokenizer { /** Word token type */ static final int WORD_TYPE = 0; /** Single byte token type */ static final int SINGLE_TOKEN_TYPE = 1; /** Double byte token type */ static final int DOUBLE_TOKEN_TYPE = 2; /** Names for token types */ static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" }; /** Max word length */ private static final int MAX_WORD_LEN = 255; /** buffer size: */ private static final int IO_BUFFER_SIZE = 256; //~ Instance fields -------------------------------------------------------- /** word offset, used to imply which character(in ) is parsed */ private int offset = 0; /** the index used only for ioBuffer */ private int bufferIndex = 0; /** data length */ private int dataLen = 0; /** * character buffer, store the characters which are used to compose * the returned Token */ private final char[] buffer = new char[MAX_WORD_LEN]; /** * I/O buffer, used to store the content of the input(one of the * members of Tokenizer) */ private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** word type: single=>ASCII double=>non-ASCII word=>default */ private int tokenType = WORD_TYPE; /** * tag: previous character is a cached double-byte character "C1C2C3C4" * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" */ private boolean preIsTokened = false; private TermAttribute termAtt; private OffsetAttribute offsetAtt; private TypeAttribute typeAtt; //~ Constructors ----------------------------------------------------------- /** * Construct a token stream processing the given input. * * @param in I/O reader */ public CJKTokenizer(Reader in) { super(in); init(); } public CJKTokenizer(AttributeSource source, Reader in) { super(source, in); init(); } public CJKTokenizer(AttributeFactory factory, Reader in) { super(factory, in); init(); } private void init() { termAtt = addAttribute(TermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); } //~ Methods ---------------------------------------------------------------- @Override public boolean incrementToken() throws IOException { clearAttributes(); /** how many character(s) has been stored in buffer */ while(true) { // loop until we find a non-empty token int length = 0; /** the position used to create Token */ int start = offset; while (true) { // loop until we've found a full token /** current character */ char c; /** unicode block of current character for detail */ Character.UnicodeBlock ub; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else{ offset--; } break; } else { offset--; return false; } } else { //get current character c = ioBuffer[bufferIndex++]; //get the UnicodeBlock of the current character ub = Character.UnicodeBlock.of(c); } //if the current character is ASCII or Extend ASCII if ((ub == Character.UnicodeBlock.BASIC_LATIN) || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) ) { if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { int i = (int) c; if (i >= 65281 && i <= 65374) { // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN i = i - 65248; c = (char) i; } } // if the current character is a letter or "_" "+" "#" if (Character.isLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')) ) { if (length == 0) { // "javaC1C2C3C4linux" // ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == DOUBLE_TOKEN_TYPE) { // "javaC1C2C3C4linux" // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = Character.toLowerCase(c); tokenType = SINGLE_TOKEN_TYPE; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, e.g."C1C2C3C4" if (Character.isLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; } else { if (tokenType == SINGLE_TOKEN_TYPE) { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } if (length > 0) { termAtt.setTermBuffer(buffer, 0, length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); return true; } else if (dataLen == -1) { offset--; return false; } // Cycle back and try for the next token (don't // return an empty string) } } @Override public final void end() { // set final offset final int finalOffset = correctOffset(offset); this.offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset() throws IOException { super.reset(); offset = bufferIndex = dataLen = 0; preIsTokened = false; tokenType = WORD_TYPE; } @Override public void reset(Reader reader) throws IOException { super.reset(reader); reset(); } }

ChineseTokenizer 例子：

public class JustTest { public static void main(String[] args) { Reader read = new StringReader("我是ha,ve中国人"); ChineseTokenizer token3 = new ChineseTokenizer(read); try { while(token3.incrementToken()){ System.out.println(token3.toString()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }

输出：

(term= 我 ,startOffset=0,endOffset=1)

(term= 是 ,startOffset=1,endOffset=2)

(term=ha,startOffset=2,endOffset=4)

(term=ve,startOffset=5,endOffset=7)

(term= 中 ,startOffset=7,endOffset=8)

(term= 国 ,startOffset=8,endOffset=9)

(term= 人 ,startOffset=9,endOffset=10)

CJKTokenizer 例子：

public class JustTest { public static void main(String[] args) { Reader read = new StringReader("我是一個ha,ve頂天立地的中國人"); CJKTokenizer token4 = new CJKTokenizer(read); try { while(token4.incrementToken()){ System.out.println(token4.toString()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }

输出：

(term= 我是 ,startOffset=0,endOffset=2,type=double)

(term= 是一 ,startOffset=1,endOffset=3,type=double)

(term= 一個 ,startOffset=2,endOffset=4,type=double)

(term=ha,startOffset=4,endOffset=6,type=single)

(term=ve,startOffset=7,endOffset=9,type=single)

(term= 頂天 ,startOffset=9,endOffset=11,type=double)

(term= 天立 ,startOffset=10,endOffset=12,type=double)

(term= 立地 ,startOffset=11,endOffset=13,type=double)

(term= 地的 ,startOffset=12,endOffset=14,type=double)

(term= 的中 ,startOffset=13,endOffset=15,type=double)

(term= 中國 ,startOffset=14,endOffset=16,type=double)

(term= 國人 ,startOffset=15,endOffset=17,type=double)

sustbeckham

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
LUCENE3.0 自学吧 6 中文分词

 ChineseTokenizer中文分词器，不过坏处是只能把词一个一个字的分开。 package org.apache.lucene.analysis.cn; import java.io.IOException;import java.io.Reader; import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.Off
复制链接

扫一扫

专栏目录