java 分词代码_中文分词源代码 - java

最新推荐文章于 2024-09-23 19:22:31 发布

Jerry大王

最新推荐文章于 2024-09-23 19:22:31 发布

阅读量194

点赞数

文章标签： java 分词代码

本文链接：https://blog.csdn.net/weixin_34070493/article/details/114455340

版权

package org.apache.lucene.analysis.cn;

/**

* Licensed to the Apache Software Foundation (ASF) under one or more

* contributor license agreements. See the NOTICE file distributed with

* this work for additional information regarding copyright ownership.

* The ASF licenses this file to You under the Apache License, Version 2.0

* (the "License"); you may not use this file except in compliance with

* the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

import java.io.Reader;

import org.apache.lucene.analysis.*;

/**

* Title: ChineseTokenizer

* Description: Extract tokens from the Stream using Character.getType()

* Rule: A Chinese character as a single token

* Company:

* The difference between thr ChineseTokenizer and the

* CJKTokenizer (id=23545) is that they have different

* token parsing logic.

* Let me use an example. If having a Chinese text

* "C1C2C3C4" to be indexed, the tokens returned from the

* ChineseTokenizer are C1, C2, C3, C4. And the tokens

* returned from the CJKTokenizer are C1C2, C2C3, C3C4.

* Therefore the index the CJKTokenizer created is much

* larger.

* The problem is that when searching for C1, C1C2, C1C3,

* C4C2, C1C2C3 ... the ChineseTokenizer works, but the

* CJKTokenizer will not work.

* @author Yiyi Sun

* @version 1.0

public final class ChineseTokenizer extends Tokenizer {

public ChineseTokenizer(Reader in) {

input = in;

}

private int offset = 0, bufferIndex=0, dataLen=0;

private final static int MAX_WORD_LEN = 255;

private final static int IO_BUFFER_SIZE = 1024;

private final char[] buffer = new char[MAX_WORD_LEN];

private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

private int length;

private int start;

private final void push(char c) {

if (length == 0) start = offset-1; // start of token

buffer[length++] = Character.toLowerCase(c); // buffer it

}

private final Token flush() {

if (length>0) {

//System.out.println(new String(buffer, 0, length));

return new Token(new String(buffer, 0, length), start, start+length);

}

else

return null;

}

public final Token next() throws java.io.IOException {

length = 0;

start = offset;

while (true) {

final char c;

offset++;

if (bufferIndex >= dataLen) {

dataLen = input.read(ioBuffer);

bufferIndex = 0;

}

if (dataLen == -1) return flush();

else

c = ioBuffer[bufferIndex++];

switch(Character.getType(c)) {

case Character.DECIMAL_DIGIT_NUMBER:

case Character.LOWERCASE_LETTER:

case Character.UPPERCASE_LETTER:

push(c);

if (length == MAX_WORD_LEN) return flush();

break;

case Character.OTHER_LETTER:

if (length>0) {

bufferIndex--;

offset--;

return flush();

}

push(c);

return flush();

default:

if (length>0) return flush();

break;

}

package org.apache.lucene.analysis.cn;

/**

* Licensed to the Apache Software Foundation (ASF) under one or more

* contributor license agreements. See the NOTICE file distributed with

* this work for additional information regarding copyright ownership.

* The ASF licenses this file to You under the Apache License, Version 2.0

* (the "License"); you may not use this file except in compliance with

* the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

import java.util.Hashtable;

import org.apache.lucene.analysis.*;

/**

* Title: ChineseFilter

* Description: Filter with a stop word table

* Rule: No digital is allowed.

* English word/token should larger than 1 character.

* One Chinese character as one Chinese word.

* TO DO:

* 1. Add Chinese stop words, such as /ue400

* 2. Dictionary based Chinese word extraction

* 3. Intelligent Chinese word extraction

* Company:

* @author Yiyi Sun

* @version 1.0

public final class ChineseFilter extends TokenFilter {

// Only English now, Chinese to be added later.

public static final String[] STOP_WORDS = {

"and", "are", "as", "at", "be", "but", "by",

"for", "if", "in", "into", "is", "it",

"no", "not", "of", "on", "or", "such",

"that", "the", "their", "then", "there", "these",

"they", "this", "to", "was", "will", "with"

};

private Hashtable stopTable;

public ChineseFilter(TokenStream in) {

super(in);

stopTable = new Hashtable(STOP_WORDS.length);

for (int i = 0; i < STOP_WORDS.length; i++)

stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);

}

public final Token next() throws java.io.IOException {

for (Token token = input.next(); token != null; token = input.next()) {

String text = token.termText();

// why not key off token type here assuming ChineseTokenizer comes first?

if (stopTable.get(text) == null) {

switch (Character.getType(text.charAt(0))) {

case Character.LOWERCASE_LETTER:

case Character.UPPERCASE_LETTER:

// English word/token should larger than 1 character.

if (text.length()>1) {

return token;

}

break;

case Character.OTHER_LETTER:

// One Chinese character as one Chinese word.

// Chinese word extraction to be added later here.

return token;

}

return null;

}

package org.apache.lucene.analysis.cn;

/**

* Licensed to the Apache Software Foundation (ASF) under one or more

* contributor license agreements. See the NOTICE file distributed with

* this work for additional information regarding copyright ownership.

* The ASF licenses this file to You under the Apache License, Version 2.0

* (the "License"); you may not use this file except in compliance with

* the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

/**

* Title: ChineseAnalyzer

* Description:

* Subclass of org.apache.lucene.analysis.Analyzer

* build from a ChineseTokenizer, filtered with ChineseFilter.

* Company:

* @author Yiyi Sun

* @version 1.0

public class ChineseAnalyzer extends Analyzer {

public ChineseAnalyzer() {

}

/**

* Creates a TokenStream which tokenizes all the text in the provided Reader.

* @return A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.

public final TokenStream tokenStream(String fieldName, Reader reader) {

TokenStream result = new ChineseTokenizer(reader);

result = new ChineseFilter(result);

return result;

}

java 分词 代码_中文分词源代码 - java

java 分词代码_中文分词源代码 - java