一个简单的中文分词

最新推荐文章于 2022-02-18 20:48:07 发布
iteye_10741
最新推荐文章于 2022-02-18 20:48:07 发布
阅读量85
点赞数
分类专栏： lucene 文章标签： Java lucene Apache
lucene 专栏收录该内容
1 篇文章 0 订阅
订阅专栏

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;


public class XiaoMaTokenizer extends Tokenizer{

	/**
	 * 用来缓存词典里所有的词
	 */
	private TreeMap<String, String> allWordsMap ;

	/**
	 * 读入的所有文本(待分词的文本) buffer
	 */
	StringBuffer textBuffer = null ;

	/**
	 * 读入的所有文本(待分词的文本) string
	 */
	private String text = null ;

	/**
	 * 要分词的文本字符的长度
	 */
	private int textLength ;


	/**
	 * 存放token序列的序列
	 */
	private List<Token> allTokenList = null ;

	/**
	 * token序列的迭代
	 */
	private Iterator<Token> allTokenIter = null ;

	/**
	 * 匹配当前的字符串偏移量
	 */
	private int curIndex = 0 ;

	/**
	 * 最长词的长度
	 */
	private static int MAX_WORD_LENGTH = 10 ;

	/**
	 * 切词的长度
	 */
	private int cutLength = 0  ;


	public XiaoMaTokenizer(Reader input){
		this.input = input ;
		try{
			//初始化
			init() ;
			//正向最大匹配
			//doCutFirst() ;
			//逆向的最大匹配
			doCutLast() ;

			allTokenIter = allTokenList.iterator() ;
		}catch(Exception e){
			e.printStackTrace() ;
		}
	}

	/**
	 * 初始化数据
	 * 导入词库
	 * 读出要分词的字符串
	 *
	 */
	public void init() throws IOException{
		//导入该公司的词库
		loadWords() ;

		//读入要分词的文本
		textBuffer = new StringBuffer() ;
		BufferedReader br = new BufferedReader(input) ;
		String temp = null ;
		while(true){
			if((temp = br.readLine())!= null){
				textBuffer.append(temp) ;
			}else{
				break ;
			}
			text = new String(textBuffer) ;
			textLength = textBuffer.length() ;
		}
		allTokenList = new ArrayList<Token>() ;
	}

	/**
	 * 正向最大匹配
	 */
	public void doCutFirst(){
		//开始的时候从0开始
		curIndex = 0 ;
		while(curIndex < textLength){
			//如果切词长度为0，则设定切词长度
			if(cutLength == 0 ){
				//设定正向的切词长度
				setCutFirstLength() ;
			}
			int startIndex = curIndex;
			int endIndex = curIndex + cutLength;
			String temp = textBuffer.substring(startIndex , endIndex) ;
			//如果存在这个词,将偏移量移动这个词的距离
			if(allWordsMap.containsKey(temp)){
				Token tk = new Token(temp,startIndex,endIndex) ;
				allTokenList.add(tk) ;
				curIndex += cutLength ;
				//将切词长度置零，下次循环时可以重新设定长度
				cutLength = 0 ;
			}else{
				//如果不存在这个词，则将切词长度减1
				cutLength -- ;
				if(cutLength == 0){
					curIndex++ ;
					cutLength =0 ;
				}
			}
		}
	}

	/**
	 * 逆向最大匹配
	 */
	public void doCutLast(){
//		开始的时候从最后的索引开始
		curIndex = textLength ;
		while(curIndex > 0){
			//如果切词长度为0，则设定切词长度
			if(cutLength == 0 ){
				//设定逆向的切词长度
				setCutLastLength() ;
			}
			int startIndex = curIndex - cutLength;
			int endIndex = curIndex;
			String temp = textBuffer.substring(startIndex , endIndex) ;
			//如果存在这个词,将偏移量向前移动这个词的距离
			if(allWordsMap.containsKey(temp)){
				Token tk = new Token(temp,startIndex,endIndex) ;
				allTokenList.add(tk) ;
				curIndex -= cutLength ;
				//将切词长度置零，下次循环时可以重新设定长度
				cutLength = 0 ;
			}else{
				//如果不存在这个词，则将切词长度减1
				cutLength -- ;
				if(cutLength == 0){
					curIndex-- ;
					//将切词长度置零，下次循环时可以重新设定长度
					cutLength = 0 ;
				}
			}
		}
	}

	/**
	 * 设定正向最大匹配的切词长度
	 *
	 */
	private void setCutFirstLength(){
		//如果文本长度没有达到设定的最大长度，那么直接从文本最后开始切
		if(textLength <= MAX_WORD_LENGTH){
			//文本长度没有达到设定的最大长度，切词长度从文本的长度开始
			cutLength = textBuffer.length() ;
		}else{
			//文本长度达到设定的最大长度，切词长度从设定的最大长度开始
			cutLength = MAX_WORD_LENGTH ;
		}
		//如果切词长度大于从当前偏移量到文本最后的长度，那么切词长度应该是最后剩下的文本长度
		if(cutLength > textLength - curIndex){
			cutLength =textLength - curIndex ;
		}


	}

	/**
	 * 设定逆向最大匹配的切词长度
	 *
	 */
	private void setCutLastLength(){
		//如果文本长度没有达到设定的最大长度，那么切词长度就是文本长度
		if(textLength <= MAX_WORD_LENGTH){
			//文本长度没有达到设定的最大长度，切词长度从文本的长度开始
			cutLength = textBuffer.length() ;
		}else{
			//文本长度达到设定的最大长度，切词长度从设定的最大长度开始
			cutLength = MAX_WORD_LENGTH ;
		}
		//如果当前偏移量小于切词长度，则切词长度应该是偏移量
		if(cutLength > curIndex){
			cutLength = curIndex ;
		}


	}


	@Override
	public Token next() throws IOException{
		while(allTokenIter.hasNext()){
			return allTokenIter.next() ;
		}
		return null ;
	}

	/*导入词库,对每个公司，都有一个专门的词库(关键词表)*/
	public void loadWords(){
		if(allWordsMap != null){
			return ;
		}
		allWordsMap = new TreeMap<String,String>() ;
		try{
			File file = new File("sDict.txt") ;
			System.out.println(file.getAbsolutePath()) ;
			InputStream words = new FileInputStream(file) ;
			BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8")) ;
			String word = null ;
			while((word = in.readLine())!=null){
				allWordsMap.put(word, "key")  ;
			}
			allWordsMap.put("123", "1") ;
		}catch(IOException e){
			e.printStackTrace() ;
		}
	}


	public static void main(String [] args){
		long stime = System.currentTimeMillis() ;
		StringReader sr = new StringReader("测试中文分词，这里填写要分词的句子") ;
		Tokenizer xt = new XiaoMaTokenizer(sr) ;
		Token t = null ;
		try{
			while((t=xt.next())!=null){
				System.out.println(t) ;
			}
		}catch(Exception e){
			e.printStackTrace() ;
		}
		long etime = System.currentTimeMillis() ;
		System.out.println("用时:" + (double)(etime-stime)/1000);
	}


}