一种中文文本的快速分词方法（二）

最新推荐文章于 2019-05-23 11:36:57 发布
mayakovsky
最新推荐文章于 2019-05-23 11:36:57 发布
阅读量582
点赞数
分类专栏：中文分词文章标签： java map 索引 string
本文链接：https://blog.csdn.net/zhukova/article/details/18940245
版权
中文分词专栏收录该内容
2 篇文章 0 订阅
订阅专栏
package org.zhukovasky.chineseSeg;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;

import org.zhukovasky.HashBinaryClass.HashBinaryContainer;
import org.zhukovasky.HashBinaryClass.Maps;
import org.zhukovasky.fileutil.WordCount;
import org.zhukovasky.fileutil.WordDictUtil;
import org.zhukovasky.invertedindex.MapWords;

/**
 * 以下是中文文本的分词工具，
 * 文本的编码为UTF-8
 * @author zhukovasky
 * @version 1.0
 * @since 2013.12
 * @email zhukovasky@163.com
 * */
public class chineseSeg {
	/**
	 * 以下方法为对中文文本的分词写入到倒排索引中
	 * @param afterprocess经过预处理后的文本
	 * @param invertedIndex 存放倒排索引的地址
	 * @param 字典所在的地址
	 * 
	 * */
	public final static int MAXLENGTH=10;
	public static void FileSeg(File afterprocess,File invertedIndex,File dict){
		MapWords mapwords=new MapWords();
		Reader r=null;
		BufferedReader bf=null;
		ObjectOutputStream oos=null;
		OutputStream output=null;
		String Line=null;
		Maps map=WordCount.getDict(dict);
		int i=0;
		try {
			r=new FileReader(afterprocess);
			bf=new BufferedReader(r);
			Line=bf.readLine();
			int Kase=0;
			if(Line.length()<=MAXLENGTH+1){
				Kase=1;
			}else{
				Kase=2;
			}
			switch(Kase){
			case 1:{
				while(i<=Line.length()-1){
					String str=null;
					String str1=null;
					String str2=null;
					str=Line.substring(0);
					str1=Line.substring(0, 1);
					str2=Line.substring(1, 2);
					int seek=0;
					if(map.isCwordExist(str1)){
						if(map.getHBC(str1).isSecondWordExist(str2)){
							HashBinaryContainer hbc=map.getHBC(str1);
							String[] temp=hbc.getMatchArray(str2);
							String[] maxletemp=WordDictUtil.getStringLengthArray(temp);
							if(maxletemp[0].length()==1){
								String segword=str1+str2;
								seek=2;
								mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
								i=i+2;
							}else{
								int length=maxletemp[0].length();
								String str3=str.substring(1, length+1);
								String segword=str1+str3;
								if(WordDictUtil.isWordMatched(str3, maxletemp)){
									mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
									seek=segword.length();
									i=i+seek;
								}else{
									i=i+2;
									segword=str1+str2;
									mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
								}
							}
						}else{
							mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
							i++;
						}
					}else{
						mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
						i++;
					}
				}
			};
			break;
			case 2:{
				while(i<=Line.length()-1){
					String str=null;
					String str1=null;
					String str2=null;
					if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){
						str=Line.substring(i);
						if(i>=Line.length()-1&&Line.length()-1>0){
							if(i>Line.length()){
								break;
							}
							str1=Line.substring(i);
							str2=null;
							break;
						}else{
							str1=str.substring(0,1);
							str2=str.substring(1,2);
							int seek=0;
							if(map.isCwordExist(str1)){
								if(map.getHBC(str1).isSecondWordExist(str2)){
									HashBinaryContainer hbc=map.getHBC(str1);
									String[] temp=hbc.getMatchArray(str2);
									String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
									if(MaxLeTemp[0].length()==1){
										String segword=str1+str2;
										seek=2;
										mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
										i=i+2;
									}else{
										int length=MaxLeTemp[0].length();
										if(str.length()<length){
											break;
										}
										String str3=str.substring(1, length+1);
										
										String segword=str1+str3;
										if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
											mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
											seek=segword.length();
											i=i+seek;
										}else{
											i=i+2;
											segword=str1+str2;
											mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
										}
									}
								}else{
									mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
									seek=1;
									i=i+seek;
								}
							}else{
								mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
								seek=1;
								i=i+seek;
							}
						}
					}else{
						str=Line.substring(i, i+MAXLENGTH);
						str1=str.substring(0, 1);
						str2=str.substring(1, 2);
						int seek=0;
						if(map.isCwordExist(str1)){
							if(map.getHBC(str1).isSecondWordExist(str2)){
								HashBinaryContainer hbc=map.getHBC(str1);
								String[] temp=hbc.getMatchArray(str2);
								String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
								if(MaxLeTemp[0].length()==1){
									String segword=str1+str2;
									seek=2;
									mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
									i=i+2;
								}else{
									int length=MaxLeTemp[0].length();
									String str3=str.substring(1, length+1);
									String segword=str1+str3;
									if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
										mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
										seek=segword.length();
										i=i+seek;
									}else{
										i=i+2;
										segword=str1+str2;
										mapwords.addNewNodeElement(segword, afterprocess.getName(), i);
									}
								}
							}else{
								mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
								seek=1;
								i=i+seek;
							}
						}else{
							mapwords.addNewNodeElement(str1, afterprocess.getName(), i);
							seek=1;
							i=i+seek;
						}
					}
				}
			};
			break;
		}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			try {
				r.close();
				bf.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		try {
			output=new FileOutputStream(invertedIndex);
			oos=new ObjectOutputStream(output);
			oos.writeObject(mapwords);
		} catch (FileNotFoundException e) {	
			e.printStackTrace();
		} catch (IOException e) {
			
			e.printStackTrace();
		}finally{
			try {
				oos.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	/**
	 * 以下方法为对中文文本的分词写入到倒排索引中
	 * @param afterprocess[k]经过预处理后的文本
	 * @param invertedIndex 存放倒排索引的地址
	 * @param 字典所在的地址
	 * */
	public static void FileArraysSeg(File[] afterprocess,File invertedIndex,File dict){
		MapWords mapwords=new MapWords();
		Reader r=null;
		BufferedReader bf=null;
		ObjectOutputStream oos=null;
		OutputStream output=null;
		String Line=null;
		Maps map=WordCount.getDict(dict);
		int i=0;
		int MAXLENGTH=9;			//取决于词典中最大长度词条
		for(int k=0;k<afterprocess.length;k++){
			try {
				r=new FileReader(afterprocess[k]);
				bf=new BufferedReader(r);
				Line=bf.readLine();
				int Kase=0;
				if(Line.length()<=MAXLENGTH+1){
					Kase=1;
				}else{
					Kase=2;
				}
				switch(Kase){
				case 1:{
					while(i<=Line.length()-1){
						String str=null;
						String str1=null;
						String str2=null;
						str=Line.substring(0);
						str1=Line.substring(0, 1);
						str2=Line.substring(1, 2);
						int seek=0;
						if(map.isCwordExist(str1)){
							if(map.getHBC(str1).isSecondWordExist(str2)){
								HashBinaryContainer hbc=map.getHBC(str1);
								String[] temp=hbc.getMatchArray(str2);
								String[] maxletemp=WordDictUtil.getStringLengthArray(temp);
								if(maxletemp[0].length()==1){
									String segword=str1+str2;
									seek=2;
									mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
									i=i+2;
								}else{
									int length=maxletemp[0].length();
									String str3=str.substring(1, length+1);
									String segword=str1+str3;
									if(WordDictUtil.isWordMatched(str3, maxletemp)){
										mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
										seek=segword.length();
										i=i+seek;
									}else{
										i=i+2;
										segword=str1+str2;
										mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
									}
								}
							}else{
								mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
								i++;
							}
						}else{
							mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
							i++;
						}
					}
				};
				break;
				case 2:{
					while(i<=Line.length()-1){
						String str=null;
						String str1=null;
						String str2=null;
						if(i>=Line.length()-1-MAXLENGTH&&Line.length()-1-MAXLENGTH>0){
							str=Line.substring(i);
							if(i>=Line.length()-1&&Line.length()-1>0){
								if(i>Line.length()){
									break;
								}
								str1=Line.substring(i);
								str2=null;
								break;
							}else{
								str1=str.substring(0,1);
								str2=str.substring(1,2);
								int seek=0;
								if(map.isCwordExist(str1)){
									if(map.getHBC(str1).isSecondWordExist(str2)){
										HashBinaryContainer hbc=map.getHBC(str1);
										String[] temp=hbc.getMatchArray(str2);
										String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
										if(MaxLeTemp[0].length()==1){
											String segword=str1+str2;
											seek=2;
											mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
											i=i+2;
										}else{
											int length=MaxLeTemp[0].length();
											if(str.length()<length){
												break;
											}
											String str3=str.substring(1, length+1);
											
											String segword=str1+str3;
											if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
												mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
												seek=segword.length();
												i=i+seek;
											}else{
												i=i+2;
												segword=str1+str2;
												mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
											}
										}
									}else{
										mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
										seek=1;
										i=i+seek;
									}
								}else{
									mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
									seek=1;
									i=i+seek;
								}
							}
						}else{
							str=Line.substring(i, i+MAXLENGTH);
							str1=str.substring(0, 1);
							str2=str.substring(1, 2);
							int seek=0;
							if(map.isCwordExist(str1)){
								if(map.getHBC(str1).isSecondWordExist(str2)){
									HashBinaryContainer hbc=map.getHBC(str1);
									String[] temp=hbc.getMatchArray(str2);
									String[] MaxLeTemp=WordDictUtil.getStringLengthArray(temp);
									if(MaxLeTemp[0].length()==1){
										String segword=str1+str2;
										seek=2;
										mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
										i=i+2;
									}else{
										int length=MaxLeTemp[0].length();
										String str3=str.substring(1, length+1);
										String segword=str1+str3;
										if(WordDictUtil.isWordMatched(str3, MaxLeTemp)){
											mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
											seek=segword.length();
											i=i+seek;
										}else{
											i=i+2;
											segword=str1+str2;
											mapwords.addNewNodeElement(segword, afterprocess[k].getName(), i);
										}
									}
								}else{
									mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
									seek=1;
									i=i+seek;
								}
							}else{
								mapwords.addNewNodeElement(str1, afterprocess[k].getName(), i);
								seek=1;
								i=i+seek;
							}
						}
					}
				};
				break;
			}
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}finally{
				try {
					r.close();
					bf.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}

		}
		try {
			output=new FileOutputStream(invertedIndex);
			oos=new ObjectOutputStream(output);
			oos.writeObject(mapwords);
		} catch (FileNotFoundException e) {	
			e.printStackTrace();
		} catch (IOException e) {
			
			e.printStackTrace();
		}finally{
			try {
				oos.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		
	}
}
mayakovsky
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一种中文文本的快速分词方法（二）

package org.zhukovasky.chineseSeg;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileReader;import java.io
复制链接

扫一扫
专栏目录