一种中文文本的快速分词方法（三）

最新推荐文章于 2020-12-16 17:12:14 发布

mayakovsky

最新推荐文章于 2020-12-16 17:12:14 发布

阅读量1.2k

点赞数

分类专栏：中文分词文章标签： map 类索引技术

本文链接：https://blog.csdn.net/zhukova/article/details/18940281

版权

中文分词专栏收录该内容

2 篇文章 0 订阅

订阅专栏

ackage org.zhukovasky.fileutil;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
/**
 * 以下是中文文本用来处理文本数组的各种方法
 * 文本的编码为UTF-8
 * @author zhukovasky
 * @version 1.0
 * @since 2013.12
 * @email zhukovasky@163.com
 * */
public class WordDictUtil {
	/**
	 * 以下方法为获得数组的排序，按长度排序返回指定长度的数组
	 * @param i为指定长度
	 * */
	public static String[] getStringLengthArray(String[] s){
		int maxlength=getMAXLENGTH(s);
		sortByLength(s);
		int count=0;
		int i=s.length-1;
		while(s[i].length()==maxlength){
			count++;
			i--;
			if(i<0){
				break;
			}
		}
		String[] sarr=new String[count];
		for(int j=s.length-1;j>(s.length-count-1);j--){
			sarr[s.length-j-1]=s[j];
		}
		sort(sarr);
		return sarr;
	}
	/**
	 * 该方法用来返回数组元素的长度
	 * */
	public static int[] getStringMARKArray(String[] s){
		int temp[]=new int[s.length];
		for(int i=0;i<temp.length;i++){
			temp[i]=s[i].length();
		}
		return temp;
	}
	/**
	 * 以下方法用来求解最大值的个数
	 * */
	public static int getMAXSIZE(String[] s){
		int max=getMAXLENGTH(s);
		int count=0;
		int temp[]=getStringMARKArray(s);
		for(int i=0;i<s.length;i++){
			if(temp[i]==max){
				count++;
			}
		}
		return count;
	}
	/**
	 * 以下方法返回字符串数组当中长度最大值
	 * */
	public static int getMAXLENGTH(String[] s){
		int MAXLENGTH=-1;
		for(int i=0;i<s.length;i++){
			if(s[i].length()>MAXLENGTH){
				MAXLENGTH=s[i].length();
			}
		}
		return MAXLENGTH;
	}
	/**
	 * 该方法根据二叉树排序方法获得的参数i
	 * 用来获得该数组中与该元素相关的所有元素
	 * */
	public static String[] getString(int i,String[] arr,String word){
		sort(arr);
		List<String> ll=new ArrayList<String>();
		//int i=BinaraySearch(arr,word);
		int Kase=i;
		int location=i;
		if(Kase<0){
			Kase=1;
		}else if(Kase==0){
			Kase=2;
		}else if(Kase>=1&&Kase<arr.length){
			Kase=3;
		}
		String[] tmp=null;
		switch (Kase) {
			case 1: {
				tmp=null;
			};
			break;
			case 2:{
				int count=0;
				while(((int)(arr[location].charAt(0)))==((int)(word.charAt(0)))){
					ll.add(arr[location]);
					location++;
					count++;
				}
				String[] str=new String[count];
				tmp=ll.toArray(str);
				sort(tmp);
			};
			break;
			case 3:{
				int m=location;
				int count=-1;
				while(((int)(arr[location].charAt(0)))==((int)(word.charAt(0)))){
					ll.add(arr[location]);
					location--;
					count++;
					if(location<0){
						break;
					}
				}
				if(m<arr.length-1){
					while(((int)(arr[m].charAt(0)))==((int)(word.charAt(0)))){
						if(m>=arr.length-1){
							break;
						}
						ll.add(arr[m+1]);
						m++;
						count++;
					};
				}else{
					ll.add(arr[m]);
				}
				String[] str=new String[count];
				tmp=ll.toArray(str);
				sort(tmp);
			}
			break;
		}
		return tmp;
	}
	/**
	 * 字符串数组的二叉树查找方法
	 * @param arr 字符串数组
	 * @param word 待匹配字符串
	 * 注意该方法返回的下标不一定准确
	 * 当该数组中存在着重复值的时候，无需排序。
	 * */
	public static int BinaraySearch(String[] arr,String word){
		Comparator<String> cmp=new Comparator<String>(){
			@Override
			public int compare(String o1, String o2) {
				int l=0;
				if((int)o1.charAt(0)>(int)o2.charAt(0)){
					l=1;
				}
				if((int)o1.charAt(0)<(int)o2.charAt(0)){
					l=-1;
				}
				return l;
			}
		};
		int i=-1;
		i=Arrays.binarySearch(arr,word,cmp);
		return i;
	}
	/**
	 * 对于字符串数组，按照每个首字符的ASCII码排序
	 * */
	public static void sort(String[] arr){
		Comparator<String> cmp=new Comparator<String>(){
			@Override
			public int compare(String o1, String o2) {
				int l=0;
				if((int)o1.charAt(0)>(int)o2.charAt(0)){
					l=1;
				}
				if((int)o1.charAt(0)<(int)o2.charAt(0)){
					l=-1;
				}
				return l;
			}
		};
		Arrays.sort(arr,cmp);
	}
	/**
	 * 按照长度排序的排序方法
	 * */
	public static void sortByLength(String[] arr){
		Comparator<String> cmp=new Comparator<String>(){

			@Override
			public int compare(String o1, String o2) {
				int l=0;
				if(o1.length()>o2.length()){
					l=1;
				}
				if(o1.length()<o2.length()){
					l=-1;
				}
				return l;
			}
			
		};
		Arrays.sort(arr,cmp);
	}
	/**
	 *返回要匹配的数组的最大值 
	 * */
	public static int MaxStringArray(String[] s){
		int MaxLength=0;
		for(int i=0;i<s.length;i++){
			if(s[i].length()>MaxLength){
				MaxLength=s[i].length();
			}
		}
		return MaxLength;
	}
	/**
	 * 用来查找字符串word是否在数组S中
	 * */
	public static boolean isWordMatched(String word,String[] s){
		boolean status=false;
		for(int i=0;i<s.length;i++){
			if(word.equals(s[i])){
				status=true;
			}
		}
		return status;
	}
}

package org.zhukovasky.HashBinaryClass;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeSet;

import org.zhukovasky.fileutil.WordDictUtil;

public class HashBinaryContainer  implements Serializable{
	/**
	 * 序列化
	 */
	private static final long serialVersionUID = -600162346743456357L;
	public String CapitalWord;
	public Set<String> LeftWordArray;
	public HashBinaryContainer(String CapitalWord){
		this.CapitalWord=CapitalWord;
		initSet();
	}
	/**
	 * 添加元素方法
	 * */
	public Set<String> addString(String leftword){
		this.LeftWordArray.add(leftword);
		Comparator<String> cmp=new Comparator<String>(){
			@Override
			public int compare(String o1, String o2) {
				int l=0;
				if((int)o1.charAt(0)>(int)o2.charAt(0)){
					l=1;
				}
				if((int)o1.charAt(0)<(int)o2.charAt(0)){
					l=-1;
				}
				return l;
			}
		};
		String[] str=this.toStringArray();
		Arrays.sort(str,cmp);
		this.LeftWordArray.clear();
		for(int i=0;i<str.length;i++){
			this.LeftWordArray.add(str[i]);
		}
		return this.LeftWordArray;
	}
	private void initSet(){
		this.LeftWordArray=new TreeSet<String>();
		int i=0;
		char s=(char)i;
		String ss=s+"";
		this.LeftWordArray.add(ss);
	}
	/**
	 * 返回二叉树查找的返回值
	 * @param word 等待匹配的字
	 * 注意该返回值只能返回第二个字
	 * */
	private int BinarySearch(String word){
		int i=WordDictUtil.BinaraySearch(this.toStringArray(), word);
		return i;
	}
	/**
	 * 该方法只能对第二个字符产生作用
	 * @param word单个字符
	 * */
	public boolean isSecondWordExist(String word){
		boolean status=true;
		if(BinarySearch(word)<0){
			status=false;
		}
		return status;
	}
	/**
	 * 获得相同第二个单字的所有数组输出
	 * */
	public String[] getMatchArray(String word){
		String[] temp=null;
		int i=0;
		if(isSecondWordExist(word)){
			i=this.BinarySearch(word);
			temp=this.getString(i, word);
		}
		WordDictUtil.sortByLength(temp);
		return temp;
	}
	/**
	 *返回要匹配的数组的最大值 
	 * */
	public int MaxStringArray(String[] s){
		int MaxLength=0;
		WordDictUtil.MaxStringArray(s);
		return MaxLength;
	}
	/**
	 * 以下方法用来返回相同第二个字符所有数组
	 * 如：中，国、国人、国人民、国人大
	 * 参数为国，返回{国，国人，国人民，国人大}
	 * @param  word第二个字
	 * */
	private  String[] getString(int i,String word){
		String[] temp=WordDictUtil.getString(i, this.toStringArray(), word);
		return temp;
	}
	/***
	 * 查找第几个数值
	 * */
	public String getIndex(int index){
		String[] temp=this.toStringArray();
		if(index<0||index>temp.length){
			try{
				throw new Exception("不好意思，请检查你的输入值");
			}catch(Exception e){
				e.printStackTrace();
			}
		}
		return temp[index];
	}
	private  String[] toStringArray(){
		String[] str=new String[this.LeftWordArray.size()];
		String[] temp=this.LeftWordArray.toArray(str);
		return temp;
	}
	/**
	 * toString方法
	 * */
	public String toString(){
		return Arrays.toString(this.toStringArray());
	}
	public String getCapitalWord() {
		return CapitalWord;
	}
	public void setCapitalWord(String capitalWord) {
		CapitalWord = capitalWord;
	}
}
package org.zhukovasky.HashBinaryClass;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

public class Maps implements Serializable{
	/**
	 * 一种存储字典的数据结构
	 */
	private static final long serialVersionUID = -7564844288821688761L;
	//public HashBinaryContainer HBC;
	/**
	 * 为hashmap技术存储
	 * */
	private Map<String,HashBinaryContainer> maps;
	public Maps(){
		this.maps=new HashMap<String,HashBinaryContainer>();
		HashBinaryContainer hbc=this.init();
		this.maps.put("",hbc);
	}
	private HashBinaryContainer init(){
		HashBinaryContainer HBC=new HashBinaryContainer("");
		return HBC;
	}
	/**
	 * 字典增加元素
	 * @param cword 首字符
	 * @param lword 剩下字符
	 * */
	public Map<String,HashBinaryContainer> addElements(String cword,String lword){
		if(isCwordExist(cword)){
			HashBinaryContainer hbc=this.maps.get(cword);
			hbc.addString(lword);
			this.maps.put(cword, hbc);
		}else{
			HashBinaryContainer hbc=new HashBinaryContainer(cword);
			hbc.addString(lword);
			this.maps.put(cword, hbc);
		}
		return this.maps;
	}
	/**
	 * 根据首字符查找相关信息
	 * */
	public	HashBinaryContainer getHBC(String cword){
		HashBinaryContainer hbc=null;
		if(isCwordExist(cword)){
			hbc=this.maps.get(cword);
		}
		return hbc;
	} 
	/**
	 * 判断某个首字符是否存在
	 * */
	public boolean isCwordExist(String cword){
		boolean status=false;
		if(this.maps.containsKey(cword)){
			status=true;
		}
		return status;
	}
	/**
	 * 获取该Maps存储的个数，实际大小
	 * */
	public int getSize(){
		return this.maps.size()-1;
	}
	public String toString(){
		return this.maps.toString();
	}
}
package org.zhukovasky.invertedindex;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * 该类是存储倒排索引基本结构
 * */
public class DocInfo implements Serializable{
	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	public List<Location> locations;
	public String DOCNAMES;
	public DocInfo(String docName){
		this.DOCNAMES=docName;
		initLocations();
	}
	public void addElements(Integer location){
		Location l=new Location(location);
		this.locations.add(l);	
	}
	public String toString(){
		return this.DOCNAMES+Arrays.toString(this.toLocationArray());
	}
	private Location[] toLocationArray(){
		Location[] locationArray=new Location[this.locations.size()];
		for(int i=0;i<this.locations.size();i++){
			locationArray[i]=this.locations.get(i);
		}
		Location[] temp=new Location[locationArray.length-1];
		for(int i=0;i<temp.length;i++){
			temp[i]=locationArray[i+1];
		}
		return temp;
	}
	public Location[] locationFindByThisDoc(){
		return this.toLocationArray();
	}
	public int getSize(){
		return this.locations.size()-1;
	}
	private List<Location> initLocations(){
		Location ll=new Location(0);
		this.locations=new ArrayList<Location>();
		this.locations.add(0, ll);
		return locations;
	}

	public String getDOCNAMES() {
		return DOCNAMES;
	}
	public void setDOCNAMES(String dOCNAMES) {
		DOCNAMES = dOCNAMES;
	}
	public Location[] getLocations() {
		return this.toLocationArray();
	}
	public void setLocations(List<Location> locations) {
		this.locations = locations;
	}
}

package org.zhukovasky.invertedindex;

import java.io.Serializable;


public class Location implements Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = 8848845062259025823L;
	public Integer location;
	public Location(Integer location){
		this.location=location;
	}
	@Override
	public String toString() {
		return "Location ["+"location=" + location + "]";
	}
	public Integer getLine() {
		return location;
	}
	public Integer getLocation() {
		return location;
	}
	public void setLocation(Integer location) {
		this.location = location;
	}
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result
				+ ((location == null) ? 0 : location.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj) {
			return true;
		}
		if (obj == null) {
			return false;
		}
		if (!(obj instanceof Location)) {
			return false;
		}
		Location other = (Location) obj;
		if (location == null) {
			if (other.location != null) {
				return false;
			}
		} else if (!location.equals(other.location)) {
			return false;
		}
		return true;
	}
	
}
package org.zhukovasky.invertedindex;

import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;


public class MapWords implements Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = -2717650942562522560L;
	private Map<String,Nodes> mapStore;
	public MapWords(){
		mapStore=new HashMap<String,Nodes>();
		initMap();
	}
	private Map<String,Nodes> initMap(){
		Nodes newNode=new Nodes("");
		this.mapStore.put("", newNode);
		return this.mapStore;
	}
	/**
	 * 以下方法为插入方法
	 * @param Keyword
	 * @param docname
	 * @param line
	 * @param location
	 * */
	public Map<String,Nodes> addNewNodeElement(String Keyword,String docname,Integer location){
		if(this.mapStore.containsKey(Keyword)){
			this.mapStore.get(Keyword).addNewDocInfos(docname,location);
			//this.mapStore.get(Keyword).addSubNodeElements(docname, line, location);
		}else{
			Nodes newNode=new Nodes(Keyword);
			newNode.addNewDocInfos(docname);
			newNode.addNewDocInfos(docname,  location);
			this.mapStore.put(Keyword, newNode);
		}
		return this.mapStore;
	}
	/**
	 * 以下方法为查找相关节点方法
	 * */
	public Nodes getNode(String KeyWord){
		Nodes node=null;
		if(this.mapStore.containsKey(KeyWord)){
			node=this.mapStore.get(KeyWord);
		}
		return node;
	}
	/**
	 * 以下方法为按照指定关键字，指定关键文章统计该关键字的个数
	 * */
	public int getWordCountByDoc(String Keyword,String docname){
		int count=0;
		Nodes node=null;
		if(isExistNode(Keyword)){
			node=this.mapStore.get(Keyword);
			count=node.getWordCountByDoc(docname);
		}
		return count;
	}
	/**
	 * 以下方法为统计某一个字出现的所有次数
	 * @param Keyword为关键字
	 * */
	public int getWordCountAll(String Keyword){
		int count=0;
		Nodes node=null;
		if(isExistNode(Keyword)){
			node=this.mapStore.get(Keyword);
			count=node.getWordCountAll();
		}
		return count;
	}
	public Set<String> toGetKeySet(){
		Set<String> sets=new HashSet<String>(); 
		for(Entry<String,Nodes> iter: toSetArrays()){
			sets.add(iter.getKey());
		}
		return sets;
	}
	public Set<Nodes> toGetValueSet(){
		Set<Nodes> sets=new HashSet<Nodes>();
		for(Entry<String,Nodes> iter:toSetArrays()){
			sets.add(iter.getValue());
		}
		return sets;
	}
	public String toGetKeyByIndex(int index){
		Set<String> sets=toGetKeySet();
		String[] set=new String[sets.size()];
		sets.toArray(set);
		String[] setArray=sets.toArray(set);
		if(index<=0||index>set.length){
			try{
				throw new Exception("~请检查输入是否有误");
			}catch(Exception e){
				System.out.println("请检查输入值");
			}
		}
		return setArray[index];
	}
	private Set<Entry<String,Nodes>> toSetArrays(){
		Set<Entry<String, Nodes>> sets=this.mapStore.entrySet();
		return sets;
	}
	public boolean isExistNode(String Keyword){
		boolean status=false;
		if(this.mapStore.containsKey(Keyword)){
			status=true;
		}
		return status;
	}
	public int getMapSize(){
		return this.mapStore.size()-1;
	}
	public String toString(){
		return this.mapStore.toString();
	}
	public static void main(String[] args){
		MapWords map=new MapWords();
		map.addNewNodeElement("你好", "文章1", 1);
		map.addNewNodeElement("你好", "文章1", 2);
		map.addNewNodeElement("你好", "文章1", 3);
		map.addNewNodeElement("你好", "文章2", 3);
		map.addNewNodeElement("你好", "文章3", 3);
		map.addNewNodeElement("你", "文章2", 3);
		System.out.println(map.toString());
		System.out.println(map.getWordCountAll("你"));
		
	}
}
package org.zhukovasky.invertedindex;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


public class Nodes implements Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	public String KeyWord;
	public List<DocInfo> listdocs;
	public Nodes(String KeyWord){
		this.KeyWord=KeyWord;
		initList();
	}
	private List<DocInfo> initList(){
		listdocs=new ArrayList<DocInfo>();
		DocInfo dif=new DocInfo("");
		listdocs.add(0, dif);
		return this.listdocs;
	}
	public void addNewDocInfos(String docname){
		if(isExist(docname)){
			return;
		}else{
			DocInfo doc=new DocInfo(docname);
			this.listdocs.add(doc);
		}
	}
	public void addNewDocInfos(String docname,Integer location){
		if(isExist(docname)==false){
			this.addNewDocInfos(docname);
		}
		for(DocInfo iter:this.listdocs){
			if(iter.getDOCNAMES().equals(docname)){
				iter.addElements(location);
			}
		}
	}
	public List<String> getDocNameAll(){
		List<String> list=new ArrayList<String>();
		for(DocInfo iter:listdocs){
			list.add(iter.getDOCNAMES());
		}
		return list;
	}
	public DocInfo getIndex(int i){
		DocInfo[] dd=this.toDocInfoArray();
		if(i>dd.length||i<1){
			try{
				throw new Exception("请假查你的输入");
			}catch(Exception e){
				e.printStackTrace();
			}
		}
		return dd[i-1];
	}
	public DocInfo getDocInfoName(String docname){
		DocInfo dd=null;
		for(DocInfo iter:listdocs){
			if(iter.getDOCNAMES().equals(docname)){
				dd=iter;
				break;
			}
		}
		return dd;
	}
	public int getWordCountByDoc(String docname){
		int count=0;
		DocInfo dd=null;
		if(isExist(docname)){
			dd=this.getDocInfoName(docname);
			count=dd.getSize();
		}
		return count;
	}
	public int getWordCountAll(){
		int count=0;
		for(DocInfo iter:this.listdocs){
			count=count+iter.getSize();
		}
		return count;
	}
	public Integer getDocInfo(String docname){
		DocInfo[] docinfo=this.toDocInfoArray();
		int m=-1;
		for(int i=0;i<docinfo.length;i++){
			if(docinfo[i].getDOCNAMES().equals(docname)){
				m=i;
				break;
			}
		}
		return m;
	}
	private DocInfo[] toDocInfoArray(){
		DocInfo[] docinfo=new DocInfo[listdocs.size()];
		DocInfo[] dd=this.listdocs.toArray(docinfo).clone();
		DocInfo[] d=new DocInfo[dd.length-1];
		for(int i=1;i<dd.length;i++){
			d[i-1]=dd[i];
		}
		return d;
	}
	public String getKeyword(){
		return this.KeyWord;
	}
	public String toString(){
		DocInfo[] dd=this.toDocInfoArray();
		return this.KeyWord+Arrays.toString(dd);
	}
	public boolean isExist(String docname){
		boolean status=false;
		for(DocInfo iter:this.listdocs){
			if(iter.getDOCNAMES().equals(docname)){
				status=true;
			}
		}
		return status;
	}
	public static void main(String[] args){
		Nodes no=new Nodes("你好");
		no.addNewDocInfos("文章1");
		no.addNewDocInfos("文章1",1);
		no.addNewDocInfos("文章1",1);
		no.addNewDocInfos("文章2",1);
		//DocInfo dd=no.getDocInfoName("文章2");
		System.out.println(no.getWordCountByDoc("文章2"));
		System.out.println(no.toString());
	}
}

mayakovsky

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一种中文文本的快速分词方法（三）

ackage org.zhukovasky.fileutil;import java.util.ArrayList;import java.util.Arrays;import java.util.Comparator;import java.util.List;/** * 以下是中文文本用来处理文本数组的各种方法 * 文本的编码为UTF-8 * @author zhukovas
复制链接

扫一扫