ackage org.zhukovasky.fileutil;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
/**
* 以下是中文文本用来处理文本数组的各种方法
* 文本的编码为UTF-8
* @author zhukovasky
* @version 1.0
* @since 2013.12
* @email zhukovasky@163.com
* */
public class WordDictUtil {
/**
* 以下方法为获得数组的排序,按长度排序返回指定长度的数组
* @param i为指定长度
* */
public static String[] getStringLengthArray(String[] s){
int maxlength=getMAXLENGTH(s);
sortByLength(s);
int count=0;
int i=s.length-1;
while(s[i].length()==maxlength){
count++;
i--;
if(i<0){
break;
}
}
String[] sarr=new String[count];
for(int j=s.length-1;j>(s.length-count-1);j--){
sarr[s.length-j-1]=s[j];
}
sort(sarr);
return sarr;
}
/**
* 该方法用来返回数组元素的长度
* */
public static int[] getStringMARKArray(String[] s){
int temp[]=new int[s.length];
for(int i=0;i<temp.length;i++){
temp[i]=s[i].length();
}
return temp;
}
/**
* 以下方法用来求解最大值的个数
* */
public static int getMAXSIZE(String[] s){
int max=getMAXLENGTH(s);
int count=0;
int temp[]=getStringMARKArray(s);
for(int i=0;i<s.length;i++){
if(temp[i]==max){
count++;
}
}
return count;
}
/**
* 以下方法返回字符串数组当中长度最大值
* */
public static int getMAXLENGTH(String[] s){
int MAXLENGTH=-1;
for(int i=0;i<s.length;i++){
if(s[i].length()>MAXLENGTH){
MAXLENGTH=s[i].length();
}
}
return MAXLENGTH;
}
/**
* 该方法根据二叉树排序方法获得的参数i
* 用来获得该数组中与该元素相关的所有元素
* */
public static String[] getString(int i,String[] arr,String word){
sort(arr);
List<String> ll=new ArrayList<String>();
//int i=BinaraySearch(arr,word);
int Kase=i;
int location=i;
if(Kase<0){
Kase=1;
}else if(Kase==0){
Kase=2;
}else if(Kase>=1&&Kase<arr.length){
Kase=3;
}
String[] tmp=null;
switch (Kase) {
case 1: {
tmp=null;
};
break;
case 2:{
int count=0;
while(((int)(arr[location].charAt(0)))==((int)(word.charAt(0)))){
ll.add(arr[location]);
location++;
count++;
}
String[] str=new String[count];
tmp=ll.toArray(str);
sort(tmp);
};
break;
case 3:{
int m=location;
int count=-1;
while(((int)(arr[location].charAt(0)))==((int)(word.charAt(0)))){
ll.add(arr[location]);
location--;
count++;
if(location<0){
break;
}
}
if(m<arr.length-1){
while(((int)(arr[m].charAt(0)))==((int)(word.charAt(0)))){
if(m>=arr.length-1){
break;
}
ll.add(arr[m+1]);
m++;
count++;
};
}else{
ll.add(arr[m]);
}
String[] str=new String[count];
tmp=ll.toArray(str);
sort(tmp);
}
break;
}
return tmp;
}
/**
* 字符串数组的二叉树查找方法
* @param arr 字符串数组
* @param word 待匹配字符串
* 注意该方法返回的下标不一定准确
* 当该数组中存在着重复值的时候,无需排序。
* */
public static int BinaraySearch(String[] arr,String word){
Comparator<String> cmp=new Comparator<String>(){
@Override
public int compare(String o1, String o2) {
int l=0;
if((int)o1.charAt(0)>(int)o2.charAt(0)){
l=1;
}
if((int)o1.charAt(0)<(int)o2.charAt(0)){
l=-1;
}
return l;
}
};
int i=-1;
i=Arrays.binarySearch(arr,word,cmp);
return i;
}
/**
* 对于字符串数组,按照每个首字符的ASCII码排序
* */
public static void sort(String[] arr){
Comparator<String> cmp=new Comparator<String>(){
@Override
public int compare(String o1, String o2) {
int l=0;
if((int)o1.charAt(0)>(int)o2.charAt(0)){
l=1;
}
if((int)o1.charAt(0)<(int)o2.charAt(0)){
l=-1;
}
return l;
}
};
Arrays.sort(arr,cmp);
}
/**
* 按照长度排序的排序方法
* */
public static void sortByLength(String[] arr){
Comparator<String> cmp=new Comparator<String>(){
@Override
public int compare(String o1, String o2) {
int l=0;
if(o1.length()>o2.length()){
l=1;
}
if(o1.length()<o2.length()){
l=-1;
}
return l;
}
};
Arrays.sort(arr,cmp);
}
/**
*返回要匹配的数组的最大值
* */
public static int MaxStringArray(String[] s){
int MaxLength=0;
for(int i=0;i<s.length;i++){
if(s[i].length()>MaxLength){
MaxLength=s[i].length();
}
}
return MaxLength;
}
/**
* 用来查找字符串word是否在数组S中
* */
public static boolean isWordMatched(String word,String[] s){
boolean status=false;
for(int i=0;i<s.length;i++){
if(word.equals(s[i])){
status=true;
}
}
return status;
}
}
package org.zhukovasky.HashBinaryClass;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Set;
import java.util.TreeSet;
import org.zhukovasky.fileutil.WordDictUtil;
public class HashBinaryContainer implements Serializable{
/**
* 序列化
*/
private static final long serialVersionUID = -600162346743456357L;
public String CapitalWord;
public Set<String> LeftWordArray;
public HashBinaryContainer(String CapitalWord){
this.CapitalWord=CapitalWord;
initSet();
}
/**
* 添加元素方法
* */
public Set<String> addString(String leftword){
this.LeftWordArray.add(leftword);
Comparator<String> cmp=new Comparator<String>(){
@Override
public int compare(String o1, String o2) {
int l=0;
if((int)o1.charAt(0)>(int)o2.charAt(0)){
l=1;
}
if((int)o1.charAt(0)<(int)o2.charAt(0)){
l=-1;
}
return l;
}
};
String[] str=this.toStringArray();
Arrays.sort(str,cmp);
this.LeftWordArray.clear();
for(int i=0;i<str.length;i++){
this.LeftWordArray.add(str[i]);
}
return this.LeftWordArray;
}
private void initSet(){
this.LeftWordArray=new TreeSet<String>();
int i=0;
char s=(char)i;
String ss=s+"";
this.LeftWordArray.add(ss);
}
/**
* 返回二叉树查找的返回值
* @param word 等待匹配的字
* 注意该返回值只能返回第二个字
* */
private int BinarySearch(String word){
int i=WordDictUtil.BinaraySearch(this.toStringArray(), word);
return i;
}
/**
* 该方法只能对第二个字符产生作用
* @param word单个字符
* */
public boolean isSecondWordExist(String word){
boolean status=true;
if(BinarySearch(word)<0){
status=false;
}
return status;
}
/**
* 获得相同第二个单字的所有数组输出
* */
public String[] getMatchArray(String word){
String[] temp=null;
int i=0;
if(isSecondWordExist(word)){
i=this.BinarySearch(word);
temp=this.getString(i, word);
}
WordDictUtil.sortByLength(temp);
return temp;
}
/**
*返回要匹配的数组的最大值
* */
public int MaxStringArray(String[] s){
int MaxLength=0;
WordDictUtil.MaxStringArray(s);
return MaxLength;
}
/**
* 以下方法用来返回相同第二个字符所有数组
* 如:中,国、国人、国人民、国人大
* 参数为国,返回{国,国人,国人民,国人大}
* @param word第二个字
* */
private String[] getString(int i,String word){
String[] temp=WordDictUtil.getString(i, this.toStringArray(), word);
return temp;
}
/***
* 查找第几个数值
* */
public String getIndex(int index){
String[] temp=this.toStringArray();
if(index<0||index>temp.length){
try{
throw new Exception("不好意思,请检查你的输入值");
}catch(Exception e){
e.printStackTrace();
}
}
return temp[index];
}
private String[] toStringArray(){
String[] str=new String[this.LeftWordArray.size()];
String[] temp=this.LeftWordArray.toArray(str);
return temp;
}
/**
* toString方法
* */
public String toString(){
return Arrays.toString(this.toStringArray());
}
public String getCapitalWord() {
return CapitalWord;
}
public void setCapitalWord(String capitalWord) {
CapitalWord = capitalWord;
}
}
package org.zhukovasky.HashBinaryClass;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class Maps implements Serializable{
/**
* 一种存储字典的数据结构
*/
private static final long serialVersionUID = -7564844288821688761L;
//public HashBinaryContainer HBC;
/**
* 为hashmap技术存储
* */
private Map<String,HashBinaryContainer> maps;
public Maps(){
this.maps=new HashMap<String,HashBinaryContainer>();
HashBinaryContainer hbc=this.init();
this.maps.put("",hbc);
}
private HashBinaryContainer init(){
HashBinaryContainer HBC=new HashBinaryContainer("");
return HBC;
}
/**
* 字典增加元素
* @param cword 首字符
* @param lword 剩下字符
* */
public Map<String,HashBinaryContainer> addElements(String cword,String lword){
if(isCwordExist(cword)){
HashBinaryContainer hbc=this.maps.get(cword);
hbc.addString(lword);
this.maps.put(cword, hbc);
}else{
HashBinaryContainer hbc=new HashBinaryContainer(cword);
hbc.addString(lword);
this.maps.put(cword, hbc);
}
return this.maps;
}
/**
* 根据首字符查找相关信息
* */
public HashBinaryContainer getHBC(String cword){
HashBinaryContainer hbc=null;
if(isCwordExist(cword)){
hbc=this.maps.get(cword);
}
return hbc;
}
/**
* 判断某个首字符是否存在
* */
public boolean isCwordExist(String cword){
boolean status=false;
if(this.maps.containsKey(cword)){
status=true;
}
return status;
}
/**
* 获取该Maps存储的个数,实际大小
* */
public int getSize(){
return this.maps.size()-1;
}
public String toString(){
return this.maps.toString();
}
}
package org.zhukovasky.invertedindex;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* 该类是存储倒排索引基本结构
* */
public class DocInfo implements Serializable{
/**
*
*/
private static final long serialVersionUID = 1L;
public List<Location> locations;
public String DOCNAMES;
public DocInfo(String docName){
this.DOCNAMES=docName;
initLocations();
}
public void addElements(Integer location){
Location l=new Location(location);
this.locations.add(l);
}
public String toString(){
return this.DOCNAMES+Arrays.toString(this.toLocationArray());
}
private Location[] toLocationArray(){
Location[] locationArray=new Location[this.locations.size()];
for(int i=0;i<this.locations.size();i++){
locationArray[i]=this.locations.get(i);
}
Location[] temp=new Location[locationArray.length-1];
for(int i=0;i<temp.length;i++){
temp[i]=locationArray[i+1];
}
return temp;
}
public Location[] locationFindByThisDoc(){
return this.toLocationArray();
}
public int getSize(){
return this.locations.size()-1;
}
private List<Location> initLocations(){
Location ll=new Location(0);
this.locations=new ArrayList<Location>();
this.locations.add(0, ll);
return locations;
}
public String getDOCNAMES() {
return DOCNAMES;
}
public void setDOCNAMES(String dOCNAMES) {
DOCNAMES = dOCNAMES;
}
public Location[] getLocations() {
return this.toLocationArray();
}
public void setLocations(List<Location> locations) {
this.locations = locations;
}
}
package org.zhukovasky.invertedindex;
import java.io.Serializable;
public class Location implements Serializable {
/**
*
*/
private static final long serialVersionUID = 8848845062259025823L;
public Integer location;
public Location(Integer location){
this.location=location;
}
@Override
public String toString() {
return "Location ["+"location=" + location + "]";
}
public Integer getLine() {
return location;
}
public Integer getLocation() {
return location;
}
public void setLocation(Integer location) {
this.location = location;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((location == null) ? 0 : location.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof Location)) {
return false;
}
Location other = (Location) obj;
if (location == null) {
if (other.location != null) {
return false;
}
} else if (!location.equals(other.location)) {
return false;
}
return true;
}
}
package org.zhukovasky.invertedindex;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
public class MapWords implements Serializable {
/**
*
*/
private static final long serialVersionUID = -2717650942562522560L;
private Map<String,Nodes> mapStore;
public MapWords(){
mapStore=new HashMap<String,Nodes>();
initMap();
}
private Map<String,Nodes> initMap(){
Nodes newNode=new Nodes("");
this.mapStore.put("", newNode);
return this.mapStore;
}
/**
* 以下方法为插入方法
* @param Keyword
* @param docname
* @param line
* @param location
* */
public Map<String,Nodes> addNewNodeElement(String Keyword,String docname,Integer location){
if(this.mapStore.containsKey(Keyword)){
this.mapStore.get(Keyword).addNewDocInfos(docname,location);
//this.mapStore.get(Keyword).addSubNodeElements(docname, line, location);
}else{
Nodes newNode=new Nodes(Keyword);
newNode.addNewDocInfos(docname);
newNode.addNewDocInfos(docname, location);
this.mapStore.put(Keyword, newNode);
}
return this.mapStore;
}
/**
* 以下方法为查找相关节点方法
* */
public Nodes getNode(String KeyWord){
Nodes node=null;
if(this.mapStore.containsKey(KeyWord)){
node=this.mapStore.get(KeyWord);
}
return node;
}
/**
* 以下方法为按照指定关键字,指定关键文章统计该关键字的个数
* */
public int getWordCountByDoc(String Keyword,String docname){
int count=0;
Nodes node=null;
if(isExistNode(Keyword)){
node=this.mapStore.get(Keyword);
count=node.getWordCountByDoc(docname);
}
return count;
}
/**
* 以下方法为统计某一个字出现的所有次数
* @param Keyword为关键字
* */
public int getWordCountAll(String Keyword){
int count=0;
Nodes node=null;
if(isExistNode(Keyword)){
node=this.mapStore.get(Keyword);
count=node.getWordCountAll();
}
return count;
}
public Set<String> toGetKeySet(){
Set<String> sets=new HashSet<String>();
for(Entry<String,Nodes> iter: toSetArrays()){
sets.add(iter.getKey());
}
return sets;
}
public Set<Nodes> toGetValueSet(){
Set<Nodes> sets=new HashSet<Nodes>();
for(Entry<String,Nodes> iter:toSetArrays()){
sets.add(iter.getValue());
}
return sets;
}
public String toGetKeyByIndex(int index){
Set<String> sets=toGetKeySet();
String[] set=new String[sets.size()];
sets.toArray(set);
String[] setArray=sets.toArray(set);
if(index<=0||index>set.length){
try{
throw new Exception("~请检查输入是否有误");
}catch(Exception e){
System.out.println("请检查输入值");
}
}
return setArray[index];
}
private Set<Entry<String,Nodes>> toSetArrays(){
Set<Entry<String, Nodes>> sets=this.mapStore.entrySet();
return sets;
}
public boolean isExistNode(String Keyword){
boolean status=false;
if(this.mapStore.containsKey(Keyword)){
status=true;
}
return status;
}
public int getMapSize(){
return this.mapStore.size()-1;
}
public String toString(){
return this.mapStore.toString();
}
public static void main(String[] args){
MapWords map=new MapWords();
map.addNewNodeElement("你好", "文章1", 1);
map.addNewNodeElement("你好", "文章1", 2);
map.addNewNodeElement("你好", "文章1", 3);
map.addNewNodeElement("你好", "文章2", 3);
map.addNewNodeElement("你好", "文章3", 3);
map.addNewNodeElement("你", "文章2", 3);
System.out.println(map.toString());
System.out.println(map.getWordCountAll("你"));
}
}
package org.zhukovasky.invertedindex;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class Nodes implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
public String KeyWord;
public List<DocInfo> listdocs;
public Nodes(String KeyWord){
this.KeyWord=KeyWord;
initList();
}
private List<DocInfo> initList(){
listdocs=new ArrayList<DocInfo>();
DocInfo dif=new DocInfo("");
listdocs.add(0, dif);
return this.listdocs;
}
public void addNewDocInfos(String docname){
if(isExist(docname)){
return;
}else{
DocInfo doc=new DocInfo(docname);
this.listdocs.add(doc);
}
}
public void addNewDocInfos(String docname,Integer location){
if(isExist(docname)==false){
this.addNewDocInfos(docname);
}
for(DocInfo iter:this.listdocs){
if(iter.getDOCNAMES().equals(docname)){
iter.addElements(location);
}
}
}
public List<String> getDocNameAll(){
List<String> list=new ArrayList<String>();
for(DocInfo iter:listdocs){
list.add(iter.getDOCNAMES());
}
return list;
}
public DocInfo getIndex(int i){
DocInfo[] dd=this.toDocInfoArray();
if(i>dd.length||i<1){
try{
throw new Exception("请假查你的输入");
}catch(Exception e){
e.printStackTrace();
}
}
return dd[i-1];
}
public DocInfo getDocInfoName(String docname){
DocInfo dd=null;
for(DocInfo iter:listdocs){
if(iter.getDOCNAMES().equals(docname)){
dd=iter;
break;
}
}
return dd;
}
public int getWordCountByDoc(String docname){
int count=0;
DocInfo dd=null;
if(isExist(docname)){
dd=this.getDocInfoName(docname);
count=dd.getSize();
}
return count;
}
public int getWordCountAll(){
int count=0;
for(DocInfo iter:this.listdocs){
count=count+iter.getSize();
}
return count;
}
public Integer getDocInfo(String docname){
DocInfo[] docinfo=this.toDocInfoArray();
int m=-1;
for(int i=0;i<docinfo.length;i++){
if(docinfo[i].getDOCNAMES().equals(docname)){
m=i;
break;
}
}
return m;
}
private DocInfo[] toDocInfoArray(){
DocInfo[] docinfo=new DocInfo[listdocs.size()];
DocInfo[] dd=this.listdocs.toArray(docinfo).clone();
DocInfo[] d=new DocInfo[dd.length-1];
for(int i=1;i<dd.length;i++){
d[i-1]=dd[i];
}
return d;
}
public String getKeyword(){
return this.KeyWord;
}
public String toString(){
DocInfo[] dd=this.toDocInfoArray();
return this.KeyWord+Arrays.toString(dd);
}
public boolean isExist(String docname){
boolean status=false;
for(DocInfo iter:this.listdocs){
if(iter.getDOCNAMES().equals(docname)){
status=true;
}
}
return status;
}
public static void main(String[] args){
Nodes no=new Nodes("你好");
no.addNewDocInfos("文章1");
no.addNewDocInfos("文章1",1);
no.addNewDocInfos("文章1",1);
no.addNewDocInfos("文章2",1);
//DocInfo dd=no.getDocInfoName("文章2");
System.out.println(no.getWordCountByDoc("文章2"));
System.out.println(no.toString());
}
}