public class FuncWordToMap {
public static void toMap(File[] filelist, Map map){
for(int i=0;i<filelist.length;i++){
String source = ReadFileByBuffer.ReadFileByBufferdeReader(filelist[i].getAbsolutePath());
String[] s = source.split(" ");
for(int j=0;j<s.length;j++){
String str=s[j];
String[] ss=str.split("_");
String word=ss[0];
String type=ss[1];
if(type.equals("IN")||type.equals("WP")||type.equals("PP")||type.equals("DT")||type.equals("CC")||type.equals("MD")||type.equals("RP")||type.equals("TO")){
if(map.containsKey(word)){
Integer time=(Integer)map.get(word);
int t=time.intValue();
time=new Integer(t+1);
map.put(word, time);
}else{
map.put(word, new Integer(1));
}
}
}
}
}
}
public class FuncWordMapSort {
public static List<Map.Entry<String, Integer>> wordSort(HashMap map1,OutputStream out) throws IOException{
List<Map.Entry<String, Integer>> list = new LinkedList<Map.Entry<String, Integer>>();//
list.addAll(map1.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>()
{
public int compare(Map.Entry obj1, Map.Entry obj2)
{
//从高往低排序
if(Integer.parseInt(obj1.getValue().toString())<Integer.parseInt(obj2.getValue().toString()))
return 1;
if(Integer.parseInt(obj1.getValue().toString())==Integer.parseInt(obj2.getValue().toString()))
return 0;
else
return -1;
}
}
);
}
打印到arff文件中如下:
@relation amazonReview
@attribute "the" numeric
@attribute "a" numeric
@attribute "and" numeric
@attribute "to" numeric
@attribute "of" numeric
@attribute "is" numeric
@attribute "I" numeric
@attribute "in" numeric
@attribute "it" numeric
@attribute "that" numeric
@attribute "for" numeric
@attribute "you" numeric
@attribute "this" numeric
@attribute "with" numeric
@attribute "are" numeric
@attribute "on" numeric............
@data
20,13,7,7,8,6,14,2,9,5,6,8,5,5,0,6,5,1,2,1,3,2,1,1,2,2,1,0,1,7,0,3,0,0,1,1,0,1,0,1,2,2,0,1,0,1,0,0,0,0,0,0,1,3,1,0,0,5,2,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Chandler
15,8,8,5,5,1,13,14,1,5,5,4,0,2,5,1,0,1,4,0,2,0,5,2,1,3,0,1,0,1,0,1,0,0,1,0,0,0,3,5,4,2,0,0,1,2,1,0,0,0,1,2,0,0,2,1,3,3,0,2,0,0,1,0,0,2,0,0,0,2,0,1,0,0,0,0,3,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Chandler
9,4,8,3,4,2,10,2,2,3,5,1,3,0,0,2,1,2,3,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,2,0,1,0,2,0,2,0,1,0,0,0,0,0,0,0,0,0,1,2,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Chandler
7,11,11,6,1
。。。。。。。以上是对每个文章中对应该词出现的频率值