使用词表,遍历替换文章单词,再进行词频统计
要求
- 去除介词等无用单词
- 单词单复数,动词不同时态等合并
代码
import java.awt.print.Printable;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
/* 使用两重循环,分别遍历时态替换表 lemmas.txt 和介词表 disablewords.txt 替换文章重复时态和介词
* 之后使用 treeMap 统计对应词的数量
*/
public class WordCount {
public static void main(String[] args) {
try {
RandomAccessFile txtPoint = new RandomAccessFile("E:\\tt.txt", "r");
RandomAccessFile lemmasPoint = new RandomAccessFile("E:\\lemmas.txt", "r");
RandomAccessFile prePoint = new RandomAccessFile("E:\\disablewords.txt", "r");
long txtLen = txtPoint.length();
long lemmasLen = lemmasPoint.length();
long preLen = prePoint.length();
int lineLen;
String regx = "[\\s\\pP\\d]+";
String txt = null;
String lemmas = null;
String pre = null;
String[] lemmasChange;
while (txtPoint.getFilePointer() < txtLen) {
txt = txt + txtPoint.readLine();
}
String txtChange[] = txt.toLowerCase().split(regx);
System.out.println("txt结束");
while (lemmasPoint.getFilePointer() < lemmasLen) {
lemmas = lemmasPoint.readLine();
lemmasChange = lemmas.split("\\s");
lineLen = lemmasChange.length;
for (int i=1;i<lineLen;i++) {
for (int j=0;j <txtChange.length;j++) {
if (lemmasChange[i].matches(txtChange[j])) {
txtChange[j] = lemmasChange[0];
}
}
}
}
System.out.println("lemmas结束");
while (prePoint.getFilePointer() < preLen) {
pre = prePoint.readLine();
for (int j=0;j <txtChange.length;j++) {
if (pre.matches(txtChange[j])) {
txtChange[j] = "";
}
}
}
System.out.println("pre结束");
Map<String,Integer> map = new TreeMap<String,Integer>();
for (String sss : txtChange) {
if (!sss.matches("")) {
if(map.get(sss) != null) {
int value = ((Integer)map.get(sss)).intValue();
value++;
map.put(sss, new Integer(value));
}
else {
map.put(sss, new Integer(1));
}
}
}
List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(map.entrySet());
Collections.sort(list,new Comparator<Map.Entry<String,Integer>>() {
//升序排序
public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
int i = 0;
for (Entry<String, Integer> e: list) {
i++;
if (i >= 10) {
break;
}
System.out.println(e.getKey()+":"+e.getValue());
}
} catch (Exception e) {
System.out.println(e);
}
}
}