要求:写一个程序,分析一个文本文件中各个词出现的频率,并且把频率最高的10个词打印出来。文本文件大约是30KB~300KB大小。
1.算法思想
从english.txt按行读取英文文本,将每个string都封装成Word放入一个WordList中,使用HashMap来将String作为key,rate作为value,在update方法中,就可以直接使用map的get方法判断是否存在唯一的key(string);计算出不同单词的出现频率rate后,对其进行排序打印出频率最高的10个单词(排序使用Collections工具类的sort()方法)。
注:单词String和频率rate封装在Word类中,以key(String)是否相同来衡量Word对象是否相同
2.程序
package Searchword;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CountOfWordsTest {
public static void main(String[] args) {
test1();
}
public static void test1() {
ArrayList<String> list = readFromFile1();
ArrayList<Word<String, Integer>> WordList = countOfWords1(list);
printList1(WordList);
}
/*
* @Description print the list
*/
private static void printList1(ArrayList<Word<String, Integer>> wordList) {
for (int i = 0; i < 10; i++) {
Word<String, Integer> word = wordList.get(i);
String key = word.getKey();
Integer value = word.getValue();
System.out.println(key + ".." + value);
}
}
public static ArrayList<Word<String, Integer>> countOfWords1(
ArrayList<String> list) {
ArrayList<Word<String, Integer>> WordList = new ArrayList<Word<String, Integer>>();
for (int i = 0; i < list.size(); i++) {
String temp = list.get(i);
update1(WordList, temp);
}
Collections.sort(WordList, new Comparator<Word<String, Integer>>() {
@Override
public int compare(Word<String, Integer> o1,
Word<String, Integer> o2) {
Integer temp1 = o1.getValue();
Integer temp2 = o2.getValue();
return temp2.compareTo(temp1);
}
});
return WordList;
}
public static void update1(ArrayList<Word<String, Integer>> WordList,
String s) {
Word<String, Integer> tempWord = new Word<String, Integer>(s, 1);
int idx = WordList.indexOf(tempWord);
if (idx == -1) {
WordList.add(tempWord);
} else {
Word<String, Integer> word = WordList.get(idx);
word.setValue(word.getValue() + 1);
}
}
@SuppressWarnings("resource")
public static ArrayList<String> readFromFile1() {
ArrayList<String> list = new ArrayList<String>();
BufferedReader br = null;
StringBuilder sb = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(
"./src/dictionary.txt")));
sb = new StringBuilder();
String line;
while ((line = br.readLine()) != null) {
sb.append(line + " ");
}
} catch (IOException e) {
e.printStackTrace();
}
String regEx = "[a-zA-Z]+";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(sb);
while (m.find()) {
String temp = m.group();
list.add(temp);
}
return list;
}
/*
* @Description the inner class contains String and count
*/
private static class Word<KeyType, ValueType extends Comparable<? super ValueType>> {
private KeyType key;
private ValueType value;
public Word(KeyType key, ValueType value) {
this.key = key;
this.value = value;
}
public KeyType getKey() {
return key;
}
public void setKey(KeyType key) {
this.key = key;
}
public ValueType getValue() {
return value;
}
public void setValue(ValueType value) {
this.value = value;
}
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj != null && obj.getClass() == Word.class) {
Word<KeyType, ValueType> tempWord = (Word<KeyType, ValueType>) obj;
if (this.getKey().equals(tempWord.getKey())) {
return true;
}
}
return false;
}
}
}
4.运行结果
由上图可见,计算300KB的文本用了280毫秒