import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Analyser {
private String inputFile;
private String outputFile;
private boolean showCount;
private boolean spliteEvery500Words;
private Map<String, Integer> map = new HashMap<String, Integer>();
private int totalWordsInFile;
private int totalWords;
public Analyser(String inputFile, String outputFile, boolean showCount, boolean spliteEvery500Words) {
super();
this.inputFile = inputFile;
this.outputFile = outputFile;
this.showCount = showCount;
this.spliteEvery500Words = spliteEvery500Words;
if(null == outputFile){
this.outputFile = inputFile+".Done.txt";
}
}
public void analysis() throws Exception {
BufferedReader br = new BufferedReader(new FileReader(inputFile));
String line;
List<String> allLines = new ArrayList<String>();
while ((line = br.readLine()) != null) {
allLines.add(line);
}
String allLinesString = allLines.toString().toLowerCase();
Pattern p = Pattern.compile("[a-zA-Z]+");
Matcher m = p.matcher(allLinesString);
String word;
while (m.find()) {
totalWordsInFile++;
word = m.group();
if(word.length()<2) continue;
// System.out.println(word);
if (!map.containsKey(word)) {
map.put(word, 1);
totalWords ++;
} else {
map.put(word, map.get(word) + 1);
}
}
Map<String ,Integer> sorted= sortByValue(map);
File file = new File(outputFile );
FileWriter fw = new FileWriter(file);
fw.append("Total words in file: "+ totalWordsInFile+"\n");
fw.append("Total words [remove duplicated]: "+ totalWords+"\n\n");
for (String w : sorted.keySet()) {
if(showCount){
fw.append(w + "\t" + sorted.get(w) + "\n");
}else{
fw.append(w + "\n");
}
}
fw.flush();
fw.close();
}
public <K, V extends Comparable<? super V>> Map<K, V> sortByValue(
Map<K, V> map) {
List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(
map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
@Override
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
return (o2.getValue()).compareTo(o1.getValue());
}
});
Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
}
public class VocabularyAnalysis {
public static void main(String[] args) {
try {
Analyser ana=new Analyser("C:\\Users\\dl44003\\workspace\\NovelWords\\src\\EnglishPatient.txt",null,true,false);
ana.analysis();
} catch (Exception e) {
e.printStackTrace();
}
}
}