Vocabulary analysis


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class Analyser {
private String inputFile;
private String outputFile;
private boolean showCount;
private boolean spliteEvery500Words;
private Map<String, Integer> map = new HashMap<String, Integer>();
private int totalWordsInFile;
private int totalWords;



public Analyser(String inputFile, String outputFile, boolean showCount, boolean spliteEvery500Words) {
super();
this.inputFile = inputFile;
this.outputFile = outputFile;
this.showCount = showCount;
this.spliteEvery500Words = spliteEvery500Words;
if(null == outputFile){
this.outputFile = inputFile+".Done.txt";


}


public void analysis() throws Exception {
BufferedReader br = new BufferedReader(new FileReader(inputFile));



String line;
List<String> allLines = new ArrayList<String>();
while ((line = br.readLine()) != null) {
allLines.add(line);
}
String allLinesString = allLines.toString().toLowerCase();
Pattern p = Pattern.compile("[a-zA-Z]+");
Matcher m = p.matcher(allLinesString);
String word;
while (m.find()) {
totalWordsInFile++;
word = m.group();
if(word.length()<2) continue;
// System.out.println(word);
if (!map.containsKey(word)) {
map.put(word, 1);
totalWords ++;
} else {
map.put(word, map.get(word) + 1);
}
}

Map<String ,Integer> sorted= sortByValue(map);
File file = new File(outputFile );
FileWriter fw = new FileWriter(file);
fw.append("Total words in file: "+ totalWordsInFile+"\n");
fw.append("Total words [remove duplicated]: "+ totalWords+"\n\n");


for (String w : sorted.keySet()) {
if(showCount){
fw.append(w + "\t" + sorted.get(w) + "\n");
}else{
fw.append(w  + "\n");
}
}


fw.flush();
fw.close();


}


public <K, V extends Comparable<? super V>> Map<K, V> sortByValue(
Map<K, V> map) {
List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(
map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
@Override
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
return (o2.getValue()).compareTo(o1.getValue());
}
});


Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}


}




public class VocabularyAnalysis {
public static void main(String[] args) {
try {  
Analyser ana=new Analyser("C:\\Users\\dl44003\\workspace\\NovelWords\\src\\EnglishPatient.txt",null,true,false);
ana.analysis();
} catch (Exception e) {
e.printStackTrace();
}
}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值