主方法
package EnglishProcessing;
import EnglishProcessing.util.TextProcessing;
import EnglishProcessing.util.WordConunt;
import java.io.IOException;
import java.util.List;
import java.util.*;
import EnglishProcessing.util.getTextContent;
/**
* @author: 作者:Gent1er
* @date:日期:2023/07/06
* 功能:读取英文文档txt,实现单词分割、单词还原、词频统计、按照词频从高到低排序;
*/
public class EnglishProcess {
public static void main(String[] args) throws IOException {
// 获取文件路径
getTextContent g = new getTextContent();
String filePath = g.chooseText();
//读取文件内容
String fileContent = g.getTextContent(filePath);
if (fileContent != null) {
//单词切割+还原
TextProcessing textProcessing = new TextProcessing();
List<String> wordlist = textProcessing.cutAnDrestar(fileContent);
//单词词频统计
WordConunt wordConunt = new WordConunt();
TreeMap<String, Integer> tj = wordConunt.wordCount(wordlist);
//按照词频从高到底排序
List<Map.Entry<String, Integer>> list = wordConunt.shotCount(tj);
//输出结果
for (int i = 0; i < list.size(); i++) {
System.out.println(list.get(i));
}
} else {
System.out.println("空文件无法统计");
}
}
}
工具类
获取文件信息工具类
package EnglishProcessing.util;
import java.awt.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
/**
* @author: 作者:Gentle
* @date:日期:2023/07/06
* 获取文件信息工具类
*/
public class getTextContent {
/**
* javaSwing弹出文件选择选择器,选取文件 返回文件路径
* @return 文件路径
*/
public String chooseText(){
FileDialog fileDialog = new FileDialog((Frame)null, "选择文件", FileDialog.LOAD);
// 显示文件选择器对话框
fileDialog.setVisible(true);
// 获取选中的文件路径
String directory = fileDialog.getDirectory();
String filename = fileDialog.getFile();
if (directory != null && filename != null) {
String filePath = directory + filename;
System.out.println("选中的文件路径为:" + filePath);
return filePath;
} else {
System.out.println("没有选择文件");
return null;
}
}
/**
* 接收文件路径 根据路径将文件存储到一个字符串中
* @param fileURL
* @return 存储文件内容的字符串
*/
public String getTextContent(String fileURL) {
String line;
String fileContent = null;
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fileURL));
//循环获取文件内容
while ((line = br.readLine()) != null) {
fileContent = fileContent + line;
}
} catch (FileNotFoundException e) {
System.out.println("文件未找到");
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close(); // 关闭对象
} catch (IOException e) {
e.printStackTrace();
}
}
}
return fileContent;
}
}package EnglishProcessing.util;
import java.awt.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
/**
* @author: 作者:Gentle
* @date:日期:2023/07/06
* 获取文件信息工具类
*/
public class getTextContent {
/**
* javaSwing弹出文件选择选择器,选取文件 返回文件路径
* @return 文件路径
*/
public String chooseText(){
FileDialog fileDialog = new FileDialog((Frame)null, "选择文件", FileDialog.LOAD);
// 显示文件选择器对话框
fileDialog.setVisible(true);
// 获取选中的文件路径
String directory = fileDialog.getDirectory();
String filename = fileDialog.getFile();
if (directory != null && filename != null) {
String filePath = directory + filename;
System.out.println("选中的文件路径为:" + filePath);
return filePath;
} else {
System.out.println("没有选择文件");
return null;
}
}
/**
* 接收文件路径 根据路径将文件存储到一个字符串中
* @param fileURL
* @return 存储文件内容的字符串
*/
public String getTextContent(String fileURL) {
String line;
String fileContent = null;
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fileURL));
//循环获取文件内容
while ((line = br.readLine()) != null) {
fileContent = fileContent + line;
}
} catch (FileNotFoundException e) {
System.out.println("文件未找到");
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close(); // 关闭对象
} catch (IOException e) {
e.printStackTrace();
}
}
}
return fileContent;
}
}
单词处理工具类
package EnglishProcessing.util;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* @author: 作者:Gentle
* @date:日期:2023/07/06
* 单词处理工具类
*/
public class TextProcessing {
/**
* 根据文件内容实现 单词分割 + 单词还原
* @param fileContent 文件内容(Sting)
* @return 还原后的单词List集合
* @throws IOException
*/
public static List<String> cutAnDrestar(String fileContent){
List<String> wordList = new ArrayList<>();
Properties properties = new Properties();
//分词、分句、词性标注和次元信息。(这块不太懂)
properties.put("annotators","tokenize,ssplit,pos,lemma");//配置分词规则
StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
Annotation document = new Annotation(fileContent);
pipeline.annotate(document);
List<CoreMap> words = document.get(CoreAnnotations.SentencesAnnotation.class);
for(CoreMap wordTemp: words) {
for (CoreLabel token: wordTemp.get(CoreAnnotations.TokensAnnotation.class)) {
String originalWord = token.get(CoreAnnotations.LemmaAnnotation.class); // 获取对应上面word的词元信息,即我所需要的词形还原后的单词
wordList.add(originalWord);
}
}
return wordList;
}
}
单词统计工具类
package EnglishProcessing.util;
import java.util.*;
/**
* @author: 作者:Gentle
* @date:日期:2023/07/06
* 单词统计工具类
*/
public class WordConunt {
/**
* 接收List单词数组 利用TreeMap统计单词出现频率
* @param wordList
* @return 默认排序的TreeMap数组
*/
public static TreeMap<String,Integer> wordCount(List<String> wordList){
TreeMap<String, Integer> tj = new TreeMap<>();
for (int i = 0; i < wordList.size(); i++) {
if (tj.containsKey(wordList.get(i))) {
//如果单词存在根据key获取value+1
int count = tj.get(wordList.get(i));
count++;
tj.put(wordList.get(i), count);
} else {
//如果单词没有则新建键值对
tj.put(wordList.get(i), 1);
}
}
return tj;
}
/**
* 排序 根据频率从高到底进行排序
* @param treeMap
* @return 排序后的List集合
*/
public static List<Map.Entry<String, Integer>> shotCount(TreeMap<String, Integer> treeMap) {
List<Map.Entry<String, Integer>> list = new ArrayList<>(treeMap.entrySet());
//Collections工具 对list进行排序 规则:lamit表达式比较器
Collections.sort(list, (a, b) -> b.getValue().compareTo(a.getValue()));
return list;
}
}