软件工程-关于词频统计程序设计实现分析

最新推荐文章于 2019-11-24 18:57:34 发布

置顶寒风中的呐喊

最新推荐文章于 2019-11-24 18:57:34 发布

阅读量1.1k

点赞数

文章标签：测试 java

本文链接：https://blog.csdn.net/u010908743/article/details/40302429

版权

需求：

写一个程序，读取一个30kb－300kb文本文件，统计其中词语的个数，并对其中高频词汇进行统计分析。

需求分析：

如果只简单实现程序功能，程序比较小，采用java语言，windows8开发环境，eclipse开发工具，VisualVM测试软件进行程序分析即可。由需求可知，程序需要的功能有，读取文件，统计单词，筛选出无意义的词语，排序，展示结果。

概要设计：

设计一个WordStatistics类，其中包含读取文件的功能，统计单词的功能，和去除无意义的词，排序，展示功能。在用一个测试类，直接对该类进行使用。

详细设计：

WordStatistic需要读取文本文件，在java中有字节流和字符流两种读取方式，我选择对其英语单词文本进行统计，所以选择字符流读取文本的方式。我采用了inputStream 进行读取，用bufferedReader进行包装，用JAVA中的StringTokenizer工具进行分词。用Map集合的key-value键值对容器装取分出的单

词，key代表单词，value为单词出现的次数。在装的同时，筛选出无意义的词语。最后用Array，按照value的值的大小进行排序。

关键算法：

关键代码：

<span style="font-family:Microsoft YaHei;font-size:14px;"><span style="font-family:Microsoft YaHei;font-size:12px;">package com.coffee.statistics;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

public class WordStatistics {
	private String path;
	private String words = "";
	private String partword;
	private HashMap<String, Integer> wordsmap = new HashMap<String, Integer>();

	public WordStatistics() {

	}

	public WordStatistics(String path) {
		this.path = path;
	}

	public void setPath(String path) {
		this.path = path;
	}

	public void statistics() {
		try {
			FileInputStream inputStream = new FileInputStream(new File(path));
			BufferedReader bufferedReader = new BufferedReader(
					new InputStreamReader(inputStream));
			while ((partword = bufferedReader.readLine()) != null) {
				words += partword;
			}
			StringTokenizer tokenizer = new StringTokenizer(words,
					"\n\r\t .,\"?!:'");
			System.out.println("单词数量" + tokenizer.countTokens());
			while (tokenizer.hasMoreElements()) {
				String key = tokenizer.nextToken();
				if (!isMeaning(key)) {
					key = tokenizer.nextToken();
				} else {
					int value = 1;
					// 添加筛选，将无效词汇去除
					if (wordsmap.get(key) != null) {
						value = (Integer) wordsmap.get(key).intValue();
						value++;
						wordsmap.put(key, value);
					} else {
						wordsmap.put(key, new Integer(value));

					}
				}
			}
			// 对集合根据Value值排序
			Set<?> sortset = wordsmap.entrySet();
			Map.Entry[] entrys = (Map.Entry[]) sortset
					.toArray(new Map.Entry[sortset.size()]);
			// 排序
			Arrays.sort(entrys, new Comparator<Object>() {

				public int compare(Object o1, Object o2) {
					Long key1 = Long.valueOf(((Map.Entry) o1).getValue()
							.toString());
					Long key2 = Long.valueOf(((Map.Entry) o2).getValue()
							.toString());
					return key1.compareTo(key2);
				}

			});

			// 遍历Map.Entry[]
			System.out.println("高频词汇前二十");
			for (int i = entrys.length - 1; i >= entrys.length - 20; i--) {
				System.out.println(entrys[i].getKey() + ":"
						+ entrys[i].getValue());
			}
			bufferedReader.close();
			inputStream.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}

	// 判断读取的词语是否有意义
	private boolean isMeaning(String key) {
		String[] unmeaning = { "the", "I", "of", "and", "to", "gave", "So",
				"And", "was", "you", "our", "You", "your", "he", "He", "me",
				"we", "We", "their", "thiers", "not", "when", "over", "what",
				"then", "where", "there", "here", "who", "a", "an", "his",
				"in", "her", "if", "If", "Do", "do", "said", "did", "day",
				"took", "on", "or", "had", "for", "by", "at", "that", "she",
				"She", "as", "it", "they", "one", "them", "go", "because",
				"saw", "give", "him", "with", "into", "men", "t", "human",
				"be", "is", "are", "am", "were", "from", "came", "Then",
				"must", "told", "but", "no", "form", "up", "all", "so", "down",
				"after", "before", "off", "out", "got", "have", "may", "any",
				"asked", "himself", "my", "became", "herself", "back", "s",
				"been", "one", "two", "will", "would", "gave", "made", "make",
				"let", "Let" };
		if (key != null) {
			for (int i = 0; i < unmeaning.length; i++) {
				if (key.endsWith(unmeaning[i])) {
					return false;
				}
			}
			return true;
		}
		return false;
	}
}</span></span>

程序运行结果截图：

选取的文本内容艾是圣经的英文版

程序测试：

<span style="font-family:Microsoft YaHei;font-size:14px;">package com.coffee.statistics;
/**
 *单词统计测试类
 * @author apple
 *
 */
public class StatisticTest {
	public static void main(String[] args) {
		for(int i=0;i<100;i++){
		String path="C:/Users/coffee/Desktop/englishText.txt";
		WordStatistics statistics = new WordStatistics(path);
		statistics.statistics();
		}
	}
}
</span>

循环100次测试其结果：

CPU使用情况：堆内存的使用：