软件工程作业1

最新推荐文章于 2024-04-26 21:51:50 发布

置顶轩轩老油条不油

最新推荐文章于 2024-04-26 21:51:50 发布

阅读量324

点赞数

分类专栏：原创。java 原创。android 原创。javaweb 文章标签： Top10

本文链接：https://blog.csdn.net/zzx520110/article/details/39862021

版权

原创。android 同时被 3 个专栏收录

3 篇文章 0 订阅

订阅专栏

原创。javaweb

2 篇文章 0 订阅

订阅专栏

原创。java

1 篇文章 0 订阅

订阅专栏

标题：选词

要求：

读取一篇文本，筛选出其中出现频率前十个有意义的词

思路：先使用DataInputStream读取文本，产生一个数据流，然后将数据流读取为一个字符串，然后将字符串通过正则表达分离为一个单词一个单词的字符串数组，然后首先使用ArrayList进行单词种类的统计（过程中用if语句去除没意义的单词，如the ，is，are等等），然后对比原字符串数组，统计出的每一种单词的出现频率，再进行

import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;

import javax.swing.JFileChooser;

public class Read {
	/*
	 * 读出所有字符
	 * 
	 *
	 */
	public static String read(String path) throws IOException {
		DataInputStream in = new DataInputStream(new FileInputStream(path));
		String txt = "";
		String temp = "";
		while ((temp = in.readLine()) != null) {
			txt = txt + temp;
		}
		in.close();
		return txt;
	}
	
	/*
	 * 选出出现频率前十的词
	 * 
	 * 
	 */
	public static void select(String path) throws IOException {
		String txt = read(path);
		ArrayList<String> res = new ArrayList<String>();
		
		String[] a = txt.trim().split("[ ,,,.,!,?,\n]");        //用正则表达式分离出字符串中的每一个词语

		/*
		 * 
		 * 统计出有意义词的种类
		 */
		for (int i = 0; i < a.length; i++) {
			String temp = a[i];
/*
 * 从所有统计出的词中去除没意义的词
 */
			if (temp.toLowerCase().equals("the")           
					|| temp.toLowerCase().equals("am")
					|| temp.toLowerCase().equals("is")
					|| temp.toLowerCase().equals("are")
					|| temp.toLowerCase().equals("was")
					|| temp.toLowerCase().equals("were")
					|| temp.toLowerCase().equals("been")
					|| temp.toLowerCase().equals("that")
					|| temp.toLowerCase().equals("this")
					|| temp.toLowerCase().equals("these")
					|| temp.toLowerCase().equals("those")
					|| temp.toLowerCase().equals("to")
					|| temp.toLowerCase().equals("in")
					|| temp.toLowerCase().equals("on")
					|| temp.toLowerCase().equals("at")
					|| temp.toLowerCase().equals("of")) {
				continue;
			}
			if (!res.contains(temp)) {
				res.add(temp);  //如果當前的詞沒被統計進去，就加入統計中
			}

		}
		/*
		 * 對每一種統計出的有意義詞進行出現頻率的統計
		 */
		int count[] = new int[res.size()];
		//有意義詞的出現頻率初始化
		for (int i = 0; i < count.length; i++) {
			count[i] = 0;
		}
		for (int i = 0; i < res.size(); i++) {
			String temp = res.get(i);
			for (int j = 0; j < a.length; j++) {
				if (temp.equals(a[j])) {  //判斷若出現一次則自加一次
					count[i]++;
				}
			}
		}

		/*
		 * 對統計出的詞匯的頻率進行插入排序
		 */
		for (int i = 1; i < count.length; i++) {
			int temp = count[i];
			String t = a[i];

			for (int j = i - 1; j >= 0; j--) {
				if (count[j] < temp) {
					int tp = count[j + 1];
					count[j + 1] = count[j];
					count[j] = tp;

					String r = res.get(j + 1);
					res.set(j + 1, res.get(j));
					res.set(j, r);
				}
			}

		}		
		//輸出出現頻率前十的詞彙及其出現頻率
		for (int i = 0; i < 10; i++) {
			System.out.println("这是第" + (i + 1) + "个: " + res.get(i));
			System.out.println("数量为" + count[i] + "个");
		}

	}

	//对Read类进行测试
	public static void main(String[] args) {
		JFileChooser f = new JFileChooser();
		String path = "";
		int s = f.showOpenDialog(null);
		if (s == JFileChooser.APPROVE_OPTION) {
			path = f.getSelectedFile().getAbsolutePath();
		} else {
			System.exit(0);
		}
		try {
			// "C:\\Users\\Administrator\\Desktop\\w.txt"
			select(path);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

}

运行结果：