统计文件词频和词量并显示消耗时间_0004_build200402

最新推荐文章于 2024-07-21 22:06:29 发布

置顶蓝飒飒

最新推荐文章于 2024-07-21 22:06:29 发布

阅读量157

点赞数 1

本文链接：https://blog.csdn.net/LanSasa_00/article/details/105345153

版权

编程小白的1000份作业专栏收录该内容

4 篇文章 0 订阅

订阅专栏

C++文件处理高级版作业

分别用RTK in Wiki.txt和Hamlet-25.txt两个文件作为输入

①统计其中每个单词的数量②然后把数量最多的前20个单词显示出来

③然后分别显示读取和排序所消耗的时间

参考运行结果如下图所示

#include<iostream>
#include<fstream>
#include<string>
#include<algorithm>
#include<map>
#include<vector>
#include<time.h>
using namespace std;

typedef pair<string, int> PAIR;

struct CmpByValue
{
	bool operator()(const PAIR& lhs, const PAIR& rhs)

	{
		return lhs.second > rhs.second;
	}
};

int main()
{
	// 记录main开始的时间
	clock_t tMain = clock();

//Part1  生成纯净的文件副本

	// 读文件
	ifstream fin ("Hamlet-25.txt");

	// 准备一个文件副本 Copy.txt
	ofstream fout ("Copy.txt");

	// 如果打开文件失败报告错误
	if (!fin.is_open() || !fout.is_open())
	{
		cout << "Fail to open the file！" << endl;
		return 1;
	}

	cout << "Input text file name: RTK in Wiki.txt" << endl;

	// 生成纯净的文件副本
        string fetchW;
	string newfetchW;

	while ( fin.peek() != EOF )
	{
		fin >> fetchW;

		//全部化为小写
		string::iterator j;

		for (j = fetchW.begin(); j != fetchW.end(); j++)
		{
			*j = tolower(*j);
		}
		
		// 在捕获的字符里来个循环，逐个查找，用空格替换不是字母的字符

		for (unsigned int i = 0; i < fetchW.length(); i++)
		{
			string str = ("abcdefghijklmnopqrstuvwxyz");

			int nPos = str.find(fetchW[i]);

			if ( nPos==string::npos )
			{
				newfetchW += ' ';
			}
			else
				newfetchW += fetchW[i];
		}

		fout << newfetchW << ' ';

		// 清空字符
		fetchW.clear();
		newfetchW.clear();

		}

	// 关闭文件和副本
        fin.close();
	fout.close();


//Part2  计算总词量和词频

	// 读副本，开始时间存入tRead
	
	fin.open("Copy.txt");
	clock_t tRead = clock();

	// 如果打开副本失败，报告错误
	if (!fin.is_open())
	{
		cout << "Fail to open the file！" << endl;
		return 1;
	}

	int nSum = 0;

	string fWord;

	map<string, int> map_fWord;

	map<string, int>::iterator it;

	// while语句将捕获的每个单词fWord插入map_fWord，并对相关计数器加1	
	while (fin >> fWord)
	{
		if (fWord.length() != 0)
		{
			map_fWord[fWord]++;
			nSum++;
		}
	}
	
	cout << "Total number of words:" << nSum << endl;

	// 关闭文档
	fin.close();

	// 读取结束时间存入tRead2
	clock_t tRead2 = clock();

	cout << "--------Word Frequency--------" << endl;

	// 将map_fWord中的值转存到vce_fWord中去
	vector<PAIR> vec_fWord(map_fWord.begin(), map_fWord.end());

	// 根据CmpByValue对vec_fetchW进行排序,开始时间存入tSort

	clock_t tSort = clock();
	sort(vec_fWord.begin(), vec_fWord.end(), CmpByValue());
	
	// 输出数量最多的前20个单词
	cout << "Top 20 most frequent words are" << endl;

	for (int i = 0; i < 20; i++)
	{
		cout << vec_fWord[i].first<< "(" << vec_fWord[i].second << ")" << endl;

	}
	
	// 结束时间存入tSort2
	clock_t tSort2 = clock();

	cout << "-------------------------------" << endl;
	
//Part3  显示运行时间

	cout << "Time to store words : " << tRead2-tRead << "(sec.)" << endl;

	cout << "Time for sorting : " << tSort2-tSort << "(sec.)" << endl;

	//记录main结束的时间
	clock_t tMain2 = clock();

	cout << "Total elapsed time : " << tMain2-tMain << "(sec.)" << endl;
	

	system("pause");

	return 0;
}

思路大概就是：

生成纯净的文件副本：因为RTK in Wiki.txt 和 Hamlet-25.txt 两个文件里有汉字，符号等干扰因素，无法准确统计出单词个数。（这两个文本文档我就不上传了哈）
计算总词量和词频：利用map()函数和数组，用到头文件#include<map>，#include<algorithm> 和 #include<vector>。
显示运行时间：利用clock()函数，用到头文件#include<time.h>

美中不足嘛，就是有些慢，可能是生成纯净版文件副本的代价......

下面奉上我的运行结果~

哪位大佬能告诉我，这博客底下的空行要怎么删掉hahahahaha~

蓝飒飒

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
统计文件词频和词量并显示消耗时间_0004_build200402

C++文件处理高级版作业分别用RTKinWiki.txt和Hamlet-25.txt两个文件作为输入①统计其中每个单词的数量②然后把数量最多的前20个单词显示出来③然后分别显示读取和排序所消耗的时间参考运行结果如下图所示#include<iostream>#include<fstream>#include<string>#in...
复制链接

扫一扫