7-4 词频统计 (30分) 状态机统计单词个数

最新推荐文章于 2023-05-25 00:12:23 发布

马角的逆袭

最新推荐文章于 2023-05-25 00:12:23 发布

阅读量1.6k

点赞数 1

文章标签：状态机统计单词个数

本文链接：https://blog.csdn.net/qq_31036127/article/details/107357069

版权

请编写程序，对一段英文文本，统计其中所有不同单词的个数，以及词频最大的前10%的单词。

所谓“单词”，是指由不超过80个单词字符组成的连续字符串，但长度超过15的单词将只截取保留前15个单词字符。而合法的“单词字符”为大小写字母、数字和下划线，其它字符均认为是单词分隔符。
输入格式:

输入给出一段非空文本，最后以符号#结尾。输入保证存在至少10个不同的单词。
输出格式:

在第一行中输出文本中所有不同单词的个数。注意“单词”不区分英文大小写，例如“PAT”和“pat”被认为是同一个单词。

随后按照词频递减的顺序，按照词频:单词的格式输出词频最大的前10%的单词。若有并列，则按递增字典序输出。
输入样例：

This is a test.

The word “this” is the word with the highest frequency.

Longlonglonglongword should be cut off, so is considered as the same as longlonglonglonee. But this_8 is different than this, and this, and this…#
this line should be ignored.

输出样例：（注意：虽然单词the也出现了4次，但因为我们只要输出前10%（即23个单词中的前2个）单词，而按照字母序，the排第3位，所以不输出。）

23
5:this
4:is

感谢武汉理工大学的郭小兵老师修正测试数据！

这题很明显的状态机了,
状态机可以处理很多(不多)字符串问题
这里有两种状态

在单词内
在单词外
扫描整个文本的过程中，两种状态来回切换，在切换状态时做相应的动作(单词计数)即可

#define debug
#ifdef debug
#include <time.h>
#include "/home/majiao/mb.h"
#endif

#include <iostream>
#include <algorithm>
#include <vector>
#include <string.h>
#include <map>
#include <set>
#include <stack>
#include <queue>
#include <math.h>

#define MAXN ((int)1e5+7)
#define ll long long 
#define INF (0x7f7f7f7f)
#define fori(lef, rig) for(int i=lef; i<=rig; i++)
#define forj(lef, rig) for(int j=lef; j<=rig; j++)
#define fork(lef, rig) for(int k=lef; k<=rig; k++)
#define QAQ (0)

using namespace std;

#define show(x...)                             \
    do {                                       \
        cout << "\033[31;1m " << #x << " -> "; \
        err(x);                                \
    } while (0)

void err() { cout << "\033[39;0m" << endl; }
template<typename T, typename... A>
void err(T a, A... x) { cout << a << ' '; err(x...); }

namespace FastIO {

	char print_f[105];
	void read() { }
	void print() { putchar('\n'); }

	template <typename T, typename... T2>
		inline void read(T &x, T2 &... oth) {
			x = 0;
			char ch = getchar();
			ll f = 1;
			while (!isdigit(ch)) {
				if (ch == '-') f *= -1; 
				ch = getchar();
			}
			while (isdigit(ch)) {
				x = x * 10 + ch - 48;
				ch = getchar();
			}
			x *= f;
			read(oth...);
		}
	template <typename T, typename... T2>
		inline void print(T x, T2... oth) {
			ll p3=-1;
			if(x<0) putchar('-'), x=-x;
			do{
				print_f[++p3] = x%10 + 48;
			} while(x/=10);
			while(p3>=0) putchar(print_f[p3--]);
			putchar(' ');
			print(oth...);
		}
} // namespace FastIO
using FastIO::print;
using FastIO::read;

int n, m, Q, K;

//在单词内的状态定为1
#define INWORD (1)

//在单词外的状态定为0
#define OUTWORD (0)

//把大写char转成小写char
#define CH(ch) ((ch>='A'&&ch<='Z') ? 'a'+ch-'A' : ch)

//判断是否是题目说的'单词字符'
#define ISWORDCHAR(ch) ((ch>='0'&&ch<='9') || (ch=='_') || (ch>='A'&&ch<='Z') || (ch>='a'&&ch<='z'))

//比较函数
bool cmp(pair<int, string>& x, pair<int, string>& y) {
	if(x.first == y.first)
		return x.second < y.second;
	return x.first > y.first;
}

int main() {
#ifdef debug
	freopen("test", "r", stdin);
	// freopen("out_main", "w", stdout);
	clock_t stime = clock();
#endif
	char ch;
	string word;
	int status = OUTWORD;
	map<string, int> mp;
	/*
	 * 状态机      初始状态(未读入字符)为在单词外
	 *    单词外状态 {
	 *       如果读到'单词字符' 就追加字符,并转换状态为单词内状态,
	 *    }
	 *    
	 *    否则  单词内状态 {
	 *       如果读到非'单词字符' {    
	 *          就转化状态为单词外,并统计当前单词的个数
	 *          清空当前单词缓冲区
	 *       }
	 *       否则 {
	 *          追加字符
	 *       }
	 *    }
	 */
	
	while((ch=getchar()) && ch!='#') {
		if(status == OUTWORD) {         //当前为在单词外的状态
			if(ISWORDCHAR(ch)) {        //读入'单词字符'
				word.push_back(CH(ch));
				status = INWORD;        //转化状态
			}
		} else {                        //当前为单词内状态
			if(ISWORDCHAR(ch)) {        //读到'单词字符'就继续追加char 
				word.push_back(CH(ch));
				if(word.length() > 15)  //题目只要前15个char
					word.pop_back(); 
			} else {                    //跳出单词
				status = OUTWORD;       //转换状态
				mp[word] ++;            //计数
				word.clear();
			}
		}
	}
	vector<pair<int,string> > vec;
	for(auto it : mp)
		vec.push_back({it.second, it.first});
	sort(vec.begin(), vec.end(), cmp);  //按题目要求排序
	printf("%d\n", (int)vec.size());
	m = int(vec.size()) / 10;           //前百分之10
	for(int i=0; i<m; i++)
		printf("%d:%s\n", vec[i].first, vec[i].second.data());





#ifdef debug
	clock_t etime = clock();
	printf("rum time: %lf 秒\n",(double) (etime-stime)/CLOCKS_PER_SEC);
#endif 
	return 0;
}