浙江大学陈越教授数据结构PTA 题目——7-1 词频统计

最新推荐文章于 2024-06-16 22:24:07 发布

小吴同学·

最新推荐文章于 2024-06-16 22:24:07 发布

阅读量4.4k

点赞数 4

分类专栏：浙江大学陈越教授数据结构PTA 题目文章标签：开发语言后端数据结构

本文链接：https://blog.csdn.net/weixin_54966200/article/details/121669559

版权

浙江大学陈越教授数据结构PTA 题目专栏收录该内容

15 篇文章 14 订阅

订阅专栏

这段代码实现了一个英文单词统计程序，通过哈希表存储并计算单词出现次数。程序读取一行英文，统计每个单词的出现频率，并能展示词频前10%的单词。输入包含一个主函数，用于创建散列表、读取单词、插入并计数、显示词频及销毁散列表。

摘要由CSDN通过智能技术生成

？？？？

#include <stdio.h>
#include <malloc.h>
#include <math.h>
#include <stdbool.h>
#include <string.h>

#define KEYLENGTH 15//长度超过15的单词将只截取保留前15个单词字符 
#define MAXTABLESIZE 111111//允许开辟的最大散列表长度 
#define MAXWORDLEN 80 //单词输入的最大长度 
typedef char ElementType[KEYLENGTH+1];//链表的数据域是一个字符串 
typedef int Index;//散列地址类型 

typedef struct LNode *PtrToLNode;//单链表一个结点的定义 
struct LNode{
	ElementType Data;//结点的数据域是一个字符串 
	int Count;//存储该单词的出现次数，空头结点的Count用来存储该单链表的结点数 
	PtrToLNode Next;
};
typedef PtrToLNode Position;
typedef PtrToLNode List;

typedef struct TblNode *HashTable;
struct TblNode{//散列表结点的定义 
	int TableSize;
	List Heads;
};

int NextPrime(int n)//返回大于n且不超过MAXTABLESIZE的最小素数 
{
	int i,p;
	for(p=n+1;p<MAXTABLESIZE;p++){
		for(i=2; i<=p; i++){
			if(p%i== 0)	break;
		}
		if(i>=p) return p;
	} 
}

HashTable CreateTable(int TableSize)
{
	HashTable H;
	int i;
	H = (HashTable)malloc(sizeof(struct TblNode));
	H->TableSize =NextPrime(TableSize);//保证散列表最大长度是素数
	H->Heads =(List)malloc(H->TableSize *sizeof(struct LNode));
	for(i=0; i<H->TableSize ;i++){//初始链表头结点 
		H->Heads[i].Data[0] = '\0';
		H->Heads[i].Next =NULL;
	} 
	return H;
}

bool IsWordChar(char c)//判断一个字符是否为合法字符 
{
	if(c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' ||c=='_') return true;
	else return false;
}

int zongchang[111]={0};//记录每个单词的开始到下一个单词开始的总长度 
int GetAWord(char *shuru,ElementType word,int j,int k)
{
	char tempword[MAXWORDLEN+1], c;
	int i,len=0;
	for(i=j;shuru[i];i++){
		c=shuru[i];
		if(len==0 && !IsWordChar(c)) zongchang[k]++; 
		if(IsWordChar(c)) {
			tempword[len++] =c;
			zongchang[k]++;
		}
		if(len && !IsWordChar(c)) {
			zongchang[k]++;
			break;//一个单词结束 
		}
	}
	tempword[len] = '\0';//设定字符串结束符
	if(len>KEYLENGTH){//长度超过15的单词将只截取保留前15个单词字符
		tempword[KEYLENGTH] = '\0';
		len = KEYLENGTH;
	} 
	strcpy(word,tempword);
	return len; 
}

int Hash(const char *Key, int TableSize)//确定关键词所在的散列函数的地址 
{
	int H=0;
	while(*Key!='\0') H=(H<<5)+ *Key++;
	return H%TableSize;
} 

Position Find(HashTable H, ElementType Key)
{
	Position P;
	Index Pos;
	Pos = Hash(Key, H->TableSize );//初始散列地址 
	P =H->Heads[Pos].Next ;
	while(P && strcmp(P->Data ,Key)) P=P->Next ;
	return P; 
}

void InsertAndCount(HashTable H, ElementType Key)
{
	Position P,NewCell;
	Index Pos;
	
	P = Find(H,Key);
	if(!P ){//关键词未找到，可以插入 
		NewCell = (Position)malloc(sizeof(struct LNode));
		strcpy(NewCell->Data ,Key);
		NewCell->Count =1;//新单词第一次出现 
		Pos =Hash(Key, H->TableSize );//初始散列地址
		NewCell->Next =H->Heads[Pos].Next;//将NewCell插入为H->Heads[Pos]链表的第一个结点  
		H->Heads[Pos].Next =NewCell;
		H->Heads[Pos].Count ++;//链表中增加了一个新单词 
	}
	else P->Count ++;//关键词已存在 
} 

void Show(HashTable H, double percent)
{
	int diffwordcount =0;//不同的单词数量
	int maxf =0;//最大的词频
	int *diffwords;//存储词频从1到maxf的单词数量
	Position L;
	int i,j,k,lowerbound,count=0;
	for(i=0;i<H->TableSize ;i++)//遍历整个散列表
	{
		diffwordcount+=H->Heads[i].Count ;//遍历所有单链表的头结点，记录不同的单词数量
	 	L = H->Heads[i].Next ;
	 	while(L){
	 		if(maxf<L->Count) maxf =L->Count; //记录所有单词的最大词频maxf 
	 		L= L->Next;
		}
	}
	printf("%d\n",diffwordcount);
	
	diffwords = (int*)malloc((maxf+1)*sizeof(int));
	for(i=0;i<=maxf;i++) diffwords[i]=0;
	for(i=0;i<H->TableSize ;i++){//统计词频从1到maxf的单词数量
		L= H->Heads[i].Next ;
		while(L){
			diffwords[L->Count]++;//该词频增加一个单词 
			L=L->Next;
		}
	} 
	
	lowerbound =(int)(diffwordcount*percent);
	for(i=maxf;i>=1 && count<lowerbound; i--) count+=diffwords[i]; 
	
	for(j=maxf; j>=i ;j--){//对每个词频，按词频从大到小输出单词 
		for(k=0; k<H->TableSize ;k++){
			L =H->Heads[k].Next ;
			while(L){
				if(j==L->Count ) printf("%-15s:%d\n",L->Data ,L->Count ); //发现一个单词的词频与当前词频相等 
				L=L->Next ;
			}
		}
	}
	free(diffwords);
}

void DestroyTable(HashTable H)
{
	int i;
	Position P, Tmp;
	for(i=0; i<H->TableSize ;i++){
		P = H->Heads[i].Next ;
		while(P){
			Tmp =P->Next ;
			free(P);
			P=Tmp;
		}
	}
	free(H->Heads);
	free(H);
}

char shuru[1111];//输入一行英文

int main()
{
	HashTable H;
	int TableSize=100;
	H=CreateTable(TableSize);
	int length=0,wordcount=0,i,k=0;
	int flag=0;//不再输入的标志 
	ElementType word;
	while(gets(shuru)){ //输入一行英文 
		for(i=0;i<strlen(shuru);i++){
			if(shuru[i]=='#'){
				flag = 1;
				goto Pos_1;//跳转到指定符号处 
			}
			else {
				length = GetAWord(shuru,word,i,k);//读取一个单词
				wordcount++;
				InsertAndCount(H,word);//统计word出现次数
				i+=zongchang[k]-1;//这行英文的下一个单词 
				k++;
				printf("%s\t",word); 
			}
		}
	}
	Pos_1:
		Show(H, 10.0/100);//显示词频前10%的所有单词 
		DestroyTable(H); 
	return 0;
}

小吴同学·

关注

4
点赞
踩
9

收藏

觉得还不错? 一键收藏
打赏
0
评论
浙江大学陈越教授数据结构PTA 题目——7-1 词频统计

？？？？#include <stdio.h>#include <malloc.h>#include <math.h>#include <stdbool.h>#include <string.h>#define KEYLENGTH 15//长度超过15的单词将只截取保留前15个单词字符 #define MAXTABLESIZE 111111//允许开辟的最大散列表长度 #define MAXWORDLEN 80 //单词输入的.
复制链接

扫一扫