C语言统计单词出现的频次并排序输出

1 ) 读入与 Linux 相关的英文文章(每个同学自己从网上下载,
件名称作为参数 ,统一转换为文本文件),文章不少于 2000
词,保留文章的段落结构,可以去掉空行,但读入前不得全部
转化为一行;
2 ) 统计不同单词在文章中出现的频次,不统计虚词(例如 a the
this of 等。
3 ) 根据 参数 N 按照从高到低顺序格式化输出前 N 个单词出现的
频次及比例,默认前 50 个单词。
源代码:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <list.h>

#define LENGTHMAX 50
#define MAX 2000000

typedef struct Wnode{
	char word[LENGTHMAX];
	int count;
	char st;
	struct list_head list;
}Wnode;

int wtotal = 0;

char *Init_array(char array[]) //初始化数组 
{     FILE *fp;
      char ch;
	  int i,j;
      int length = strlen(array);
      fp = fopen("linux.txt", "r");
	  for(i=0;(ch=fgetc(fp))!=EOF;i++)
	  	   array[i] = ch;
	  	    length = i;
	 for(i=0; i<length; i++)
	{
		if(array[i]=='-')
		{
			for(j=i+1; j<length; j++)
				array[j-1] = array[j];
			length--;
		}	
		else if(array[i]>='A' && array[i]<='Z')
			array[i] +=32;
		else if(array[i]>='a' && array[i]<='z')
			array[i]= array[i];
		else array[i] = ' ';
	}
	  array[i] = '\0';
	  
} 

int isFunctionwords(char *str)  //是否为虚词 
{   int i ,flag=1;
	char funcwords[84][20]= {{"of"},{"to"},{"in"},{"and"},{"as"},{"from"},{"for"},{"with"},{"that"},{"have"},
					{"by"},{"on"},{"upon"},{"about"},{"above"},{"across"},{"among"},{"ahead"},{"after"},{"a"},
					{"an"},{"although"},{"at"},{"also"},{"along"},{"around"},{"always"},{"away"},{"any"},{"up"},
					{"under"},{"until"},{"before"},{"between"},{"beyond"},{"behind"},{"because"},{"what"},{"when"},{"would"},
					{"could"},{"who"},{"whom"},{"whose"},{"which"},{"where"},{"why"},{"without"},{"whether"},{"down"},
					{"during"},{"despite"},{"over"},{"off"},{"only"},{"other"},{"out"},{"than"},{"the"},{"then"},
					{"through"},{"throughout"},{"that"},{"these"},{"this"},{"those"},{"there"},{"therefore"},{"till"},{"some"},
					{"such"},{"since"},{"so"},{"can"},{"many"},{"much"},{"more"},{"may"},{"might"},{"must"},
					{"ever"},{"even"},{"every"},{"each"}};
					
	for(i=0;i<84;i++)
	{   
		if(!strcmp(str,funcwords[i])){
			return flag;
		}
	}
	return flag=0;
}


char *delmarks(char word[])  //去单词的引号
{    int i;
	 int length = strlen(word);
	 if(word[0]=='\''){
	 	for(i=1;word[i]!='\0';i++){
	 		word[i-1]=word[i];
		 }
		 word[i-1] = '\0';
	 }
	 if(word[length-1]=='\'')   word[length-1] = '\0';
	 if(word[length-2]=='\'')   word[length-2] = '\0';
	 return word;
} 



void Extractwords(char array[],struct list_head *head)  //提取单词
{
	 char word[LENGTHMAX];
	 char *pword;
	 int i,num;
	 for(pword=array;*pword!='\0';pword++){
	 	num = 0;
	 	while(*pword!='\0'&&*pword!=' '){
	 		word[num++] = *pword++;
		 }
		 word[num] = '\0';
		 if(num>0){//提取单词成功 
		 	if(!isFunctionwords(word)){
		 		if(word[0]=='\''||word[num-1]=='\''){
		 			strcpy(word,delmarks(word));
				 }
				 Wnode *my_word_node=(Wnode *)malloc(sizeof(Wnode));
				 my_word_node->count = 1;
				 my_word_node->st = 'N'; 
				 strcpy(my_word_node->word,word);
				 list_add_tail(&(my_word_node->list),head);
				 wtotal++;
				 memset(word,0,sizeof(word));
				 
			 }
		 }
	 }
	 printf("           提取所有单词后的单词数(不包括虚词): %d\n",wtotal);
	 printf("*****************************************************\n");
} 




int CountWordnum(struct list_head *head,char word[])  //计算单词出现次数
{    
	   Wnode *tmpword;//标记要统计的单词
	   int count = 0;
	   struct list_head *pos;//遍历链表
	   list_for_each(pos,head){
	   	   tmpword=list_entry(pos,Wnode,list);
	   	   if(tmpword->st=='N'){ //判断是否已经访问过 
	   	   	 if(!(strcmp(tmpword->word,word))){
	   	   	       tmpword->st = 'Y';
				   count++;
			    }
			  }
	   	   
	   } 
	   return count;
} 



void delword(struct list_head *head)  //删除状态为Y的结点
{
	Wnode *tmpword;
	struct list_head *pos;
	list_for_each(pos,head){
	   	   tmpword = list_entry(pos,Wnode,list);
	   	   	    if(tmpword->st=='Y'){
	   	   	    	list_del(pos);
	   	   	    	wtotal--;
					  }

			  } 
}


void sort(struct list_head *head)  //对单词出现的次数进行排序
{
	 int max,tmpcount;
	 char word[50];
	 Wnode *tmpword,*tmp,*lword,*kword;
	 struct list_head *p,*q,*l,*k;
	 for(p=head->next;p!=head;p=p->next){
	 	tmpword=list_entry(p,Wnode,list);
	 	max=tmpword->count;
	 	    for(q=p->next;q!=head;q=q->next){
	 	    	tmp=list_entry(q,Wnode,list);
	 	    	if(max<tmp->count){
	 	    		max = tmp->count;
	 	    		l = q;
				 }
			 }
			if(max!=tmpword->count){//当前的max不是最大值,最大值要进行交换 
				kword = list_entry(p,Wnode,list);
				lword = list_entry(l,Wnode,list);
				strcpy(word,kword->word);
				tmpcount = kword->count;
				strcpy(kword->word,lword->word);
				kword->count = lword->count;
				strcpy(lword->word,word);
				lword->count = tmpcount;
			}
	 }
	   
} 


void showword(struct list_head *head)  //输出前N个单词
{
	 Wnode *tmpword;
	 struct list_head *p;
	 int i = 0,n;
	 printf("请选择前N个单词的出现次数,默认前50个\n");
	 scanf("%d",&n);
	 printf("\n前%d个单词出现的次数及频率->  \n",n);
	 printf("\n<---------------------------------------------->\n");
	 printf("\n单词                  次数               频率     ");
	 printf("\n<---------------------------------------------->\n");
	 for(p=head->next;p!=head&&i<n;p=p->next,i++){
	 	tmpword = list_entry(p,Wnode,list);
	 	printf("%-15s        %-5d              %-6.2f",tmpword->word,tmpword->count,(float)tmpword->count/wtotal);
	 	printf("\n<---------------------------------------------->\n");
	 }
} 





int main()
{   
	  
	  Wnode wordlisthead;
	  INIT_LIST_HEAD(&wordlisthead.list);
	  Wnode *tmp;
	  struct list_head *pos;
	  
	  char array[MAX];
	  int count;
	  Init_array(array); //提取文件中的所有字符 
	  	     
	   printf("                                   文章全文  \n%s\n",array); 
	   printf("*****************************************************\n");
	   
	  
	  //提取单词
	 Extractwords(array,&wordlisthead.list);
	   
	   //统计单词出现次数
	  list_for_each(pos,&wordlisthead.list){
	  	    count = 0;
	  	    tmp = list_entry(pos,Wnode,list);
	  	    if(tmp->st=='N'){
	  	    	    count = CountWordnum(pos,tmp->word);
	  	           	tmp->count = count;
			     
			  }
	  	    
	  } 
	  
	     delword(&wordlisthead.list);  //删除结点状态为Y的结点
		 sort(&wordlisthead.list);
		 showword(&wordlisthead.list); 
}

  • 2
    点赞
  • 53
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值