【C语言】数据结构与算法-统计《白鲸记》（英文版）词频

九六四一居士

已于 2024-03-24 21:44:13 修改

阅读量421

点赞数

文章标签： c语言哈希算法开发语言数据结构

于 2022-10-29 09:24:43 首次发布

本文链接：https://blog.csdn.net/m0_50580625/article/details/127582985

版权

我们在大二学习数据结构时，课程大作业是对英文版的白鲸记进行词频统计，要求运算时间要在30秒以内，我记得我的程序统计最大的Bulk那章文档最快大概18秒左右统计出来，做的性能不太好，欢迎大家指正和批评，大家共同进步。

采用算法：

采用的哈希表的方式对单词进行存储

然后通过冒泡排序将表中的单词从大到小排序

统计《白鲸记》中Big.txt这一章的内容排除标点符号

输出结果：

Output for text file from 

http://norvig.com/big.txt

Total number of words = 1097929
Number of different words = 30326
The 100 most common words:
WORD            NUMBER OF OCCURRENCES
THE             80003
OF              40025
AND             38290
TO              28760
IN              22048
A               21142
HE              12249
THAT            12195
WAS             11410
IT              10251
HIS             10033
IS              9764
WITH            9740
AS              8060
HAD             7383
I               7217
FOR             6936
AT              6789
BY              6736
ON              6639
NOT             6623
BE              6154
FROM            5700
BUT             5640
YOU             5359
OR              5349
HER             5284
HIM             5230
WHICH           4840
WERE            4288
ALL             4141
THIS            4058
SHE             3875
THEY            3855
ARE             3627
HAVE            3491
SAID            3464
AN              3422
ONE             3302
WHO             3031
SO              3014
THEIR           2955
WHAT            2921
WHEN            2921
THERE           2863
BEEN            2599
MAY             2549
IF              2368
NO              2332
UP              2284
MY              2240
THEM            2240
INTO            2124
MORE            1996
OUT             1987
WOULD           1952
ME              1920
PRINCE          1897
DID             1873
ONLY            1872
PIERRE          1797
WE              1781
COULD           1700
NOW             1696
ITS             1635
HAS             1603
WILL            1576
THEN            1552
SOME            1534
TIME            1529
MAN             1524
AFTER           1504
DO              1500
ABOUT           1496
OTHER           1488
SUCH            1435
BEFORE          1363
VERY            1335
HOW             1301
SHOULD          1297
OVER            1282
YOUR            1276
THESE           1231
NEW             1211
THAN            1206
ANY             1204
THOSE           1201
WELL            1188
OLD             1180
FIRST           1175
HIMSELF         1158
TWO             1138
DOWN            1128
FACE            1125
MEN             1118
UPON            1111
SEE             1101
NATASHA         1097
LIKE            1080
ANDREW          1074

代码如下：

#include<stdio.h>
#include<ctype.h>
#include<string.h>
#define HASHTABLE_SIZE 55000  //define HASHTABLE_SIZE 55000 
#include<stdlib.h>
#include <stdio.h>
#include <stdlib.h>
/*#include<fsrm.h>*/


typedef struct word{
	char Word[50];
	int num;
     }word;                                //Define the structure of a word
word *hashtable[HASHTABLE_SIZE];

int hash(int num){
   int a=num%HASHTABLE_SIZE;        //Resolving Hash Conflict by Dividing and Leaving Remainder Method
 return a; 
}

/*unsigned int BKDRHash(char *str)
{
    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
    unsigned int hash = 0;
 
    while (*str)
    {
        hash = hash * seed + (*str++);
    }
 
    return (hash & 0x7FFFFFFF);
}*/ 
 

int Different=0,Total=0;	    //Global variables are easy to call 

void inserthash(int count,char *w)  //Insert Date 
{
	int s=hash(count);
		
	word *newWord;
  Total++;//The total number is set before judgment
	
	while (hashtable[s])
	{
        if(!strcmp(hashtable[s]->Word,w))
		{hashtable[s]->num++;
		return;} 
		else
		s++;
	}
	newWord=(word*)malloc(sizeof(word));
	newWord->num = 1;
	strcpy(newWord->Word,w);  //Save w characters in structure Word
	hashtable[s] = newWord;
   Different++;
}


void readfile()
{   int i=0,j;
	int a[100];
    int count;
    FILE *fp;
	char temp[80],ch;
	word *head;
	word *curr; 
    word *Temp=(word*)malloc(sizeof(word));
   /* for(i=0;i<55000;i++)
        {
		 hashtable[i]=0;                   //This initialization efficiency is extremely low
		}*/ 
	
	memset(hashtable,0,HASHTABLE_SIZE); //Efficient initialization
	fp=fopen("big.txt","r");   //Read the document setting path
	if(fp == NULL)
    {
        printf("Failed to open the file!\n");
        exit(0); //Automatic exit does not exist for document
    }
      while (!feof(fp))                      //Read the big loop at the end of the document
	{   ch=fgetc(fp);
	
	if(isalpha(ch)||ch=='\'')
		{if(islower(ch))
			{ temp[i++]=ch-32;  //Converting lowercase letters to uppercase
			count=count+ch-32;  //Hash value can also be used to make hash value, which is also very convenient
			}
			else
			{ temp[i++]=ch;
			  count=count+ch;
			}}	
		else
		{ temp[i++]='\0';
			if(strlen(temp)>=1)
			inserthash(count,temp);
			for(i=0;i<80;i++)        //Initialize array temp 
        {temp[i]=0;}
		 count=0;
			i=0;
		}		
		if(feof(fp))
		break;
	}
	
	fclose(fp); //结束读文本 
	printf("The 100 most common words:\n");
	printf("            WORD            NUMBER OF OCCURRENCES\n");
		for (i = 0; i < 100; i++)   //sort                       bubble sort
	{    count=0;
		for (j = 0; j < HASHTABLE_SIZE; j++)
		{if( hashtable[j]!= NULL && hashtable[j]->num>count) 
			{count=hashtable[j]->num;
				a[i]=j; //The larger the value, move back one place
					/*printf("%15s %15d\n",hashtable[a[i]]->Word,hashtable[a[i]]->num);*/
			}
		}
		printf("%15s %15d\n",hashtable[a[i]]->Word,hashtable[a[i]]->num);
		
		hashtable[a[i]]->num = 0; //Initialize after one cycle
	}     //冒泡排序 
	} 
void  print()
{
	printf("Number of total_words = %d\n",Total);
	printf("Number of different words = %d\n",Different);
	
}

 int main()
{
    readfile(); 
    //Sort after reading text
	print();
	//Print the total number of words and the number of different words
	/*sort()*/
	return 0;
}

欢迎读者们向我提出问题，一起交流一起进步：）