利用树统计单词出现的频率

最新推荐文章于 2023-08-20 12:10:46 发布

roma823

最新推荐文章于 2023-08-20 12:10:46 发布

阅读量2.7k

点赞数 1

分类专栏： C 文章标签： struct null 数据结构 c

本文链接：https://blog.csdn.net/roma823/article/details/6669925

版权

C 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

这篇博客介绍如何利用二叉树数据结构来统计《The C Programming Language》书中输入单词的出现次数。通过递归函数addtree构建单词树，并使用treeprint按字典顺序打印单词及其频率。此外，文章还详细解释了getword函数中关于lim的调整以及自定义getch和ungetch函数的必要性，以确保正确处理输入字符。

摘要由CSDN通过智能技术生成

源自《The C Programmin Language》P122 ex6.5 代码位于ex6.5：

统计输入中所有单词的出现次数

代码：
#include <stdio.h>
#include <ctype.h>				/* 为了使用isalpha，isspace等字符判别函数 */
#include <string.h>				/* 为了使用strcmp，strcpy等字符串操作函数 */

#define		MAXWORD		100		/* 单词的最大长度 */

struct tnode					/* 树的节点 */
{
	char* word;					/* 指向单词的指针 */
	int count;					/* 单词出现的次数 */
	struct tnode* left;			/* 左子节点 */
	struct tnode* right;		/* 右子节点 */
};

struct tnode* addtree(struct tnode*, char* );
void treeprint(struct tnode* );
int getword(char*, int);

/* 单词出现频率的统计，按字母顺序输出 */
int main()
{
	struct tnode* root;
	char word[MAXWORD];

	root = NULL;
	while(getword(word, MAXWORD) != EOF)
		if(isalpha(word[0]))
			root = addtree(root, word);
	treeprint(root);

	return 0;
}

struct tnode* talloc();
char* strdup(char* );

/* addtree函数：在p的位置或p的下方增加一个w节点 */
struct tnode* addtree(struct tnode* p, char* w)
{
	int cond;

	if(p == NULL)				/* 该单词是一个新单词 */
	{
		p = talloc();			/* 创建一个新节点 */
		p->word = strdup(w);
		p->count = 1;
		p->left = p->right = NULL;
	}
	else if((cond = strcmp(p->word, w)) == 0)
		++p->count;				/* 新单词与节点中的单词匹配 */
	else if(cond > 0)			/* 如果小于该节点中的单词，则进入左子树 */
		p->left  = addtree(p->left, w);
	else						/* 如果大于该节点中的单词，则进入右子树 */
		p->right = addtree(p->right, w);

	return p;
}

/* treeprint函数：按序打印数p */
void treeprint(struct tnode* p)
{
	if(p != NULL)
	{
		treeprint(p->left);
		printf("%4d %s\n", p->count, p->word);
		treeprint(p->right);
	}
}

/* talloc函数：创建一个新节点tnode */
struct tnode* talloc()
{
	return (struct tnode*)malloc(sizeof(struct tnode));
}

/* strdup函数：复制s到某个位置 */
char* strdup(char* s)
{
	char* p;

	p = (char*)malloc(strlen(s) + 1);
	if(p != NULL)
		strcpy(p, s);

	return p;
}

#define		BUFSIZE		10000
static int buf[BUFSIZE];
static int bufp = 0;

int getch()
{
	return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int val)
{
	if(bufp >= BUFSIZE)
	{
		printf("error: buf full!\n");
		return;
	}
	buf[bufp++] = val;
}

/* getword函数：从输入中获取一个单词 */
int getword(char* word, int lim)
{
	char* w;
	int c;
	
	w = word;
	if(isspace(c = getch()))
		;
	if(c != EOF)
		*w++ = c;
	if(!isalpha(c))
	{
		*w = '\0';
		return c;
	}
	--lim;						/* 保持数组不越界 */
	for(; --lim > 0; ++w)
		if(!isalpha(*w = getch()))
		{
			ungetch(*w);
			break;
		}
	*w = '\0';

	return word[0];
}
分析：

1，因为预先不知道出现的单词列表，无法方便地排序并使用折半查找；也不能分别对输入中的每个单词都执行一次线性查找，开销太大。

        使用二叉树的数据结构来组织这些单词：对节点的所有操作要保证，任何节点的左子树只包含按字典顺序小于该节点中单词的那些节点

2，函数的设计用到直接递归，通过对输入中获取的每个单词调用一次递归addtree函数来构建整个单词二叉树；通过调用递归函数treeprint

        函数来按序打印单词及出现频率。

3， L124：getword中的--lim; 此语句是确保单词不会越界，例如：宏定义了MAXWORD为100，表示单词最大长度为99，故在进入L125的

        for循环之前应该将lim减一变成99，这是因为前面以及在w指向的空间的首位放入了一个字符c（L118：*w++ = c ;），而只有极限情况下

        for循环只会执行98次，故而保证单词的最大长度不超过99。

4，关于ungetch getch函数，不直接使用getchar而使用getch的原因在于：需要将getword中最后一次取出的不合条件的字符重新压回输入

        缓冲区中，供下次getword时输入之用，而不能简单地将其抛弃，这样会出现逻辑错误。

        而stdio.h中提供了库函数getchar却没有提供与之对应的ungetchar函数，故而需要自定义getch与ungetch函数。