数据结构——哈夫曼树

哈夫曼树

树的带权路径长度达到最小,称这样的二叉树最优二叉树,也称为哈夫曼树(Huffman Tree)。哈夫曼树是带权路径长度最短的树,权值较大的结点离根较近。 

任务描述:

  1. 从文件中读文本;
  2. 统计字符频次, 并建立字母表;
  3. 构造 Huffman 树;
  4. 将文本编码;
  5. 解码.

版本一

分析:

  1. parent, lchild, rchild 都用的是整数, 即下标. 也就是说, 先建立节点, 存放于一个数组 (线性表) 中, 再进行链接. 避免使用指针, 有静态链表的味道, good!

代码

#include <iostream>
#include <fstream>
#include <string.h>
using namespace std; 

#define MaxSize 1024  // 读入文件的上限 
#define OK 1
#define ERROR 0
typedef int Status;

typedef struct wordcnt{  // 统计字符和对应的次数 
	char ch;
	int cnt = 0;
}Count;

typedef struct NumCount{  // 统计次数的外部封装 
	Count count[MaxSize];
	int length = 0;
}NumCount;

typedef struct HTree{  // 哈夫曼树结构 
	char data; 
	int weight;
	int parent,lchild,rchild;
}HTNode,*HuffmanTree; 

typedef struct HCode{ // 编码结构 
	char data;
	char* str; 
}*HuffmanCode;


Status ReadData(char *source);  // 读入文件 
Status WordCount(char *data,NumCount *paraCnt); // 统计次数 
Status Show(NumCount *paraCnt);   // 展示次数 
Status CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray);  // 创建哈夫曼树 
Status select(HuffmanTree HT,int top,int *s1,int *s2);  // 选择权重最小的两个节点 
Status CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length);  // 创建哈夫曼编码 
Status Encode(char *data,HuffmanCode HC,int length);  // 将读入的文件编码,写到txt文件 
Status Decode(HuffmanTree HT,int length);  //读入编码文件,解码 

int main(int argc, char** argv) {
	char data[MaxSize];  
	NumCount Cntarray;
	ReadData(data);  // 读入数据 
	WordCount(data,&Cntarray);  // 统计次数 
//	Show(&Cntarray); //可以查看每个单词出现的对应次数 
	HuffmanTree tree;
	CreateHuffmanTree(tree,Cntarray.length,Cntarray);  // 建树 
	HuffmanCode code;  
	CreateHuffmanCode(tree,code,Cntarray.length);  // 创建编码 
	Encode(data,code,Cntarray.length);  // 生成编码文件 
	Decode(tree,Cntarray.length);  // 解码 
	cout<<"Please view the generated TXT file to check the result"<<endl; 
	return 0;
}

Status ReadData(char *source)
{
	//打开文件读入数据 
	ifstream infile;
	infile.open("in.txt");
	cout<<"Reading..."<<endl;
	cout<<"the input file is:"<<endl;
	infile.getline(source,MaxSize);
	cout<<source<<endl;
	infile.close();
	cout<<endl;
	return OK;
}

Status WordCount(char *data,NumCount *paraCnt)
{
	int flag;// 标识是否已经记录 
	int len = strlen(data);
	for(int i = 0;i < len;++i)
	{
		flag = 0;
		for(int j = 0;j < paraCnt->length;++j)
		{
			if(paraCnt->count[j].ch == data[i]) // 若已有记录,直接++ 
			{
				++paraCnt->count[j].cnt;
				flag = 1;
				break;
			}
			
		}
		if(!flag) // 没有记录,则新增 
		{
			paraCnt->count[paraCnt->length].ch = data[i];
			++paraCnt->count[paraCnt->length].cnt;
			++paraCnt->length;
		}
	}
	return OK;
}

Status Show(NumCount *paraCnt)
{
	cout<<"the length is "<<paraCnt->length<<endl;
	for(int i = 0;i < paraCnt->length;++i)
	{
		cout<<"The character "<<paraCnt->count[i].ch<<"  appears  "<<paraCnt->count[i].cnt<<endl;
	}
	cout<<endl;
	return OK;
}

Status CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
	if(length <= 1) return ERROR;
	int s1,s2;
	int m = length*2-1;  // 没有度为1的节点,则总结点是2*叶子节点数-1个 
	HT = new HTNode[m+1];
	for(int i = 1;i <= m;++i)  // 初始化 
	{
		HT[i].parent = 0;
		HT[i].lchild = 0;
		HT[i].rchild = 0;
	}
	
	for(int i = 1;i <= length;++i) 
	{
		HT[i].data = cntarray.count[i-1].ch;
		HT[i].weight = cntarray.count[i-1].cnt;
	}
	
	for(int i = length + 1;i <= m;++i)
	{
		select(HT,i-1,&s1,&s2);  // 从前面的范围里选择权重最小的两个节点 
		HT[s1].parent = i;
		HT[s2].parent = i;
		HT[i].lchild = s1;
		HT[i].rchild = s2;
		HT[i].weight = HT[s1].weight + HT[s2].weight;  // 得到一个新节点 
	}
	return OK;
}

Status select(HuffmanTree HT,int top,int *s1,int *s2)
{
	int min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中,权重最小的节点 
	{
		if(HT[i].weight < min && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s1 = i;
		}
	}
	
	min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中,权重次小的节点 
	{
		if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s2 = i;
		}
	}
	return OK;	
}

Status CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
	HC = new HCode[length+1];
	char *cd = new char[length];  // 存储编码的临时空间 
	cd[length-1] = '\0';  // 方便之后调用strcpy函数 
	int c,f,start;
	for(int i = 1;i <= length;++i)
	{
		start = length-1;  // start表示编码在临时空间内的起始下标,由于是从叶子节点回溯,所以是从最后开始 
		c = i;
		f = HT[c].parent;
		while(f != 0)
		{
			--start;  // 由于是回溯,所以从临时空间的最后往回计 
			if(HT[f].lchild == c)
				cd[start] = '0';
			else 
				cd[start] = '1';
			c = f;
			f = HT[c].parent;
		}
		HC[i].str = new char[length-start];  // 最后,实际使用的编码空间大小是length-start 
		HC[i].data = HT[i].data;
		strcpy(HC[i].str,&cd[start]);  // 从实际起始地址开始,拷贝到编码结构中 
	}
	delete cd;
}

Status Encode(char *data,HuffmanCode HC,int length)
{
	ofstream outfile;
	outfile.open("code.txt");
	for(int i = 0;i < strlen(data);++i)  // 依次读入数据,查找对应的编码,写入编码文件 
	{
		for(int j = 1;j <= length;++j)
		{
			if(data[i] == HC[j].data)
			{
				outfile<<HC[j].str;
			}
		}
	}
	outfile.close();
	cout<<"the code txt has been written"<<endl;
	cout<<endl;
	return OK;
}

Status Decode(HuffmanTree HT,int length)
{
	char codetxt[MaxSize*length];
	ifstream infile;
	infile.open("code.txt");
	infile.getline(codetxt,MaxSize*length);
	infile.close();
	
	ofstream outfile;
   	outfile.open("out.txt");
	
	int root = 2*length-1;  // 从根节点开始遍历 
	for(int i = 0;i < strlen(codetxt);++i)
	{
		if(codetxt[i] == '0') root = HT[root].lchild;  //为0表示向左遍历 
		else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历 
		if(HT[root].lchild == 0 && HT[root].rchild == 0)  // 如果已经是叶子节点,输出到输出文件中,然后重新回到根节点 
		{
			outfile<<HT[root].data;
			root = 2*length-1;
		}
	}
	outfile.close();
	cout<<"the output txt has been written"<<endl;
	cout<<endl;
	return OK;
}

文本文件内容

in.txt

Life is full of confusing and disordering Particular time,a particular location,Do the arranged thing of ten million time in the brain,Step by step ,the life is hard to avoid delicacy and stiffness No enthusiasm forever,No unexpected happening of surprising and pleasing So,only silently ask myself in mind Next happiness,when will come?

版本二

  1. source.txt 这么短, 怎么编码 😦
  2. 用的是正宗的链式结构. 难度增加.
#include <stdio.h>
#include <stdlib.h>

//最大读入的文本文件的字符长度
#define MAXSIZE 100

typedef struct NODE
{
    char c;
    //结点权重
    int weight;
    //记录结点表示的二进制,左0右1
    int binary;
    //是否已经创建该结点,1为创建,0为未创建
    int flag;

    struct NODE *lchild, *rchild, *parent;

} NODE, *PNODE;

typedef struct
{
    int valuse[MAXSIZE];
    int top;
} STACK;

/**
 * 栈的相关操作
 **/
void initStack(STACK &stack)
{
    stack.top = 0;
}

void push(STACK &stack, int value)
{
    stack.valuse[stack.top++] = value;
}

int pop(STACK &stack)
{
    if (stack.top == 0)
        return -1;
    stack.top--;
    return stack.valuse[stack.top];
}

//初始化树结点
void initNode(PNODE node)
{
    node->lchild = NULL;
    node->parent = NULL;
    node->rchild = NULL;
    node->flag = 0;
    node->weight = 0;
    node->c = -1;
    node->binary = 0;
}

/**
 * 创建结点
 * 
 * param 新节点的权重
 **/
PNODE createNode(int weight)
{
    PNODE node = (PNODE)malloc(sizeof(NODE));
    if (node)
    {
        initNode(node);
        node->weight = weight;
    }
    return node;
}

/**
 * 进行字符的编码
 * 遍历所有的叶子结点,找到与字符相对应的叶子结点,向上遍历,将路径入栈
 * 
 * param c 欲编码的字符
 * param childrenNodes 叶子结点的集合
 * param lenOfNodes 叶子结点的个数
 * return 返回保存该叶子结点路径的栈
 **/
STACK charEncode(char c, PNODE childrenNodes, int lenOfNodes)
{
    STACK stack;
    initStack(stack);
    for (int i = 0; i < lenOfNodes; i++)
    {
        if (c == childrenNodes[i].c)
        {
            PNODE tmp = &childrenNodes[i];
            while (tmp->parent != NULL)
            {
                push(stack, tmp->binary);
                tmp = tmp->parent;
            }
            break;
        }
    }
    return stack;
}

/**
 * 进行字符串的编码
 * 
 * param str 欲编码的字符串
 * param childrenNodes 哈夫曼树的叶子节点的集合
 * param lenOfNodes 叶子结点的个数
 * return 返回编码后的二进制字符串
 **/
char *strEncode(char *str, PNODE childrenNodes, int lenOfNodes)
{
    char *result = (char *)malloc(sizeof(char) * MAXSIZE * lenOfNodes);
    int len = 0;
    while (*str != '\0')
    {
        STACK stack = charEncode(*str, childrenNodes, lenOfNodes);
        while (stack.top > 0)
        {
            result[len++] = pop(stack) + '0';
        }
        str++;
    }
    result[len] = '\0';
    return result;
}

/**
 * 获取权重最小的结点
 * 
 * param nodes 叶子结点的集合
 * param lenOfNodes 叶子结点的个数
 * return 返回该叶子结点的地址,若所有的结点都已创建,返回NULL
 **/
PNODE getMinWeightNode(PNODE nodes, int lenOfNodes)
{
    PNODE node;
    int min = 0, i;

    //对已创建过的结点进行过滤
    while (min < lenOfNodes)
    {
        if (nodes[min].flag == 0)
        {
            break;
        }
        min++;
    }

    if (min == lenOfNodes)
    {
        return NULL;
    }

    //查找未创建的结点中权重最小的结点
    for (i = min + 1; i < lenOfNodes; i++)
    {
        if (nodes[i].flag == 0 && nodes[i].weight < nodes[min].weight)
        {
            min = i;
            continue;
        }
    }

    nodes[min].flag = 1;
    return &nodes[min];
}

/**
 * 根据叶子结点创建哈夫曼树
 * 
 * param nodes 叶子结点的集合
 * param lenOfNode 叶子结点的个数
 * param childNode 子结点的一个
 **/
PNODE createHuffmanTree(PNODE nodes, int lenOfNodes, PNODE childNode)
{
    PNODE minWeightNode, parentNode;

    minWeightNode = getMinWeightNode(nodes, lenOfNodes);

    if (!minWeightNode)
        return childNode;

    if (!childNode)
    {
        parentNode = minWeightNode;
    }
    else
    {
        parentNode = createNode(childNode->weight + minWeightNode->weight);

        if (childNode->weight < minWeightNode->weight)
        {
            parentNode->lchild = childNode;
            parentNode->rchild = minWeightNode;
        }
        else
        {
            parentNode->rchild = childNode;
            parentNode->lchild = minWeightNode;
        }
        parentNode->lchild->binary = 0;
        parentNode->rchild->binary = 1;

        childNode->parent = minWeightNode->parent = parentNode;
    }

    createHuffmanTree(nodes, lenOfNodes, parentNode);
}

/**
 * 将大写字母转成小写字母
 * 若区分大小写,哈夫曼树的度增加一倍,编码效率降低
 * 
 * param c 传入的字符
 * return 若传入的字符是大写字母则转换成小写字母,如是其他字符不做更改
**/
char charTolowercase(char c)
{
    if (c >= 'A' && c <= 'Z')
    {
        c += 'a' - 'A';
    }
    return c;
}

/**
 * 根据字符出现频率创建哈夫曼树
 * 返回哈夫曼树根结点
 * 
 * param buff 用来保存读取的字符串的数组
 * param nodes 用来记录所有的叶子结点
 * param lenOfNodes 用来保存叶子结点的个数
 **/
PNODE readFromSource(const char *filePath, char *buff, PNODE childrenNodes, int &lenOfNodes)
{

    //记录字符串长度
    int lenOfStr = 0, i;

    //记录当前读取的字符
    char c;

    FILE *file = fopen(filePath, "rb");

    if (file == NULL)
    {
        puts("Can't find source file!");
        exit(0);
    }

    //一个字符一个字符读入本地文本
    c = fgetc(file);
    while (!feof(file))
    {
        c = charTolowercase(c);
        //初始化结点
        initNode(&childrenNodes[lenOfNodes]);

        buff[lenOfStr++] = c;
        for (i = 0; i < lenOfNodes; i++)
        {
            if (childrenNodes[i].c == c)
            {
                childrenNodes[i].weight++;
                break;
            }
        }
        if (i == lenOfNodes)
        {
            childrenNodes[lenOfNodes].c = c;
            childrenNodes[lenOfNodes++].weight++;
        }
        c = fgetc(file);
    }
    buff[lenOfStr] = '\0';

    fclose(file);
    return createHuffmanTree(childrenNodes, lenOfNodes, NULL);
}

/**
 * 将编码后的字符串存入本地文件
 * param filePath 欲存放的本地文件路径
 * param result 编码后的二进制结果
 **/
void writeResult(const char *filePath, char *result)
{
    FILE *fp = fopen(filePath, "wb");
    if (fputs(result, fp) >= 0)
    {
        printf("生成结果成功\r\n");
    }
    fclose(fp);
}

/**
 * 将编码后的二进制字符串进行解码
 * param str 欲解码的二进制字符串
 * param 哈夫曼树根结点
 * return 返回解码后的字符串
 * 
 * tips:哈夫曼树是根据原字符串构造而来,一棵树对应一种编码方式
 **/
char *strDecode(const char *str, PNODE TreeRoot)
{
    const char *tmp = str;
    char *result = (char *)malloc(sizeof(char) * MAXSIZE);
    //用来记录结果字符串的长度
    int len = 0;
    while (*tmp != '\0')
    {
        PNODE tmpNode = TreeRoot;
        while (tmpNode->lchild && tmpNode->rchild)
        {
            tmpNode = *tmp == '0' ? tmpNode->lchild : tmpNode->rchild;
            tmp++;
        }
        result[len++] = tmpNode->c;
    }
    result[len] = '\0';
    return result;
}

int main()
{
    char buff[MAXSIZE];

    NODE childrenNodes[MAXSIZE];

    int len = 0;

    //确保source.txt和exe处于同目录
    PNODE root = readFromSource("source.txt", buff, childrenNodes, len);

    writeResult("result.txt", strEncode(buff, childrenNodes, len));

    printf("%s", strDecode("11111111111110111111111110011101111111111011001110011111111101111111101111111011111101111101100111101010", root));

    return 0;
}

source.txt

you are a pig
me too

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值