c++ 数据结构 *** 哈夫曼树的应用——压缩软件

最新推荐文章于 2023-10-11 16:33:15 发布

treeshy

最新推荐文章于 2023-10-11 16:33:15 发布

阅读量4.8k

点赞数 6

分类专栏： [c++]数据结构

本文链接：https://blog.csdn.net/treeshy/article/details/50392535

版权

[c++]数据结构专栏收录该内容

5 篇文章 0 订阅

订阅专栏

数据结构的作业，压缩软件用的，具体写的过程中有哪些问题在程序里说吧。

头文件与常量部分：

利用char的8位，来存储文件里的元素。每次取出文件中的8位并记录这八位出现的次数用来进行哈夫曼数的建立。

#include<iostream>
#include<stack>
#include<string>
#include<list>
#include<iterator>

using namespace std;

//利用char进行8位一取，这样不管是字符类型还是汉字或者其他格式都可以进行压缩
const unsigned int N = 256;//char共有8位，那么一共最多有256个可能的字符，叶节点数目
const unsigned int M = 2 * N;//对于拥有n个叶结点的huffman树，一共有2*n-1个结点，数组最后一位不存数，供select函数选择使用
const unsigned long long MAX = 0xffffffffffff;

三个类的声明部分：

Buffer类用来从文件中读取8位字符或者写入8位字符。因为每次都只能从文件中读取或者写入一个字符，所以利用Buffer类进行缓冲。

treeNode类是哈夫曼树的结点类，其中保存了树中每个结点出现的频率与其左右子结点与父节点。

HuffmanTree类是哈夫曼树类，可以进行压缩或解压。

class Buffer {//缓冲类，对文件进行读取/写入操作的时候通过Buffer类进行整个字符的读取/写入
public:
	unsigned int bits;//实际字节长度
	char ch;//字节
};

class HuffmanTree;//huffman树类声明

class treeNode {//huffman树结点类
	friend HuffmanTree;
private:
	unsigned long long weight;//该结点一共出现了多少次
	unsigned int right, left;
	unsigned int parents;
};

class HuffmanTree {//huffman树类
private:
	treeNode nodes[M];//存储树的每个结点信息，最多M个
	unsigned int leaf[N];//存储叶节点字符信息，最多N个
	unsigned int index[N];//存储叶节点的下标值
	char* leafCode[N];//存储叶节点的编码信息
	FILE* input, *output;
	unsigned int num;//实际上叶结点个数
	unsigned long long size;//一共有多少个字符
	Buffer buf;//输入输出缓冲

	//辅助函数
	void write(unsigned int i);//向文件中写入一个bite
	void write(unsigned int num, unsigned int bits);//向文件中写入bits位的num数
	void writerest();//如果最后buf里面还有字符没有写入，用这个函数一起写入
	void read(unsigned int &i);//从文件中读取一位bite，用i输出
	void read(unsigned int &num, unsigned int bits);//从文件中读取bits位的数，并用num输出

	//Encode辅助函数
	void enSieve();//读取文件，并且统计文件中字符数
	void select(unsigned int pos,unsigned int &t1,unsigned int &t2);//在0——pos间选择两个权重最小的，用t1、t2输出
	void enSetTree();//对需要被压缩的文件中的字符建立huffman树，并且完善每个结点的编码信息

	//Decode辅助函数
	void deSieve();//读取需要被解压文件，并建造其哈夫曼树

public:
	void Encode();//对文件编码
	void Decode();//对文件译码
};

HuffmanTree类的实现：

读取/写入辅助函数：

主要是利用Buffer类中的bits来记录ch中实际有效的位数。在write的时候，如果ch中实际位数已经足够8位，那么将ch写入文件，然后再将bits置8，ch置0；在read的时候，如果ch的实际位数已经等于0，那么从文件中fget（）一个8位数给ch，同时bits置8。同时，有可能在写入到最后的时候，bits实际上并不等于0，那么用writerest将剩下的ch全部写入。

//辅助函数块
void HuffmanTree::write(unsigned int i) {//向文件中写入一个bite
	buf.ch = (buf.ch << 1) + i;//对ch加上一位
	buf.bits++;//ch的八位bite的实际使用量加一
	if (buf.bits == 8) {//如果全部用完，那么将buf.ch输入进output中，并将buf.bits,buf.ch置0重新开始新一轮计数
		fputc(buf.ch, output);
		buf.bits = 0;
		buf.ch = 0;
	}
}
void HuffmanTree::write(unsigned int target, unsigned int bits) {//向文件中写入bits位的num数
	/*傻逼做法
	for (unsigned int i = 0; i < bits; ++i) {//一位一位的放进去
		write((target & 128)>>7 );//从高位依次放到低位
		target <<= 1;
	}
	*/

	stack<unsigned int> s;
	unsigned int i, bit;
	for (i = 1; i <= bits; i++) {
	s.push(target & 1);
	target = (target >> 1);
	}
	for (i = 1; i <= bits; i++) {
	bit = s.top();
	write(bit);
	s.pop();
	}
}
void HuffmanTree::writerest() {//如果最后buf里面还有字符没有写入，用这个函数一起写入
	unsigned int now = buf.bits;
	if (now>0)
		for (unsigned int i = 0; i<8 - now; i++)write(0);
}
void HuffmanTree::read(unsigned int &i) {//从文件中读取一位bite，用i输出
	if (buf.bits == 0) {
		buf.bits = 8;
		buf.ch = fgetc(input);
	}
	i = (buf.ch & 128) >> 7;
	buf.bits--;
	buf.ch <<= 1;

}
void HuffmanTree::read(unsigned int &target, unsigned int bits) {//从文件中读取bits位的数，并用num输出
	unsigned int tmp;
	target = 0;
	for (unsigned int i = 0; i < bits; ++i) {
		read(tmp);
		target = (target << 1) + tmp;
	}
}

EnCode()函数及其辅助函数的实现部分：

昨晚在这部分出现了一个错误点，在enSetTree函数的实现部分，如下代码：

	for (int i = 0; i < N; ++i)
		leafCode[i] = NULL;

被我写成了：

	for (int i = 0; i <= N; ++i)
		leafCode[i] = NULL;

然后就导致了FILE* input莫名其妙被置成了NULL，之后找到错误之后感慨了一下不要乱置NULL。。下标一定要看清。。。

//Encode辅助函数块
void HuffmanTree::enSieve() {//读取文件，并且统计文件中字符数
	char inName[1000], outName[1000];
	cout << "Input file name that you want to code:";
	cin >> inName;
	cout << "Input target file name:";
	cin >> outName;
	if ((input = fopen(inName, "rb")) == NULL) {
		cout << "Can not open file." << endl;
		system("pause");
		exit(1);
	}
	if (feof(input)) {
		cout << "Empty source file" << endl;
		system("pause");
		exit(1);
	}
	if ((output = fopen(outName, "wb")) == NULL) {
		cout << "Can not open file." << endl;;
		system("pause");
		exit(1);
	}

	//从文件中读取字符，并统计字符出现频率
	rewind(input);
	unsigned int ch;
	size = 0;
	for (unsigned int i = 0; i < N; ++i) {
		leaf[i] = 0;
		index[i] = 0;
	}
	for (unsigned int i = 0; i < M; ++i) {
		nodes[i].weight = 0;
		nodes[i].left = nodes[i].right = nodes[i].parents = M-1;
	}
	ch = fgetc(input);
	while (!feof(input)) {
		leaf[ch]++;
		size++;
		ch = fgetc(input);
	}

	//nodes[N-1].weight置为最大
	nodes[M-1].weight = MAX;

	//筛掉出现频率为0的字符，并写入nodes，index数组，并修改num值
	num = 0;
	for (unsigned int i = 0; i < N; ++i)
		if (leaf[i]) {
			nodes[num].weight = leaf[i];
			leaf[i] = num;
			index[num] = i;
			num++;
		}
	if (!num) {
		cout << "doesn't have a word" << endl;
		system("pause");
		exit(1);
	}
}


void HuffmanTree::select(unsigned int pos, unsigned int &t1,unsigned int &t2) {//在0——pos间选择两个权重最小的，用t1、t2输出
	t1 = M-1, t2 = M-1;
	for (unsigned int i = 0; i < pos; ++i) {
		if (nodes[i].weight < nodes[t1].weight&&nodes[i].parents==M-1)
			t1 = i;
	}
	for (unsigned int i = 0; i < pos; ++i) {
		if (nodes[i].weight < nodes[t2].weight&&i != t1&&nodes[i].parents == M-1)
			t2 = i;
	}
}


void HuffmanTree::enSetTree() {//对需要被压缩的文件中的字符建立huffman树，并且完善每个结点的编码信息
	//建立huffman树
	for (unsigned int i = num; i < num * 2 - 1; ++i) {
		unsigned int t1, t2;
		select(i, t1, t2);
		nodes[i].weight = nodes[t1].weight + nodes[t2].weight;
		nodes[i].left = t1;
		nodes[i].right = t2;
		nodes[t1].parents = nodes[t2].parents = i;
	}

	for (int i = 0; i < N; ++i)
		leafCode[i] = NULL;

	//对每个结点进行编码
	unsigned int start, c, f, i;
	char *cd = new char[num];                 //编码临时变量
	for (i = 0; i < N; i++)
		if (leafCode[i] != NULL) {
			delete[]leafCode[i];  //释放存储空间
			leafCode[i] = NULL;
		}
	cd[num - 1] = '\0';         //编码结束符
	for (i = 0; i < num; i++) {    //逐位求Huffman编码
		start = num - 1;        //编码结束符位置
		for (c = i, f = nodes[i].parents; f != M - 1; c = f, f = nodes[c].parents) { //从叶到根求编码
			if (nodes[f].left == c)cd[--start] = '0';
			else cd[--start] = '1';
		}
		leafCode[i] = new char[num - start];      //为第i个字符编码分配空间
		strcpy(leafCode[i], &cd[start]);            //从cd复制编码到HuffmanCode
	}
	delete cd;
}


void HuffmanTree::Encode() {//对文件编码
	enSieve();//初始化input，output；统计文件中字符
	enSetTree();//根据enSieve完成huffman树的建立与对字符进行编码

	rewind(output);
	rewind(input);
	//向output的开头中写入树结构
	buf.bits = 0;
	buf.ch = 0;
	fwrite(&size,sizeof(unsigned long long),1,output);//写入size
	write(num, 8);//将树结构中的叶结点个数写入
	for (unsigned int i = 0; i < num; ++i)//将树节点中的叶节点写入
		fwrite(&index[i], sizeof(char), 1, output);
	//选择num最大需要多少位来存储
	unsigned maxbit = 1;
	unsigned int tmp = num * 2 - 1;
	while (tmp) {
		maxbit++;
		tmp >>= 1;
	}
	for (unsigned int i = num; i < num * 2 - 1; ++i) {//写入左右孩子信息
		write(nodes[i].left, maxbit);
		write(nodes[i].right, maxbit);
	}

	//写入编码信息
	unsigned int ch;
	ch = fgetc(input);
	while (!feof(input)) {
		unsigned int start = 0;//判断对ch的编码leafCode[loc]的起始位置
		while (leafCode[leaf[ch]][start] != '\0') {
			if (leafCode[leaf[ch]][start] == '1')write(1);
			else write(0);
			++start;
		}
		ch = fgetc(input);
	}
	writerest();//写入剩下的字符
	cout << "Done!\n\n";
	fclose(input);
	fclose(output);
}

DeCode()函数及其辅助函数的实现部分：

//Decode辅助函数块
void HuffmanTree::deSieve() {//读取需要被解压文件，并建造其哈夫曼树
	char inName[1000], outName[1000];
	cout << "Input file name that you want to decode:";
	cin >> inName;
	cout << "Input target file name:";
	cin >> outName;
	if ((input = fopen(inName, "rb")) == NULL) {
		cout << "Can not open file." << endl;
		system("pause");
		exit(1);
	}
	if (feof(input)) {
		cout << "Empty source file" << endl;
		system("pause");
		exit(1);
	}
	if ((output = fopen(outName, "wb")) == NULL) {
		cout << "Can not open file." << endl;;
		system("pause");
		exit(1);
	}

	//开始读取树结构
	rewind(input);
	for (unsigned int i = 0; i < M; ++i) {
		nodes[i].parents = nodes[i].right = nodes[i].left = N-1;
	}

	buf.bits = 0; //清空缓冲区
	buf.ch = 0;
	fread(&size,sizeof(unsigned long long),1, input);//读取size
	read(num, 8);//读取树结构中的叶结点个数
	if (num == 0)num = 256;
	for (unsigned int i = 0; i < num; ++i)//读取树节点中的叶节点
		fread(&index[i], sizeof(char), 1, input);
	//选择num最大需要多少位来存储
	unsigned maxbit = 1;
	unsigned int tmp = num * 2 - 1;

	while (tmp) {
		maxbit++;
		tmp >>= 1;
	}

	for (unsigned int i = num; i < num * 2 - 1; ++i) {//读取左右孩子信息
		read(nodes[i].left, maxbit);
		read(nodes[i].right, maxbit);
		nodes[nodes[i].left].parents = nodes[nodes[i].right].parents = i;
	}
}


void HuffmanTree::Decode() {
	deSieve();

	//开始译码
	rewind(output);
	unsigned int tmp;
	read(tmp);
	for (int i = 0; i < size; ++i) {
		unsigned int loc = 2 * num - 2;
		while ((nodes[loc].left != N-1 || nodes[loc].right != N-1) && !feof(input)) {
			if (tmp == 0)loc = nodes[loc].left;
			else loc = nodes[loc].right;
			read(tmp);
		}
		fputc(index[loc], output);
	}
	cout << "Done!\n\n";
	fclose(input);
	fclose(output);
}

写下来大概感受就是注意二进制的长短，以及不要写的头晕了。。。长度各种乱。。。奇葩。。

测试部分：

#pragma warning(disable:4996)
#include<iostream>
#include<cstdio>
#include<cmath>
#include<stack>
#include<queue>
#include<cstring>
#include<sstream>
#include<set>
#include<string>
#include<iterator>
#include<vector>
#include<map>
#include<algorithm>
#include"HuffmanTree.h"
using namespace std;

int main(void) {
	cout << sizeof(char) << endl;
	char choose = '1';
	while (choose != '3') {
		HuffmanTree tree;
		cout << "1.Huffman Encode" << endl;
		cout << "2.Huffman Decode" << endl;
		cout << "3.exit" << endl;
		cin >> choose;
		switch (choose) {
		case'1':tree.Encode(); break;
		case'2':tree.Decode(); break;
		default:break;
		}
	}
//	system("pause");
	return 0;
}

测试效果：

第一波（纯文字）：