数据压缩（八）——LZW词典编解码

本文链接：https://blog.csdn.net/weixin_41926958/article/details/105600090

掌握词典编码的基本原理，用C/C++/Python等语言编程实现LZW解码器并分析编解码算法。

（一）背景知识

1.1 树的构成

代码中定义了字典树，结构为：
在这里插入图片描述

1.2 LZW的编码实现过程

在这里插入图片描述

1.3 LZW的解码实现过程

在这里插入图片描述

1.4 编解码的初始词典

以下为初始的词典
在这里插入图片描述

（二）实现代码

生成测试文件"1.dat"代码

#include <cstdio>
#include <iostream>
#include <fstream>
using namespace std;

int main()
{
	ofstream outfile("1.dat", ios::binary);
	if (!outfile) { cout << "error to create file!" << endl; }

	unsigned char a[14] = { 'a','b','b','a','b','a','b','a','c','a','b','a','c','d' };
	outfile.write((char*)a, 14);
	outfile.close();
	return 0;
}

bitio.h

#pragma once
/*
 * Declaration for bitwise IO
 *
 * vim: ts=4 sw=4 cindent
 */
#ifndef __BITIO__
#define __BITIO__

#include <stdio.h>

typedef struct {
    FILE* fp;//文件名字
    unsigned char mask;
    int rack;
}BITFILE;

BITFILE* OpenBitFileInput(char* filename);//
BITFILE* OpenBitFileOutput(char* filename);//
void CloseBitFileInput(BITFILE* bf);
void CloseBitFileOutput(BITFILE* bf);
int BitInput(BITFILE* bf);
unsigned long BitsInput(BITFILE* bf, int count);
void BitOutput(BITFILE* bf, int bit);
void BitsOutput(BITFILE* bf, unsigned long code, int count);
#endif	// __BITIO__

bitio.cpp

/*
 * Definitions for bitwise IO
 *
 * vim: ts=4 sw=4 cindent
 */

#include <stdlib.h>
#include <stdio.h>
#include "bitio.h"
BITFILE* OpenBitFileInput(char* filename) {
	BITFILE* bf;
	bf = (BITFILE*)malloc(sizeof(BITFILE));
	if (NULL == bf) { 
		return NULL; 
	}
	if (NULL == filename) { 
		bf->fp = stdin;
	}
	else {
		errno_t err = 0;
		err = fopen_s(&(bf->fp),filename, "rb");
	}
	if (NULL == bf->fp) return NULL;
	bf->mask = 0x80;
	bf->rack = 0;
	return bf;
}

BITFILE* OpenBitFileOutput(char* filename) {
	BITFILE* bf;
	bf = (BITFILE*)malloc(sizeof(BITFILE));
	if (NULL == bf) {
		return NULL;
	}
	if (NULL == filename) {
		bf->fp = stdout;
	}
	else {
		errno_t err = 0;
		err = fopen_s(&(bf->fp),filename, "wb");
		if (err != 0)
			return NULL;
	}
	if (NULL == bf->fp) return NULL;
	bf->mask = 0x80;//初始化mask=0x80
	bf->rack = 0;	//初始化rack = 0
	return bf;
}

void CloseBitFileInput(BITFILE* bf) {
	fclose(bf->fp);
	free(bf);
}

void CloseBitFileOutput(BITFILE* bf) {
	// Output the remaining bits
	if (0x80 != bf->mask) fputc(bf->rack, bf->fp);
	fclose(bf->fp);
	free(bf);
}

int BitInput(BITFILE* bf) {
	int value;

	if (0x80 == bf->mask) {
		bf->rack = fgetc(bf->fp);
		if (EOF == bf->rack) {
			fprintf(stderr, "Read after the end of file reached\n");
			exit(-1);
		}
	}
	value = bf->mask & bf->rack;
	bf->mask >>= 1;	//mask = mask/2;
	if (0 == bf->mask) bf->mask = 0x80;
	return((0 == value) ? 0 : 1);
}

unsigned long BitsInput(BITFILE* bf, int count) {
	unsigned long mask;
	unsigned long value;
	mask = 1L << (count - 1);
	value = 0L;
	while (0 != mask) {
		if (1 == BitInput(bf))
			value |= mask;
		mask >>= 1;
	}
	return value;
}

void BitOutput(BITFILE* bf, int bit) {
	if (0 != bit) bf->rack |= bf->mask;
	bf->mask >>= 1;
	if (0 == bf->mask) {	// eight bits in rack
		fputc(bf->rack, bf->fp);//把rack写到fp中
		bf->rack = 0;	//重新初始化rack = 0
		bf->mask = 0x80;//重新初始化mask = 0x80
	}
}

void BitsOutput(BITFILE* bf, unsigned long code, int count) {
	unsigned long mask;

	mask = 1L << (count - 1);
	while (0 != mask) {
		BitOutput(bf, (int)(0 == (code & mask) ? 0 : 1));
		mask >>= 1;
	}
}
#if 0
int main(int argc, char** argv) {
	BITFILE* bfi, * bfo;
	int bit;
	int count = 0;

	if (1 < argc) {
		if (NULL == OpenBitFileInput(bfi, argv[1])) {
			fprintf(stderr, "fail open the file\n");
			return -1;
		}
	}
	else {
		if (NULL == OpenBitFileInput(bfi, NULL)) {
			fprintf(stderr, "fail open stdin\n");
			return -2;
		}
	}
	if (2 < argc) {
		if (NULL == OpenBitFileOutput(bfo, argv[2])) {
			fprintf(stderr, "fail open file for output\n");
			return -3;
		}
	}
	else {
		if (NULL == OpenBitFileOutput(bfo, NULL)) {
			fprintf(stderr, "fail open stdout\n");
			return -4;
		}
	}
	while (1) {
		bit = BitInput(bfi);
		fprintf(stderr, "%d", bit);
		count++;
		if (0 == (count & 7))fprintf(stderr, " ");
		BitOutput(bfo, bit);
	}
	return 0;
}
#endif

lzw_E.cpp

/*
 * Definition for LZW coding
 *
 * vim: ts=4 sw=4 cindent nowrap
 */
#include <cstdio>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <cstdio>
#include "bitio.h"
#define MAX_CODE 65535

using namespace std;

struct {
	int suffix = -1;//后缀
	int parent = -1, firstchild = -1, nextsibling = -1;//父亲，孩子，兄弟姐妹
} dictionary[MAX_CODE + 1];
int next_code;
int d_stack[MAX_CODE]; // stack for decoding a phrase

#define input(f) ((int)BitsInput( f, 16))
#define output(f, x) BitsOutput( f, (unsigned long)(x), 16)

int DecodeString(int start, int code);
void InitDictionary(void);

//打印词典
void PrintDictionary(void) {
	int n;
	int count;
	for (n = 256; n < next_code; n++) {
		count = DecodeString(0, n);
		printf("%4d->", n);
		while (0 < count--) printf("%c", (char)(d_stack[count]));
		printf("\n");
	}
}

//解码，可以用来计算phrase_length,也可以用来构造词典
int DecodeString(int start, int code) {
	int count;
	count = start;
	while (0 <= code) {
		d_stack[count] = dictionary[code].suffix;//尾缀
		code = dictionary[code].parent;
		count++;
	}
	return count;
}

int FindCharacter(int new_code,int &character)
{
	int code = new_code;
	int count = 0;
	while (0 <= code) {
		character = dictionary[code].suffix;
		code = dictionary[code].parent;
		count++;
	}
	return count;
}

void PrintCW(int phrase_length,int new_code,FILE* fp)
{
	int code = new_code;
	int count = phrase_length;
	unsigned char* fileout = new unsigned char[phrase_length];
	while (0 < count--) {
		fileout[count] = unsigned char(dictionary[code].suffix);
		code = dictionary[code].parent;
	}
	for (int i = 0; i < phrase_length; i++)
		fprintf(fp, "%c", fileout[i]);
}

//初始化字典
void InitDictionary(void) {
	int i;

	/*初始字典中每一个节点的根节点都是自身，兄弟姐妹就是自己+1，
	只有第256个没有兄弟姐妹*/
	for (i = 0; i < 256; i++) {
		dictionary[i].suffix = i;
		dictionary[i].parent = -1;
		dictionary[i].firstchild = -1;
		dictionary[i].nextsibling = i + 1;
	}
	dictionary[255].nextsibling = -1;
	next_code = 256;	//下一个要进行插入的suffix就是256了
}
/*
 * Input: string represented by string_code in dictionary,
 * Output: the index of character+string in the dictionary
 * 		index = -1 if not found
 */
int InDictionary(int character, int string_code) {
	int sibling;
	if (0 > string_code) return character;			//如果前面没有前缀，那么直接返回刚读入进来的字符
	sibling = dictionary[string_code].firstchild;	//前缀为string_code的first_child
	while (-1 < sibling) {							
		if (character == dictionary[sibling].suffix) return sibling;
		sibling = dictionary[sibling].nextsibling;
	}
	return -1;
}

void AddToDictionary(int character, int string_code) {
	int firstsibling, nextsibling;
	if (0 > string_code) return;
	dictionary[next_code].suffix = character;			//尾缀字符添加为character
	dictionary[next_code].parent = string_code;			//前缀为string_code
	dictionary[next_code].nextsibling = -1;				//没有nextsibling
	dictionary[next_code].firstchild = -1;				//没有firstchild
	firstsibling = dictionary[string_code].firstchild;	//firstsibling是查找string_code得到的firstchild
	if (-1 < firstsibling) {	// the parent has child			
		nextsibling = firstsibling;								
		while (-1 < dictionary[nextsibling].nextsibling)		
			nextsibling = dictionary[nextsibling].nextsibling;	
		dictionary[nextsibling].nextsibling = next_code;
	}
	else {// no child before, modify it to be the first
		dictionary[string_code].firstchild = next_code;
	}
	next_code++;
}

void LZWEncode(FILE* fp, BITFILE* bf) {
	int character;
	int string_code;
	int index;
	unsigned long file_length;

	fseek(fp, 0, SEEK_END);
	file_length = ftell(fp);				//文件的长度
	fseek(fp, 0, SEEK_SET);					//指针重新指回文件开头
	BitsOutput(bf, file_length, 4 * 8);		//写文件长度
	InitDictionary();						//初始化字典树
	string_code = -1;
	while (EOF != (character = fgetc(fp))) {			
		index = InDictionary(character, string_code);	//判断是否在字典中
		if (0 <= index) {	// string+character in dictionary
			string_code = index;
		}
		else {	// string+character not in dictionary
			output(bf, string_code);	//输出string_code
			if (MAX_CODE > next_code) {	// free space in dictionary
				// add string+character to dictionary
				AddToDictionary(character, string_code);
			}
			string_code = character;
		}
	}
	output(bf, string_code);
	PrintDictionary();
}

void LZWDecode(BITFILE* bf, FILE* fp) {
	int character = -1;			//C:str(PW)的第一个字符
	int new_code, last_code;//CW,PW
	int phrase_length;		//短语长度
	unsigned long file_length;

	file_length = BitsInput(bf, 4 * 8);		//读出文件长度
	if (-1 == file_length) file_length = 0;	//如果文件长度为
	InitDictionary();						//初始化字典树
	new_code = input(bf);
	last_code = new_code;
	fprintf(fp, "%c", new_code);
	int decode_length = 1;
	while (decode_length<file_length)	//没读到文件末尾
	{
		new_code = input(bf);
		decode_length++;
		if (dictionary[new_code].suffix != -1)//在字典中
		{
			phrase_length = FindCharacter(new_code,character);
			PrintCW(phrase_length, new_code,fp);
			AddToDictionary(character, last_code);
			last_code = new_code;
			decode_length = decode_length + phrase_length - 1;
		}
		else//不在字典中
		{
			phrase_length = FindCharacter(last_code, character);
			int temp = next_code;
			next_code = new_code;
			AddToDictionary(character, last_code);
			next_code = temp + 1;
			PrintCW(phrase_length + 1, new_code, fp);
			last_code = new_code;
			decode_length = phrase_length + decode_length;
		}
	}
	PrintDictionary();
}

int main(int argc, char** argv) {
	FILE* fp;//输入的文件
	BITFILE* bf;//输出的文件

	//argv[1]:"E/D" argv[2]:读入的文件 argv[3]:读出的文件
	if (4 > argc) {
		fprintf(stdout, "usage: \n%s <o> <ifile> <ofile>\n", argv[0]);
		fprintf(stdout, "\t<o>: E or D reffers encode or decode\n");
		fprintf(stdout, "\t<ifile>: input file name\n");
		fprintf(stdout, "\t<ofile>: output file name\n");
		return -1;
	}
	if ('E' == argv[1][0]) { // do encoding
		errno_t err = 0;
		err = fopen_s(&fp, argv[2], "rb");
		bf = OpenBitFileOutput(argv[3]);//输出的文件
		if (err == 0 && NULL != bf) {
			LZWEncode(fp, bf);
			fclose(fp);
			CloseBitFileOutput(bf);
			fprintf(stdout, "encoding done\n");
		}
	}
	else if ('D' == argv[1][0]) {	// do decoding
		bf = OpenBitFileInput(argv[2]);
		errno_t err = 0;
		err = fopen_s(&fp,argv[3],"wb");
		if (NULL != fp && NULL != bf) {
			LZWDecode(bf, fp);
			fclose(fp);
			CloseBitFileInput(bf);
			fprintf(stdout, "decoding done\n");
		}
	}
	else {	// otherwise
		fprintf(stderr, "not supported operation\n");
	}
	return 0;
}

（三）关键代码分析

3.1 调试属性设置

在这里插入图片描述

3.2 main()函数分析

在这里插入图片描述

3.3 LZWEncode()函数分析

在这里插入图片描述

3.4 LZWDecode()函数分析

在这里插入图片描述

3.5 各辅助函数分析

OpenBitFileOutput()函数分析

作用就是打开需要进行输出的编码二进制文件。

OpenBitFileInput()函数分析

作用就是打开需要进行解码的编码二进制文件。

BitsOutput()函数分析

在这里插入图片描述

InDictionary()函数分析

在这里插入图片描述

AddToDictionary()函数分析

在这里插入图片描述

PrintDictionary()函数分析

在这里插入图片描述

FindCharacter()函数分析

在这里插入图片描述

PrintCW()函数分析

作用就是在解码时将new_code对应的码字输出到输出二进制文件中。

（四）实验结果

4.1 测试代码是否可以正确编解码

阶段	文件1二进制查看
编码前
编码后
解码后

文件1编码程序运行	文件1解码程序运行

文件2压缩解压缩前 txt查看	文件2压缩解压缩后 txt查看

可以发现，编解码的代码是正确的，可以进行下一步LZW压缩效率的分析。

4.2 对10种不同格式的文件进行压缩

4.2.1 原始文件

在这里插入图片描述

4.2.2 压缩文件

在这里插入图片描述

4.2.3 压缩效率分析

在这里插入图片描述
可以看出，LZW算法在一部分文件的压缩上有着非常好的效果，比如xls文件，压缩效率高达74.98%,但在另外一部分文件的压缩上则会起到相反的效果，不仅没有压缩，还使得数据量更加大，比如caj文件，压缩效率居然是-26.41%。
猜测原因：由于文件的编码格式要求限制，一部分文件中字符的重复概率很高，而另一部分文件中字符的重复概率很低，造成了LZW在压缩不同文件时展现出来的截然相反的效果。