LWZ基本概念
LZW压缩编码是一种先进的数据压缩技术,属于无损压缩编码,该编码主要用于图像数据的压缩。对于简单图像和平滑且噪声小的信号源具有较高的压缩比,并且有较高的压缩和解压缩速度。
1977年,两位以色列教授Lempel和Ziv提出了查找冗余字符和用较e69da5e887aae79fa5e9819331333236363465短的符号标记替代冗余字符的概念。1985年,由Welch加以充实而形成LZW,简称“LZW”技术。
LZW算法采用动态的建立字典的方法,依次读入原文件的字符序列,每次碰到新的连续的字符串,就在字典中加入标示,当下次再次遇到这种字符串时,就可以用字典索引序号直接代替字符串,写入压缩文件中。在这里引入两个名词: “string”,“char”;string表示前缀,char 表示新读入的字符,每个字典索引对应一对(string,char);
LWZ编码原理
如上流程图:
1:将词典初始化为包含所有可能的单字符,当前前缀P初始化为空。
2:当前字符C=字符流中的下一个字符。
3:判断P+C是否在词典中
(1)如果“是”,则用C扩展P,即让P=P+C,返回到步骤2。
(2)如果“否”,则输出与当前前缀P相对应的码字W;将P+C添加到词典中;令P=C,并返回到步骤2
解码流程
1:在开始译码时词典包含所有可能的前缀根。
2:令CW:=码字流中的第一个码字。
3:输出当前缀-符串string.CW到码字流。
4:先前码字PW:=当前码字CW。
5:当前码字CW:=码字流的下一个码字。
6:判断当前缀-符串string.CW 是否在词典中。
(1)如果”是”,则把当前缀-符串string.CW输出到字符流。
当前前缀P:=先前缀-符串string.PW。
当前字符C:=当前前缀-符串string.CW的第一个字符。
把缀-符串P+C添加到词典。
(2)如果”否”,则当前前缀P:=先前缀-符串string.PW。
当前字符C:=当前缀-符串string.CW的第一个字符。
输出缀-符串P+C到字符流,然后把它添加到词典中。
步骤7:判断码字流中是否还有码字要译。
(1)如果”是”,就返回步骤4。
(2)如果”否”,结束。
代码实现
bitio.h
//
/*
* Declaration for bitwise IO
*
* vim: ts=4 sw=4 cindent
*/
#ifndef __BITIO__
#define __BITIO__
#define MAX_CODE 65535
#include <iostream>
#include <stdio.h>
#include <cstdio>
typedef struct { //define the struct named bitfile which set the format of the struct reading in
FILE* fp; // this struct including a pointer, an unsigned char format variate and an int format variate
unsigned char mask;
int rack;
}BITFILE;
BITFILE* OpenBitFileInput(char* filename);
BITFILE* OpenBitFileOutput(char* filename);
void CloseBitFileInput(BITFILE* bf);
void CloseBitFileOutput(BITFILE* bf);
int BitInput(BITFILE* bf);
unsigned long BitsInput(BITFILE* bf, int count);
void BitOutput(BITFILE* bf, int bit);
void BitsOutput(BITFILE* bf, unsigned long code, int count);
#endif // __BITIO__
bitio.cpp
/*
* Definitions for bitwise IO
*
* vim: ts=4 sw=4 cindent
*/
#include <stdlib.h>
#include <stdio.h>
#include "bitio.h"
BITFILE* OpenBitFileInput(char* filename) { //Open the binary file which needs to be LZW compressed
BITFILE* bf; //define a pointer named bf(the same as fp in the struct BITFILE)
bf = (BITFILE*)malloc(sizeof(BITFILE));
if (NULL == bf)
{
return NULL;
}
if (NULL == filename)
{
bf->fp = stdin;
}
else
{
errno_t err = 0;
if (err = fopen_s(&bf->fp, filename, "rb") != 0)
{
return NULL;
}
}
if (NULL == bf->fp)
{
return NULL;
}
bf->mask = 0x80;
bf->rack = 0;
return bf;
}
BITFILE* OpenBitFileOutput(char* filename) { //Open the binary file which needs to be uncompressed
BITFILE* bf; //define a pointer named bf(the same as fp in the struct BITFILE)
bf = (BITFILE*)malloc(sizeof(BITFILE));
if (NULL == bf)
{
return NULL;
}
if (NULL == filename)
{
bf->fp = stdout;
}
else
{
errno_t err = 0;
if (err = fopen_s(&bf->fp, filename, "wb") != 0)
{
return NULL;
}
}
if (NULL == bf->fp) return NULL;
bf->mask = 0x80;//设置bf中的mask为0x80,即128
bf->rack = 0;//设置bf中的rack为0
return bf;
}
void CloseBitFileInput(BITFILE* bf) { //close the files opened and free the buffer used
fclose(bf->fp);
free(bf);
}
void CloseBitFileOutput(BITFILE* bf) { //close the files opened and free the buffer used
// Output the remaining bits
if (0x80 != bf->mask) fputc(bf->rack, bf->fp);
fclose(bf->fp);
free(bf);
}
int BitInput(BITFILE* bf) {
int value;
if (0x80 == bf->mask) {
bf->rack = fgetc(bf->fp);
if (EOF == bf->rack) {
fprintf(stderr, "Read after the end of file reached\n");
exit(-1);
}
}
value = bf->mask & bf->rack;
bf->mask >>= 1;
if (0 == bf->mask) bf->mask = 0x80;
return((0 == value) ? 0 : 1);
}
unsigned long BitsInput(BITFILE* bf, int count) {
unsigned long mask;
unsigned long value;
mask = 1L << (count - 1);
value = 0L;
while (0 != mask) {
if (1 == BitInput(bf))
value |= mask;
mask >>= 1;
}
return value;
}
void BitOutput(BITFILE* bf, int bit) {//turn the int type "bit" into the 8bits number and write into the file "bf"
if (0 != bit) bf->rack |= bf->mask;
bf->mask >>= 1;
if (0 == bf->mask) { // eight bits in rack
fputc(bf->rack, bf->fp);
bf->rack = 0;
bf->mask = 0x80;
}
}
void BitsOutput(BITFILE* bf, unsigned long code, int count) {//turn the unsigned long type "code" into the "count"bits number and write into the file "bf"
unsigned long mask;
mask = 1L << (count - 1);
while (0 != mask) {
BitOutput(bf, (int)(0 == (code & mask) ? 0 : 1));
mask >>= 1;
}
}
#if 0
int main(int argc, char** argv) {
BITFILE* bfi, * bfo;
int bit;
int count = 0;
if (1 < argc) {
if (NULL == OpenBitFileInput(bfi, argv[1])) {
fprintf(stderr, "fail open the file\n");
return -1;
}
}
else {
if (NULL == OpenBitFileInput(bfi, NULL)) {
fprintf(stderr, "fail open stdin\n");
return -2;
}
}
if (2 < argc) {
if (NULL == OpenBitFileOutput(bfo, argv[2])) {
fprintf(stderr, "fail open file for output\n");
return -3;
}
}
else {
if (NULL == OpenBitFileOutput(bfo, NULL)) {
fprintf(stderr, "fail open stdout\n");
return -4;
}
}
while (1) {
bit = BitInput(bfi);
fprintf(stderr, "%d", bit);
count++;
if (0 == (count & 7))fprintf(stderr, " ");
BitOutput(bfo, bit);
}
return 0;
}
#endif
lzw_E.cpp
/*
* Definition for LZW coding
*
* vim: ts=4 sw=4 cindent nowrap
*/
#include <cstdio>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include "bitio.h"
using namespace std;
struct {
int suffix;
int parent;
int firstchild, nextsibling;
} dictionary[MAX_CODE + 1];
int next_code;
int d_stack[MAX_CODE]; // stack for decoding a phrase
#define input(f) ((int)BitsInput( f, 16))//定义input的函数
#define output(f, x) BitsOutput( f, (unsigned long)(x), 16)//定义output的函数
int DecodeString(int start, int code);
void InitDictionary(void);
void PrintDictionary(void)
{//输出字典
int n;
int count;
for (n = 256; n < next_code; n++) {
count = DecodeString(0, n);
printf("%4d->", n);
while (0 < count--) printf("%c", (char)(d_stack[count]));
printf("\n");
}
}
//Print out the dictionary that we build
int DecodeString(int start, int code) {
int count;
count = start;
while (0 <= code)
{
d_stack[count] = dictionary[code].suffix;
code = dictionary[code].parent;
count++;
}
return count;
}//the process of decoding
void InitDictionary(void) {
int i;
for (i = 0; i < 256; i++) {
dictionary[i].suffix = i;//尾缀字符
dictionary[i].parent = -1;//母节点
dictionary[i].firstchild = -1;//孩子节点
dictionary[i].nextsibling = i + 1;//下一个兄弟节点
}
dictionary[255].nextsibling = -1;
next_code = 256;
}
/*
* Input: string represented by string_code in dictionary,
* Output: the index of character+string in the dictionary
* index = -1 if not found
*/
int InDictionary(int character, int string_code) {//查找词典中是否有字符串
int sibling;
if (0 > string_code)
{
return character;
}
sibling = dictionary[string_code].firstchild;
while (-1 < sibling)
{
if (character == dictionary[sibling].suffix)
{
return sibling;
}
sibling = dictionary[sibling].nextsibling;
}
return -1;
}
void AddToDictionary(int character, int string_code) //每一次读入的字符为character和string code值
{
int firstsibling, nextsibling;
if (0 > string_code)
{
return;
}
dictionary[next_code].suffix = character;
dictionary[next_code].parent = string_code;
dictionary[next_code].nextsibling = -1;
dictionary[next_code].firstchild = -1;
firstsibling = dictionary[string_code].firstchild;
if (-1 < firstsibling) { // the parent has child
nextsibling = firstsibling;
while (-1 < dictionary[nextsibling].nextsibling)
{
nextsibling = dictionary[nextsibling].nextsibling;
}
dictionary[nextsibling].nextsibling = next_code;
}
else {// no child before, modify it to be the first
dictionary[string_code].firstchild = next_code;
}
next_code++;
}
void LZWEncode(FILE* fp, BITFILE* bf) {
int character;
int string_code;
int index;
unsigned long file_length;
fseek(fp, 0, SEEK_END);
file_length = ftell(fp);
fseek(fp, 0, SEEK_SET);
BitsOutput(bf, file_length, 4 * 8);//调用BitsOutput函数
InitDictionary();//初始化词典
string_code = -1;//string_code的初值赋值为-1,方便在第一次判断的时候判断读取是否为单个字符
while (EOF != (character = fgetc(fp)))
{//从fp中逐个取出字符,这个其实就是预先读入的1.dat中的字母,每次读一个字节
index = InDictionary(character, string_code);
if (0 <= index)
{ // string+character in dictionary
string_code = index;
}
else
{ // string+character not in dictionary
output(bf, string_code);
if (MAX_CODE > next_code) { // free space in the dictionary
// add string+character to the dictionary
AddToDictionary(character, string_code);
}
string_code = character;
}
}
output(bf, string_code);
}
void LZWDecode(BITFILE* bf, FILE* fp) {//解码
int character = -1;
int new_code, last_code;
int phrase_length;
unsigned long file_length;//文件总长度
file_length = BitsInput(bf, 4 * 8);//读出输入的需要解码的文件的长度
if (-1 == file_length)
{
file_length = 0;
}
InitDictionary();//初始化解码词典
last_code = -1;
while (0 < file_length) {
new_code = input(bf);//重定义input函数
if (new_code >= next_code)
//this is the case CSCSC(not in dict)
{
d_stack[0] = character;//将character先存入d_stack[0]
phrase_length = DecodeString(1, last_code);
}
else
{
phrase_length = DecodeString(0, new_code);//
}
character = d_stack[phrase_length - 1];//找到dstack里的最后一个字符
while (0 < phrase_length) {
phrase_length--;
fputc(d_stack[phrase_length], fp);//输出到文件
file_length--;//文件剩余未解压缩的量
}
if (MAX_CODE > next_code) { // add the new phrase to dictionary
AddToDictionary(character, last_code);
}
last_code = new_code;//更新last_code的值
}
}
int main(int argc, char** argv) {
FILE* fp;
BITFILE* bf;
if (4 > argc) { //输入的参数个数必须要大于等于三,否则会出现以下内容
fprintf(stdout, "usage: \n%s <o> <ifile> <ofile>\n", argv[0]);
fprintf(stdout, "\t<o>: E or D reffers encode or decode\n");
fprintf(stdout, "\t<ifile>: input file name\n");
fprintf(stdout, "\t<ofile>: output file name\n");
return -1;
}
if ('E' == argv[1][0]) { // do encoding,E是在命令参数中设置的
errno_t err = 0;
err = fopen_s(&fp, argv[2], "rb");//argv[2]就是在命令参数中设置的a.dat,实现读入操作
bf = OpenBitFileOutput(argv[3]);//argv[3]就是在命令参数中设置的输出的地址和文件名
if (err == 0 && NULL != bf) {
LZWEncode(fp, bf);
fclose(fp);//关闭文件指针
CloseBitFileOutput(bf);
fprintf(stdout, "encoding done\n");//输出完成编码过程
}
}
else if ('D' == argv[1][0]) { // do decoding,D是在命令参数中设置的
bf = OpenBitFileInput(argv[2]);
errno_t err = 0;
err = fopen_s(&fp, argv[3], "wb");//argv[3]就是在命令参数中设置的输出的地址和文件名
if (NULL != fp && NULL != bf) {//如果成功读入,而且成功创立输出文件
LZWDecode(bf, fp);//调用LZWEncode的函数
//输入为argv[2]所对应的FILE*型指针
fclose(fp);//关闭文件指针
CloseBitFileInput(bf);
fprintf(stdout, "decoding done\n");//输出
}
}
else { // otherwise
fprintf(stderr, "not supported operation\n");
}
return 0;
}
验证结果
.选择几种不同格式类型的文件,使用LZW编码器进行压缩得到输出的压缩比特流文件。对各种不同格式的文件进行压缩效率的分析。
原始文件格式 | 原始文件大小 | 编码后文件大小 | 压缩比 |
---|---|---|---|
pptx | 3431kb | 4172kb | 0.822 |
jpg | 12kb | 14kb | 0.857 |
yuv | 768kb | 108kb | 7.111 |
txt | 1kb | 1kb | 1.000 |
1206kb | 1585kb | 0.761 | |
docx | 367kb | 370kb | 0.992 |
通过观察压缩比可以看到,除yuv之外,其余文件格式经过编码之后数据量增大。
对于这个问题考虑为部分格式字符重复率低导致编码后数据量大。