lz77算法的实现

最近想写一个把txt文本转成mobi文件的小程序,看了一下mobi的格式,里面最重要的就是要用到一个palmdoc的压缩算法,这个压缩算法是一个lz77的变形。今天花了一个上午实现了这个压缩算法,当然还写了解压缩的程序。现在把程序贴上来。

lz.h

/* LZ77 is a compression technique that used to compress text
 */

#ifndef _LZ_H_
#define _LZ_H_

typedef unsigned char byte;

typedef struct lz_ctxt{
	byte* scroll_window;
	int   scroll_window_size;
	int   max_scroll_window_size;
	int   min_scroll_window_size; 

	byte* forward_window;
	int   forward_window_size;
	int   max_forward_window_size;
	int   min_forward_window_size;

	byte* plain_data;
	int   plain_data_size;
	int   max_plain_data_size;
	int   min_plain_data_size;

	byte* compressed_data;
	int   compressed_data_size;

	int   byte_count;
}lz_ctxt;

enum {
	LZ_LITERAL = 0,
	LZ_LEN_DIS_PAIR,
	LZ_BYTE_PAIR,
	LZ_LITERAL_COUNT 
};
byte* decompress(byte* compressed_data, int len);

int compress(lz_ctxt* lz);

lz_ctxt* lz_ctxt_new(byte* plain_data);
lz_ctxt* lz_ctxt_new_full(byte* plain_data, int plain_data_size);

void lz_ctxt_delete(lz_ctxt* lz);
#endif

lz.c

#include "lz.h"
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>


#define DATA_LENGTH 4096 
#define MAX_SCROLL_WINDOW_SIZE 2047
#define MIN_SCROLL_WINDOW_SIZE 3

#define MAX_FORWARD_WINDOW_SIZE 10
#define MIN_FORWARD_WINDOW_SIZE 3

#define MAX_PLAIN_DATA_SIZE 4096
#define MIN_PLAIN_DATA_SIZE 0

#define MAX_COUNT 0x08

static int get_byte_type(byte b);
static int get_distance(byte* b);
static int get_length(byte* b);
static int get_count(byte* b);
static int get_distance_length(lz_ctxt* lz, int* p_dis, int* p_len);

//创建一个压缩过程的上下文,压缩时很多参数需要用到他
lz_ctxt*
lz_ctxt_new_full(byte* plain_data, int plain_data_size) {
	if(NULL == plain_data || 0 == plain_data_size \
			|| plain_data_size > MAX_PLAIN_DATA_SIZE){
		fprintf(stderr, "plain data error!\n");
		return NULL;
	}

	lz_ctxt* lz = (lz_ctxt*)malloc(sizeof(lz_ctxt));
	memset(lz,0, sizeof(lz_ctxt));
	lz->compressed_data = (byte*)malloc(sizeof(byte)*MAX_PLAIN_DATA_SIZE);
	memset(lz->compressed_data, 0, MAX_PLAIN_DATA_SIZE);

	lz->max_scroll_window_size =  MAX_SCROLL_WINDOW_SIZE;
	lz->min_scroll_window_size = MIN_SCROLL_WINDOW_SIZE;
	lz->max_forward_window_size = MAX_FORWARD_WINDOW_SIZE;
	lz->min_forward_window_size = MIN_FORWARD_WINDOW_SIZE;

	lz->max_plain_data_size = MAX_PLAIN_DATA_SIZE;
	lz->min_plain_data_size = MIN_PLAIN_DATA_SIZE;

	lz->plain_data = plain_data;
	lz->plain_data_size = plain_data_size;

	lz->scroll_window = plain_data;
	lz->forward_window = plain_data;

	return lz;
}

lz_ctxt*
lz_ctxt_new(byte* plain_data) {
	return lz_ctxt_new_full(plain_data, MAX_PLAIN_DATA_SIZE);
}

int
compress(lz_ctxt* lz) {
	if(NULL == lz) {
		fprintf(stderr, "Lz context is null!\n");
		return 0;
	}
	int i = 0;
	byte b;

	while(lz->byte_count < lz->plain_data_size) {
		if(lz->scroll_window_size < 3){      //滑动窗口的大小没有到要求的最小值
			if(*(lz->forward_window) < 0x80){
				lz->compressed_data[i++] = *(lz->forward_window);
				lz->scroll_window_size ++;
				lz->forward_window ++;
				lz->byte_count ++;
			}else{
				int count= get_count(lz->forward_window);
				lz->compressed_data[i++] = count;
				memcpy(lz->compressed_data+i,\
					lz->forward_window, count);
				lz->byte_count += count;
				i += count;
				lz->scroll_window_size += count;
				lz->forward_window +=count;
			}
		}else{
			int distance = 0;
			int length = 0;
			if(get_distance_length(lz, &distance, &length)) {  //查看滑动窗口里是否能找到前向窗口的匹配
				byte pair0 = 0x80;
				pair0 |= (distance >> 5);
				byte pair1 = 0;
				pair1 = length-MIN_FORWARD_WINDOW_SIZE;
				pair1 = ((distance & 0x1f) << 3) | pair1;
				lz->compressed_data[i++] = pair0;
				lz->compressed_data[i++] = pair1;

			}else{
				if(*(lz->forward_window) < 0x80 && *(lz->forward_window) != 0x20) {
					lz->compressed_data[i++] = *(lz->forward_window);
					length = 1;
				}else if(*(lz->forward_window) == 0x20){
					if(*(lz->forward_window+1) >= 0x80) {
						lz->compressed_data[i++] = *(lz->forward_window);
						length = 1;
					}else{
						byte b = *(lz->forward_window+1)^0x80;
						lz->compressed_data[i++] = b;
						length = 2;
					}
				}else{
					length = get_count(lz->forward_window);
					lz->compressed_data[i++] = length;
					memcpy(lz->compressed_data+i, lz->forward_window, length);
					i += length;
				}
			}
			lz->byte_count += length;
			if(lz->scroll_window_size == lz->max_scroll_window_size){
				lz->scroll_window += length;
			}else if(lz->scroll_window_size + length <= lz->max_scroll_window_size){
				lz->scroll_window_size += length;
			}else{
				lz->scroll_window_size == lz->max_scroll_window_size;
				lz->scroll_window += lz->scroll_window_size + length - lz->max_scroll_window_size;
			}
			lz->forward_window += length;
			lz->forward_window_size =\
				lz->byte_count + lz->max_forward_window_size < lz->plain_data_size?\
				lz->max_forward_window_size:lz->plain_data_size - lz->byte_count;
		}
	}
	lz->compressed_data_size = i;
	return 1;
}
//解压缩函数,比较简单
byte*
decompress(byte* compressed_data, int len){
	byte *uncompressed_data = (byte*)malloc(sizeof(byte)*DATA_LENGTH);
	memset(uncompressed_data, DATA_LENGTH, 0);
	
	assert(len>0);

	int i,j;
	int distance;
	int length;
	int start;
	for (i = 0, j = 0; i < len; i ++, j ++) {
		switch(get_byte_type(compressed_data[i])){
			case LZ_LITERAL:
				uncompressed_data[j] = compressed_data[i];
				break;

			case LZ_LEN_DIS_PAIR:
				distance = get_distance(compressed_data+i);
				length = get_length(compressed_data+i+1);
				
				start = j - distance;
				memcpy(uncompressed_data+j,\
					uncompressed_data+start, length);
				j += length - 1;
				i++;
				break;

			case LZ_BYTE_PAIR:
				uncompressed_data[j] = ' ';
				j ++;
				uncompressed_data[j] = compressed_data[i]^0x80;
				break;
			case LZ_LITERAL_COUNT:
				length = compressed_data[i];
				memcpy(uncompressed_data+j, \
					compressed_data+i+1, length);
				i += length;
				j += length-1;
				break;
			default:
				break;
		}
	}
	return uncompressed_data;
}

void 
lz_ctxt_delete(lz_ctxt* lz) {
	if(NULL != lz) {
		if(NULL != lz->compressed_data)
			free(lz->compressed_data);
		free(lz);
	}
}

static int
get_byte_type(byte b) {
	if(b <= 0xbf && b >= 0x80){
		return LZ_LEN_DIS_PAIR;
	}else if(b >= 0xc0){
		return LZ_BYTE_PAIR;
	}else if(b >= 0x01 && b <= 0x08) {
		return LZ_LITERAL_COUNT;
	}else{
		return LZ_LITERAL;
	}
}

static int
get_distance(byte* b) {
	int distance = 0;
	distance = b[0] & 0x3f;
	distance = (distance << 5) + ((b[1] & 0xf8) >> 3);
	return distance;
}

static int
get_length(byte* b) {
	int length =0;
	length = ((*b) & 0x07) + 3;
	return length;
}

static int
get_count(byte* b) {
	int i;
	for(i = 0; i < MAX_COUNT && b[i] >= 0x80; i ++)
		;
	return i;
}

static int
get_distance_length(lz_ctxt* lz, int* p_dis, int* p_len) {
	int max_len = lz->min_forward_window_size - 1;
	int index = 0;
	byte* p = lz->scroll_window;
	while(index++ < lz->scroll_window_size) {
		if(*p != *(lz->forward_window)) {
			p ++;
			continue;
		}

		int len;
		for (len = 0; len < lz->forward_window_size && p[len] == lz->forward_window[len];\
				len ++)
			;
		if(len == lz->forward_window_size){
			*p_len = len;
			*p_dis = lz->scroll_window_size - index + 1;
			break;
		}else if (len >= lz->min_forward_window_size && len >= max_len){
			max_len = len;
			*p_len = len;
			*p_dis = lz->scroll_window_size - index + 1;
		}
		p ++;
	}
	if(max_len >= lz->min_forward_window_size)
		return 1;
	return 0;
}

关于lz77的算法,可以在 http://jpkc.zust.edu.cn/2007/dmt/course/Mmt03_05_2.htm找到。

关于palmdoc的算法,我摘取wiki上面的一段话,英文的,不过很简单啦。

http://wiki.mobileread.com/wiki/PalmDOC

PalmDOC uses LZ77 compression techniques. DOC files can contain only compressed text. The format does not allow for any text formatting. This keeps files small, in keeping with the Palm philosophy. However, extensions to the format can use tags, such as HTML or PML, to include formatting within text. These extensions to PalmDoc are not interchangeable and are the basis for most eBook Reader formats on Palm devices.LZ77 algorithms achieve compression by replacing portions of the data with references to matching data that has already passed through both encoder and decoder. A match is encoded by a pair of numbers called a length-distance pair, which is equivalent to the statement "each of the next length characters is equal to the character exactly distance characters behind it in the uncompressed stream." (The "distance" is sometimes called the "offset" instead.)In the PalmDoc format, a length-distance pair is always encoded by a two-byte sequence. Of the 16 bits that make up these two bytes, 11 bits go to encoding the distance, 3 go to encoding the length, and the remaining two are used to make sure the decoder can identify the first byte as the beginning of such a two-byte sequence.PalmDoc combines LZ77 with a simple kind of byte pair compression. PalmDoc Algorithm: [3] PalmDoc files are decoded as follows:Read a byte from the compressed stream. If the byte is0x00: "1 literal" copy that byte unmodified to the decompressed stream.0x09 to 0x7f: "1 literal" copy that byte unmodified to the decompressed stream.0x01 to 0x08: "literals": the byte is interpreted as a count from 1 to 8, and that many literals are copied unmodified from the compressed stream to the decompressed stream.0x80 to 0xbf: "length, distance" pair: the 2 leftmost bits of this byte ('10') are discarded, and the following 6 bits are combined with the 8 bits of the next byte to make a 14 bit "distance, length" item. Those 14 bits are broken into 11 bits of distance backwards from the current location in the uncompressed text, and 3 bits of length to copy from that point (copying n+3 bytes, 3 to 10 bytes).0xc0 to 0xff: "byte pair": this byte is decoded into 2 characters: a space character, and a letter formed from this byte XORed with 0x80.Repeat from the beginning until there is no more bytes in the compressed file.

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值