LZW的原理:
初始化字典为255个字符,随着压缩的过程,字典一步步扩充,最终把全文本转换成字典的id序列,再用变长编码的方法,把id序列编码成二进制。
编码方法:0-254用8位表示,255到510用9位表示,......
当第一次需要编码9位的时候,插入255表示升位,此后0-510都用9位表示,......
因为代码是用string来操作的,所以额外加了对于/0的编码,保证算法的通用性。
压缩率:大概在1024:420左右
代码:
#include "compress.h"
#include <map>
#define BUF_SIZE 1048576
#define ASCII 256
#define ASCIILEN 8
using namespace std;
char g_buff[BUF_SIZE + 5];
char g_curByte;
int g_bufSize;
int g_bits;
int g_byte;
bool g_EOF;
void writeBit(unsigned int data, int length)
{
if (!length) {
return;
}
if (!g_bits) {
g_buff[g_byte++] = g_curByte, g_curByte = 0;
}
g_bits = (g_bits + 7) % 8, g_curByte |= (((data >> length - 1) & 1) << g_bits);
writeBit(data, length - 1);
}
void writeInt(unsigned int data, int &length)
{
if (data >= unsigned((1 << length) - 1)) {
writeBit((1 << length) - 1, length);
length++;
}
writeBit(data, length);
}
void readBit(unsigned int &val)
{
if (!g_bits) {
g_EOF = (g_byte == g_bufSize), g_curByte = g_buff[g_byte++];
}
g_bits = (g_bits + 7) % 8, val = val * 2 + ((g_curByte >> g_bits) & 1);
}
unsigned int readInt(int length)
{
unsigned int val = 0;
while (length-- && !g_EOF) {
readBit(val);
}
return val;
}
void compress(ifstream &fin, ofstream &fout)
{
map<string, unsigned int> m;
for (int i = 0; i < ASCII; i++) {
m[string("") + char(i)] = i;
}
int bitLen = ASCIILEN;
unsigned int index = ASCII;
char ch;
string str;
while (fin.get(ch)) {
if (!ch) {
if (m[str]) {
writeInt(m[str], bitLen);
str = "";
}
writeInt(0, bitLen);
continue;
}
if (m[str + ch]) {
str += ch;
} else {
writeInt(m[str], bitLen);
m[str + ch] = index++;
str = ch;
}
}
if (str != "") {
writeInt(m[str], bitLen);
}
}
void decompress(ofstream &fout, ifstream &fin)
{
map<int, string> m;
for (int i = 0; i < ASCII; i++) {
m[i] = string("") + char(i);
}
int bitLen = ASCIILEN;
unsigned int index = ASCII;
char zero = 0;
string oldstr, newstr;
g_curByte = g_buff[0];
int code = g_curByte;
if (code == ASCII - 1) { // 以\255开头的文件
bitLen++;
code = readInt(bitLen);
}
if (code == 0) {
fout << zero;
} else {
oldstr = m[code];
fout << oldstr;
}
while (true) {
code = readInt(bitLen);
if (g_EOF) {
break;
}
if (code == (1 << bitLen) - 1) {
bitLen++;
code = readInt(bitLen);
}
if (code == 0) {
fout << zero;
oldstr = "";
continue;
}
if (m[code] != "") {
newstr = m[code];
} else {
newstr = oldstr + oldstr[0];
}
if (oldstr != "") {
m[index++] = oldstr + newstr[0];
}
oldstr = newstr;
fout << oldstr;
}
}
void compress(const string input, const string output)
{
ifstream fin(input.c_str(), ios::binary);
ofstream fout(output.c_str(), ios::binary);
g_bits = 8;
g_byte = 0;
g_curByte = 0;
compress(fin, fout);
fin.close();
g_buff[g_byte++] = g_curByte;
fout.write(g_buff, sizeof(char) * g_byte);
fout.close();
}
void decompress(const string input, const string output)
{
ifstream fin(input.c_str(), ios::binary);
ofstream fout(output.c_str(), ios::binary);
g_EOF = false;
g_bits = 0;
g_byte = 1;
fin.read(g_buff, sizeof(char) * BUF_SIZE);
g_bufSize = int(fin.gcount());
decompress(fout, fin);
fout.close();
fin.close();
}