哈夫曼编码
预先统计出一组数据中所有的元素的出现概率。再通过构造树生成各元素表达式
如:一组数据中包含:a,b,c,d,e,f (5种元素)。各元素的出现概率为:
a:45%
b:13%
c:12%
d:16%
e:9%
f:5%
则可生成构造树:
5中元素至少需要3bit来表示(即:2^2 > 2^ 3 = 8 >5),数据总和为 n*(3)= 3n;
采用哈夫曼编码后,元素可做以下表示(生成表达式):
a = 0;
b = 101;
c = 100;
d = 111;
e = 1101;
f = 1100;
数据总和为 n * (45%*1+13%*3+12%*3+16%*3+9%*4+5%*4) = 2.24n
对比与正常编码。压缩了25%;
实现过程
java的实现代码:
1.构造一个便于计算的树Mod
public static class Bc{
//当前元素
public Byte bt;
//元素统计数量
public long ct;
//左子树
public Bc left;
//右子树
public Bc right;
public Bc(Byte bt, int ct) {
this.bt = bt;
this.ct = ct;
}
}
2.统计所有元素
public static List<Bc> getBcs(byte[] bytes){
ArrayList<Bc> list = new ArrayList<>();
if (bytes == null || bytes.length == 0){
return list;
}
for (byte b : bytes) {
Bc v = null;
for (Bc bc : list) {
if (bc.bt == b){
bc.ct++;
v = bc;
break;
}
}
if (v == null){
v = new Bc(b,1);
list.add(v);
}
}
return sort(list);
}
3.通用排序方法
public static List<Bc> sort(List<Bc> list){
list.sort((o1, o2) -> {
int i = (int) (o2.ct - o1.ct);
if (i == 0) {
if (o2.bt == null) {
i = -1;
}else if (o1.bt == null){
i = 1;
}else {
i = o1.bt-o2.bt;
}
}
return i;
});
return list;
}
4.将统计元素构造成树:
**public static Bc tree(List<Bc> list){
if (list.size() == 1) {
return list.get(0);
}
Bc root = new Bc(null, 0);
Bc left = list.get(list.size() - 1);
root.ct += left.ct;
Bc right = list.get(list.size() - 2);
root.ct += right.ct;
root.left = left;
root.right = right;
list = list.subList(0, list.size() - 2);
list.add(root);
sort(list);
return tree(list);
}
**
5.通过构造树。获取所有元素的表达式
/**
* 表达式 mod
*/
public static class CompressNode{
//元素
public Byte bt;
//表达式
public int trans;
//表达式长度(区分于 0和 00 这类的表达式)
public int offset;
public CompressNode(Byte bt, int trans, int offset) {
this.bt = bt;
this.trans = trans;
this.offset = offset;
}
}
/**
* 从构造树中获得表达式
*/
public static List<CompressNode> getCompressNodes(Bc root){
List<CompressNode> nodeList = new ArrayList<>();
if (root.bt != null) {
nodeList.add(new CompressNode(root.bt,0,1));
}
if (root.left != null){
compressNodes(root.left, 0,1,nodeList);
}
if (root.right != null){
compressNodes(root.right, 1,1,nodeList);
}
return nodeList;
}
private static void compressNodes(Bc root,int value, int hierarchy,List<CompressNode> nodeList){
if (root.bt != null){
nodeList.add(new CompressNode(root.bt,value,hierarchy));
}
if (root.left != null){
compressNodes(root.left, value<<1,hierarchy+1,nodeList);
}
if (root.right != null){
compressNodes(root.right, (value<<1)+1,hierarchy+1,nodeList);
}
}
6 :将数据通过表达式进行压缩
/**
* res:数据
* compressNodes:表达式
*/
public static byte[] compress(byte[] res,List<CompressNode> compressNodes){
if (res == null||res.length==0){
return res;
}
List<Byte> byteList = new ArrayList<>();
int c = 0;
for (byte b : res) {
CompressNode node = null;
for (CompressNode compressNode : compressNodes) {
if (Objects.equals(b,compressNode.bt)){
node = compressNode;
}
}
int bt = node.trans;
int offset = node.offset;
if (c == 0){
byteList.add((byte) 0);
}
c = setInList(byteList, c, bt, offset);
}
byteList.add((byte)c);
byte[] bytes = new byte[byteList.size()];
for (int i = 0; i < byteList.size(); i++) {
bytes[i] = byteList.get(i);
}
return bytes;
}
private static int setInList(List<Byte> byteList, int c, Integer bt, int offset) {
if (8- c > offset) {
Byte last = byteList.get(byteList.size() - 1);
last = (byte)(last |(bt <<(8- c - offset)));
byteList.set(byteList.size()-1,last);
c += offset;
}else if (8- c == offset){
Byte last = byteList.get(byteList.size() - 1);
last = (byte)(last| bt);
byteList.set(byteList.size()-1,last);
c = 0;
}else{
Byte last = byteList.get(byteList.size() - 1);
last = (byte)(last|(bt >>(offset -8+ c)));
byteList.set(byteList.size()-1,last);
for (int i = 0; i < 8-c; i++) {
bt -= (1<<(offset-i));
}
offset = offset -8 + c;
c = 0;
byteList.add((byte) 0);
return setInList(byteList,c,bt,offset);
}
return c;
}
7: 解压缩由哈夫曼编码的数据
public static byte[] unCompress(byte[] cals,List<CompressNode> compressNodes){
if (cals == null||cals.length==0){
return cals;
}
List<Byte> byteList = new ArrayList<>();
compressNodes.sort((o1,o2)->{
int i = o2.offset - o1.offset;
return i!=0?i:o2.trans- o1.trans;
});
int length = cals.length - 1;
int lastIndex = cals[length];
lastIndex = lastIndex==0?8:lastIndex;
int i = 0;
int c = 0;
while (true){
if (i == length || (i == length-1 && c >= lastIndex)){
break;
}
int byteLen = 8;
byte cal = cals[i];
if (i == length-1){
byteLen = lastIndex;
}
for (CompressNode node : compressNodes) {
if (node.offset < byteLen-c){
if (getIndexByte(cal,c,node.offset) == node.trans){
byteList.add(node.bt);
c+=node.offset;
break;
}
}else if (node.offset == byteLen-c){
if (getIndexByte(cal,c,node.offset) == node.trans){
byteList.add(node.bt);
i++;c=0;break;
}
}else{
int bc = (node.offset - (byteLen-c))/8;
int reint = (node.offset - (byteLen - c)) % 8;
if (reint > 0){
bc++;
}
bc++;
if (i+bc > length || (i+bc == length-1 && reint > lastIndex)){
continue;
}else{
byte[] vs = new byte[bc];
for (int j = 0; j < bc; j++) {
vs[j] = cals[j+i];
}
if (getIndexBytes(vs,c, node.offset) == node.trans){
byteList.add(node.bt);
if (reint == 0) {
c = 0;i = i+bc;
}else{
c = reint;i = i+bc-1;
}
break;
}
}
}
}
}
byte[] bytes = new byte[byteList.size()];
for (int k = 0; k < byteList.size(); k++) {
bytes[k] = byteList.get(k);
}
return bytes;
}
public static int getIndexByte(byte val,int index,int length){
if (index+length > 8) {
throw new RuntimeException("byte bit max 8");
}
int e = 0;
for (int i = index; i < index+length; i++) {
if ((i==0&&val<0)||(i>0&&(val&1<<(7-i))==(1<<(7-i)))){
e += 1<<(length+index-i-1);
}
}
return e;
}
public static int getIndexBytes(byte[] vals,int index,int length){
if (index+length > (8*vals.length) || length < (8*(vals.length-2))+2) {
throw new RuntimeException("byte bit max 8");
}
int e = 0;
for (int i = 0; i < vals.length; i++) {
byte val = vals[i];
if (i == 0) {
e += getIndexByte(val, index, 8 - index) << (length - (8 - index));
}else if(i == vals.length -1 ){
e += getIndexByte(val, 0, length- ((8 - index)+8*(vals.length-2)));
}else{
e += getIndexByte(val, 0, 8) << (length- ((8 - index)+8*(vals.length-1-i)));
}
}
return e;
}
至此!
本章完结