赫夫曼编码
简介
- 赫夫曼编码也称哈夫曼编码,是一种编码方式,属于一种程序算法
- 赫夫曼编码是赫夫曼树在电讯通信中的经典应用之一
- 赫夫曼编码广泛的用于数据文件压缩。其压缩率通常在20%-90%之间
- 赫夫曼编码是可变长编码的一种。Huffman于1952年提出一种编码方法,称之为最佳编码
原理及图解
-
**定长编码:**将字符串转成对应的ASCII码,然后用其对应的二进制来传递信息
-
变长编码:
-
赫夫曼编码:
代码实现:
数据的压缩和解压
package com.tmo.huffman;
/**
* 赫夫曼编码
* @author tmo
*
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class HuffmanCode {
public static void main(String[] args) {
String str = "i like like like java do you like a java";
byte[] bytes = str.getBytes();
System.out.println(bytes.length);//40
byte[] huffmanZip = huffmanZip(bytes);
System.out.println("压缩后的:"+Arrays.toString(huffmanZip));
String byteToBitString = byteToBitString((byte) -88, true);
System.out.println(byteToBitString);
byte[] decode = decode(huffmanCodes, huffmanZip);
System.out.println(new String(decode));
// List<Node> list = getList(bytes);
// System.out.println(list);
// Node node = createHuffmanTree(list);
// System.out.println("根节点:"+node);
// preOrder(node);
//
// Map<Byte, String> huffmanCodes = getCodes(node);
// System.out.println("生成的赫夫曼编码表为:"+huffmanCodes);
// byte[] zip = zip(bytes, huffmanCodes);
// System.out.println("压缩后的:"+Arrays.toString(zip));
}
//解压:
//1.字节转二进制字符串
/**
* 将byte转成一个二进制字符串
* @param b 传入的字节
* @param flag 标志是否需要补高位 true 需要 false 不需要
* @return 返回改b对应的二进制字符串(按照补码的方式返回)
*/
public static String byteToBitString(byte b,boolean flag) {
int temp = b;
//如果是正数,我们还存在补高位
if (flag) {
temp |= 256;
}
//返回的是temp对应的二进制的补码
String binaryString = Integer.toBinaryString(temp);
if (flag) {//不是最后一位
return binaryString.substring(binaryString.length()-8);
}else {
return binaryString;
}
}
//2.完成对压缩数据的解码
/**
*
* @param huffmanCodes 赫夫曼编码表
* @param huffmanBytes 赫夫曼编码得到的字节数组
* @return 返回原来的字符串对应的数组
*/
public static byte[] decode(Map<Byte, String> huffmanCodes,byte[] huffmanBytes) {
//1.得到huffmanBytes对应的二进制字符串
StringBuilder stringBuilder2 = new StringBuilder();
for (int i = 0; i < huffmanBytes.length; i++) {
byte b = huffmanBytes[i];
//判断是否是最后一个字节
boolean flag = (i == huffmanBytes.length-1);
stringBuilder2.append(byteToBitString(b, !flag));
}
System.out.println(stringBuilder2);
//2.将字符串按照指定的编码表进行编码
//将赫夫曼编码表调正
Map<String, Byte> map = new HashMap<String, Byte>();
for (Byte b : huffmanCodes.keySet()) {
map.put(huffmanCodes.get(b),b);
}
//创建集合,存放byte
List<Byte> list = new ArrayList<Byte>();
//从左向右扫描stringBuilder2
//i就是索引
for (int i = 0; i < stringBuilder2.length();) {
int count = 1;
boolean flag = true;
Byte by =null;
while (flag) {
//递增取出key进行查找
String key = stringBuilder2.substring(i, i+count);
by = map.get(key);
if (by == null) {
//没有找到
count++;
}else {
//找到
flag = false;
}
}
list.add(by);
i += count;
}
//当for循环结束时,list中就存放了所有的字符
//将list中的字符放到byte[]中并返回
byte[] b = new byte[list.size()];
for (int i = 0; i < b.length; i++) {
b[i] = list.get(i);
}
return b;
}
//封装压缩的过程
public static byte[] huffmanZip(byte[] bytes) {
//1.得到一个node的集合
List<Node> list = getList(bytes);
//2.创建Huffman树
Node node = createHuffmanTree(list);
//3.得到Huffman编码表
Map<Byte, String> huffmanCodes = getCodes(node);
//4.压缩
byte[] zip = zip(bytes, huffmanCodes);
return zip;
}
//编写一个方法,将字符串对应的字节数组byte[],通过生成的赫夫曼编码表,返回一个赫夫曼编码压缩后的byte[]
/**
*
* @param bytes 原始数组对应的byte[]
* @param HuffmanCodes 生成的Huffman编码表
* 10101000(补码)--> 10101000 - 1 = 10100111(反码) -->11011000(原码) = -88
*/
public static byte[] zip(byte[] bytes, Map<Byte, String> HuffmanCodes) {
//1.将bytes数组转换成赫夫曼编码对应的字符串
StringBuilder stringBuilder2 = new StringBuilder();
for (byte b : bytes) {
stringBuilder2.append(HuffmanCodes.get(b));
}
System.out.println(stringBuilder2);
//2.将生成的101010001011111111001000101...转化为byte[]
//计算byte[]的长度
int len = 0;
if (stringBuilder2.length() % 8 == 0) {
len = stringBuilder2.length() / 8;
}else {
len = stringBuilder2.length() / 8 + 1;
}
//创建 存储压缩后的byte[]
byte[] huffmanByteCodes = new byte[len];
for (int i = 0,index = 0; i < stringBuilder2.length(); i+=8,index++) {//因为每8位对应一个byte,所以步长加8
String strByte;
if (i+8 > stringBuilder2.length()) {
strByte = stringBuilder2.substring(i);
}else {
strByte = stringBuilder2.substring(i,i+8);
}
huffmanByteCodes[index] = (byte) Integer.parseInt(strByte, 2);
}
return huffmanByteCodes;
}
//生成Huffman对应的赫夫曼编码
//1.将Huffman编码表对应的编码存放在Map<Byte,String>中
static Map<Byte, String> huffmanCodes = new HashMap<Byte, String>();
//2.在赫夫曼编码中需要去拼接路径,定义一个StringBuilder存储某个叶子节点的路径
static StringBuilder stringBuilder = new StringBuilder();
//重载一把
public static Map<Byte, String> getCodes(Node node) {
if (node == null) {
return null;
}
//处理左子树
getCodes(node.left, "0", stringBuilder);
//向右递归
getCodes(node.right, "1", stringBuilder);
return huffmanCodes;
}
/**
* 将传入的node节点的所有叶子节点的赫夫曼编码得到,并放入huffmanCodes集合中
* @param node 传入的节点
* @param code 路径:左节点:0 右节点:1
* @param stringBuilder 用于拼接路径
*/
public static void getCodes(Node node,String code,StringBuilder stringBuilder) {
StringBuilder stringBuilder2 = new StringBuilder(stringBuilder);
//拼接
stringBuilder2.append(code);
if (node != null) { //node == null 不处理
//判断当前节点是叶子节点还是非叶子节点
if (node.data == null) { //非叶子节点
//递归处理
//向左递归
getCodes(node.left, "0", stringBuilder2);
//向右递归
getCodes(node.right, "1", stringBuilder2);
}else {//叶子节点
//表示找到叶子节点的最后
huffmanCodes.put(node.data, stringBuilder2.toString());
System.out.println(huffmanCodes);
}
}
}
//前序遍历方法
public static void preOrder(Node node) {
if (node != null) {
node.preOrder();
}else {
System.out.println("树空");
}
}
//创建赫夫曼树
public static Node createHuffmanTree(List<Node> list) {
//当节点中只剩下一个节点时,说明赫夫曼树创建成功
while (list.size() >1) {
//排序,从小到大
Collections.sort(list);
//从集合中取出前两个构成新的二叉树
Node leftNode = list.get(0);
Node rightNode = list.get(1);
//构建新的二叉树
Node parent = new Node(null, leftNode.weight+rightNode.weight);
parent.left = leftNode;
parent.right = rightNode;
//移除两个节点
list.remove(leftNode);
list.remove(rightNode);
//将新节点加入到集合
list.add(parent);
}
return list.get(0);
}
//将字节数组转换成list
public static List<Node> getList(byte[] bytes){
//定义一个list集合
List<Node> nodes = new ArrayList<Node>();
//使用map记录字节数组中数值的个数
Map<Byte, Integer> map = new HashMap<Byte, Integer>();
//遍历bytes
for (byte node : bytes) {
//定义保存字符的个数
Integer count = map.get(node);
if (count == null) {
map.put(node, 1);
}else {
map.put(node, count+1);
}
}
//遍历map,将其放入list集合中
for (Byte node : map.keySet()) {
nodes.add(new Node(node, map.get(node)));
}
return nodes;
}
}
class Node implements Comparable<Node>{
Byte data; //存放数据本身,比如 'a' => 79
int weight; //存放权值 表示字符数据出现的次数
Node left;
Node right;
public Node(Byte data, int weight) {
super();
this.data = data;
this.weight = weight;
}
@Override
public String toString() {
return "Node [data=" + data + ", weight=" + weight + "]";
}
@Override
public int compareTo(Node o) {
//从小到大
return this.weight - o.weight;
}
//前序遍历
public void preOrder() {
System.out.println(this);
if (this.left != null) {
this.left.preOrder();
}
if (this.right != null) {
this.right.preOrder();
}
}
}