哈夫曼树与编码的创建过程及发展由来
这里基础知识就不再叙述了,请参考博客
https://www.cnblogs.com/alomsc/p/12736502.html#:~:text
写的非常详细,初学者阅读一遍即可理解
具体代码实现
首先展示下待编码的文本
String data = "In computer science and information theory, "
+ "a Huffman code is a particular type of optimal prefix code that is commonly used for lossless data compression. "
+ "The process of finding and/or using such a code proceeds by means of Huffman coding, "
+ "an algorithm developed by David A. Huffman while he was a Ph.D. student at MIT, and published in the 1952 paper "
+ "\"A Method for the Construction of Minimum-Redundancy Codes\".[1] "
+ "The output from Huffman's algorithm can be viewed as a variable-length code table for encoding a source symbol "
+ "(such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence"
+ " (weight) for each possible value of the source symbol. As in other entropy encoding methods, more common symbols are generally "
+ "represented using fewer bits than less common symbols. Huffman's method can be efficiently implemented, "
+ "finding a code in linear time to the number of input weights if these weights are sorted.[2] However, "
+ "although optimal among methods encoding symbols separately, Huffman coding is not always optimal among all compression methods.";
用HashMap统计词频
HashMap可以存储键值对,即<Key,Value>,用这种数据结构大大降低了程序设计的复杂性。
//统计字符串中字符出现的频率
public Map<Character,Integer> computeCharCount(String text){
Map<Character,Integer> map = new HashMap<Character,Integer>();
char[] ch = text.toCharArray();
for(char b : ch) {
if(!map.containsKey(b)) {
map.put(b, 1);
}
else {
map.replace(b,map.get(b)+1);
}
}
return map;
}
将hashmap中的数据取出,放到哈夫曼树当中的每个结点处,再将结点存入ArrayList集合当中,利用集合可以被排序的特点,构建哈夫曼树
创建集合并添加数据进入结点
ArrayList<HTNode> nodes = new ArrayList<>();
for(Character c : chars.keySet()){
HTNode node = new HTNode();
node.setData(c);
node.setWeight(chars.get(c));
node.setLchild(null);
node.setRchild(null);
nodes.add(node);
}
构建哈夫曼树
/**
* 根据初始的结点列表,建立哈夫曼树,
* 反复生成哈夫曼树,每次重新构建树,将更新编码
* 生成好了哈夫曼树,把生成的树根结点进行返回。
*/
public HTNode buildTree(List<HTNode> nodes){
while(nodes.size()>1){
Collections.sort(nodes);
HTNode left = nodes.get(0);
HTNode right = nodes.get(1);
HTNode parent = new HTNode(left.getWeight()+right.getWeight());
parent.setLchild(left);
parent.setRchild(right);
nodes.add(parent);
nodes.remove(left);
nodes.remove(right);
}
return nodes.get(0);
}
根据生成的哈夫曼树,构建哈夫曼编码,越向树根编码越短,越向叶子结点编码越长
/**
* 根据已建立的哈夫曼树根结点,生成对应的字符编码,
* 字符编码应为0,1字符串
*/
public Map<Character, String> getCode(HTNode tree) {
//如果当前结点不是字符结点,即没有关键字,只有权值
if (tree.getData() == '\0') {
if (tree.getLchild() != null) {
//左结点就把0拼接其后
String j = tree.getCode() + Code.ZERO.getCode();
tree.getLchild().setCode(j);
//递归左孩子
getCode(tree.getLchild());
}
if (tree.getRchild() != null) {
//右结点就把1拼接其后
String k = tree.getCode() + Code.ONE.getCode();
tree.getRchild().setCode(k);
//递归右孩子
getCode(tree.getRchild());
}
}
//如果当前结点是字符结点,则放入静态全局变量code中,存储关键字和哈夫曼编码值
else {
code.put(tree.getData(), tree.getCode());
}
return code;
}
存储字符对应的编码之后,新建一个字符串文本,对其进行编码解码的验证
//例子
String text = "This paper first presents a new array data structure to represent the Huffman tree.";
实现哈夫曼编码
/**
* 使用当前类训练好的huffman编码来对文本进行编码
*/
public String encode(String text) {
String sum = "";
char[] b = text.toCharArray();
for (char tmp : b) {
sum += code.get(tmp);
}
return sum;
}
实现哈夫曼解码
不得不说java API真的强大,可以用字符串的startsWith()方法,进行查找哈夫曼编码对应的字符值。这里说明一下,哈夫曼编码的任何一个字符编码,都不是其他字符编码的前缀,所以查找编码是唯一的!
/**
* 使用当前类中训练好的huffman编码,
* 对编码后的文本进行解码
*/
public String decode(String text) {
String sum ="";
while (text.length() > 0) {
for (Map.Entry<Character, String> e : code.entrySet()) {
String temp = e.getValue();
if (text.startsWith(temp)) {
sum+=e.getKey();
text = text.substring(temp.length());
break;
}
}
}
return sum;
}
计算压缩比
System.out.println("编码后二进制字节长度:" + coded.length());
System.out.println("原字符串长度" + text.length()*8);
System.out.println("压缩率"+coded.length()*1.0/text.length()/8);
全部代码
哈夫曼树类
package com.edu.bym;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.edu.bym.HTNode.Code;
/**
* 哈夫曼树实现
* @author piaow
*/
public class HuffmanTree{
private Map<Character, String> code = new HashMap<Character,String>();
/**
* 根据初始的结点列表,建立哈夫曼树,
* 反复生成哈夫曼树,每次重新构建树,将更新编码
* 生成好了哈夫曼树,把生成的树根结点进行返回。
*/
public HTNode buildTree(List<HTNode> nodes){
while(nodes.size()>1){
Collections.sort(nodes);
HTNode left = nodes.get(0);
HTNode right = nodes.get(1);
HTNode parent = new HTNode(left.getWeight()+right.getWeight());
parent.setLchild(left);
parent.setRchild(right);
nodes.add(parent);
nodes.remove(left);
nodes.remove(right);
}
return nodes.get(0);
}
/**
* 根据已建立的哈夫曼树根结点,生成对应的字符编码,
* 字符编码应为0,1字符串
*/
public Map<Character, String> getCode(HTNode tree) {
//如果当前结点不是字符结点,即没有关键字,只有权值
if (tree.getData() == '\0') {
if (tree.getLchild() != null) {
//左结点就把0拼接其后
String j = tree.getCode() + Code.ZERO.getCode();
tree.getLchild().setCode(j);
//递归左孩子
getCode(tree.getLchild());
}
if (tree.getRchild() != null) {
//右结点就把1拼接其后
String k = tree.getCode() + Code.ONE.getCode();
tree.getRchild().setCode(k);
//递归右孩子
getCode(tree.getRchild());
}
}
//如果当前结点是字符结点,则放入静态全局变量code中,存储关键字和哈夫曼编码值
else {
code.put(tree.getData(), tree.getCode());
}
return code;
}
/**
* 获取已建立的哈夫曼树生成的字符编码,
* 字符编码应为0,1字符串
*/
public Map<Character, String> getCode(){
return this.code;
}
//统计字符串中字符出现的频率
public Map<Character,Integer> computeCharCount(String text){
Map<Character,Integer> map = new HashMap<Character,Integer>();
char[] ch = text.toCharArray();
for(char b : ch) {
if(!map.containsKey(b)) {
map.put(b, 1);
}
else {
map.replace(b,map.get(b)+1);
}
}
return map;
}
/**
* 使用当前类训练好的huffman编码来对文本进行编码
*/
public String encode(String text) {
String sum = "";
char[] b = text.toCharArray();
for (char tmp : b) {
sum += code.get(tmp);
}
return sum;
}
/**
* 使用当前类中训练好的huffman编码,
* 对编码后的文本进行解码
*/
public String decode(String text) {
String sum ="";
while (text.length() > 0) {
for (Map.Entry<Character, String> e : code.entrySet()) {
String temp = e.getValue();
if (text.startsWith(temp)) {
sum+=e.getKey();
text = text.substring(temp.length());
break;
}
}
}
return sum;
}
public static void main(String[] args){
HuffmanTree htree = new HuffmanTree();
//首先对字符串中的字符出现次数进行统计
String data = "This paper first presents a new array data structure to represent the Huffman tree. "
+"The memory required in the proposed data structure is less than the previous methods (Huffman, 1952; Roman, 1992), "
+"which also use array data structure to store the corresponding Huffman tree, and is the lower bound of the one in (Hashemian, 1995)."
+" We then present an efficient Huffman decoding algorithm based on the proposed data structure; given a Huffman code, "
+"the search time for finding the source symbol is O(d), where d denotes the depth of the Huffman tree. "
+"This time bound is equal to the ones in (Hashemian, 1995; Huffman, 1952; Roman, 1992). "
+ "Some experimentations on real images are carried out to demonstrate the performance of space and search time among our method and the previous ones.";
//chars 保存了每个字符出现的次数
Map<Character, Integer> chars = htree.computeCharCount(data);
ArrayList<HTNode> nodes = new ArrayList<>();
for(Character c : chars.keySet()){
HTNode node = new HTNode();
node.setData(c);
node.setWeight(chars.get(c));
node.setLchild(null);
node.setRchild(null);
nodes.add(node);
}
HTNode tree = htree.buildTree(nodes);
Map<Character, String> code = htree.getCode(tree);
for(Character c : code.keySet()){
System.out.println("字符'"+c+"'的哈夫曼编码:"+code.get(c));
}
String text = "This paper first presents a new array data structure to represent the Huffman tree.";
String coded = htree.encode(text);
System.out.println("字符串:This paper first presents a new array data structure to represent the Huffman tree.");
System.out.println("被编码为:"+coded);
String oriText = htree.decode(coded);
System.out.println("被解码为:"+oriText);
System.out.println(oriText.equals(text));
System.out.println("编码后二进制字节长度:" + coded.length());
System.out.println("原字符串长度" + text.length()*8);
System.out.println("压缩率"+coded.length()*1.0/text.length()/8);
}
}
结点类
package com.edu.bym;
public class HTNode implements Comparable<HTNode>{
//这里做了修改,因为在全局变量code里,编码是采用String类型的,所以这里做修改
public enum Code{
ZERO("0"), ONE("1");
private String code;
private Code(String c){
this.code = c;
}
public String getCode(){
return code;
}
}
/**
* 哈夫曼树的叶子结点数据
*/
private char data ;
/**
* 结点的编码,只有0和1两种可能
*/
private String code="";
private double weight;
private HTNode lchild;
private HTNode rchild;
public HTNode(double weight) {
this.weight=weight;
}
public HTNode() {
}
public char getData() {
return data;
}
public void setData(char data) {
this.data = data;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public HTNode getLchild() {
return lchild;
}
public void setLchild(HTNode lchild) {
this.lchild = lchild;
}
public HTNode getRchild() {
return rchild;
}
public void setRchild(HTNode rchild) {
this.rchild = rchild;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
@Override
public int compareTo(HTNode o) {
if(this.weight<o.weight){
return -1;
}else{
return 1;
}
}
}