用java实现对字符串文本的哈夫曼编码与解码

最新推荐文章于 2024-06-21 22:09:33 发布

piaow_

最新推荐文章于 2024-06-21 22:09:33 发布

阅读量923

点赞数 3

分类专栏：数据结构文章标签： java 数据结构开发语言

本文链接：https://blog.csdn.net/weixin_52323239/article/details/124447294

版权

数据结构专栏收录该内容

2 篇文章 0 订阅

订阅专栏

哈夫曼树与编码的创建过程及发展由来

这里基础知识就不再叙述了，请参考博客

https://www.cnblogs.com/alomsc/p/12736502.html#:~:text

写的非常详细，初学者阅读一遍即可理解

具体代码实现

首先展示下待编码的文本

String data = "In computer science and information theory, "
				+ "a Huffman code is a particular type of optimal prefix code that is commonly used for lossless data compression. "
				+ "The process of finding and/or using such a code proceeds by means of Huffman coding, "
				+ "an algorithm developed by David A. Huffman while he was a Ph.D. student at MIT, and published in the 1952 paper "
				+ "\"A Method for the Construction of Minimum-Redundancy Codes\".[1] "
				+ "The output from Huffman's algorithm can be viewed as a variable-length code table for encoding a source symbol "
				+ "(such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence"
				+ " (weight) for each possible value of the source symbol. As in other entropy encoding methods, more common symbols are generally "
				+ "represented using fewer bits than less common symbols. Huffman's method can be efficiently implemented, "
				+ "finding a code in linear time to the number of input weights if these weights are sorted.[2] However, "
				+ "although optimal among methods encoding symbols separately, Huffman coding is not always optimal among all compression methods.";

用HashMap统计词频
HashMap可以存储键值对，即<Key,Value>,用这种数据结构大大降低了程序设计的复杂性。

  //统计字符串中字符出现的频率
	public  Map<Character,Integer> computeCharCount(String text){
		Map<Character,Integer> map = new HashMap<Character,Integer>();
			char[] ch = text.toCharArray();
			for(char b : ch) {
				if(!map.containsKey(b)) {
					map.put(b, 1);
				}
				else {
					map.replace(b,map.get(b)+1);
				}
		}
		return map;
	}

将hashmap中的数据取出，放到哈夫曼树当中的每个结点处，再将结点存入ArrayList集合当中，利用集合可以被排序的特点，构建哈夫曼树

创建集合并添加数据进入结点

ArrayList<HTNode> nodes = new ArrayList<>();
		for(Character c : chars.keySet()){
			HTNode node = new HTNode();
			node.setData(c);
			node.setWeight(chars.get(c));
			node.setLchild(null);
			node.setRchild(null);		
			nodes.add(node);
		}

构建哈夫曼树

/**
	 * 根据初始的结点列表，建立哈夫曼树，
	 * 反复生成哈夫曼树，每次重新构建树，将更新编码
	 * 生成好了哈夫曼树，把生成的树根结点进行返回。
	 */
	public HTNode buildTree(List<HTNode> nodes){
		while(nodes.size()>1){
			Collections.sort(nodes);
			HTNode left = nodes.get(0);
			HTNode right = nodes.get(1);
			HTNode parent = new HTNode(left.getWeight()+right.getWeight());
			parent.setLchild(left);
			parent.setRchild(right);
			nodes.add(parent);
			nodes.remove(left);
			nodes.remove(right);
		}
		return nodes.get(0);
	}

根据生成的哈夫曼树，构建哈夫曼编码，越向树根编码越短，越向叶子结点编码越长

/**
	 * 根据已建立的哈夫曼树根结点，生成对应的字符编码，
	 * 字符编码应为0，1字符串
	 */
	public  Map<Character, String> getCode(HTNode tree) {
		//如果当前结点不是字符结点，即没有关键字，只有权值
		if (tree.getData() == '\0') {
			if (tree.getLchild() != null) {
				//左结点就把0拼接其后
				String j = tree.getCode() + Code.ZERO.getCode();
				tree.getLchild().setCode(j);
				//递归左孩子
				getCode(tree.getLchild());
			}
			if (tree.getRchild() != null) {
				//右结点就把1拼接其后
				String k = tree.getCode() + Code.ONE.getCode();
				tree.getRchild().setCode(k);
				//递归右孩子
				getCode(tree.getRchild());
			}
		} 
		//如果当前结点是字符结点，则放入静态全局变量code中，存储关键字和哈夫曼编码值
		else {
			code.put(tree.getData(), tree.getCode());
		}
		return code;
	}

存储字符对应的编码之后，新建一个字符串文本，对其进行编码解码的验证

//例子
String text = "This paper first presents a new array data structure to represent the Huffman tree.";

实现哈夫曼编码

/**
	 * 使用当前类训练好的huffman编码来对文本进行编码
	 */
	public String encode(String text) {
		String sum = "";
		char[] b = text.toCharArray();
		for (char tmp : b) {
			sum += code.get(tmp);
		}
		return sum;
	}

实现哈夫曼解码
不得不说java API真的强大，可以用字符串的startsWith()方法，进行查找哈夫曼编码对应的字符值。这里说明一下，哈夫曼编码的任何一个字符编码，都不是其他字符编码的前缀，所以查找编码是唯一的！

		/**
	 * 使用当前类中训练好的huffman编码，
	 * 对编码后的文本进行解码
	 */
	public String decode(String text) {
		String sum ="";
		while (text.length() > 0) {
			for (Map.Entry<Character, String> e : code.entrySet()) {
				String temp = e.getValue();
				if (text.startsWith(temp)) {
					sum+=e.getKey();
					text = text.substring(temp.length());
					break;
				}
			}
		}
		return sum;
	}

计算压缩比

	System.out.println("编码后二进制字节长度:" + coded.length());
		System.out.println("原字符串长度" + text.length()*8);
		System.out.println("压缩率"+coded.length()*1.0/text.length()/8);

全部代码

哈夫曼树类

package com.edu.bym;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.edu.bym.HTNode.Code;

/**
 * 哈夫曼树实现
 * @author piaow
 */
public class HuffmanTree{

	private Map<Character, String> code = new HashMap<Character,String>();
	
	/**
	 * 根据初始的结点列表，建立哈夫曼树，
	 * 反复生成哈夫曼树，每次重新构建树，将更新编码
	 * 生成好了哈夫曼树，把生成的树根结点进行返回。
	 */
	public HTNode buildTree(List<HTNode> nodes){
		while(nodes.size()>1){
			Collections.sort(nodes);
			HTNode left = nodes.get(0);
			HTNode right = nodes.get(1);
			HTNode parent = new HTNode(left.getWeight()+right.getWeight());
			parent.setLchild(left);
			parent.setRchild(right);
			nodes.add(parent);
			nodes.remove(left);
			nodes.remove(right);
		}
		return nodes.get(0);
	}
	
	/**
	 * 根据已建立的哈夫曼树根结点，生成对应的字符编码，
	 * 字符编码应为0，1字符串
	 */
	public  Map<Character, String> getCode(HTNode tree) {
		//如果当前结点不是字符结点，即没有关键字，只有权值
		if (tree.getData() == '\0') {
			if (tree.getLchild() != null) {
				//左结点就把0拼接其后
				String j = tree.getCode() + Code.ZERO.getCode();
				tree.getLchild().setCode(j);
				//递归左孩子
				getCode(tree.getLchild());
			}
			if (tree.getRchild() != null) {
				//右结点就把1拼接其后
				String k = tree.getCode() + Code.ONE.getCode();
				tree.getRchild().setCode(k);
				//递归右孩子
				getCode(tree.getRchild());
			}
		} 
		//如果当前结点是字符结点，则放入静态全局变量code中，存储关键字和哈夫曼编码值
		else {
			code.put(tree.getData(), tree.getCode());
		}
		return code;
	}

	/**
	 * 获取已建立的哈夫曼树生成的字符编码，
	 * 字符编码应为0，1字符串
	 */
	public Map<Character, String> getCode(){
		return this.code;
	}
		
    //统计字符串中字符出现的频率
	public  Map<Character,Integer> computeCharCount(String text){
		Map<Character,Integer> map = new HashMap<Character,Integer>();
			char[] ch = text.toCharArray();
			for(char b : ch) {
				if(!map.containsKey(b)) {
					map.put(b, 1);
				}
				else {
					map.replace(b,map.get(b)+1);
				}
		}
		return map;
	}
	
	/**
	 * 使用当前类训练好的huffman编码来对文本进行编码
	 */
	public String encode(String text) {
		String sum = "";
		char[] b = text.toCharArray();
		for (char tmp : b) {
			sum += code.get(tmp);
		}
		return sum;
	}
	
	/**
	 * 使用当前类中训练好的huffman编码，
	 * 对编码后的文本进行解码
	 */
	public String decode(String text) {
		String sum ="";
		while (text.length() > 0) {
			for (Map.Entry<Character, String> e : code.entrySet()) {
				String temp = e.getValue();
				if (text.startsWith(temp)) {
					sum+=e.getKey();
					text = text.substring(temp.length());
					break;
				}
			}
		}
		return sum;
	}

	public static void main(String[] args){
		HuffmanTree htree = new HuffmanTree();
		//首先对字符串中的字符出现次数进行统计
		String data = "This paper first presents a new array data structure to represent the Huffman tree. "
		        +"The memory required in the proposed data structure is less than the previous methods (Huffman, 1952; Roman, 1992), "
				+"which also use array data structure to store the corresponding Huffman tree, and is the lower bound of the one in (Hashemian, 1995)."
		        +" We then present an efficient Huffman decoding algorithm based on the proposed data structure; given a Huffman code, "
				+"the search time for finding the source symbol is O(d), where d denotes the depth of the Huffman tree. "
		        +"This time bound is equal to the ones in (Hashemian, 1995; Huffman, 1952; Roman, 1992). "
				+ "Some experimentations on real images are carried out to demonstrate the performance of space and search time among our method and the previous ones.";
		//chars 保存了每个字符出现的次数
		Map<Character, Integer> chars = htree.computeCharCount(data);
		ArrayList<HTNode> nodes = new ArrayList<>();
		for(Character c : chars.keySet()){
			HTNode node = new HTNode();
			node.setData(c);
			node.setWeight(chars.get(c));
			node.setLchild(null);
			node.setRchild(null);		
			nodes.add(node);
		}
		HTNode tree = htree.buildTree(nodes);
		Map<Character, String> code = htree.getCode(tree);
		
		for(Character c : code.keySet()){
			System.out.println("字符'"+c+"'的哈夫曼编码："+code.get(c));
		}
		String text = "This paper first presents a new array data structure to represent the Huffman tree.";
		String coded = htree.encode(text);
		System.out.println("字符串：This paper first presents a new array data structure to represent the Huffman tree.");
		System.out.println("被编码为："+coded);
		String oriText = htree.decode(coded);
		System.out.println("被解码为："+oriText);
		System.out.println(oriText.equals(text));
		System.out.println("编码后二进制字节长度:" + coded.length());
		System.out.println("原字符串长度" + text.length()*8);
		System.out.println("压缩率"+coded.length()*1.0/text.length()/8);
	}
}

结点类

package com.edu.bym;

public class HTNode implements Comparable<HTNode>{ 
	
	//这里做了修改，因为在全局变量code里，编码是采用String类型的，所以这里做修改
	public enum Code{
		ZERO("0"), ONE("1");
		private String code;
		private Code(String c){
			this.code = c;
		}
		public String getCode(){
			return code;
		}
	}
	
	/**
	 *  哈夫曼树的叶子结点数据
	 */
	private char data ;
	
	/**
	 * 结点的编码，只有0和1两种可能
	 */
	private String code="";
	
	private double weight;
	private HTNode lchild;
	private HTNode rchild;
	
	public HTNode(double weight) {
		this.weight=weight;
	}
	public HTNode() {
	}
	public char getData() {
		return data;
	}
	public void setData(char data) {
		this.data = data;
	}
	public double getWeight() {
		return weight;
	}
	public void setWeight(double weight) {
		this.weight = weight;
	}
	public HTNode getLchild() {
		return lchild;
	}
	public void setLchild(HTNode lchild) {
		this.lchild = lchild;
	}
	public HTNode getRchild() {
		return rchild;
	}
	public void setRchild(HTNode rchild) {
		this.rchild = rchild;
	}
	public String getCode() {
		return code;
	}
	public void setCode(String code) {
		this.code = code;
	}
	@Override
	public int compareTo(HTNode o) {
		if(this.weight<o.weight){
			return -1;
		}else{
			return 1;
		}
	}
}

piaow_

关注

3
点赞
踩
17

收藏

觉得还不错? 一键收藏
1
评论
用java实现对字符串文本的哈夫曼编码与解码

哈夫曼树与编码的创建过程及发展由来这里基础知识就不再叙述了，请参考博客https://www.cnblogs.com/alomsc/p/12736502.html#:~:text写的非常详细，初学者阅读一遍即可理解具体代码实现首先展示下待编码的文本String data = "In computer science and information theory, " + "a Huffman code is a particular type of optimal prefix co
复制链接

扫一扫

专栏目录