Day15——Huffman编码之构建Huffman树

颜妮儿

已于 2022-04-16 11:49:09 修改

阅读量1.1k

点赞数

分类专栏： Java 文章标签： java eclipse

于 2022-04-14 19:44:42 首次发布

本文链接：https://blog.csdn.net/Z__XY_/article/details/124179736

版权

Java 专栏收录该内容

42 篇文章 0 订阅

订阅专栏

计算机A想要给计算机B发送一串字符信息，需要将字符转成二进制编码才能发送，我们要知道在数据传输过程中，二进制数据越长不仅影响传输效率，而且出错率更高，所以我们需要用更短的二进制字符串来表示相同的字符信息。由于每个字符出现的频率是不一定相同的，所以我们应该要让字符出现频率越高的字符的编码越短，这样才能保证我们在发送的二进制数据最短，这就用到了我们今天要学的Huffman编码。
要实现Huffman编码，我们首先得构建哈夫曼树，先了解几个相关概念：

路径：在一棵树中，一个结点到另一个结点之间所经过的结点序列；
路径长度：一条路径中，所经过的边数。如路径包含i个几点，路径长度为i-1，用 $l$ 表示;
结点的权：给每个结点赋予的值，用 $w$ 表示
结点的带权路径长度：从根结点到当前结点的路径长度*该结点的权值。
树的带权路径长度：每个叶子结点的带权路径长度和： $WPL=\sum\limits_{i=1}^nw_il_i$

哈夫曼树：在含有n个带权叶结点的二叉树中，其中带权路径长度（WPL）最小的二叉树。

根据WPL的定义，我们要构建一棵哈夫曼树，就应该让权值越小的结点离根结点越远。

从待合并结点中找到两个权值最小的结点，生成一个新节点，使找到的两个结点成为该节点的孩子结点，并使该结点的权值为找到的两个结点权值之和，然后将其纳入待合并结点；
重复执行1，直到只剩一个结点。

在这里插入图片描述
对应我们刚才提到的需求，要发送的字符就是需要待合并的结点，每个字符在消息中出现的频率就是该结点对应的频率，处理步骤：

读入信息，并记录每个字符在信息中出现的次数；
根据次数，建立哈夫曼树。

代码：

package day15;

import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.stream.Collectors;

public class Huffman {

	/**
	 * An inner class for Huffman nodes.
	 * 
	 */
	class HuffmanNode {

		/**
		 * The char. Only valid for leaf nodes.
		 */
		char character;

		/**
		 * Weight. It can also be double.
		 */
		int weight;

		/**
		 * The left child.
		 */
		HuffmanNode leftChild;

		/**
		 * The right child.
		 */
		HuffmanNode rightChild;

		/**
		 * 
		 *********************
		 * The first constructor.
		 * 
		 *********************
		 *
		 */
		public HuffmanNode(char paraCharacter, int paraWeight, HuffmanNode paraLeftChild, HuffmanNode paraRightChild) {
			character = paraCharacter;
			weight = paraWeight;
			leftChild = paraLeftChild;
			rightChild = paraRightChild;
		}// Of HuffmanNode

		/**
		 * To string.
		 */
		public String toString() {
			String resultString = "";

			return resultString;
		}// Of toSting

	}// Of class HuffmanNode

	/**
	 * The number of characters. 256 for ASCII
	 */
	public static final int NUM_CHARS = 256;

	/**
	 * The input text. It is stored in a string for simplicity.
	 */
	String inputText;

	/**
	 * The length of alphabet,also the number of leaves.
	 */
	int alphabetLength;

	/**
	 * The alphabet.
	 */
	char[] alphabet;

	/**
	 * The count of chars. The length is 2*alphabetLength -1 to include non-leaf
	 * nodes.
	 */
	int[] charCounts;

	/**
	 * The mapping of chars to the indices in the alphabet.
	 */
	int[] charMapping;

	/**
	 * Codes for each char in the alphabet. It should have the same length as
	 * alphabet.
	 */
	String[] huffmanCodes;

	/**
	 * All nodes. The last node is the root.
	 */
	HuffmanNode[] nodes;

	/**
	 * 
	 *********************
	 * The first constructor.
	 * 
	 * @param paraFilename The text filename.
	 *********************
	 *
	 */
	public Huffman(String paraFilename) {
		charMapping = new int[NUM_CHARS];
		readText(paraFilename);
	}// Of the fist constructor

	/**
	 * 
	 *********************
	 * @Title: readText
	 * @Description: TODO(Read text.)
	 *
	 * @param paraFilename The filename.
	 *********************
	 *
	 */
	public void readText(String paraFilename) {
		try {
			inputText = Files.newBufferedReader(Paths.get(paraFilename), StandardCharsets.UTF_8).lines()
					.collect(Collectors.joining("\n"));
		} catch (Exception ee) {
			System.out.println(ee);
			System.exit(0);
		} // Of try
		System.out.println("The text is:\r\n" + inputText);
	}// Of readText

	/**
	 * 
	 *********************
	 * @Title: constructAlphabet
	 * @Description: TODO(Construct the alphabet. The results are stored in the
	 *               member variables charMapping and alphabet)
	 *
	 *********************
	 *
	 */
	public void constructAlphabet() {
		// Initialize.
		Arrays.fill(charMapping, -1);

		// The count for each char. At most NUM_VHARS chars.
		int[] tempCharCounts = new int[NUM_CHARS];

		// The index of the char in the ASCII charset.
		int tempCharIndex;

		// Step 1. Scan the string to obtain the counts.
		char tempChar;
		for (int i = 0; i < inputText.length(); i++) {
			tempChar = inputText.charAt(i);
			tempCharIndex = (int) tempChar;

			System.out.println("" + tempCharIndex + " ");

			tempCharCounts[tempCharIndex]++;
		} // Of for i

		// Step 2. Scan to determine the size of the alphabet.
		alphabetLength = 0;
		for (int i = 0; i < 255; i++) {
			if (tempCharCounts[i] > 0) {
				alphabetLength++;
			} // Of if
		} // Of for i

		// Step 3. Compress to the alphabet
		alphabet = new char[alphabetLength];
		charCounts = new int[2 * alphabetLength - 1];

		int tempCounter = 0;
		for (int i = 0; i < NUM_CHARS; i++) {
			if (tempCharCounts[i] > 0) {
				alphabet[tempCounter] = (char) i;
				charCounts[tempCounter] = tempCharCounts[i];
				charMapping[i] = tempCounter;
				tempCounter++;
			} // Of if
		} // Of for i

		System.out.println("The alphabet is: " + Arrays.toString(alphabet));
		System.out.println("Their counts are: " + Arrays.toString(charCounts));
		System.out.println("The char mappings are: " + Arrays.toString(charMapping));
	}// Of constructAlphabet

	public void constructTree() {
		// Step 1. Allocate space.
		nodes = new HuffmanNode[alphabetLength * 2 - 1];
		boolean[] tempProcessed = new boolean[alphabetLength * 2 - 1];

		// Step 2. Initialize leaves.
		for (int i = 0; i < alphabetLength; i++) {
			nodes[i] = new HuffmanNode(alphabet[i], charCounts[i], null, null);
		} // Of for i

		// Step 3. Construct the tree.
		int tempLeft, tempRight, tempMinimal;
		for (int i = alphabetLength; i < 2 * alphabetLength - 1; i++) {
			// Step 3.1 Select the first minimal as the left child.
			tempLeft = -1;
			tempMinimal = Integer.MAX_VALUE;
			for (int j = 0; j < i; j++) {
				if (tempProcessed[j]) {
					continue;
				} // Of if

				if (tempMinimal > charCounts[j]) {
					tempMinimal = charCounts[j];
					tempLeft = j;
				} // Of if
			} // Of for j

			tempProcessed[tempLeft] = true;

			// Step 3.2 Select the second minimal as the right child.
			tempRight = -1;
			tempMinimal = Integer.MAX_VALUE;
			for (int j = 0; j < i; j++) {
				if (tempProcessed[j]) {
					continue;
				} // Of if

				if (tempMinimal > charCounts[j]) {
					tempMinimal = charCounts[j];
					tempRight = j;
				} // Of if
			} // Of for j

			tempProcessed[tempRight] = true;
			System.out.println("Selecting " + i + " are " + tempLeft + " and " + tempRight);

			// Step 3.3 Construct the new node.
			charCounts[i] = charCounts[tempLeft] + charCounts[tempRight];
			nodes[i] = new HuffmanNode('*', charCounts[i], nodes[tempLeft], nodes[tempRight]);

		} // Of for i

	}// Of constructTree

	/**
	 * 
	 *********************
	 * @Title: getRoot
	 * @Description: TODO(Get the root of the binary tree)
	 *
	 * @return The root.
	 *********************
	 *
	 */
	public HuffmanNode getRoot() {
		return nodes[nodes.length - 1];
	}// Of getRoot

	/**
	 * 
	 *********************
	 * @Title: preOrderVisit
	 * @Description: TODO(Pre-order visit)
	 *
	 * @param paraNode
	 *********************
	 *
	 */
	public void preOrderVisit(HuffmanNode paraNode) {
		System.out.println("(" + paraNode.character + ", " + paraNode.weight + ")");

		if (paraNode.leftChild != null) {
			preOrderVisit(paraNode.leftChild);
		} // Of if

		if (paraNode.rightChild != null) {
			preOrderVisit(paraNode.rightChild);
		} // Of if

	}// Of preOrderVisit

	/**
	 * 
	 *********************
	 * @Title: main
	 * @Description: TODO(The entrance of program.)
	 *
	 * @param args Not used now.
	 *********************
	 *
	 */
	public static void main(String args[]) {
		Huffman tempHuffman = new Huffman("F:/huffmantext-small.txt");
		tempHuffman.constructAlphabet();

		tempHuffman.constructTree();

		HuffmanNode tempRoot = tempHuffman.getRoot();
		System.out.println("The root is: " + tempRoot);
		System.out.println("Preorder visit:");
		tempHuffman.preOrderVisit(tempHuffman.getRoot());
	}// Of main
}// Of class Huffman

运行结果：
在这里插入图片描述
小结：
思路还是好理解的，难点在于代码中了几个数组来表示字符与频数、结点与索引之间的映射关系，数组多了处理的时候容易把自己绕进去，所以写代码的时候可以在旁边记录一下某个函数需要处理哪些数组，包括数组对应的语义是什么，方便思路混乱的时候查阅，当然如果用字典的话，思路会更清晰一些。今天在写这个算法的时候，又重新思考了一遍，浮现了许多奇奇怪怪的想法，对哈夫曼树有了新的理解，哈哈，不错。

颜妮儿

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
Day15——Huffman编码之构建Huffman树

计算机A想要给计算机B发送一串字符信息，需要将字符转成二进制编码才能发送，我们要知道在数据传输过程中，二进制数据越长不仅影响传输效率，而且出错率更高，所以我们需要用更短的二进制字符串来表示相同的字符信息。由于每个字符出现的频率是不一定相同的，所以我们应该要让字符出现频率越高的字符的编码越短，这样才能保证我们在发送的二进制数据最短，这就用到了我们今天要学的Huffman编码。要实现Huffman编码，我们首先得构建哈夫曼树，先了解几个相关概念：路径：在一棵树中，一个结点到另一个结点之间所经过的结点序列；
复制链接

扫一扫