Day30 —— encoding and decoding
1. Background
今天是学习java的第30天了。在这个特殊的日子,我学习了java数据结构里面的一个特殊的章节,赫夫曼编码和解码。
2. Description
Huffman Coding的主要用途在于对资料进行编码、压缩。流程大致如下:
- 依照即将编码的內容中,各个字符出现的频率建立Huffman Tree。
- 依照Huffman Tree对资料內容进行编码。
假设我们有一棵Huffman Tree(如上图所示),以及一段神秘数字「11100100100111000110010110100111101110010010011」。现在我们来解码。
首先从图中的root(写着18的节点)出发。由于神秘数字的第一个数字为1,所以我们沿写着1的箭头走,抵达右侧的子节点。接著,第二个数字为1,故我们同样走向右侧的子节點。第三个数字為1,因此我们最后抵达了「t」。所以「111」解码后的內容为=「t」,或者说「t」所对应的编码为「111」。
重复以上步骤,最后得到字符串为「to be or not to be」。
以上,就是赫夫曼编码的一个简单的介绍了。这也是我对它的理解,下面进入代码。
这里说一句,之前创建的Hfuuman.txt文件里面最好放纯英文的文本,我试过空格和符号,都导致了代码超出范围。
3. Code
package datastructure;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.stream.Collectors;
import java.util.Arrays;
/**
* Heffman tree, encoding, and decoding.
*
* @author Leo liu.
*/
public class Huffman {
/**
* Inner Huffman Node.
*/
class HuffmanNode {
// The char.
char Character;
// Weight.
int Weight;
// The left child.
HuffmanNode LeftChild;
// The right child.
HuffmanNode RightChild;
// The parent.
HuffmanNode Parent;
/**
*
* The first constructure.
*
* Override HeffmanNode.
*
*/
public HuffmanNode(char paraCharacter, int paraWeight, HuffmanNode paraLeftChild, HuffmanNode paraRightChild, HuffmanNode paraParent) {
Character = paraCharacter;
Weight = paraWeight;
LeftChild = paraLeftChild;
RightChild = paraRightChild;
Parent = paraParent;
} // Of HuffmanNode
/**
************
* To String.
************
*/
public String toString() {
String resultString = "(" + Character + ", " + Weight + ")";
return resultString;
} // Of toString
} // Of HuffmanNode
/**
* 字符的数量,256个ASCII字符。
*/
public static final int NUM_CHARS = 256;
/**
* 输入的文本。
*/
String inputText;
/**
* 字母表的长度。
*/
int alphabetLength;
/**
* 字母表。
*/
char[] alphabet;
// 字符数。长度包括非叶子节点,长度为2*alphabetLength.
int[] charCounts;
// 字符到字母表中的映射。
int[] charMapping;
// 字母表中每个字符的代码。
String[] huffmanCodes;
/**
* All nodes.
*/
HuffmanNode[] nodes;
/**
*********************
* The first constructor.
*
* @param paraFilename The text filename.
*********************
*/
public Huffman(String paraFilename) {
charMapping = new int[NUM_CHARS];
readText(paraFilename);
}// Of the first constructor
/**
*********************
* Read text.
*
* @param paraFilename The text filename.
*********************
*/
public void readText(String paraFilename) {
try {
inputText = Files.newBufferedReader(Paths.get(paraFilename), StandardCharsets.UTF_8)
.lines().collect(Collectors.joining("\n"));
} catch (Exception ee) {
System.out.println(ee);
System.exit(0);
} // Of try
System.out.println("The text is:\r\n" + inputText);
}// Of readText
public void constructAlphabet() {
// 所有字符到字符表中的映射被初始化为-1.
Arrays.fill(charMapping, -1);
// 字符数目统计,最多为NUM_CHARS个.
int[] tempCharCounts = new int[NUM_CHARS];
// 由下文可知,求出Char的ASCII索引.
int tempCharIndex;
// Step 1.
char tempChar;
for (int i = 0; i < inputText.length(); i++) {
tempChar = inputText.charAt(i);
tempCharIndex = (int) tempChar;
System.out.print("" + tempCharIndex + " ");
tempCharCounts[tempCharIndex]++; // 这里类似于哈希表的形式,统计索引为tempCharIndex的字符的出现次数。
} // Of for i
// Step 2. 统计出现了多少种字符.
alphabetLength = 0;
for (int i = 0; i < 255; i++) {
if (tempCharCounts[i] > 0) {
alphabetLength++;
} // Of if
} // Of for i
// Step 3.
alphabet = new char[alphabetLength];
charCounts = new int[2 * alphabetLength - 1];
int tempCounter = 0;
for (int i = 0; i < NUM_CHARS; i++) {
if (tempCharCounts[i] > 0) {
alphabet[tempCounter] = (char) i;
charCounts[tempCounter] = tempCharCounts[i];
charMapping[i] = tempCounter;
tempCounter++;
} // Of if
} // Of for i
System.out.println("The alphabet is: " + Arrays.toString(alphabet));
System.out.println("Their counts are: " + Arrays.toString(charCounts));
System.out.println("The char mappings are: " + Arrays.toString(charMapping));
}// Of constructAlphabet
/**
*********************
* Construct the tree.
*********************
*/
public void constructTree() {
// Step 1. 为节点分配空间
nodes = new HuffmanNode[alphabetLength * 2 - 1];
boolean[] tempProcessed = new boolean[alphabetLength * 2 - 1];
// Step 2. 初始化叶子节点.
for (int i = 0; i < alphabetLength; i++) {
nodes[i] = new HuffmanNode(alphabet[i], charCounts[i], null, null, null);
} // Of for i
// Step 3. 创建树.
int tempLeft, tempRight, tempMinimal;
for (int i = alphabetLength; i < 2 * alphabetLength - 1; i++) {
// Step 3.1 选择第一个最小值作为左孩子.
tempLeft = -1;
tempMinimal = Integer.MAX_VALUE;
for (int j = 0; j < i; j++) {
if (tempProcessed[j]) {
continue;
} // Of if
if (tempMinimal > charCounts[j]) {
tempMinimal = charCounts[j];
tempLeft = j;
} // Of if
} // Of for j
tempProcessed[tempLeft] = true;
// Step 3.2 选择第二个小的值作为右孩子.
tempRight = -1;
tempMinimal = Integer.MAX_VALUE;
for (int j = 0; j < i; j++) {
if (tempProcessed[j]) {
continue;
} // Of if
if (tempMinimal > charCounts[j]) {
tempMinimal = charCounts[j];
tempRight = j;
} // Of if
} // Of for j
tempProcessed[tempRight] = true;
System.out.println("Selecting " + tempLeft + " and " + tempRight);
// Step 3.3 创建一个新节点
charCounts[i] = charCounts[tempLeft] + charCounts[tempRight];
nodes[i] = new HuffmanNode('*', charCounts[i], nodes[tempLeft], nodes[tempRight], null);
// Step 3.4 链接孩子节点.
nodes[tempLeft].Parent = nodes[i];
nodes[tempRight].Parent = nodes[i];
System.out.println("The children of " + i + " are " + tempLeft + " and " + tempRight);
} // Of for i
}// Of constructTree
/**
*********************
* Get the root of the binary tree.
*
* @return The root.
*********************
*/
public HuffmanNode getRoot() {
return nodes[nodes.length - 1];
}// Of getRoot
/**
*********************
* 一个普通的先序遍历
*
* @param paraNode The given Node.
*********************
*/
public void preOrderVisit(HuffmanNode paraNode) {
System.out.print("(" + paraNode.Character + ", " + paraNode.Weight + ") ");
if (paraNode.LeftChild != null) {
preOrderVisit(paraNode.LeftChild);
} // Of if
if (paraNode.RightChild != null) {
preOrderVisit(paraNode.RightChild);
} // Of if
} // Of preOrderVisit
public void generateCode() {
huffmanCodes = new String[alphabetLength];
HuffmanNode tempNode;
for (int i = 0; i < alphabetLength; i++) {
tempNode = nodes[i];
// 使用tempCharCode代替tempCode。
String tempCharCode = "";
while (tempNode.Parent != null) {
if (tempNode == tempNode.Parent.LeftChild) {
tempCharCode = "0" + tempCharCode;
} else {
tempCharCode = "1" + tempCharCode;
} // Of if
tempNode = tempNode.Parent;
} // Of while
huffmanCodes[i] = tempCharCode;
System.out.println("The code of " + alphabet[i] + " is " + tempCharCode);
} // Of for i
} // Of generateCode
/**
*********************
* 赫夫曼编码的编码环节
*
* @param paraString The given String.
* @return
*********************
*/
public String encoding(String paraString) {
String resultCodeString = "";
int tempIndex;
for (int i = 0; i < paraString.length(); i++) {
// 原始字符在字母表中的位置
tempIndex = charMapping[(int) paraString.charAt(i)];
resultCodeString += huffmanCodes[tempIndex];
} // Of for i
return resultCodeString;
}// Of coding
/**
*********************
* 赫夫曼编码的解码环节。
*
* @param paraString The given string.
*********************
*/
public String decoding(String paraString) {
String resultCodeString = "";
HuffmanNode tempNode = getRoot();
for (int i = 0; i < paraString.length(); i++) {
if (paraString.charAt(i) == '0') {
tempNode = tempNode.LeftChild;
System.out.println(tempNode);
} else {
tempNode = tempNode.RightChild;
System.out.println(tempNode);
} // Of if
if (tempNode.LeftChild == null) {
System.out.println("Decode one:" + tempNode);
// Decode one char.
resultCodeString += tempNode.Character;
// Return to the root.
tempNode = getRoot();
} // Of if
} // Of for i
return resultCodeString;
}// Of decoding
/**
*
* The entrence of program.
*
* @param args Not used now.
*/
public static void main(String[] args) {
Huffman tempHuffman = new Huffman("D:/java/vstest/src/datastructure/Huffman.txt");
tempHuffman.constructAlphabet();
tempHuffman.constructTree();
HuffmanNode tempRoot = tempHuffman.getRoot();
System.out.println("The root is: " + tempRoot);
System.out.println("Preorder visit:");
tempHuffman.preOrderVisit(tempHuffman.getRoot());
tempHuffman.generateCode();
String tempCoded = tempHuffman.encoding("abcdb");
System.out.println("Coded: " + tempCoded);
String tempDecoded = tempHuffman.decoding(tempCoded);
System.out.println("Decoded: " + tempDecoded);
} // Of main
}
运行结果