【数据结构 & 通信原理】进行哈夫曼编码的同时求出信息熵

执行效果

aaaaaaabbbbbccdddd
Hello world!

代码

package com.huffman.spd.www;

import java.io.*;
import java.util.*;

public class Huffman {

	private final Map<Character, List<Boolean>> encodeMap;
	private final Map<List<Boolean>, Character> decodeMap;
	private final String fileName;

	private final List<Boolean> res;

	private static class TreeNode {

		char ch;
		int weigh;
		TreeNode left;
		TreeNode right;

		public TreeNode(TreeNode left, TreeNode right) {
			weigh = left.weigh + right.weigh;
			this.left = left;
			this.right = right;
		}

		public TreeNode(Map.Entry<Character, Integer> entry) {
			ch = entry.getKey();
			weigh = entry.getValue();
		}

		public TreeNode() {
		}

		public boolean isAvailableLeaf() {
			return ch != '\0' && left == null && right == null;
		}

		public int getWeigh() {
			return weigh;
		}

	}

	public Huffman(String path) {

		String[] wholeName = path.split("\\.");
		fileName = wholeName[0];
		if (!wholeName[1].equals("txt")) {
			throw new IllegalArgumentException("錯誤的文件格式!請選擇其他文件或修改文件後綴。");
		}

		String str = readString(path);
		Map<Character, Integer> map = new HashMap<>();

		for (char ch : str.toCharArray()) {
			map.put(ch, map.containsKey(ch) ? map.get(ch) + 1 : 1);
		}

		Queue<TreeNode> pQueue = new PriorityQueue<>(Comparator.comparingInt(TreeNode::getWeigh));
		double log2 = Math.log(2);

		for (Map.Entry<Character, Integer> entry : map.entrySet()) {
			pQueue.add(new TreeNode(entry));
		}

		TreeNode root = new TreeNode();

		while (pQueue.size() > 1) {
			root = new TreeNode(pQueue.poll(), pQueue.poll());
			pQueue.offer(root);
		}

		encodeMap = new HashMap<>();
		decodeMap = new HashMap<>();
		res = new ArrayList<>();

		setMaps(root);

	}

	private void setMaps(TreeNode root) {

		if (root == null) return;

		if (root.isAvailableLeaf()) {
			List<Boolean> bits = new ArrayList<>(res);
			encodeMap.put(root.ch, bits);
			decodeMap.put(bits, root.ch);
			return;
		}

		res.add(false);
		setMaps(root.left);
		res.remove(res.size() - 1);

		res.add(true);
		setMaps(root.right);
		res.remove(res.size() - 1);

	}

	public int encode (String name) throws IOException {

		List<Boolean> bits = new ArrayList<>();
		String text = readString(name + ".txt");

		for (char ch : text.toCharArray()) {
			for (boolean b : encodeMap.get(ch)) {
				bits.add(b);
			}
		}

		Iterator<Boolean> itr = bits.listIterator();

		byte[] arr = new byte[(bits.size() >>> 3) + ((bits.size() & 0b111) == 0 ? 0 : 1)];
		int j = 0;

		while (itr.hasNext()) {
			byte b = 0;
			for (int i = 0; i < 8; i++) {
				b <<= 1;
				if (itr.hasNext() && itr.next()) b++;
			}
			arr[j++] = b;
		}

		ByteArrayInputStream bais = new ByteArrayInputStream(arr);
		FileOutputStream fos = new FileOutputStream(name + ".hfm");

		int len = -1;
		/* 緩衝區為16字節 */
		byte[] buffer = new byte[0x10];

		while ((len = bais.read(buffer)) != -1) {
			fos.write(buffer, 0, len);
		}

		fos.flush();

		return bits.size();

	}

	public String decode (String name, int length) throws IOException {

		FileInputStream fis = new FileInputStream(name + ".hfm");
		List<Boolean> bits = new ArrayList<>();

		int len = -1;
		byte[] buffer = new byte[0x10];

		while((len = fis.read(buffer))!=-1) {
			for (int i = 0; i < len; i++) {
				for (byte j = -128; j != 0 && length > 0; j = (byte) ((j > 0 ? j : -j) >> 1), length--) {
					bits.add((buffer[i] & j) != 0);
				}
			}
			if (length < 0) break;
		}

		StringBuilder sb = new StringBuilder();
		List<Boolean> flush = new ArrayList<>();

		for (boolean b : bits) {
			flush.add(b);
			Character ch = decodeMap.get(flush);
			if (ch != null) {
				sb.append(ch);
				flush.clear();
			}
		}

		writeString(fileName + ".txt", sb.toString());
		return sb.toString();

	}

	public int encode() throws IOException {
		return encode(fileName);
	}

	public String decode(int length) throws IOException {
		return decode(fileName, length);
	}

	private static String readString(String path) {

		StringBuilder sb = new StringBuilder();
		String str;

		try(BufferedReader br =
				new BufferedReader(
						new InputStreamReader(
								new FileInputStream(path)))) {

			boolean flag = false;

			while ((str = br.readLine()) != null) {
				if (flag) {
					sb.append('\n');
				}
				flag = true;
				sb.append(str);
			}

		} catch (IOException e) {
			System.err.println("文件讀取異常");
		}

		return sb.toString();

	}

	private static void writeString(String path, String str) {

		try(BufferedWriter bw =
				new BufferedWriter(
						new OutputStreamWriter(
								new FileOutputStream(path)))) {

			bw.write(str);
			bw.flush();

		} catch (IOException e) {
			System.err.println("文件写入异常");
		}

	}

	private static StringBuilder bitsString(List<Boolean> bits) {
		StringBuilder sb = new StringBuilder();
		for (boolean b : bits) {
			sb.append(b ? '1' : '0');
		}
		return sb;
	}

	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(fileName).append('\n');
		for (Map.Entry<Character, List<Boolean>> entry : encodeMap.entrySet()) {
			sb.append(entry.getKey()).append(" = ").append(bitsString(entry.getValue())).append('\n');
		}
		return sb.toString();
	}

	public static double infoEntropy(String str) {

		Map<Character, Integer> map = new HashMap<>();
		double entropy = 0;

		for (char ch : str.toCharArray()) {
			map.put(ch, map.containsKey(ch) ? map.get(ch) + 1 : 1);
		}

		double log2 = Math.log(2);

		for (char ch : map.keySet()) {
			double p = (double) map.get(ch) / (double) str.length();
			entropy -= p * Math.log(p) / log2;
		}

		return entropy;

	}

	public static void main(String[] args) throws IOException {

		Huffman huffman = new Huffman("text.txt");
		int size = huffman.encode();
		String str = huffman.decode(size);
		int originSize = str.getBytes().length * 8;
		double entropy = Huffman.infoEntropy(str);

		System.out.println((str.length() > 0x20 ? "该文本" : "文本\"" + str + "\"")
				+ "原本占用" + originSize + "位,经过哈夫曼压缩后共占用" + size + "位。");
		System.out.printf("平均每个字符占用%.4f位\n", (double) size / (double) str.length());
		System.out.printf("压缩率为%.3f", (double) size / (double) originSize * 100);
		System.out.println("%");
		System.out.printf("信息熵为:%.4f\n", entropy);
		System.out.printf("信息熵是平均每个字符所占用的位数的%.4f", entropy / ((double) size / (double) str.length()) * 100);

	}

}

优化方案

  由于java中没有一一映射的数据结构,所以我只好采用两个映射的mao来实现映射。很明显这是很低小的。虽然查找出来很快,但却有大量的内存开销,这是不应该出现的。

  所以我们应该像我这个源代码中自己构建一个二叉树那样自己构建一个一一映射的数据结构。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

九死九歌

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值