赫夫曼编码-数据压缩
1,首先创建一个需要压缩的遗传字符串String str = “i like like like java do you like a java”;
2,赫夫曼编码就是把在字符串最多的字符放在前面最少的字符放在后面(带全路径最小wpl)
3,然后左路径用0,右路径用1 去代替从而产生新的自己的代码
4,用这些str替换字符
5,再把这些str每8个字符分隔开转成byte字节
6,大概就是这种方式去压缩
下面开始撸代码
1–先要创建节点才可以用赫夫曼树
//还是那句节点要有比较性才可以排序
class Node implements Comparable<Node> {
Byte data;
int weight;//存放这个字节出现多少次 也是要写在compareto里的
Node left;
Node right;
//传进来的参数如果data=null就等于是非叶子节点
public Node(Byte data, int weight) {
super();
this.data = data;
this.weight = weight;
}
@Override
public int compareTo(Node o) {
// 可比性
return this.weight - o.weight;
}
@Override
public String toString() {
return "Node [data=" + data + ", weight=" + weight + "]";
}
//前序排列 写太多次不会都要背下来了
public void preOrder() {
System.out.println(this);
if (this.left != null) {
this.left.preOrder();
}
if (this.right != null) {
this.right.preOrder();
}
}
}
public class HuffmanCode {
public static void main(String[] args) {
String str = "i like like like java do you like a java";
byte[] bytes = str.getBytes();
byte[] huffmanCodesBytes = huffmanZip(bytes);
System.out.println(Arrays.toString(huffmanCodesBytes));
}
//压缩数据的代码
private static byte[] huffmanZip(byte[] bytes) {
//1,把字符遍历 往list集合存放可以知道那个字符有几个
List<Node> list = getNodes(bytes);
//2,把集合放进方法里生成一个赫夫曼树
Node node = createHuffmanTree(list);
//3,用赫夫曼树的带权路径长度wpl最小去生成一个每个字符代表自己的数字字符串
Map<Byte, String> huffmanCodes =getCodes(node);
//4,再用自己的方式压缩数据
byte[] zip = zip(bytes, huffmanCodes);
return zip;
}
private static byte[] zip(byte[] bytes, Map<Byte, String> huffmanCodes) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < bytes.length; i++) {
//创建字符流添加所有的字节(遍历字节 huffmanCodes map里用get(key)
//的方式获取相应的数字字符串)
sb.append(huffmanCodes.get(bytes[i]));
}
//字符流要用8个字节去分割
int len;
if (sb.length() % 8 == 0) {
len = sb.length() / 8;
} else {
len = sb.length() / 8 + 1;
}
//获取需要几个角标的字节数组 创建数组
byte[] huffmanCodeBytes = new byte[len];
int index = 0;
for (int i = 0; i < sb.length(); i += 8) {
String strByte;
if(i+8>sb.length()) {
//这里字符流最后一节可能没有8个字节所以需要判断
strByte=sb.substring(i);
}else {
strByte = sb.substring(i, i + 8);
}
//字节数组里存放字节(因为用01表示等于二进制再把二进制转化成int
//所及节省空间 就等于压缩)
huffmanCodeBytes[index] = (byte) Integer.parseInt(strByte, 2);
index++;
}
return huffmanCodeBytes;
}
//存放每个字符代表那几个1,0 这是成员属性一直存在
static Map<Byte, String> huffmanCodes = new HashMap<>();
static StringBuilder stringBuilder = new StringBuilder();
private static Map<Byte, String> getCodes(Node root) {
//首先判断根节点是否为空
if (root == null) {
return null;
}
//开始往左递归,结束再往右递归
getCodes(root.left, "0", stringBuilder);
getCodes(root.right, "1", stringBuilder);
return huffmanCodes;
}
//String code 往左就是0,往右就是1
private static void getCodes(Node node, String code, StringBuilder stringBuilder) {
StringBuilder stringBuilder2 = new StringBuilder(stringBuilder);
//sb加0或者1
stringBuilder2.append(code);
//节点为空结束递归
if (node != null) {
//节点的data为空表示是非叶子节点需要往下递归
//如果不是空代表一个字符把字符和sb(代表的一串数字)存放到huffmanCodes map集合里
if (node.data == null) {
getCodes(node.left, "0", stringBuilder2);
getCodes(node.right, "1", stringBuilder2);
} else {
huffmanCodes.put(node.data, stringBuilder2.toString());
}
}
}
//把字符遍历 往map集合存放可以知道那个字符有几个 再把map遍历放进list集合
private static List<Node> getNodes(byte[] bytes) {
ArrayList<Node> nodes = new ArrayList<Node>();
Map<Byte, Integer> counts = new HashMap<>();
for (byte b : bytes) {
Integer count = counts.get(b);
if (count == null) {
counts.put(b, 1);
} else {
count++;
counts.put(b, count);
}
}
for (Map.Entry<Byte, Integer> entry : counts.entrySet()) {
nodes.add(new Node(entry.getKey(), entry.getValue()));
}
return nodes;
}
//生成赫夫曼树
private static Node createHuffmanTree(List<Node> nodes) {
//遍历到只剩下一个根节点
while (nodes.size() > 1) {
//自动排序
Collections.sort(nodes);
//每次取出最小的两个 生成一个非叶子节点
Node left = nodes.remove(0);
Node right = nodes.remove(0);
Node parent = new Node(null, left.weight + right.weight);
//非叶子节点的左右节点赋值 再把非叶子节点放进集合
parent.left = left;
parent.right = right;
nodes.add(parent);
}
return nodes.get(0);
}
public static void preOrder(Node root) {
if (root != null) {
root.preOrder();
} else {
System.out.println("空数组");
}
}
}
ps-压缩文件也不过如此
写博客几次为了记住并且写出代码
到头来还是那样只能更明确地读懂这些代码了而已
再接再厉