In 1953, David A. Huffman published his paper "A Method for the Construction of Minimum-Redundancy Codes", and hence printed his name in the history of computer science. As a professor who gives the final exam problem on Huffman codes, I am encountering a big problem: the Huffman codes are NOT unique. For example, given a string "aaaxuaxz", we can observe that the frequencies of the characters 'a', 'x', 'u' and 'z' are 4, 2, 1 and 1, respectively. We may either encode the symbols as {'a'=0, 'x'=10, 'u'=110, 'z'=111}, or in another way as {'a'=1, 'x'=01, 'u'=001, 'z'=000}, both compress the string into 14 bits. Another set of code can be given as {'a'=0, 'x'=11, 'u'=100, 'z'=101}, but {'a'=0, 'x'=01, 'u'=011, 'z'=001} is NOT correct since "aaaxuaxz" and "aazuaxax" can both be decoded from the code 00001011001001. The students are submitting all kinds of codes, and I need a computer program to help me determine which ones are correct and which ones are not.
Input Specification:
Each input file contains one test case. For each case, the first line gives an integer N (2≤N≤63), then followed by a line that contains all the N distinct characters and their frequencies in the following format:
c[1] f[1] c[2] f[2] ... c[N] f[N]
where c[i]
is a character chosen from {'0' - '9', 'a' - 'z', 'A' - 'Z', '_'}, and f[i]
is the frequency of c[i]
and is an integer no more than 1000. The next line gives a positive integer M (≤1000), then followed by M student submissions. Each student submission consists of Nlines, each in the format:
c[i] code[i]
where c[i]
is the i
-th character and code[i]
is an non-empty string of no more than 63 '0's and '1's.
Output Specification:
For each test case, print in each line either "Yes" if the student's submission is correct, or "No" if not.
Note: The optimal solution is not necessarily generated by Huffman algorithm. Any prefix code with code length being optimal is considered correct.
Sample Input:
7
A 1 B 1 C 1 D 3 E 3 F 6 G 6
4
A 00000
B 00001
C 0001
D 001
E 01
F 10
G 11
A 01010
B 01011
C 0100
D 011
E 10
F 11
G 00
A 000
B 001
C 010
D 011
E 100
F 101
G 110
A 00000
B 00001
C 0001
D 001
E 00
F 10
G 11
Sample Output:
Yes
Yes
No
No
个人分析:哈夫曼树的应用。帮助题目中的老师判断:1.学生提交是否为哈夫曼树 2.同组编码哈夫曼树可能不同,要求能正确识别。
乍一看:
对于第一个点,可以通过构造一个函数,判断学生输入是否符合前缀码条件来得到实现。
对于第二个点,想到同组编码的不同哈夫曼树,都是可以通过左右儿子(非叶结点)交换得到的,可以根据这个特点进行判断。
然而,问题并没有这么简单:是前缀码/是哈夫曼树,这两之间并不是充分必要关系。即使是前缀码,也不一定就是哈夫曼树:
其次,第二个点,判断左右儿子交换后是否同构,也是多此一举。。。。。(当时写的时候写了好久T T泪目),只要同时满足最小wpl+前缀码条件,就一定是正确的哈夫曼树了。。。。
而学生的wpl值是相对容易求得的,只需要根据学生输入编码的长度*频次,在加和就可以了。而正确的最小wpl,可以通过自己建立一棵哈夫曼树,再以此计算得到。那么最大的问题就是:如何判断前缀码条件了。
所谓前缀码,也就是在一堆编码串里,a串不能在b串里出现,比如a:01,b:011,c:1这样就是不符合 条件的,因为011既可以对应b,也可以对应ac。那么通过把所有编码串一一对比,判断是否有子串情况,若没有,就是正确前缀码,如果有那就不是。
上代码:
#include<stdio.h>
#include<stdlib.h>
#define Maxsize 63
int number1=0;
//哈夫曼树定义
typedef struct TreeNode *HuffmanTree;
struct TreeNode{
int weight;
HuffmanTree left,right;
};
//最小堆定义
typedef struct HNode *MinHeap;
struct HNode{
TreeNode *Element;
int size;
int capacity;
};
//堆有序化为最小堆
void percDown(MinHeap H,int i)
{
int parent,child;
TreeNode X;
X=H->Element[i];
for(parent=i;parent*2<=H->size;parent=child)
{
child=2*parent;
if(child!=H->size&&(H->Element[child].weight)>(H->Element[child+1].weight))
child++;
if((H->Element[child].weight)<(X.weight))
{
H->Element[parent]=H->Element[child];
}
else
break;
}
H->Element[parent]=X;
}
void BuildMinHeap(MinHeap H)
{
for(int i=H->size/2;i>0;i--)
percDown(H,i);
}
//堆的插入,删除,判满,判空,初始化函数
MinHeap CreateHeap()
{
MinHeap H=(MinHeap)malloc(sizeof(struct HNode));
H->Element=(TreeNode*)malloc((Maxsize+1)*sizeof(TreeNode));
H->size=0;
H->capacity=Maxsize;
H->Element[0].weight=0;
return H;
}
bool IsFull(MinHeap H)
{
return(H->size==H->capacity);
}
bool Insert(MinHeap H,HuffmanTree T)
{
int i;
if(IsFull(H))
{
printf("最小堆已满");
return false;
}
i=++H->size;
for(;(H->Element[i/2].weight)>(T->weight);i/=2)
H->Element[i]=H->Element[i/2];
H->Element[i]=*T;
return true;
}
bool IsEmpty(MinHeap H)
{
return(H->size==0);
}
HuffmanTree DeleteMin(MinHeap H)
{
int parent,child;
HuffmanTree MinItem,X;
if(IsEmpty(H))
{
printf("最小堆已为空");
}
MinItem=(HuffmanTree)malloc(sizeof(struct TreeNode));
X=(HuffmanTree)malloc(sizeof(struct TreeNode));
*MinItem=H->Element[1];
*X=H->Element[H->size];
H->size=H->size-1;
for(parent=1;parent*2<=H->size;parent=child)
{
child=parent*2;
if((child!=H->size)&&(H->Element[child].weight)>(H->Element[child+1].weight))
child++;
if((X->weight)<=(H->Element[child].weight))
break;
else
H->Element[parent]=H->Element[child];
}
H->Element[parent]=*X;
return MinItem;
}
HuffmanTree Huffman(MinHeap H)
{
int i; HuffmanTree T;
BuildMinHeap(H);
int times=H->size;
for(i=1;i<times;i++)
{
T=(HuffmanTree)malloc(sizeof(struct TreeNode));
T->left=DeleteMin(H);
T->right=DeleteMin(H);
T->weight=T->left->weight+T->right->weight;
number1+=T->weight; //计算哈夫曼树的wpl值
Insert(H,T);
}
T=DeleteMin(H);
return T;
}
bool Compare(char *c1,char *c2) //遍历两编码串,判断是否有子串情况存在
{
while (*c1 == *c2 && *c1 != '\0')
{
++c1;
++c2;
}
return *c1 == '\0' || *c2 == '\0';
}
int code_length(char *a)
{
char *p=a;int len=0;
while(*p != '\0')
{
p++;
len++;
}
return len;
}
int main()
{
int N,M;
scanf("%d",&N);
char c[N];int data[N]; //data[]计频次
getchar();
MinHeap H=CreateHeap();//初始化堆,并将哈夫曼树的结点权值放在堆中
for(int i=1;i<=N;i++)
{
if(i<N)
{
scanf("%c %d ",&c[i],&data[i-1]); //由于最小堆的第一个元素是哨兵,循环从i=1开始做
} //同时data[]从0下标开始计
else
scanf("%c %d",&c[i],&data[i-1]); //其实这个c[]并没有什么软用..就是为了读掉输入
H->Element[i].weight=data[i-1];
H->Element[i].left=NULL;
H->Element[i].right=NULL;
H->size++;
}
//先根据字符频次,得出其哈夫曼树,计算得wpl
HuffmanTree T;
T=Huffman(H);
//储存学生输入
getchar();
scanf("%d",&M);
getchar();
char ch[N],code[N][Maxsize+1];
for(int j=0;j<M;j++)
{ int flag=0;
int number2=0;
for(int k=0;k<N;k++)
{
scanf("%c %s",&ch[k],&code[k]);
getchar();
}
//计算学生输入码的wpl
for(int i=0;i<N;i++)
{
number2+=(code_length(code[i]))*data[i];
}
if(number2>number1&&flag==0)
{
printf("No\n");
flag=1;
}
else
{ //判断学生码是否为前缀码
for(int i=1;i<N;i++)
{
for(int j=0;j<i;j++)
{
if(Compare(code[i],code[j])&&flag==0)
{
printf("No\n");
flag=1;
}
}
}
if(flag==0)
printf("Yes\n");
}
}
}
检测结果:
总结:这道题很有意义,菜鸡感觉自己写的东西真的有点用了。至少,能得到最短最节省空间的编码。很开心也很开心~哈夫曼老人家1953年就写了这篇paper,感觉自己仿佛活在梦里= =这都差了60年了,我才学明白。。。anyway,再接再厉!!