题目内容
哈夫曼编码任务描述:
问题描述:利用哈夫曼编码,实现压缩和解压缩。
提高要求:
(1)能够分析文件,统计文件中出现的字符,统计字符出现的概率,再对文件进行编码,实现文件的压缩和解压缩。
(2)能够对于文件的压缩比例进行统计。
题目说明
输入要压缩的文件路径名字(默认当前路径),将输入的文本文件压缩到output.txt中,解压缩内容会输出到original.txt中,并输出压缩比。
存在问题
当前代码可以实现纯英文文本文件的压缩和解压缩,对于中文文件此代码还存在问题,只能压缩和解压缩内容较少的中文,内容多了的话,解压缩出来会出现乱码
输出例子
请输入要压缩的文件名字
input1.txt
input.txt文件打开成功!
字符对应的编码为:
111000
101111
110
! 1110101110
" 010001
' 111010010
( 10110001011110001
) 10110001011110010
* 10110001011110011
, 101001
- 010011001
. 1011100
0 101100010111111011
1 101100010110110
2 101100010110101
3 1011000101111101
4 1011000101101110
5 1011000101111100
6 101100010110100
7 1011000101101111
8 10110001011110000
9 101100010111111001
: 11101011001100
; 101100110
> 101100010111111000
? 010011000
A 10110010
B 1110100111
C 10110001111
D 010011011
E 0100110100
F 11101000011
G 11101011000
H 0100110101
I 10110000
J 1011000101100
K 1011000101110
L 11101011110
M 101100111
N 10110001110
O 11101000010
P 1011000110
Q 1011000101111111
R 10110001010
S 11101011111
T 111010001
U 1110101100111
V 11101011001101
W 1110100000
X 101100010111101
Y 11101011011
Z 1011000101111110101
[ 10110001011111101000
] 10110001011111101001
a 1000
b 1011101
c 101101
d 10101
e 000
f 011111
g 010010
h 0011
i 0010
j 11101011010
k 11101010
l 01110
m 111001
n 0101
o 0110
p 010000
q 1011000100
r 11110
s 11111
t 1001
u 111011
v 0100111
w 101000
x 1110100110
y 011110
z 111010110010
输出哈夫曼树:
下标 权值 父结点 左孩子 右孩子 结点
0 35 92 -1 -1 1
1 209919 155 -1 -1
2 3032 120 -1 -1 T
3 1157 109 -1 -1 H
4 966 109 -1 -1 E
5 630 103 -1 -1 R
6 1353 111 -1 -1 P
7 963 108 -1 -1 S
8 649 104 -1 -1 N
9 762 105 -1 -1 O
10 786 105 -1 -1 F
11 2344 117 -1 -1 D
12 3256 121 -1 -1 '
13 5413 125 -1 -1 A
14 824 106 -1 -1 G
15 948 108 -1 -1 L
16 24540 137 -1 -1
17 24540 138 -1 -1
18 66588 147 -1 -1 n
19 87051 149 -1 -1 t
20 64024 146 -1 -1 h
21 123562 152 -1 -1 e
22 20150 134 -1 -1 f
23 63505 146 -1 -1 i
24 57104 145 -1 -1 r
25 60346 145 -1 -1 s
26 2936 119 -1 -1 M
27 74835 148 -1 -1 o
28 44339 142 -1 -1 d
29 78801 149 -1 -1 a
30 19394 134 -1 -1 y
31 25641 138 -1 -1 m
32 14821 132 -1 -1 p
33 36384 141 -1 -1 l
34 21483 135 -1 -1 ,
35 29 91 -1 -1 6
36 32 91 -1 -1 2
37 25 89 -1 -1 5
38 7094 127 -1 -1 k
39 21355 135 -1 -1 w
40 29843 139 -1 -1 u
41 17994 133 -1 -1 g
42 23018 136 -1 -1 c
43 725 104 -1 -1 C
44 11892 130 -1 -1 b
45 9594 128 -1 -1 v
46 864 107 -1 -1 j
47 11708 130 -1 -1 .
48 373 102 -1 -1 z
49 135 98 -1 -1 J
50 5107 124 -1 -1 I
51 2829 119 -1 -1 ;
52 2038 116 -1 -1 -
53 1428 112 -1 -1 W
54 28 90 -1 -1 Q
55 1611 113 -1 -1 x
56 1659 113 -1 -1 B
57 1252 110 -1 -1 q
58 240 100 -1 -1 U
59 15419 132 -1 -1 "
60 125 97 -1 -1 V
61 947 107 -1 -1 Y
62 108 97 -1 -1 :
63 45 93 -1 -1 X
64 1895 115 -1 -1 !
65 11 84 -1 -1 (
66 11 85 -1 -1 )
67 175 99 -1 -1 K
68 1962 116 -1 -1 ?
69 12 85 -1 -1 *
70 4 81 -1 -1 Z
71 25 89 -1 -1 3
72 20 87 -1 -1 4
73 20 87 -1 -1 7
74 10 84 -1 -1 8
75 6 82 -1 -1 >
76 6 82 -1 -1 9
77 8 83 -1 -1 0
78 1 80 -1 -1 [
79 1 80 -1 -1 ]
80 2 81 78 79
81 6 83 80 70
82 12 86 75 76
83 14 86 81 77
84 21 88 74 65
85 23 88 66 69
86 26 90 82 83
87 40 92 72 73
88 44 93 84 85
89 50 94 37 71
90 54 94 86 54
91 61 95 35 36
92 75 95 0 87
93 89 96 88 63
94 104 96 89 90
95 136 98 91 92
96 193 99 93 94
97 233 100 62 60
98 271 101 49 95
99 368 101 67 96
100 473 102 97 58
101 639 103 98 99
102 846 106 48 100
103 1269 110 5 101
104 1374 111 8 43
105 1548 112 9 10
106 1670 114 14 102
107 1811 114 46 61
108 1911 115 15 7
109 2123 117 4 3
110 2521 118 57 103
111 2727 118 6 104
112 2976 120 53 105
113 3270 121 55 56
114 3481 122 106 107
115 3806 122 64 108
116 4000 123 68 52
117 4467 123 109 11
118 5248 124 110 111
119 5765 125 51 26
120 6008 126 112 2
121 6526 126 12 113
122 7287 127 114 115
123 8467 128 116 117
124 10355 129 50 118
125 11178 129 13 119
126 12534 131 120 121
127 14381 131 38 122
128 18061 133 123 45
129 21533 136 124 125
130 23600 137 47 44
131 26915 139 126 127
132 30240 140 32 59
133 36055 140 41 128
134 39544 141 30 22
135 42838 142 39 34
136 44551 143 129 42
137 48140 143 130 16
138 50181 144 17 31
139 56758 144 131 40
140 66295 147 132 133
141 75928 148 33 134
142 87177 150 135 28
143 92691 150 136 137
144 106939 151 138 139
145 117450 151 24 25
146 127529 152 23 20
147 132883 153 140 18
148 150763 153 27 141
149 165852 154 29 19
150 179868 154 142 143
151 224389 155 144 145
152 251091 156 21 146
153 283646 156 147 148
154 345720 157 149 150
155 434308 157 1 151
156 534737 158 152 153
157 780028 158 154 155
158 1314765 -1 156 157
压缩编码时,output.txt文件打开成功!
译码时,output.txt文件打开成功!
译码时,original.txt文件打开成功!
文件压缩成功,压缩比为:0.575841
代码
#include<iostream>
#include<cstdio>
#include<fstream>
#include<algorithm>
#include<map>
#include<vector>
using namespace std;
const int MAX = 256;
#define LL long long
struct HTNode {
unsigned char data;
int weight;
int parent, lchild, rchild;
HTNode() {
weight = parent = lchild = rchild = -1;
}
};
LL inputlen;
int n = 0;
HTNode *arr;
map<char, vector<int>> mp;
unsigned char* char_huffman;
LL openfile(char* name);
void Count(LL filelen);
void CreateHuffmanTree();
void HaffmanCode();
void outPutTree();
void Binarycode();
int Translate();
unsigned char int_to_char(int *a);
void char_to_int(int x, int* a);
int main() {
cout<<"请输入要压缩的文件名字"<<endl;
char filename[MAX];
cin.getline(filename,MAX);
arr = new HTNode[2*MAX - 1];
inputlen = openfile(filename);
Count(inputlen);
CreateHuffmanTree();
HaffmanCode();
outPutTree();
Binarycode();
int outputlen = Translate();
double rate = (double)outputlen/inputlen;
printf("\n");
printf("文件压缩成功,压缩比为:%lf",rate);
return 0;
}
LL openfile(char* name){
FILE* file1 = NULL;
file1 = fopen(name,"rb");
if(file1 == NULL){
cout<<"打开文件失败!"<<endl;
exit(1);
}
cout<<"input.txt文件打开成功!"<<endl;
cout<<endl;
fseek(file1,0,SEEK_END);
LL filelen = ftell(file1);
rewind(file1);
char_huffman = new unsigned char[filelen];
for(LL i=0;i<filelen;i++){
char_huffman[i] = 0;
}
fread(char_huffman,1,filelen,file1);
fclose(file1);
return filelen;
}
void Count(LL filelen){
for (LL i = 0; i < filelen; i++) {
int flag = 0;
for (int j = 0; j < n; j++) {
if (arr[j].data == char_huffman[i]) {
flag = 1;
arr[j].weight++;
break;
}
}
if (!flag) {
arr[n].data = char_huffman[i];
arr[n].weight = 1;
n++;
}
}
}
void CreateHuffmanTree() {
for (int i = n; i < 2 * n - 1; i++) {
LL min1 = 10000000; LL min2 = min1;
int x1 = -1, x2 = -1;
for (int j = 0; j < i; j++) {
if (arr[j].parent == -1) {
if (arr[j].weight < min1) {
min2 = min1;
x2 = x1;
min1 = arr[j].weight;
x1 = j;
}
else if (arr[j].weight < min2) {
min2 = arr[j].weight;
x2 = j;
}
}
}
arr[x1].parent = arr[x2].parent = i;
arr[i].weight = min1 + min2;
arr[i].lchild = x1;
arr[i].rchild = x2;
}
}
void HaffmanCode()
{
for (int i = 0; i < n; i++) {
int t = i;
vector<int> v;
int parent = arr[i].parent;
for (int j = parent; j != -1; t = j, j = arr[j].parent) {
if (arr[j].lchild == t) {
v.push_back(0);
}
else {
v.push_back(1);
}
}
reverse(v.begin(),v.end());
mp[arr[i].data]= v;
}
}
void outPutTree(){
cout<<"字符对应的编码为:"<<endl;
for(map<char,vector<int>>::iterator it = mp.begin();it != mp.end();it++){
printf("%c ",it->first);
for(int i=0;i<(*it).second.size();i++){
printf("%d",(*it).second[i]);
}
printf("\n");
}
printf("输出哈夫曼树:\n");
printf("下标\t权值\t父结点\t左孩子\t右孩子\t结点\n");
for(int i=0;i<2*n-1;i++){
printf("%d\t%d\t%d\t%d\t%d\t%c\t\n",i,arr[i].weight,arr[i].parent,arr[i].lchild,arr[i].rchild,arr[i].data);
}
printf("\n");
}
void Binarycode(){
ofstream file1("output.txt");
if(!file1.is_open()){
printf("output.txt文件打开失败\n");
exit(1);
}
printf("压缩编码时,output.txt文件打开成功!\n");
printf("\n");
int buffer[8] = {0};
int count = 0;
for(int i=0;i<inputlen;i++){
int x;
x = char_huffman[i];
for(int j=0;j<mp[x].size();j++){
if(count == 8){
file1<<int_to_char(buffer);
count = 0;
}
buffer[count] = mp[x][j];
count++;
}
}
if(count != 0){
file1<<int_to_char(buffer);
}
char lackcount = count;
file1<<lackcount;
file1.close();
}
int Translate(){
FILE* file1 = NULL;
file1 = fopen("output.txt","rb");
if(file1 == NULL){
cout<<"写入文件时,文件output.txt打开失败!"<<endl;
exit(1);
}
cout<<"译码时,output.txt文件打开成功!"<<endl;
cout<<endl;
fseek(file1,0,SEEK_END);
LL filelen = ftell(file1);
rewind(file1);
unsigned char* buf = new unsigned char[filelen];
for(int i=0;i<filelen;i++){
buf[i] = 0;
}
fread(buf,1,filelen,file1);
fclose(file1);
vector<int> source;
int a[8];
for(int i=0;i<8;i++){
a[i] = 0;
}
for(int i=0;i<filelen - 2;i++){
char_to_int(buf[i],a);
for(int j = 0 ;j<8;j++){
source.push_back(a[j]);
}
}
int lackcount = buf[filelen - 1];
char_to_int(buf[filelen - 2],a);
for(int j=0;j<lackcount;j++){
source.push_back(a[j]);
}
ofstream file2("original.txt",ios::binary);
if(!file2.is_open()){
printf("文件original.txt打开失败!");
exit(1);
}
printf("译码时,original.txt文件打开成功!");
printf("\n");
int q = 2 * n - 2;
for (int i = 0;i<source.size();i++){
if (source[i] == 0) {
q = arr[q].lchild;
}
else if (source[i] == 1) {
q = arr[q].rchild;
}
if (arr[q].lchild == -1 && arr[q].rchild == -1) {
if(arr[q].data != 13)
file2<<arr[q].data;
q = 2 * n - 2;
}
}
return filelen;
}
unsigned char int_to_char(int *a){
unsigned char x;
x = a[0]*128 + a[1]*64 + a[2]*32 + a[3]*16 + a[4]*8 + a[5]*4 + a[6]*2 + a[7]*1;
return x;
}
void char_to_int(int x, int* a){
for(int i=7;i>=0;i--){
a[i] = x % 2;
x = x / 2;
}
}
代码内容参考来处