/*** This is an implementation of Huffman coding.** The core algorithm is taken from the CLR book (Introduction of Algorithms),* Chapter 16.3, and directly used to implement the 'build_tree()' routine.** After the tree is built, a code table that maps a character to a binary* code is built from the tree, and used for encoding text. Decoding is done* by traversing the Huffman tree, as prescribed by the algorithm.** Binary codes are represented by std::vector, which is a specialized* vector that optimizes space.*/

#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include //std::future std::promise#include //std::ref#include #include using namespace std;

// A Huffman Tree Nodestruct HuffmanTree {

char c; // character in an alphabet int cfreq; // frequency of c. struct HuffmanTree *left;

struct HuffmanTree *right;

HuffmanTree(char c, int cfreq, struct HuffmanTree *left=NULL,

struct HuffmanTree *right=NULL) :

c(c), cfreq(cfreq), left(left), right(right) {


~HuffmanTree() {

delete left, delete right;


// Compare two tree nodes class Compare {


bool operator()(HuffmanTree *a, HuffmanTree *b) {

return a->cfreq > b->cfreq;




/*** Builds a Huffman Tree from an input of alphabet C, where C is a vector* of (character, frequency) pairs.*/

HuffmanTree *build_tree(vector< pair > &alph) {

// First build a min-heap // Build leaf nodes first priority_queue, HuffmanTree::Compare > alph_heap;

for (vector< pair >::iterator it = alph.begin();

it != alph.end(); ++it) {

HuffmanTree *leaf = new HuffmanTree(it->first, it->second);



// HuffmanTree algorithm: Merge two lowest weight leaf nodes until // only one node is left (root). HuffmanTree *root = NULL;

while (alph_heap.size() > 1) {

HuffmanTree *l, *r;

l =;


r =;


root = new HuffmanTree(0, l->cfreq + r->cfreq, l, r);



return root;


/*** Prints the tree nodes in breadth-first order*/

void print_tree(HuffmanTree *t) {

deque< pair > q;

q.push_back(make_pair(t, 0));

int curlevel = -1;

while (!q.empty()) {

HuffmanTree *parent = q.front().first;

int level = q.front().second;


if (curlevel != level) {

curlevel = level;

cout << "Level " << curlevel << endl;


cout << parent->cfreq << " " << parent->c << endl;

if (parent->left)

q.push_back(make_pair(parent->left, level + 1));

if (parent->right)

q.push_back(make_pair(parent->right, level + 1));



typedef vector code_t;

typedef map codetable;

/*** Makes a lookup table (std::map) of (c -> code) from a HuffmanTree, where* code is an unsigned long representing the binary code.*/

map build_lookup_table(HuffmanTree *htree) {

codetable lookup; // 二进制表 char -> code deque< pair > q;

q.push_back(make_pair(htree, code_t()));

while (!q.empty()) {

HuffmanTree *node, *lc, *rc;

code_t code;

node = q.front().first;

code = q.front().second;


lc = node->left;

rc = node->right;

if (lc) {

// HuffmanTree is always full (either no children or two children) // Left child is appended a 0 and right child a 1. code_t code_cp(code);

q.push_back(make_pair(lc, (code.push_back(0), code)));

q.push_back(make_pair(rc, (code_cp.push_back(1), code_cp)));

} else {

// Leaf node: contains the character lookup.insert(make_pair(node->c, code));

cout << "(" << node->c << ", ";

for (unsigned i = 0; i < code.size(); i++) {

cout << code[i];


cout << ")" << endl;



return lookup;


/*** Encodes an input string. returns a byte vector.*/

code_t encode(string input, codetable &lookup, code_t &result) {

// code_t reslut string::iterator it = input.begin();

string::iterator it_end = input.end();

for (it; it != it_end; ++it) {

code_t b = lookup[*it];

result.insert(result.end(), b.begin(), b.end());


// return result; // 编码得到的01 byte 序列 vector存储...}

/*** Look up the next valid code in @biter using @htree and returns the* resulting string. Note the iterator @biter is advanced by the actual* length of the next valid code, which varies.*/

char code_lookup(code_t::iterator &biter, const code_t::iterator &biter_end,

const HuffmanTree *htree) {

const HuffmanTree *node = htree;

while (true) {

if (!node->left)

{ // Huffman tree is full: always contains both children or none. break;


if (biter == biter_end) {

throw std::out_of_range("No more bits");


if (*biter) {

node = node->right;

} else {

node =node->left;




return node->c;


/*** Decodes a compressed string represented by a bit vector (vector)* @compressed, using a HuffmanTree @htree.* Returns the original string.*/

string decode(code_t &compressed, const HuffmanTree *htree)


string result;

code_t::iterator biter = compressed.begin();

code_t::iterator biter_end = compressed.end();

while (true) {

try {

result += code_lookup(biter, biter_end, htree);

} catch (const std::out_of_range &oor) {

// Iterator exhausted. break;



return result;


/*** Tests*/

// Make frequency table from a string.vector< pair > make_freq_table(string inp) {

map cfmap;

vector< pair >cfvec;

for (unsigned i = 0; i < inp.size(); i++) {

if (cfmap.find(inp[i]) == cfmap.end()) {

cfmap.insert(make_pair(inp[i], 1));


cfmap[inp[i]] += 1;


for (map::iterator it = cfmap.begin();

it != cfmap.end(); ++it) {

cfvec.push_back(make_pair(it->first, it->second));


return cfvec;


string bitvec_to_string(code_t &bitvec) {

string result;

size_t nbits;

nbits = bitvec.size() & 7; //4

// Write the number of "hanging bits" at the first byte result += static_cast(nbits); // at most 7

char byte = 0;

int bitvec_size = bitvec.size();

for (unsigned i = 0; i < bitvec_size; i++) {

unsigned boff = i & 7;

byte |= bitvec[i] << boff;

if (boff == 7) {

// Write a byte result += byte;

byte = 0;



if (nbits) {

result += byte;


// cout << bitvec_size << "----" << result.size() << endl; return result;


code_t string_to_bitvec(string packed) {

code_t result;

int packed_size = packed.size();

// assert(packed.size()); if (packed_size == 1) {

assert(packed[0] == 0);

return result;


unsigned nbits = packed[0];

string::iterator it = packed.begin() + 1;

string::iterator packed_end = packed.end();

for (it; it != packed_end; ++it) {

for (unsigned i = 0; i < 8; i++) {

result.push_back((*it >> i) & 1);



// fix the last byte if (nbits) {

for (unsigned i = 0; i < (8 - nbits); i++) {




return result;


void encoder(string &s, codetable &ctbl, code_t &t)


// Encode // timeval start, end; // float time_use=0; // gettimeofday(&start, NULL); encode(s, ctbl, t);

// gettimeofday(&end, NULL); // time_use=(end.tv_sec-start.tv_sec)*1000000 +(end.tv_usec-start.tv_usec); // printf("Single thread encode cost: %f秒\n",time_use/1000000); // cout << "encoded (compression ratio: " << 1 - ((float)(t.size() / 8) / s.size()) << ")" << endl;}

void decoder(string &ans, string &c, const HuffmanTree *htree)


// Decode // timeval start1, end1; // float time_use1=0; // gettimeofday(&start1, NULL); // string packed = bitvec_to_string(t); // cout << t.size() << " " << packed.size() << endl; code_t t1 = string_to_bitvec(c);

// cout << t.size() << " " << t1.size() << endl; // assert(std::equal(t.begin(), t.end(), t1.begin())); ans = decode(t1, htree); // 这句有点坑 string ans = decode(t1, htree) 报奇怪内存错误... // gettimeofday(&end1, NULL); // time_use1=(end1.tv_sec-start1.tv_sec)*1000000 +(end1.tv_usec-start1.tv_usec); // printf("Single thread, decode cost: %f秒\n",time_use1/1000000); assert(s1 == s); delete htree; // cout << "string size:" << c.size() << "\tdecompressed size: " << ans.size() << endl;}

string readFileIntoString(){

ifstream ifile("compressed.txt");

ostringstream buf;

char ch;



return buf.str();


int main() {

string filename("y.txt");

string text;

fstream in(filename.c_str());

string s;



s += text;


int n_thread = 12;

int sub_s = s.size() / n_thread;

vector sub_strings;

for (unsigned i=0; i

sub_strings.push_back(s.substr(i*sub_s, sub_s));


// code_t t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10; // 存储编码各子线程结果 vector t_s(n_thread);

// 编解码总计时: timeval start, end;

float time_use=0;

gettimeofday(&start, NULL);

// 构建 dict huffman tree vector< pair > cfvec = make_freq_table(s);

HuffmanTree *htree = build_tree(cfvec); //print_tree(htree); codetable ctbl = build_lookup_table(htree);

// encoder 并发 thread threads[n_thread];

for (unsigned i=0; i

threads[i] = thread(encoder, ref(sub_strings[i]), ref(ctbl), ref(t_s[i]));


for (auto &thread : threads)


// save compressed file ofstream file("compressed.txt");

vector ss_start_index; // 从compressed.txt中截取各个子编码string,每个子模块的起点index存起来... vector ss_lens;

string tmp;

int tmp_size;

int start_index = 0;

// for (unsigned i=0; i

// tmp = bitvec_to_string(t_s[i]); tmp = bitvec_to_string(ti);

tmp_size = tmp.size();



start_index += (tmp_size+1); // 加1是因为每个string后面默认的'\0' file << tmp << endl; // 只能依次写入.. 因为不同sub_thread除以8的余数与总code_t除以8的余数,合起来是有差别的... }

/************ Decode ************/

// to get sub compressed size int all = ss_start_index[n_thread-1] + ss_lens[n_thread-1] + 1;

// load compressed txt file string c; // string 后面的那个'\0'很fan ren c=readFileIntoString();

vector c_s;

vector ress(n_thread);

// 解码计时: timeval start1, end1;

float time_use1=0;

gettimeofday(&start1, NULL);

for (unsigned i=0; i

c_s.push_back(c.substr(ss_start_index[i], ss_lens[i]));


// Decode 并发: thread de_threads[n_thread];

for (unsigned i=0; i

de_threads[i] = thread(decoder, ref(ress[i]), ref(c_s[i]), htree);


for (auto &thread : de_threads)


// concate results // string s_res = res1 + res2 + res3 + res4 + res5 + res6 + res7 + res8 + res9 + res10; // 解码结果 string s_res;

for (auto &res_ : ress)

s_res += res_;

// only decompress time gettimeofday(&end1, NULL);

time_use1=(end1.tv_sec-start1.tv_sec)*1000000 +(end1.tv_usec-start1.tv_usec);

printf("解码耗时: %f秒\n",time_use1/1000000);

// en+decode time gettimeofday(&end, NULL);

time_use=(end.tv_sec-start.tv_sec)*1000000 +(end.tv_usec-start.tv_usec);

printf("编解码总耗时: %f秒\n",time_use/1000000);

cout << "compression ratio: " << 1 - ((float) all / s.size()) << endl;

printf("compreassed file %f kb\n",(float) all / 1024);

// 解码恢复数据 check assert(s_res == s);

delete htree;


