2021-04-18

HuffmanCoding

basic idea behind Huffman coding: to use fewer bits for characters that occur more frequently.

The output from Huffman’s algorithm can be viewed as a variable-length code table for encoding a source symbol (such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence (weight) for each possible value of the source symbol.

Terminology

Huffman coding uses a specific method for choosing the representation for each symbol, resulting in a prefix code (sometimes called “prefix-free codes”, that is, the bit string representing some particular symbol is never a prefix of the bit string representing any other symbol)

Problem definition

Given

A set of symbols and their weights (usually proportional to probabilities).

Find

A prefix-free binary code (a set of codewords) with minimum expected codeword length (equivalently, a tree with minimum weighted path length from the root).

Process

Two type of nodes

Leaf nodes

typedef struct LeafNode{
    Type symbol ; // the type of the thing you want to compress
    double weight ; // usually is the frequency of apparence
    struct LeafNode * parent ;
}LeafNode ;

Internal nodes

typedef struct InternalNode{
    LeafNode * Lchild ; 
    LeafNode * Rchild ;
    double weight ; 
    struct InternalNode * parent ; //this is optional
}InternalNode;

The process begins with the leaf nodes containing the probabilities of the symbol they represent.
Then, the process takes the two nodes with smallest probability, and creates a new internal node having these two nodes as children. The weight of the new node is set to the sum of the weight of the children.
We then apply the process again, on the new internal node and on the remaining nodes, we repeat this process until only one node remains, which is the root of the Huffman tree.
we’re building a single tree from a group (or forest) of trees. Initially, all the trees have a single node containing a character and the character’s weight.
A visualization of Huffman coding

Decompression

Generally speaking, the process of decompression is simply a matter of translating the stream of prefix codes to individual byte values, usually by traversing the Huffman tree node by node as each bit is read from the input stream (reaching a leaf node necessarily terminates the search for that particular byte value).

C implementation

#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#define Type unsigned char
#define heaplength 10000
#define char_kind 256
#define N 256
#define left(i) ((i)*2+1)
#define right(i) ((i)*2+2)
#define parent(i) (((i)-1)/2)
#define smaller(a,b) ((h->start[a].w) <= (h->start[b].w) ? (a):(b) )
#define smallest(a,b,c) smaller(smaller(a,b) ,(c))
#define isLeaf(root) ((root)->left == NULL && (root)->right==NULL)
unsigned long long  alphabet [char_kind] ;
char huffs [N][N] ;
void count_frequency(const char * name);
typedef struct hfmTNode{
    Type element ;
    unsigned long long w ;
    struct hfmTNode * left ;
    struct hfmTNode * right ;
}Node ;

void swap(Node * a , Node * b ){
    Node  temp  = *a ;
    *a = *b;
    *b = temp;
}

Node* newnode(Type data , unsigned long long weight ){
    Node* res = (Node*)malloc(sizeof(Node));
    res->element = data;
    res->w = weight;
    res->left = res->right = NULL;
    return res ;
}

typedef struct heap{
    size_t maxlength;
    size_t size;
    Node* start;
}Heap;

void initialize(Heap * h){
    h->maxlength = heaplength ;
    h->start = (Node*)malloc(sizeof(Node) * h->maxlength);
    h->size = 0;
}
void destroy(Heap * h){ free(h->start); }

void percolateup(Heap * h , size_t position){
    if (position >= h->size || position < 0 ) return ;
    while(parent(position)>=0 ){
        if(h->start[parent(position)].w <= h->start[position].w) break;
        swap(&h->start[parent(position)] , &h->start[position]);
        position = parent(position);
    }
}

void percolatedown(Heap * h , size_t position){
    if (position >= h->size || position < 0 ) return ;
    while(left(position) < h->size ){
        size_t min ;
        if (right(position) < h->size){
        min = smallest(position , left(position) , right(position));
        }
        else{ min = smaller(position , left(position));
        } if (min == position) break;
        swap(&h->start[position] , &h->start[min]);
        position = min;
    }
}

void heap_append(Heap* h  , Node * node){
    h->start[h->size++] = *node;
    percolateup(h , h->size - 1 );
}

Node* heap_pop(Heap * h ){
    Node * res = newnode(h->start[0].element , h->start[0].w);
    res->left = h->start[0].left;
    res->right = h->start[0].right;
    swap(&h->start[0] , &h->start[--(h->size)]);
    percolatedown(h , 0);
    return res;
}
void createHuffmanTree(Heap * h){
    while(h->size != 1){
        Node * l = heap_pop(h);
        Node * r = heap_pop(h);
        Node * new = newnode('#' , l->w + r->w );
        new->left = l;
        new->right = r;
        heap_append(h , new);
        free(new);
        //I could have been more elegant here, I should use pointer-to-pointer more instead of arguments copying
    }
}

void write2huff(char * bf , size_t length , unsigned char destination){
    for(int i = 0 ;  i < length ; i++){
        if (bf[i]=='1') strcat(huffs[destination], "1");
        else if (bf[i]=='0') strcat(huffs[destination], "0");
        else printf("[error!]");
    }
}

void encode(Node * root ,  char * buffer , size_t length ){
    if (root == NULL) return;
    if (isLeaf(root)){
        //save huffman code into array
        write2huff(buffer, length, (unsigned char)root->element);
        return;
    }
    buffer[length++] = '0';
    encode(root->left , buffer , length );
    length--; //back one step, in each node, you can only move to one direction per time
    buffer[length++] = '1';
    encode(root->right , buffer , length );
}

Heap *  HuffmanEncode(const char * filename){
    count_frequency(filename);
    //creat Huffman heap
    Heap * min_heap  = (Heap*)malloc(sizeof(Heap));
    initialize(min_heap);
    for (int  i = 0 ; i < char_kind ; i++){
        if (alphabet[i]){
            Node * new = newnode((unsigned char)i , alphabet[i]);
            heap_append(min_heap , new);
            free(new);
        }
    }
    createHuffmanTree(min_heap);

    //Huffman Encode
    char buffer [N];for (int i = 0;i < N ; i++) buffer[i] = 0; //clear buffer
    size_t length = 0;
    encode(&min_heap->start[0] , buffer , length);

    //return Huffman heap
    return min_heap;
}

void decode(Node* root , char * signals , size_t * length ,  FILE * binaryfile){
    if (isLeaf(root)){
        fputc(root->element, binaryfile);
        return ;
        }
    char choice = signals[(*length)++];
    switch(choice){
        case '0' :{
            decode(root->left , signals , length , binaryfile);
            break;
        }
        case '1':{
            decode(root->right , signals , length , binaryfile);
            break;
        }
        default:
            printf("invalid code\n");
    }
}

void count_frequency(const char * name){
    FILE * file = fopen(name , "r");
    if (file == NULL){
        printf("file not found\n");
        exit(-1);
    }
    char c = fgetc(file);
    while(c!=EOF){
        alphabet[(unsigned char)c]++;
        c = fgetc(file);
    }
    fclose(file);
}
//----------------------------my bytes loop queue-----------------------------
typedef struct ByteQ{
    char * head ;
    int maxsize ;
    int size ;
    int front ;
    int rear ;
}ByteQ;

void initial(ByteQ* bq , int msize){
    bq->head = (char*)malloc(msize * sizeof(char));
    bq->maxsize = msize;
    bq->size = 0;
    bq->front = bq->rear = 0;
}

//user should check overflow himself
void equeue(char * bitstring , ByteQ * bq){
    unsigned long space = strlen(bitstring);
    for (int i = 0; i < space ; i++){
        bq->head[((bq->rear)++)%bq->maxsize]=bitstring[i];
        bq->size++;
    }
}

//user should check empty himself
char dqueue(ByteQ * bq){
    char res = 0;
    for (int i = 0 ; i < 8 ; i++){
        char c = bq->head[(bq->front)++ % bq->maxsize];
        bq->size--;
        if (c == '1'){
            res = (res << 1) + 1 ;
        }
        else if (c == '0') {
            res <<= 1;
        }
        else{
            printf("error happens when dqueue\n");
        }
    }
    return  res;
}

void get_string(const ByteQ * bq , int lengthOfstring , char * res ){
    for (int i = 0 ; i < lengthOfstring ; i++){
        res[i] = bq->head[(bq->front + i)%bq->maxsize];
    }
}

void pop_string(ByteQ * bq , size_t lengthOfstring){
    bq->front = (bq->front + lengthOfstring) % bq->maxsize;
    bq->size -= lengthOfstring;
}

//----------------------------my bytes loop queue-----------------------------

void compress2bin(char * file2bcompressed , char * binaryfile){
    FILE * file = fopen(file2bcompressed, "rt");
    FILE * bin = fopen(binaryfile, "wb");
    ByteQ * bq = (ByteQ*)malloc(sizeof(ByteQ));
    initial(bq, N);
    char c = fgetc(file);
    while (c!=EOF) {
        equeue(huffs[(unsigned char)c], bq);
        while(bq->size >= 8){
            char byte = dqueue(bq);
            fputc((unsigned char)byte, bin);
        }
        c = fgetc(file);
    }
    while (bq->size % 8 != 0) {
        equeue("0", bq);
    }
    while (bq->size > 0) {
        fputc((unsigned char)dqueue(bq), bin);
    }
    fclose(file);
    fclose(bin);
}

void compress2txt( char * originfile , char * text_file_name ){
    FILE * tobcompressed = fopen(originfile , "rt");
    FILE * text = fopen(text_file_name , "wt");
    char c = fgetc(tobcompressed);
    while (c != EOF) {
        fprintf(text, "%s" , huffs[(unsigned char)c]);
        c = fgetc(tobcompressed);
    }
    fclose(tobcompressed);
    fclose(text);
}

void char2str( unsigned char number  , char * bitstring){
    for (int i = 7 ; i >= 0 ; i--){
        bitstring[i] = (char)( (number &  0b00000001) + 48);
        number >>= 1 ;
    }
}

void decompress(char * file2bdecomp , char * output , Heap * huffheap){
    FILE * file = fopen(file2bdecomp, "rb");
    FILE * out = fopen(output, "wt");
    size_t  length = 0;
    ByteQ * bq = (ByteQ*)malloc(sizeof(ByteQ));
    initial(bq, N);
    char eight [9];eight[8] = '\0';
    char c = fgetc(file);
    char signals [24] ;
    while (!feof(file)) {
        for (int i = 0 ; i < 3 ; i++){
            char2str((unsigned char)(c) , eight);
            if (bq->size <= bq->maxsize - 8){
                equeue(eight, bq);
            }
            else continue;
            c = fgetc(file);
            if (feof(file)) break;
        }
        get_string(bq , 24, signals);
        decode(&huffheap->start[0], signals, &length , out);
        pop_string(bq, length);
            length=0;
    }
    length = 0;
    // because the shortest huffman code has lenght of 3
    while (bq->size > 3) {
        get_string(bq , 24, signals);
        decode(&huffheap->start[0], signals, &length , out);
        pop_string(bq, length);
            length=0;
    }
    fclose(file);
    fclose(out);
}

void writeSymbol(char * symbolfilename){
    FILE * symbol = fopen(symbolfilename , "wt");
    for (int i = 0 ; i < N ; i++){
        if (huffs[i] == NULL) continue;
        if ((unsigned char )i == 0xa) fprintf(symbol ,  "%u:\\n:%s\n" , (unsigned char )i , huffs[i] );
        else fprintf( symbol , "%u:%c:%s\n" , (unsigned char )i, (unsigned char )i , huffs[i] );
    }
    fclose(symbol);
}


int main(int argc , char * argv []){
    char * f = "pride.txt";             //file to be compressed
    char * textname="ttttext.txt";      //bitstring file
    char * symbolname = "ssssymbols.txt";//huffman code for all symbols
    char * binname = "bbbbinary.dat";   //compressed binary file
    char * output = "ooooutput.txt";    //decompressed text file

    Heap * huffheap = HuffmanEncode(f);
    writeSymbol(symbolname);
     printf("symbols created.\n");
    compress2bin(f, binname);
    printf("successfully compress to binary !!!\n");
    compress2txt(f, textname);
    printf("successfully compress to bitstring !!!\n");
    decompress(binname, output, huffheap);
    printf("successfully decompress binary file !!!\n");

    destroy(huffheap);
    return 0;
}

In case of there are some non-ascii characters in the file to be compressed, In order to do a lossless decompression, I arbitrarily read the file byte by byte, though this may spoil the frequency of appearance.

$ cat test.txt
a
Í
$ hexdump -C < test.txt > testdump  && cat testdump
00000000  61 0a c3 8d 0a                              |a....|

1 2	>>> 'Í'.encode('utf8') b'\xc3\x8d'

#include <stdio.h>
int main(){
	FILE * file = fopen("test.txt" , "rt");
	FILE * result = fopen("result.txt" , "wt");
	char c = fgetc(file);
	while(c != EOF){
		fputc(c , result);
		c = fgetc(file);
	}
	return 0;
}

$ cat result.txt
a
Í
$ diff test.txt result.txt
$

char is a signed byte, this may lead to some troubles, so I convert it to unsigned char to represent 0-255

#include <stdio.h>
int main(){
	FILE * file = fopen("test.txt" , "rt");
	FILE * result = fopen("result.txt" , "wt");
	char c = fgetc(file);
	while(c != EOF){
		fputc(c , result);
		printf("signed : %d\tunsigned :%u\n" , c ,(unsigned char)c);
		c = fgetc(file);
	}
	return 0;
}

$ gcc frw.c -o frw && ./frw
signed : 97	unsigned :97
signed : 10	unsigned :10
signed : -61	unsigned :195
signed : -115	unsigned :141
signed : 10	unsigned :10

use bitwise operation to convert between byte and bitstring

void char2str( unsigned char number  , char * bitstring){
    for (int i = 7 ; i >= 0 ; i--){
        bitstring[i] = (char)( (number &  0b00000001) + 48);
        number >>= 1 ;
    }
}
char dqueue(ByteQ * bq){
    char res = 0;
    for (int i = 0 ; i < 8 ; i++){
        char c = bq->head[(bq->front)++ % bq->maxsize];
        bq->size--;
        if (c == '1'){
            res = (res << 1) + 1 ;
        }
        else if (c == '0') {
            res <<= 1;
        }
        else{
            printf("error happens when dqueue\n");
        }
    }
    return  res;
}

In case of I need to decompress a very large file, I implement a customized queue to mimic a stream parser
The program seems to work nicely on my MacOS, but on Windows it will encounter a wired memory access error, during mallocing memory, a pointer will get a very small address, like 0x5d, which will lead to a permission denied later, I don’t know why.