A New Approach for Efficient Sequential Decoding of Static Huffman Codes

Created by Ramses Alexander Coraspe Valdez

Created on May 20, 2020

Dependencies

In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import timeit
import statistics as stats
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import gridspec
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import random
import argparse
from operator import itemgetter
from itertools import combinations, product
from itertools import tee
from itertools import islice
from scipy.stats import entropy

BWT, MTF and Entropy

In [2]:
def entropy_shannon(s, base=None):
    value, counts = np.unique(s, return_counts=True)
    return entropy(counts, base = base)

def bw_transform(s):
    n = len(s)
    m = sorted([s[i:n]+s[0:i] for i in range(n)])
    I = m.index(s)
    L = [q[-1] for q in m]
    return (I, L)


def bw_restore(I, L):
    n = len(L)
    X = sorted([(i, x) for i, x in enumerate(L)], key=itemgetter(1))

    T = [None for i in range(n)]
    for i, y in enumerate(X):
        j, _ = y
        T[j] = i

    Tx = [I]
    for i in range(1, n):
        Tx.append(T[Tx[i-1]])

    S = [L[i] for i in Tx]
    S.reverse()
    return ''.join(S)

common_dictionary = list(range(256))
def encodeMTF(plain_text):      
    dictionary = common_dictionary.copy()
    
    compressed_text = list()    
    rank = 0

    for c in plain_text:
        rank = dictionary.index(int(c))   
        compressed_text.append(rank) 

        dictionary.pop(rank)
        dictionary.insert(0, int(c))

    return compressed_text

def decodeMTF(compressed_data):
    compressed_text = compressed_data
    dictionary = common_dictionary.copy()
    plain_data = []

    for rank in compressed_text:

        plain_data.append(dictionary[rank])

        e = dictionary.pop(rank)
        dictionary.insert(0, e)

    return plain_data    

HUFFMAN CODING CLASS

In [3]:
from collections import Counter

class NodeT:

	def __init__(self, frequency, left, right, isLeaf, valueLeaf):
         self.frequency = frequency
         self.left = left
         self.right = right
         self.isLeaf = isLeaf
         self.valueLeaf = valueLeaf

	def getLeft(self):
		return self.left

	def getRight(self):
		return self.right

	def F(self):
		return self.frequency

	def V(self):
		return self.valueLeaf

class Huffman:
  def __init__(self, fileBytes):
    self.fileBytes = fileBytes
    self.huffmancodes = {}
    self.codes = []
    self.compressedFile = None
    self.tablefrequency = {}
    self.realLength = 0
    self.remained=0
    self.base_table = None

  def compress(self):  

      self.realLength = len(self.fileBytes)

      c = dict(Counter(self.fileBytes))
          
      for key, value in c.items():
            self.tablefrequency[key] = value                                        
      tl = []
      lk  = self.tablefrequency.keys()             
      
      for k in lk:          
          n  = NodeT(self.tablefrequency.get(k),None, None, True, k)
          tl.append(n)                
                
      tl.sort(key=lambda x: x.frequency)
                  
      while len(tl)> 1:          
          l = tl.pop(0)          
          r = tl.pop(0)          
          tn = NodeT(l.F() + r.F() ,l, r, False, None)     
          tl.append(tn)
          tl.sort(key=lambda x: x.frequency)
                  
      self._huffmanCodes(tl.pop(0))            
            
      cf = []
      for b in self.fileBytes:
          cf.append(self.huffmancodes.get(b))  
      
      self.compressedFile = ''.join(cf)
      #Returning the huffman table, new bytes compressed, remaining in bits
      #return self.huffmancodes, ''.join(cf), remained
    
  def _huffmanCodes(self, tl):       
      if tl.isLeaf == False:         
         l = tl.getLeft()
         r = tl.getRight()
         self.codes.append("0")
         self._huffmanCodes(l);
         self.codes.pop()
         self.codes.append("1")
         self._huffmanCodes(r) 
         self.codes.pop()
      else:                        
         self.huffmancodes[tl.V()] = ''.join(self.codes)       
      
    
  def details_compression(self):
    raw = self.realLength * 8
    compressed = len(self.compressedFile)    
    percent = ((compressed * 100) / raw)
    return ((float)(100 - percent))

  def _table(self):
     _byte =  list(self.huffmancodes.keys())
     _symbol = [chr(b) for b in _byte]
     _codigo =  list(self.huffmancodes.values())
     _frequency = [self.tablefrequency[b] for b in _byte]
     _length_code = [len(c) for c in _codigo]
     _length_byte = [len(str(c)) for c in _byte]
     _start_byte = [int(str(c)[:1]) for c in _byte]

     self.base_table = pd.DataFrame(list(zip(_byte, _symbol, _codigo, _length_code, _length_byte, _start_byte, _frequency )), 
               columns =['Byte', 'Symbol', 'Code', 'Length_code', 'Length_Byte', 'Start_Byte', 'Frequency'])     
               
  def info_table(self):
      self._table()
      lc_count = dict(Counter(list(self.base_table['Length_code'])))
      result = self.base_table.groupby('Length_code', as_index=False).agg({"Frequency": "sum"})
      _count = [ lc_count[l] for l in result['Length_code']]
    
      length_code = list(result['Length_code'])
      return pd.DataFrame(list(zip(list(result['Length_code']), _count, list(result['Frequency']) )), index=length_code, 
                columns =['Longitud Código', 'Numero de Códigos', 'Total Bytes'])  

PLOTING CODING

In [4]:
def plot_lens_by_codes_families(term_freq, hf_code):

    fig, ax = plt.subplots(figsize = (28, 6))

    bars = []
    values= []
    for ix in range(256):
        if ix in term_freq.keys():
            bars.append(ix)
            values.append(term_freq[ix])
    y_pos = np.arange(len(bars))


    barlist = plt.bar(y_pos, values, alpha = 0.7)
    plt.xticks(y_pos, bars, rotation=45)

    sum_values = sum(values)

    lens = []
    for i, v in enumerate(values):
        key = bars[i]
        length = len(hf_code[key])
        l = str(round((v / sum_values) * 100, 1)) + '%'
        if v > 100:
            ax.text((i - 0.6), (v * 1.0125), str(l), color = "black", fontweight = "normal", fontsize = 7)
        barlist[i].set_color(palette[str(length)])


    legend_list = []
    for k, v in palette.items():
        legend_list.append(mpatches.Patch(color = v, label = 'Longitud código ' + k))

    plt.legend(handles = legend_list)
    plt.ylabel('Frecuencia')
    plt.xlabel('# Bytes')
    plt.title('Longitudes por familia de códigos')
    plt.grid(False)
    plt.show()

def plot_fails_trend(_dataf, _datas, title):    
    f_mean = stats.mean(_dataf)
    f_stdev = stats.pstdev(_dataf)
    print(round(f_mean, 4), 'fallos/byte con desviación estándar:', round(f_stdev, 4))

    max_value = max(_dataf)
    min_value = 0
    upper_lim = min(max_value, (f_mean + f_stdev))
    lower_lim = max(min_value, (f_mean - f_stdev))
    
    fig = plt.figure(figsize = (24, 5))
    plt.plot(_dataf, linewidth=1, marker='.', markersize=5, color= '#f38d8b', label= "fallos")    
    plt.plot(_datas,  linewidth=1, marker='.', markersize=5, color= '#20639B', label= "longitud de código")
    plt.grid(alpha=.4)
    plt.legend()
    plt.axhline(y = upper_lim, color = "#000000", linestyle = "--", linewidth=1)
    plt.axhline(y = f_mean, color = "#000000", linestyle = "-", linewidth=1)
    plt.axhline(y = lower_lim, color = "#000000", linestyle = "--", linewidth=1)
    plt.title('Comportamiento del fallo durante la decodificación (' + title + ')')
    plt.xlabel('# Tiempo', fontsize = 10)
    plt.ylabel('Cantidad de fallos / longitud de código encontrada', fontsize = 10)  
    plt.tick_params(axis="x", labelsize=10)
    
    
    plt.show()

def plot_fails_violin(violindf, xname, yname):

    fig, ax = plt.subplots(1, 1, figsize = (15, 8))

    sns.set(style="whitegrid")
    sns.violinplot(xname, yname, data = violindf, color='#f8bab9', linewidth=2)
    sns.stripplot(xname,yname, data=violindf, jitter=True, color='#f38d8b' ,size=6, marker="D", edgecolor="black", alpha=.25 , zorder=1)                   
    ax.set_xlabel('')
    ax.set_ylabel('# fallos', fontsize = 10)
    plt.title('Distribución del numero de fallos')
    plt.show()   



def plot_fails_dist(fd1,fd2,fd3,fd4):

    fig = plt.figure(figsize = (24, 12))
    fig.subplots_adjust(hspace = 0.25, wspace = 0.25)
    gs = gridspec.GridSpec(2, 2, width_ratios=[4, 4])     

    x = min(fd1['n_fails'])    
    
    ax1 = plt.subplot(gs[0])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd1, color='#6699cc')

    for index, row in fd1.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax1.set_xlabel('# Fallos', fontsize = 9)
    ax1.set_ylabel('Número de fallos', fontsize = 9)

    x = min(fd2['n_fails'])    
    
    ax2 = plt.subplot(gs[1])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd2, color='#6699cc')

    for index, row in fd2.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax2.set_xlabel('# Fallos', fontsize = 9)
    ax2.set_ylabel('Número de fallos', fontsize = 9)

    
    x = min(fd3['n_fails'])        
    ax3 = plt.subplot(gs[2])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd3, color='#6699cc')

    for index, row in fd3.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax3.set_xlabel('# Fallos', fontsize = 9)
    ax3.set_ylabel('Número de fallos', fontsize = 9)


    x = min(fd4['n_fails'])    
    
    ax4 = plt.subplot(gs[3])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd4, color='#6699cc')

    for index, row in fd4.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax4.set_xlabel('# Fallos', fontsize = 9)
    ax4.set_ylabel('Número de fallos', fontsize = 9)  

    plt.show()      

def table_fails_dist(fails_data):
    fails_dist = Counter(fails_data)
    df_fails_dist = pd.DataFrame.from_records(fails_dist.most_common(), columns = ['n_fails', 'quantity'])
    df_fails_dist["fails_perc"] = 100 * df_fails_dist.quantity / len(fails_data)
    df_fails_dist = df_fails_dist.sort_values(by=['n_fails'])
    return df_fails_dist

def get_accuracy(fails_data, len_des):

    accuracy = 0
    total_attempts = sum(fails_data)
    attempts_dict = dict(Counter(fails_data))

    if 0 in attempts_dict.keys():        
        accuracy = round(100 * len_des / (total_attempts + attempts_dict[0]), 4)
    else:
        accuracy = round(100*len_des / total_attempts,4)

    return accuracy, total_attempts

Section

In [5]:
files = ['Información en el universo holográfico.txt']



fl = []
longs = []
ent = []
minn = []
gana = []
chunk= []


for f in files:
    print(f)
    with open(f, 'rb') as r:                          
            original= bytearray(r.read())
                                        
    originalVector = [x for x in original]
    print("Entropy before:" , entropy_shannon(originalVector, 2))

    if (len(originalVector) >= 1000000):
        chunk_size = 4000
    elif (len(originalVector) < 1000000 and len(originalVector) >= 100000):
        chunk_size = 5000
    elif (len(originalVector) < 100000 and len(originalVector) >= 10000):
        chunk_size = 3000
    else:
        chunk_size = len(originalVector)

    slices_originalVector = [originalVector[i:i + chunk_size] for i in range(0, len(originalVector), chunk_size)]

    bw_ids = []
    newvector = []

    for  sc in slices_originalVector:
        i, bw_slice = bw_transform(sc)
        # bw_ids.append(i)
        newvector.extend(bw_slice)

    originalVector  = encodeMTF(newvector)

    l = len(originalVector)
    h = entropy_shannon(originalVector, 2) 
    minb = (l*h)/8
    gan = (l-minb)/l
    fl.append(f)
    longs.append(l)
    ent.append(h)
    minn.append(minb)
    gana.append(gan)
    chunk.append(chunk_size)


results = pd.DataFrame(list(zip(fl, longs, ent, minn,gana, chunk )),
                        columns=['File',
                                'longitud',
                                'entropia',
                                'minimo',
                                'ganancia',
                                 'chunk']) 

results
Información en el universo holográfico.txt
Entropy before: 4.475698083847766
Out[5]:
File longitud entropia minimo ganancia chunk
0 Información en el universo holográfico.txt 28087 3.493894 12266.623456 0.563263 3000

FILE

In [6]:
file_name = 'Información en el universo holográfico.txt'
with open(file_name, 'rb') as r:                                                           
          original= bytearray(r.read())
                    
originalVector = [x for x in original]

slices_originalVector = [originalVector[i:i + chunk_size] for i in range(0, len(originalVector), chunk_size)]  

bw_ids = []
newvector = []

for  sc in slices_originalVector:
    i, bw_slice = bw_transform(sc)
    #bw_ids.append(i)
    newvector.extend(bw_slice)


originalVector  = encodeMTF(newvector)
In [7]:
term_freq = Counter(originalVector)
n = len(term_freq)

N = sum(term_freq.values())
for term in term_freq:
    term_freq[term] = term_freq[term] / N

df = pd.DataFrame.from_records(term_freq.most_common(n), columns = ['Byte', 'Frecuencia'])

def get_x_labels(s_v):
    x_labels = []
    for ix in range(len(s_v)):
        if ix % 5 == 0:
            x_labels.append(str(ix))
        else:
            x_labels.append('')
    return x_labels


fig = plt.figure(figsize = (18, 6))
ax = sns.barplot(x = 'Byte', y = 'Frecuencia', data = df.sort_values(by=['Byte']), palette=("Blues_r"))
plt.xticks(fontsize = 10, rotation = 60)
plt.title('Frecuencia de los bytes en ' + file_name )
plt.show()

Algorithms

In [8]:
class MarkovChain(object):
    def __init__(self, transition_matrix, states):

        self.transition_matrix = np.atleast_2d(transition_matrix)
        self.states = states
        self.index_dict = {self.states[index]: index for index in 
                           range(len(self.states))}
                           
        self.state_dict = {index: self.states[index] for index in
                           range(len(self.states))}

    def fix_p(self, p):
        if p.sum() != 1.0:
            p = p*(1./p.sum())
        return p                           
 
    def next_state(self, current_state):

        return np.random.choice(
         self.states, 
         p=self.fix_p(self.transition_matrix[self.index_dict[current_state], :])
        )

 
    def generate_states(self, current_state, no=10):

        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state)
            current_state = next_state
        return future_states

class lookup_decoding:

    def __init__(self, cf, ht):
        self.cf = cf
        self.ht = ht

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)        
        buffer = []
        attempts = 0

        for i in range(0, c_size):            
            buffer.append(self.cf[i])
            possible_code = ''.join(buffer)
            if possible_code in self.ht.keys():
                o_file.append(self.ht[possible_code])
                s_list.append(len(possible_code))
                f_list.append(attempts)
                buffer.clear()
                attempts = 0
                continue
            attempts  += 1
                
        return o_file, f_list, s_list

class length_code_decoding:

    def __init__(self, cf, ht, lc):
        self.cf = cf
        self.ht = ht
        self.lc = lc

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)     

        index = 0
        while index < c_size:
            attempts = 0
            for sz in self.lc:
                possible_code = self.cf[index: index + sz]
                if possible_code in self.ht.keys():
                    o_file.append(int(self.ht[possible_code])) 
                    index = index + sz
                    f_list.append(attempts)
                    s_list.append(sz)
                    break
                attempts += 1
                
        return o_file, f_list, s_list

class information_decoding:

    def __init__(self, cf, ht, lc):
        self.cf = cf
        self.ht = ht
        self.lc = lc
        # self.count_whole = {}       
        # self.count_whole.update( [(l, t['total_bytes'][l]) for l in t['length_code']])

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)

        index = 0
        while index < c_size:
            attempts = 0
            for sz in self.lc:
                possible_code = self.cf[index: index + sz]
                if possible_code in self.ht.keys():
                    o_file.append(self.ht[possible_code])
                    f_list.append(attempts)
                    s_list.append(sz)
                    index += sz
                    # self.count_whole[sz] -= 1                         
                    # if self.count_whole[sz] == 0:
                    #     self.lc.remove(sz)             
                    break
                attempts += 1
                
        return o_file, f_list, s_list

class MKC_decoding:

    def __init__(self, cf, ht, lc, mtx):
        self.cf = cf
        self.ht = ht
        self.lc = lc
        self.mtx = mtx

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)

        predict_length = MarkovChain(transition_matrix=self.mtx , states=self.lc)

        #find the first stage (code length)
        next_state_length = 0
        attempts = 0
        for sz in self.lc:
            possible_code = self.cf[0: sz]
            if possible_code in self.ht.keys():
                o_file.append(self.ht[possible_code])
                f_list.append(attempts)
                s_list.append(sz)
                next_state_length= sz                
                break
            attempts += 1

        index = next_state_length
        while index < c_size:
                next_state_length = predict_length.next_state(current_state=next_state_length)
                possible_code = self.cf[index: index + next_state_length]
                if possible_code in self.ht.keys():
                        o_file.append(self.ht[possible_code])
                        f_list.append(attempts)
                        s_list.append(next_state_length)
                        index += next_state_length
                        # next_state_length= next_states_length
                        attempts = 0                  
                else:
                    attempts +=1

        
        return o_file, f_list, s_list        

# class bidireccional_decoding:

#     def __init__(self, cf, ht, lc):
#         self.cf = cf
#         self.ht = ht
#         self.lc = lc
#         o_file = []
#         f_list = []
#         c_size = len(self.cf)

#     def __reverse_slicing(self, s):
#         return s[::-1]

#     def __decoding(self, id_t, c, is_inv):
#         o_file = []
#         f_list = []
#         s_list = []        
#         index = 0

#         if is_inv:

#         else:

#             while index < len(c):
#                 attempts = 0
#                 for sz in self.lc:
#                     possible_code = c[index: index + sz]
#                     if possible_code in self.ht.keys():
#                         o_file.append(self.ht[possible_code])
#                         f_list.append(attempts)
#                         s_list.append(sz)
#                         index += sz         
#                         break
#                     attempts += 1

#         return id_t, dc, remained


#     def decode(self):

#         pool = multiprocessing.Pool( args.numProcessors )

#         tasks = []
#         cs1, cs2 = self.cf[:int(len(self.cf)/2)], self.__reverse_slicing(self.cf[int(len(self.cf)/2):])
#         tasks.append((1, cs1, False))
#         tasks.append((2, cs2, True))
        
#         results = [pool.apply_async(self.__decoding, t) for t in tasks]

#         dict_dc =  {1:[], 2:[]}
#         remained = {1:'', 2:''}
        
#         for r in results:
#             (id, dc, remained) = result.get()
#             dict_dc[id].append(dc)
#             remained[id] = remained

#         o_file=  dict_dc[1] + self.ht[remained[1] + self.__reverse_slicing(remained[2])] + dict_dc[2]


#         pool.close()
#         pool.join()

#         return o_file, f_list, s_list

COMPRESSION

In [9]:
h = Huffman(originalVector)
h.compress()
print("Compresion del nuevo string:", len(h.compressedFile)/8, " de: ",len(originalVector))
print('comprimido en un {}'.format(h.details_compression()))
print("Tasa de compresion: ", len(originalVector) / (len(h.compressedFile)/8))
ht_inv = { v: k for k, v in h.huffmancodes.items()}

nt = h.info_table()
lengths = list(nt["Longitud Código"])
nt
Compresion del nuevo string: 12416.875  de:  28087
comprimido en un 55.79138035390038
Tasa de compresion:  2.2620023153973925
Out[9]:
Longitud Código Numero de Códigos Total Bytes
1 1 1 11727
3 3 1 3099
4 4 2 3779
5 5 3 3426
6 6 4 2470
7 7 4 1173
8 8 6 906
9 9 5 421
10 10 10 383
11 11 28 484
12 12 10 94
13 13 14 63
14 14 14 34
15 15 28 28
In [10]:
pd.set_option('display.max_rows', None)
df = h.base_table
df = df.sort_values(["Length_code", "Length_Byte", "Start_Byte", "Frequency"], ascending = (True, False, True,False ))
# df.to_csv(r'data/table.csv')
df
Out[10]:
Byte Symbol Code Length_code Length_Byte Start_Byte Frequency
0 0 0 1 1 0 11727
1 1  100 3 1 1 3099
83 2  1101 4 1 2 2164
2 3  1010 4 1 3 1615
102 4  11101 5 1 4 1333
82 5  11001 5 1 5 1141
21 6  10111 5 1 6 952
20 10 \n 101101 6 2 1 466
129 7  111111 6 1 7 789
103 8  111100 6 1 8 668
84 9 \t 111000 6 1 9 547
105 11 1111011 7 2 1 356
104 12 1111010 7 2 1 345
22 13 \r 1100000 7 2 1 236
23 14  1100001 7 2 1 236
128 15  11111011 8 2 1 214
101 16  11100111 8 2 1 170
100 17  11100110 8 2 1 167
49 18  11000110 8 2 1 136
4 21  10110001 8 2 2 110
3 20  10110000 8 2 2 109
119 19  111110011 9 2 1 97
120 22  111110100 9 2 2 98
106 23  111110000 9 2 2 87
88 24  111001001 9 2 2 77
24 25  110001000 9 2 2 62
127 27  1111101011 10 2 2 54
109 26  1111100011 10 2 2 46
99 29 1110010111 10 2 2 44
91 28 1110010101 10 2 2 41
85 31 1110010000 10 2 3 37
79 30 1100011110 10 2 3 36
78 32 1100011101 10 2 3 34
27 33 ! 1100010011 10 2 3 32
19 36 $ 1011001111 10 2 3 30
15 63 ? 1011001101 10 2 6 29
126 34 " 11111010101 11 2 3 26
107 37 % 11111000100 11 2 3 22
108 35 # 11111000101 11 2 3 22
98 39 ' 11100101101 11 2 3 21
26 38 & 11000100101 11 2 3 15
86 42 * 11100100010 11 2 4 19
87 40 ( 11100100011 11 2 4 19
80 46 . 11000111110 11 2 4 18
29 45 - 11000101001 11 2 4 16
25 44 , 11000100100 11 2 4 15
8 41 ) 10110010011 11 2 4 14
6 43 + 10110010001 11 2 4 13
118 52 4 11111001011 11 2 5 25
28 57 9 11000101000 11 2 5 16
30 54 6 11000101010 11 2 5 16
7 56 8 10110010010 11 2 5 14
9 50 2 10110010100 11 2 5 14
10 58 : 10110010101 11 2 5 14
11 55 7 10110010110 11 2 5 14
110 64 @ 11111001000 11 2 6 23
89 66 B 11100101000 11 2 6 20
90 62 > 11100101001 11 2 6 20
81 67 C 11000111111 11 2 6 18
12 65 A 10110010111 11 2 6 14
13 60 < 10110011000 11 2 6 14
14 61 = 10110011001 11 2 6 14
5 68 D 10110010000 11 2 6 13
18 71 G 10110011101 11 2 7 15
111 48 0 111110010010 12 2 4 12
112 47 / 111110010011 12 2 4 12
32 49 1 110001010111 12 2 4 8
125 53 5 111110101001 12 2 5 13
97 51 3 111001011001 12 2 5 11
33 59 ; 110001011000 12 2 5 8
31 70 F 110001010110 12 2 7 8
34 74 J 110001011001 12 2 7 8
16 75 K 101100111000 12 2 7 7
17 76 L 101100111001 12 2 7 7
35 112 p 1100010110100 13 3 1 4
36 118 v 1100010110101 13 3 1 4
37 116 t 1100010110110 13 3 1 4
38 125 } 1100010110111 13 3 1 4
39 127  1100010111000 13 3 1 4
40 114 r 1100010111001 13 3 1 4
41 69 E 1100010111010 13 2 6 4
113 78 N 1111100101000 13 2 7 6
115 72 H 1111100101010 13 2 7 6
96 73 I 1110010110001 13 2 7 5
44 77 M 1100010111101 13 2 7 4
114 80 P 1111100101001 13 2 8 6
42 81 Q 1100010111011 13 2 8 4
43 82 R 1100010111100 13 2 8 4
116 103 g 11111001010110 14 3 1 3
121 121 y 11111010100000 14 3 1 3
122 117 u 11111010100001 14 3 1 3
123 129  11111010100010 14 3 1 3
45 115 s 11000101111100 14 3 1 2
46 179 ³ 11000101111101 14 3 1 2
48 191 ¿ 11000101111111 14 3 1 2
50 120 x 11000111000000 14 3 1 2
51 130 ‚ 11000111000001 14 3 1 2
52 195 Ã 11000111000010 14 3 1 2
53 122 z 11000111000011 14 3 1 2
124 79 O 11111010100011 14 2 7 3
117 84 T 11111001010111 14 2 8 3
47 95 _ 11000101111110 14 2 9 2
54 111 o 110001110001000 15 3 1 1
55 161 ¡ 110001110001001 15 3 1 1
56 109 m 110001110001010 15 3 1 1
58 123 { 110001110001100 15 3 1 1
59 170 ª 110001110001101 15 3 1 1
60 174 ® 110001110001110 15 3 1 1
66 102 f 110001110010100 15 3 1 1
67 152 ˜ 110001110010101 15 3 1 1
70 105 i 110001110011000 15 3 1 1
71 128 € 110001110011001 15 3 1 1
72 124 | 110001110011010 15 3 1 1
74 187 » 110001110011100 15 3 1 1
75 113 q 110001110011101 15 3 1 1
76 108 l 110001110011110 15 3 1 1
77 110 n 110001110011111 15 3 1 1
92 132 „ 111001011000000 15 3 1 1
94 148 ” 111001011000010 15 3 1 1
62 88 X 110001110010000 15 2 8 1
63 83 S 110001110010001 15 2 8 1
73 89 Y 110001110011011 15 2 8 1
93 85 U 111001011000001 15 2 8 1
95 86 V 111001011000011 15 2 8 1
57 93 ] 110001110001011 15 2 9 1
61 90 Z 110001110001111 15 2 9 1
64 98 b 110001110010010 15 2 9 1
65 92 \ 110001110010011 15 2 9 1
68 99 c 110001110010110 15 2 9 1
69 97 a 110001110010111 15 2 9 1
In [11]:
palette = {"1": "#1F77B4", "3": "#ff7f0e", "4": "#2ca02c", "5": "#d62728", "6": "#C65293",
            "7": "#A0ACBD", "8": "#6C7888", "9": "#3C4856", "10": "#3C4856", "11": "#3C4856", "12": "#3C4856", "13": "#3C4856", "14": "#3C4856",
           "15": "#3C4856"}

plot_lens_by_codes_families(h.tablefrequency, h.huffmancodes)           

LOOKUP TABLE

In [12]:
start_time = timeit.default_timer()
ld = lookup_decoding(h.compressedFile, ht_inv)
decompressed, failslookup, successlookup =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failslookup, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
elapsed time:  0.11994150999998965 s
total_attempts:  71248 accuracy: 33.85 %

MARKOV CHAIN MODEL

In [13]:
def window(seq, n=2):

    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

pairs = pd.DataFrame(window(successlookup), columns=['state1', 'state2'])
counts = pairs.groupby('state1')['state2'].value_counts()
probs = (counts / counts.sum()).unstack()
probs = probs.fillna(0)
tmx = probs.to_numpy()

p = np.array(tmx) 

lengths.sort(reverse = False)

start_time = timeit.default_timer()
ld = MKC_decoding(h.compressedFile, ht_inv, lengths, p)
decompressed, failsmarkov, successmarkov =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsmarkov, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
elapsed time:  17.37315056400007 s
total_attempts:  378541 accuracy: 7.277 %

CODE LENGTH without order

In [14]:
random.shuffle(lengths)

start_time = timeit.default_timer()
ld = length_code_decoding(h.compressedFile, ht_inv, lengths)
decompressed, failsnoorder, successnoorder =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsnoorder, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
elapsed time:  0.11112102099991716 s
total_attempts:  231774 accuracy: 12.1133 %

CODE LENGTH with order

In [15]:
lengths.sort(reverse = False)        

start_time = timeit.default_timer()
ld = length_code_decoding(h.compressedFile, ht_inv, lengths)
decompressed, failsorder, successorder =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsorder, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
elapsed time:  0.04226169599996865 s
total_attempts:  54888 accuracy: 42.1632 %

Order based in BYTES COMPRESSED

In [16]:
nt = nt.sort_values(by=['Total Bytes'], ascending=False)

start_time = timeit.default_timer()
ld = information_decoding(h.compressedFile, ht_inv, list(nt["Longitud Código"]))
decompressed, failscompressed, successcompressed =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failscompressed, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
nt
elapsed time:  0.05727922899995974 s
total_attempts:  53717 accuracy: 42.9176 %
Out[16]:
Longitud Código Numero de Códigos Total Bytes
1 1 1 11727
4 4 2 3779
5 5 3 3426
3 3 1 3099
6 6 4 2470
7 7 4 1173
8 8 6 906
11 11 28 484
9 9 5 421
10 10 10 383
12 12 10 94
13 13 14 63
14 14 14 34
15 15 28 28

FAILURE BEHAVIOR

In [17]:
plot_fails_trend(failslookup[0:2000], successlookup[0:2000], 'Tabla de búsqueda')
plot_fails_trend(failsnoorder[0:2000], successnoorder[0:2000], 'Longitud de código sin orden')
plot_fails_trend(failsorder[0:2000], successorder[0:2000], 'Longitud de código con orden')
plot_fails_trend(failscompressed[0:2000], successcompressed[0:2000], 'bytes comprimidos')
2.9375 fallos/byte con desviación estándar: 2.8916
8.56 fallos/byte con desviación estándar: 2.756
2.2675 fallos/byte con desviación estándar: 2.5776
2.1905 fallos/byte con desviación estándar: 2.5798
In [18]:
violindf = pd.DataFrame(list(zip(failslookup, failsnoorder, failsorder, failscompressed)),
                            columns=['Tabla de busqueda', 'Longitud de código sin orden',
                                    'Longitud de código con orden','bytes comprimidos'])

violindf = violindf.melt(var_name='algorithm', value_name='fails')

plot_fails_violin(violindf, 'algorithm', 'fails')
/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
In [19]:
table_distlookup = table_fails_dist(failslookup)
table_distnoorder = table_fails_dist(failsnoorder)
table_distorder = table_fails_dist(failsorder)
table_distcompressed = table_fails_dist(failscompressed)
In [20]:
plot_fails_dist(table_distlookup, table_distnoorder, table_distorder,  table_distcompressed)