A New Approach for Efficient Sequential Decoding of Static Huffman Codes¶

Created by Ramses Alexander Coraspe Valdez¶

Created on May 20, 2020¶

Dependencies¶

from collections import Counter
import pandas as pd
import numpy as np
import timeit
import statistics as stats
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import gridspec
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import random
import argparse
from operator import itemgetter
from itertools import combinations, product
from itertools import tee
from itertools import islice
from scipy.stats import entropy

BWT, MTF and Entropy¶

def entropy_shannon(s, base=None):
    value, counts = np.unique(s, return_counts=True)
    return entropy(counts, base = base)

def bw_transform(s):
    n = len(s)
    m = sorted([s[i:n]+s[0:i] for i in range(n)])
    I = m.index(s)
    L = [q[-1] for q in m]
    return (I, L)


def bw_restore(I, L):
    n = len(L)
    X = sorted([(i, x) for i, x in enumerate(L)], key=itemgetter(1))

    T = [None for i in range(n)]
    for i, y in enumerate(X):
        j, _ = y
        T[j] = i

    Tx = [I]
    for i in range(1, n):
        Tx.append(T[Tx[i-1]])

    S = [L[i] for i in Tx]
    S.reverse()
    return ''.join(S)

common_dictionary = list(range(256))
def encodeMTF(plain_text):      
    dictionary = common_dictionary.copy()
    
    compressed_text = list()    
    rank = 0

    for c in plain_text:
        rank = dictionary.index(int(c))   
        compressed_text.append(rank) 

        dictionary.pop(rank)
        dictionary.insert(0, int(c))

    return compressed_text

def decodeMTF(compressed_data):
    compressed_text = compressed_data
    dictionary = common_dictionary.copy()
    plain_data = []

    for rank in compressed_text:

        plain_data.append(dictionary[rank])

        e = dictionary.pop(rank)
        dictionary.insert(0, e)

    return plain_data

HUFFMAN CODING CLASS¶

from collections import Counter

class NodeT:

	def __init__(self, frequency, left, right, isLeaf, valueLeaf):
         self.frequency = frequency
         self.left = left
         self.right = right
         self.isLeaf = isLeaf
         self.valueLeaf = valueLeaf

	def getLeft(self):
		return self.left

	def getRight(self):
		return self.right

	def F(self):
		return self.frequency

	def V(self):
		return self.valueLeaf

class Huffman:
  def __init__(self, fileBytes):
    self.fileBytes = fileBytes
    self.huffmancodes = {}
    self.codes = []
    self.compressedFile = None
    self.tablefrequency = {}
    self.realLength = 0
    self.remained=0
    self.base_table = None

  def compress(self):  

      self.realLength = len(self.fileBytes)

      c = dict(Counter(self.fileBytes))
          
      for key, value in c.items():
            self.tablefrequency[key] = value                                        
      tl = []
      lk  = self.tablefrequency.keys()             
      
      for k in lk:          
          n  = NodeT(self.tablefrequency.get(k),None, None, True, k)
          tl.append(n)                
                
      tl.sort(key=lambda x: x.frequency)
                  
      while len(tl)> 1:          
          l = tl.pop(0)          
          r = tl.pop(0)          
          tn = NodeT(l.F() + r.F() ,l, r, False, None)     
          tl.append(tn)
          tl.sort(key=lambda x: x.frequency)
                  
      self._huffmanCodes(tl.pop(0))            
            
      cf = []
      for b in self.fileBytes:
          cf.append(self.huffmancodes.get(b))  
      
      self.compressedFile = ''.join(cf)
      #Returning the huffman table, new bytes compressed, remaining in bits
      #return self.huffmancodes, ''.join(cf), remained
    
  def _huffmanCodes(self, tl):       
      if tl.isLeaf == False:         
         l = tl.getLeft()
         r = tl.getRight()
         self.codes.append("0")
         self._huffmanCodes(l);
         self.codes.pop()
         self.codes.append("1")
         self._huffmanCodes(r) 
         self.codes.pop()
      else:                        
         self.huffmancodes[tl.V()] = ''.join(self.codes)       
      
    
  def details_compression(self):
    raw = self.realLength * 8
    compressed = len(self.compressedFile)    
    percent = ((compressed * 100) / raw)
    return ((float)(100 - percent))

  def _table(self):
     _byte =  list(self.huffmancodes.keys())
     _symbol = [chr(b) for b in _byte]
     _codigo =  list(self.huffmancodes.values())
     _frequency = [self.tablefrequency[b] for b in _byte]
     _length_code = [len(c) for c in _codigo]
     _length_byte = [len(str(c)) for c in _byte]
     _start_byte = [int(str(c)[:1]) for c in _byte]

     self.base_table = pd.DataFrame(list(zip(_byte, _symbol, _codigo, _length_code, _length_byte, _start_byte, _frequency )), 
               columns =['Byte', 'Symbol', 'Code', 'Length_code', 'Length_Byte', 'Start_Byte', 'Frequency'])     
               
  def info_table(self):
      self._table()
      lc_count = dict(Counter(list(self.base_table['Length_code'])))
      result = self.base_table.groupby('Length_code', as_index=False).agg({"Frequency": "sum"})
      _count = [ lc_count[l] for l in result['Length_code']]
    
      length_code = list(result['Length_code'])
      return pd.DataFrame(list(zip(list(result['Length_code']), _count, list(result['Frequency']) )), index=length_code, 
                columns =['Longitud Código', 'Numero de Códigos', 'Total Bytes'])

PLOTING CODING¶

def plot_lens_by_codes_families(term_freq, hf_code):

    fig, ax = plt.subplots(figsize = (28, 6))

    bars = []
    values= []
    for ix in range(256):
        if ix in term_freq.keys():
            bars.append(ix)
            values.append(term_freq[ix])
    y_pos = np.arange(len(bars))


    barlist = plt.bar(y_pos, values, alpha = 0.7)
    plt.xticks(y_pos, bars, rotation=45)

    sum_values = sum(values)

    lens = []
    for i, v in enumerate(values):
        key = bars[i]
        length = len(hf_code[key])
        l = str(round((v / sum_values) * 100, 1)) + '%'
        if v > 100:
            ax.text((i - 0.6), (v * 1.0125), str(l), color = "black", fontweight = "normal", fontsize = 7)
        barlist[i].set_color(palette[str(length)])


    legend_list = []
    for k, v in palette.items():
        legend_list.append(mpatches.Patch(color = v, label = 'Longitud código ' + k))

    plt.legend(handles = legend_list)
    plt.ylabel('Frecuencia')
    plt.xlabel('# Bytes')
    plt.title('Longitudes por familia de códigos')
    plt.grid(False)
    plt.show()

def plot_fails_trend(_dataf, _datas, title):    
    f_mean = stats.mean(_dataf)
    f_stdev = stats.pstdev(_dataf)
    print(round(f_mean, 4), 'fallos/byte con desviación estándar:', round(f_stdev, 4))

    max_value = max(_dataf)
    min_value = 0
    upper_lim = min(max_value, (f_mean + f_stdev))
    lower_lim = max(min_value, (f_mean - f_stdev))
    
    fig = plt.figure(figsize = (24, 5))
    plt.plot(_dataf, linewidth=1, marker='.', markersize=5, color= '#f38d8b', label= "fallos")    
    plt.plot(_datas,  linewidth=1, marker='.', markersize=5, color= '#20639B', label= "longitud de código")
    plt.grid(alpha=.4)
    plt.legend()
    plt.axhline(y = upper_lim, color = "#000000", linestyle = "--", linewidth=1)
    plt.axhline(y = f_mean, color = "#000000", linestyle = "-", linewidth=1)
    plt.axhline(y = lower_lim, color = "#000000", linestyle = "--", linewidth=1)
    plt.title('Comportamiento del fallo durante la decodificación (' + title + ')')
    plt.xlabel('# Tiempo', fontsize = 10)
    plt.ylabel('Cantidad de fallos / longitud de código encontrada', fontsize = 10)  
    plt.tick_params(axis="x", labelsize=10)
    
    
    plt.show()

def plot_fails_violin(violindf, xname, yname):

    fig, ax = plt.subplots(1, 1, figsize = (15, 8))

    sns.set(style="whitegrid")
    sns.violinplot(xname, yname, data = violindf, color='#f8bab9', linewidth=2)
    sns.stripplot(xname,yname, data=violindf, jitter=True, color='#f38d8b' ,size=6, marker="D", edgecolor="black", alpha=.25 , zorder=1)                   
    ax.set_xlabel('')
    ax.set_ylabel('# fallos', fontsize = 10)
    plt.title('Distribución del numero de fallos')
    plt.show()   



def plot_fails_dist(fd1,fd2,fd3,fd4):

    fig = plt.figure(figsize = (24, 12))
    fig.subplots_adjust(hspace = 0.25, wspace = 0.25)
    gs = gridspec.GridSpec(2, 2, width_ratios=[4, 4])     

    x = min(fd1['n_fails'])    
    
    ax1 = plt.subplot(gs[0])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd1, color='#6699cc')

    for index, row in fd1.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax1.set_xlabel('# Fallos', fontsize = 9)
    ax1.set_ylabel('Número de fallos', fontsize = 9)

    x = min(fd2['n_fails'])    
    
    ax2 = plt.subplot(gs[1])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd2, color='#6699cc')

    for index, row in fd2.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax2.set_xlabel('# Fallos', fontsize = 9)
    ax2.set_ylabel('Número de fallos', fontsize = 9)

    
    x = min(fd3['n_fails'])        
    ax3 = plt.subplot(gs[2])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd3, color='#6699cc')

    for index, row in fd3.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax3.set_xlabel('# Fallos', fontsize = 9)
    ax3.set_ylabel('Número de fallos', fontsize = 9)


    x = min(fd4['n_fails'])    
    
    ax4 = plt.subplot(gs[3])
    g = sns.barplot(x = 'n_fails', y = 'quantity', data = fd4, color='#6699cc')

    for index, row in fd4.iterrows():
        lbl_value = str(round(row.fails_perc, 4)) + ' %'        
        g.text(row.n_fails - x, row.quantity * 1.02 , lbl_value, color='black', ha="center", fontsize = 8)

    ax4.set_xlabel('# Fallos', fontsize = 9)
    ax4.set_ylabel('Número de fallos', fontsize = 9)  

    plt.show()      

def table_fails_dist(fails_data):
    fails_dist = Counter(fails_data)
    df_fails_dist = pd.DataFrame.from_records(fails_dist.most_common(), columns = ['n_fails', 'quantity'])
    df_fails_dist["fails_perc"] = 100 * df_fails_dist.quantity / len(fails_data)
    df_fails_dist = df_fails_dist.sort_values(by=['n_fails'])
    return df_fails_dist

def get_accuracy(fails_data, len_des):

    accuracy = 0
    total_attempts = sum(fails_data)
    attempts_dict = dict(Counter(fails_data))

    if 0 in attempts_dict.keys():        
        accuracy = round(100 * len_des / (total_attempts + attempts_dict[0]), 4)
    else:
        accuracy = round(100*len_des / total_attempts,4)

    return accuracy, total_attempts

Section¶

files = ['Información en el universo holográfico.txt']



fl = []
longs = []
ent = []
minn = []
gana = []
chunk= []


for f in files:
    print(f)
    with open(f, 'rb') as r:                          
            original= bytearray(r.read())
                                        
    originalVector = [x for x in original]
    print("Entropy before:" , entropy_shannon(originalVector, 2))

    if (len(originalVector) >= 1000000):
        chunk_size = 4000
    elif (len(originalVector) < 1000000 and len(originalVector) >= 100000):
        chunk_size = 5000
    elif (len(originalVector) < 100000 and len(originalVector) >= 10000):
        chunk_size = 3000
    else:
        chunk_size = len(originalVector)

    slices_originalVector = [originalVector[i:i + chunk_size] for i in range(0, len(originalVector), chunk_size)]

    bw_ids = []
    newvector = []

    for  sc in slices_originalVector:
        i, bw_slice = bw_transform(sc)
        # bw_ids.append(i)
        newvector.extend(bw_slice)

    originalVector  = encodeMTF(newvector)

    l = len(originalVector)
    h = entropy_shannon(originalVector, 2) 
    minb = (l*h)/8
    gan = (l-minb)/l
    fl.append(f)
    longs.append(l)
    ent.append(h)
    minn.append(minb)
    gana.append(gan)
    chunk.append(chunk_size)


results = pd.DataFrame(list(zip(fl, longs, ent, minn,gana, chunk )),
                        columns=['File',
                                'longitud',
                                'entropia',
                                'minimo',
                                'ganancia',
                                 'chunk']) 

results

Información en el universo holográfico.txt
Entropy before: 4.475698083847766

FILE¶

file_name = 'Información en el universo holográfico.txt'
with open(file_name, 'rb') as r:                                                           
          original= bytearray(r.read())
                    
originalVector = [x for x in original]

slices_originalVector = [originalVector[i:i + chunk_size] for i in range(0, len(originalVector), chunk_size)]  

bw_ids = []
newvector = []

for  sc in slices_originalVector:
    i, bw_slice = bw_transform(sc)
    #bw_ids.append(i)
    newvector.extend(bw_slice)


originalVector  = encodeMTF(newvector)

term_freq = Counter(originalVector)
n = len(term_freq)

N = sum(term_freq.values())
for term in term_freq:
    term_freq[term] = term_freq[term] / N

df = pd.DataFrame.from_records(term_freq.most_common(n), columns = ['Byte', 'Frecuencia'])

def get_x_labels(s_v):
    x_labels = []
    for ix in range(len(s_v)):
        if ix % 5 == 0:
            x_labels.append(str(ix))
        else:
            x_labels.append('')
    return x_labels


fig = plt.figure(figsize = (18, 6))
ax = sns.barplot(x = 'Byte', y = 'Frecuencia', data = df.sort_values(by=['Byte']), palette=("Blues_r"))
plt.xticks(fontsize = 10, rotation = 60)
plt.title('Frecuencia de los bytes en ' + file_name )
plt.show()

Algorithms¶

class MarkovChain(object):
    def __init__(self, transition_matrix, states):

        self.transition_matrix = np.atleast_2d(transition_matrix)
        self.states = states
        self.index_dict = {self.states[index]: index for index in 
                           range(len(self.states))}
                           
        self.state_dict = {index: self.states[index] for index in
                           range(len(self.states))}

    def fix_p(self, p):
        if p.sum() != 1.0:
            p = p*(1./p.sum())
        return p                           
 
    def next_state(self, current_state):

        return np.random.choice(
         self.states, 
         p=self.fix_p(self.transition_matrix[self.index_dict[current_state], :])
        )

 
    def generate_states(self, current_state, no=10):

        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state)
            current_state = next_state
        return future_states

class lookup_decoding:

    def __init__(self, cf, ht):
        self.cf = cf
        self.ht = ht

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)        
        buffer = []
        attempts = 0

        for i in range(0, c_size):            
            buffer.append(self.cf[i])
            possible_code = ''.join(buffer)
            if possible_code in self.ht.keys():
                o_file.append(self.ht[possible_code])
                s_list.append(len(possible_code))
                f_list.append(attempts)
                buffer.clear()
                attempts = 0
                continue
            attempts  += 1
                
        return o_file, f_list, s_list

class length_code_decoding:

    def __init__(self, cf, ht, lc):
        self.cf = cf
        self.ht = ht
        self.lc = lc

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)     

        index = 0
        while index < c_size:
            attempts = 0
            for sz in self.lc:
                possible_code = self.cf[index: index + sz]
                if possible_code in self.ht.keys():
                    o_file.append(int(self.ht[possible_code])) 
                    index = index + sz
                    f_list.append(attempts)
                    s_list.append(sz)
                    break
                attempts += 1
                
        return o_file, f_list, s_list

class information_decoding:

    def __init__(self, cf, ht, lc):
        self.cf = cf
        self.ht = ht
        self.lc = lc
        # self.count_whole = {}       
        # self.count_whole.update( [(l, t['total_bytes'][l]) for l in t['length_code']])

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)

        index = 0
        while index < c_size:
            attempts = 0
            for sz in self.lc:
                possible_code = self.cf[index: index + sz]
                if possible_code in self.ht.keys():
                    o_file.append(self.ht[possible_code])
                    f_list.append(attempts)
                    s_list.append(sz)
                    index += sz
                    # self.count_whole[sz] -= 1                         
                    # if self.count_whole[sz] == 0:
                    #     self.lc.remove(sz)             
                    break
                attempts += 1
                
        return o_file, f_list, s_list

class MKC_decoding:

    def __init__(self, cf, ht, lc, mtx):
        self.cf = cf
        self.ht = ht
        self.lc = lc
        self.mtx = mtx

    def decode(self):
        o_file = []
        f_list = []
        s_list = []
        c_size = len(self.cf)

        predict_length = MarkovChain(transition_matrix=self.mtx , states=self.lc)

        #find the first stage (code length)
        next_state_length = 0
        attempts = 0
        for sz in self.lc:
            possible_code = self.cf[0: sz]
            if possible_code in self.ht.keys():
                o_file.append(self.ht[possible_code])
                f_list.append(attempts)
                s_list.append(sz)
                next_state_length= sz                
                break
            attempts += 1

        index = next_state_length
        while index < c_size:
                next_state_length = predict_length.next_state(current_state=next_state_length)
                possible_code = self.cf[index: index + next_state_length]
                if possible_code in self.ht.keys():
                        o_file.append(self.ht[possible_code])
                        f_list.append(attempts)
                        s_list.append(next_state_length)
                        index += next_state_length
                        # next_state_length= next_states_length
                        attempts = 0                  
                else:
                    attempts +=1

        
        return o_file, f_list, s_list        

# class bidireccional_decoding:

#     def __init__(self, cf, ht, lc):
#         self.cf = cf
#         self.ht = ht
#         self.lc = lc
#         o_file = []
#         f_list = []
#         c_size = len(self.cf)

#     def __reverse_slicing(self, s):
#         return s[::-1]

#     def __decoding(self, id_t, c, is_inv):
#         o_file = []
#         f_list = []
#         s_list = []        
#         index = 0

#         if is_inv:

#         else:

#             while index < len(c):
#                 attempts = 0
#                 for sz in self.lc:
#                     possible_code = c[index: index + sz]
#                     if possible_code in self.ht.keys():
#                         o_file.append(self.ht[possible_code])
#                         f_list.append(attempts)
#                         s_list.append(sz)
#                         index += sz         
#                         break
#                     attempts += 1

#         return id_t, dc, remained


#     def decode(self):

#         pool = multiprocessing.Pool( args.numProcessors )

#         tasks = []
#         cs1, cs2 = self.cf[:int(len(self.cf)/2)], self.__reverse_slicing(self.cf[int(len(self.cf)/2):])
#         tasks.append((1, cs1, False))
#         tasks.append((2, cs2, True))
        
#         results = [pool.apply_async(self.__decoding, t) for t in tasks]

#         dict_dc =  {1:[], 2:[]}
#         remained = {1:'', 2:''}
        
#         for r in results:
#             (id, dc, remained) = result.get()
#             dict_dc[id].append(dc)
#             remained[id] = remained

#         o_file=  dict_dc[1] + self.ht[remained[1] + self.__reverse_slicing(remained[2])] + dict_dc[2]


#         pool.close()
#         pool.join()

#         return o_file, f_list, s_list

COMPRESSION¶

h = Huffman(originalVector)
h.compress()
print("Compresion del nuevo string:", len(h.compressedFile)/8, " de: ",len(originalVector))
print('comprimido en un {}'.format(h.details_compression()))
print("Tasa de compresion: ", len(originalVector) / (len(h.compressedFile)/8))
ht_inv = { v: k for k, v in h.huffmancodes.items()}

nt = h.info_table()
lengths = list(nt["Longitud Código"])
nt

Compresion del nuevo string: 12416.875  de:  28087
comprimido en un 55.79138035390038
Tasa de compresion:  2.2620023153973925

pd.set_option('display.max_rows', None)
df = h.base_table
df = df.sort_values(["Length_code", "Length_Byte", "Start_Byte", "Frequency"], ascending = (True, False, True,False ))
# df.to_csv(r'data/table.csv')
df

palette = {"1": "#1F77B4", "3": "#ff7f0e", "4": "#2ca02c", "5": "#d62728", "6": "#C65293",
            "7": "#A0ACBD", "8": "#6C7888", "9": "#3C4856", "10": "#3C4856", "11": "#3C4856", "12": "#3C4856", "13": "#3C4856", "14": "#3C4856",
           "15": "#3C4856"}

plot_lens_by_codes_families(h.tablefrequency, h.huffmancodes)

LOOKUP TABLE¶

start_time = timeit.default_timer()
ld = lookup_decoding(h.compressedFile, ht_inv)
decompressed, failslookup, successlookup =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failslookup, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )

elapsed time:  0.11994150999998965 s
total_attempts:  71248 accuracy: 33.85 %

MARKOV CHAIN MODEL¶

def window(seq, n=2):

    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

pairs = pd.DataFrame(window(successlookup), columns=['state1', 'state2'])
counts = pairs.groupby('state1')['state2'].value_counts()
probs = (counts / counts.sum()).unstack()
probs = probs.fillna(0)
tmx = probs.to_numpy()

p = np.array(tmx) 

lengths.sort(reverse = False)

start_time = timeit.default_timer()
ld = MKC_decoding(h.compressedFile, ht_inv, lengths, p)
decompressed, failsmarkov, successmarkov =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsmarkov, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )

elapsed time:  17.37315056400007 s
total_attempts:  378541 accuracy: 7.277 %

CODE LENGTH without order¶

random.shuffle(lengths)

start_time = timeit.default_timer()
ld = length_code_decoding(h.compressedFile, ht_inv, lengths)
decompressed, failsnoorder, successnoorder =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsnoorder, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )

elapsed time:  0.11112102099991716 s
total_attempts:  231774 accuracy: 12.1133 %

CODE LENGTH with order¶

lengths.sort(reverse = False)        

start_time = timeit.default_timer()
ld = length_code_decoding(h.compressedFile, ht_inv, lengths)
decompressed, failsorder, successorder =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failsorder, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )

elapsed time:  0.04226169599996865 s
total_attempts:  54888 accuracy: 42.1632 %

Order based in BYTES COMPRESSED¶

nt = nt.sort_values(by=['Total Bytes'], ascending=False)

start_time = timeit.default_timer()
ld = information_decoding(h.compressedFile, ht_inv, list(nt["Longitud Código"]))
decompressed, failscompressed, successcompressed =  ld.decode()

elapsed = timeit.default_timer() - start_time
print("elapsed time: ", elapsed, 's')

accuracy, total_attempts =  get_accuracy(failscompressed, len(decompressed))

print("total_attempts: ", total_attempts, "accuracy:", accuracy, '%' )
nt

elapsed time:  0.05727922899995974 s
total_attempts:  53717 accuracy: 42.9176 %

FAILURE BEHAVIOR¶

plot_fails_trend(failslookup[0:2000], successlookup[0:2000], 'Tabla de búsqueda')
plot_fails_trend(failsnoorder[0:2000], successnoorder[0:2000], 'Longitud de código sin orden')
plot_fails_trend(failsorder[0:2000], successorder[0:2000], 'Longitud de código con orden')
plot_fails_trend(failscompressed[0:2000], successcompressed[0:2000], 'bytes comprimidos')

2.9375 fallos/byte con desviación estándar: 2.8916

8.56 fallos/byte con desviación estándar: 2.756

2.2675 fallos/byte con desviación estándar: 2.5776

2.1905 fallos/byte con desviación estándar: 2.5798

violindf = pd.DataFrame(list(zip(failslookup, failsnoorder, failsorder, failscompressed)),
                            columns=['Tabla de busqueda', 'Longitud de código sin orden',
                                    'Longitud de código con orden','bytes comprimidos'])

violindf = violindf.melt(var_name='algorithm', value_name='fails')

plot_fails_violin(violindf, 'algorithm', 'fails')

/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

table_distlookup = table_fails_dist(failslookup)
table_distnoorder = table_fails_dist(failsnoorder)
table_distorder = table_fails_dist(failsorder)
table_distcompressed = table_fails_dist(failscompressed)

plot_fails_dist(table_distlookup, table_distnoorder, table_distorder,  table_distcompressed)

	Longitud Código	Numero de Códigos	Total Bytes
1	1	1	11727
3	3	1	3099
4	4	2	3779
5	5	3	3426
6	6	4	2470
7	7	4	1173
8	8	6	906
9	9	5	421
10	10	10	383
11	11	28	484
12	12	10	94
13	13	14	63
14	14	14	34
15	15	28	28

	Byte	Symbol	Code	Length_code	Length_Byte	Start_Byte	Frequency
0	0		0	1	1	0	11727
1	1		100	3	1	1	3099
83	2		1101	4	1	2	2164
2	3		1010	4	1	3	1615
102	4		11101	5	1	4	1333
82	5		11001	5	1	5	1141
21	6		10111	5	1	6	952
20	10	\n	101101	6	2	1	466
129	7		111111	6	1	7	789
103	8		111100	6	1	8	668
84	9	\t	111000	6	1	9	547
105	11		1111011	7	2	1	356
104	12		1111010	7	2	1	345
22	13	\r	1100000	7	2	1	236
23	14		1100001	7	2	1	236
128	15		11111011	8	2	1	214
101	16		11100111	8	2	1	170
100	17		11100110	8	2	1	167
49	18		11000110	8	2	1	136
4	21		10110001	8	2	2	110
3	20		10110000	8	2	2	109
119	19		111110011	9	2	1	97
120	22		111110100	9	2	2	98
106	23		111110000	9	2	2	87
88	24		111001001	9	2	2	77
24	25		110001000	9	2	2	62
127	27		1111101011	10	2	2	54
109	26		1111100011	10	2	2	46
99	29		1110010111	10	2	2	44
91	28		1110010101	10	2	2	41
85	31		1110010000	10	2	3	37
79	30		1100011110	10	2	3	36
78	32		1100011101	10	2	3	34
27	33	!	1100010011	10	2	3	32
19	36	$	1011001111	10	2	3	30
15	63	?	1011001101	10	2	6	29
126	34	"	11111010101	11	2	3	26
107	37	%	11111000100	11	2	3	22
108	35	#	11111000101	11	2	3	22
98	39	'	11100101101	11	2	3	21
26	38	&	11000100101	11	2	3	15
86	42	*	11100100010	11	2	4	19
87	40	(	11100100011	11	2	4	19
80	46	.	11000111110	11	2	4	18
29	45	-	11000101001	11	2	4	16
25	44	,	11000100100	11	2	4	15
8	41	)	10110010011	11	2	4	14
6	43	+	10110010001	11	2	4	13
118	52	4	11111001011	11	2	5	25
28	57	9	11000101000	11	2	5	16
30	54	6	11000101010	11	2	5	16
7	56	8	10110010010	11	2	5	14
9	50	2	10110010100	11	2	5	14
10	58	:	10110010101	11	2	5	14
11	55	7	10110010110	11	2	5	14
110	64	@	11111001000	11	2	6	23
89	66	B	11100101000	11	2	6	20
90	62	>	11100101001	11	2	6	20
81	67	C	11000111111	11	2	6	18
12	65	A	10110010111	11	2	6	14
13	60	<	10110011000	11	2	6	14
14	61	=	10110011001	11	2	6	14
5	68	D	10110010000	11	2	6	13
18	71	G	10110011101	11	2	7	15
111	48	0	111110010010	12	2	4	12
112	47	/	111110010011	12	2	4	12
32	49	1	110001010111	12	2	4	8
125	53	5	111110101001	12	2	5	13
97	51	3	111001011001	12	2	5	11
33	59	;	110001011000	12	2	5	8
31	70	F	110001010110	12	2	7	8
34	74	J	110001011001	12	2	7	8
16	75	K	101100111000	12	2	7	7
17	76	L	101100111001	12	2	7	7
35	112	p	1100010110100	13	3	1	4
36	118	v	1100010110101	13	3	1	4
37	116	t	1100010110110	13	3	1	4
38	125	}	1100010110111	13	3	1	4
39	127		1100010111000	13	3	1	4
40	114	r	1100010111001	13	3	1	4
41	69	E	1100010111010	13	2	6	4
113	78	N	1111100101000	13	2	7	6
115	72	H	1111100101010	13	2	7	6
96	73	I	1110010110001	13	2	7	5
44	77	M	1100010111101	13	2	7	4
114	80	P	1111100101001	13	2	8	6
42	81	Q	1100010111011	13	2	8	4
43	82	R	1100010111100	13	2	8	4
116	103	g	11111001010110	14	3	1	3
121	121	y	11111010100000	14	3	1	3
122	117	u	11111010100001	14	3	1	3
123	129		11111010100010	14	3	1	3
45	115	s	11000101111100	14	3	1	2
46	179	³	11000101111101	14	3	1	2
48	191	¿	11000101111111	14	3	1	2
50	120	x	11000111000000	14	3	1	2
51	130		11000111000001	14	3	1	2
52	195	Ã	11000111000010	14	3	1	2
53	122	z	11000111000011	14	3	1	2
124	79	O	11111010100011	14	2	7	3
117	84	T	11111001010111	14	2	8	3
47	95	_	11000101111110	14	2	9	2
54	111	o	110001110001000	15	3	1	1
55	161	¡	110001110001001	15	3	1	1
56	109	m	110001110001010	15	3	1	1
58	123	{	110001110001100	15	3	1	1
59	170	ª	110001110001101	15	3	1	1
60	174	®	110001110001110	15	3	1	1
66	102	f	110001110010100	15	3	1	1
67	152		110001110010101	15	3	1	1
70	105	i	110001110011000	15	3	1	1
71	128		110001110011001	15	3	1	1
72	124	\|	110001110011010	15	3	1	1
74	187	»	110001110011100	15	3	1	1
75	113	q	110001110011101	15	3	1	1
76	108	l	110001110011110	15	3	1	1
77	110	n	110001110011111	15	3	1	1
92	132		111001011000000	15	3	1	1
94	148		111001011000010	15	3	1	1
62	88	X	110001110010000	15	2	8	1
63	83	S	110001110010001	15	2	8	1
73	89	Y	110001110011011	15	2	8	1
93	85	U	111001011000001	15	2	8	1
95	86	V	111001011000011	15	2	8	1
57	93	]	110001110001011	15	2	9	1
61	90	Z	110001110001111	15	2	9	1
64	98	b	110001110010010	15	2	9	1
65	92	\	110001110010011	15	2	9	1
68	99	c	110001110010110	15	2	9	1
69	97	a	110001110010111	15	2	9	1