Burrows Wheeler transform and Move To Front (BWT + MTF)

Created by Ramses Alexander Coraspe Valdez

Created on May 20, 2020

In [32]:
from scipy.stats import entropy
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np

Entropy

In [33]:
def entropy_shannon(s, base=None):
    value, counts = np.unique(s, return_counts=True)
    return entropy(counts, base = base)

Burrows Wheeler transform (BWT)

In [34]:
def bw_transform(s):
    n = len(s)
    m = sorted([s[i:n]+s[0:i] for i in range(n)])
    I = m.index(s)
    L = [q[-1] for q in m]
    return (I, L)


def bw_restore(I, L):
    n = len(L)
    X = sorted([(i, x) for i, x in enumerate(L)], key=itemgetter(1))

    T = [None for i in range(n)]
    for i, y in enumerate(X):
        j, _ = y
        T[j] = i

    Tx = [I]
    for i in range(1, n):
        Tx.append(T[Tx[i-1]])

    S = [L[i] for i in Tx]
    S.reverse()
    return ''.join(S)

Move to Front (MTF)

In [35]:
common_dictionary = list(range(256))
def encodeMTF(plain_text):      
    dictionary = common_dictionary.copy()
    
    compressed_text = list()    
    rank = 0

    for c in plain_text:
        rank = dictionary.index(int(c))   
        compressed_text.append(rank) 

        dictionary.pop(rank)
        dictionary.insert(0, int(c))

    return compressed_text

def decodeMTF(compressed_data):
    compressed_text = compressed_data
    dictionary = common_dictionary.copy()
    plain_data = []

    for rank in compressed_text:

        plain_data.append(dictionary[rank])

        e = dictionary.pop(rank)
        dictionary.insert(0, e)

    return plain_data 

Plotting code

In [36]:
def get_x_labels(s_v):
    x_labels = []
    for ix in range(len(s_v)):
        if ix % 5 == 0:
            x_labels.append(str(ix))
        else:
            x_labels.append('')
    return x_labels

def plot(bts):

    term_freq = Counter(bts)
    n = len(term_freq)
    N = sum(term_freq.values())
    for term in term_freq:
        term_freq[term] = term_freq[term] / N
    
    df = pd.DataFrame.from_records(term_freq.most_common(n), columns = ['Byte', 'Frecuencia'])
    fig = plt.figure(figsize = (18, 6))
    ax = sns.barplot(x = 'Byte', y = 'Frecuencia', data = df.sort_values(by=['Byte']), color='#3498db')
    plt.xticks(fontsize = 10, rotation = 60)
    plt.title('Frecuencia de los bytes')
    return plt

File Reading

In [37]:
def read_file(file_name):
    with open(file_name, 'rb') as r:                                                           
          original= bytearray(r.read())
    return [x for x in original]

Before BWT and MTF

In [38]:
originalbytes = read_file('book1-en.txt')
plt = plot(originalbytes)
plt.show()
print("Entropy:", entropy_shannon(originalbytes))
Entropy: 3.1504021522329757

After BWT and MTF

In [39]:
originalbytes = read_file('book1-en.txt')

slices_originalbytes = [originalbytes[i:i + 10000] for i in range(0, len(originalbytes), 10000)]  

bw_ids = []
bytesbwt = []

for sc in slices_originalbytes:
    i, bw_slice = bw_transform(sc)
    bw_ids.append(i)
    bytesbwt.extend(bw_slice)

bytesMTF  = encodeMTF(bytesbwt)

plt = plot(bytesMTF)
plt.show()
print("Entropy:", entropy_shannon(bytesMTF))
Entropy: 2.449382298285078