from scipy.stats import entropy
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np
def entropy_shannon(s, base=None):
value, counts = np.unique(s, return_counts=True)
return entropy(counts, base = base)
def bw_transform(s):
n = len(s)
m = sorted([s[i:n]+s[0:i] for i in range(n)])
I = m.index(s)
L = [q[-1] for q in m]
return (I, L)
def bw_restore(I, L):
n = len(L)
X = sorted([(i, x) for i, x in enumerate(L)], key=itemgetter(1))
T = [None for i in range(n)]
for i, y in enumerate(X):
j, _ = y
T[j] = i
Tx = [I]
for i in range(1, n):
Tx.append(T[Tx[i-1]])
S = [L[i] for i in Tx]
S.reverse()
return ''.join(S)
common_dictionary = list(range(256))
def encodeMTF(plain_text):
dictionary = common_dictionary.copy()
compressed_text = list()
rank = 0
for c in plain_text:
rank = dictionary.index(int(c))
compressed_text.append(rank)
dictionary.pop(rank)
dictionary.insert(0, int(c))
return compressed_text
def decodeMTF(compressed_data):
compressed_text = compressed_data
dictionary = common_dictionary.copy()
plain_data = []
for rank in compressed_text:
plain_data.append(dictionary[rank])
e = dictionary.pop(rank)
dictionary.insert(0, e)
return plain_data
def get_x_labels(s_v):
x_labels = []
for ix in range(len(s_v)):
if ix % 5 == 0:
x_labels.append(str(ix))
else:
x_labels.append('')
return x_labels
def plot(bts):
term_freq = Counter(bts)
n = len(term_freq)
N = sum(term_freq.values())
for term in term_freq:
term_freq[term] = term_freq[term] / N
df = pd.DataFrame.from_records(term_freq.most_common(n), columns = ['Byte', 'Frecuencia'])
fig = plt.figure(figsize = (18, 6))
ax = sns.barplot(x = 'Byte', y = 'Frecuencia', data = df.sort_values(by=['Byte']), color='#3498db')
plt.xticks(fontsize = 10, rotation = 60)
plt.title('Frecuencia de los bytes')
return plt
def read_file(file_name):
with open(file_name, 'rb') as r:
original= bytearray(r.read())
return [x for x in original]
originalbytes = read_file('book1-en.txt')
plt = plot(originalbytes)
plt.show()
print("Entropy:", entropy_shannon(originalbytes))
originalbytes = read_file('book1-en.txt')
slices_originalbytes = [originalbytes[i:i + 10000] for i in range(0, len(originalbytes), 10000)]
bw_ids = []
bytesbwt = []
for sc in slices_originalbytes:
i, bw_slice = bw_transform(sc)
bw_ids.append(i)
bytesbwt.extend(bw_slice)
bytesMTF = encodeMTF(bytesbwt)
plt = plot(bytesMTF)
plt.show()
print("Entropy:", entropy_shannon(bytesMTF))