Skip to content

Commit

Permalink
Proper normalization
Browse files Browse the repository at this point in the history
Except for some hardcoded values
  • Loading branch information
baAlex committed May 1, 2022
1 parent c95ab05 commit 41a44ec
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 132 deletions.
97 changes: 76 additions & 21 deletions resources/ans-cdf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ template <typename T> class Cdf
}

public:
Cdf(const T* message, size_t len, uint32_t normalize_to = 0)
Cdf(const T* message, size_t len, uint32_t scale_to = 0)
{
// Cumulative distribution function (CDF) following Pasco (1976, p.10)
// (on my own nomenclature):
Expand Down Expand Up @@ -103,31 +103,87 @@ template <typename T> class Cdf
cumulative += i.frequency;
}

// Normalize (TODO, TODO, TODO)
bool normalized = false;
if (normalize_to != 0 && max_cumulative_ > normalize_to)
// Scale (TODO, not good nor that bad... I need to
// study real-world projects, hopefully find a paper)
if (scale_to != 0)
{
const auto div = std::ceil(static_cast<double>(max_cumulative_ - 1) / static_cast<double>(normalize_to));
normalized = true;
const auto mul = static_cast<double>(scale_to) / static_cast<double>(max_cumulative_);

std::cout << "### Original cumulative: " << max_cumulative_ << "\n";
std::cout << "### Scale: " << mul << "x\n";

uint32_t cumulative = 0;
for (auto& i : table)
size_t zeros_from = table.size();
size_t twos_from = table.size();

// Accumulate, again, this time scaling frequencies
for (size_t i = 0; i < table.size(); i++)
{
i.frequency = static_cast<uint32_t>(std::floor(static_cast<double>(i.frequency) / div));
if (i.frequency == 0)
i.frequency = 1;
struct CdfEntry<T>& e = table[i];

// Accumulate those that do not require current frequency
max_cumulative_ = cumulative + 1;
i.cumulative = cumulative;
cumulative += i.frequency;
e.cumulative = cumulative;

// Scale frequency, if ends up being zero, as long
// 'max_cumulative' allow us, convert it back to one
e.frequency = static_cast<uint32_t>(std::floor(static_cast<double>(e.frequency) * mul));

if (e.frequency == 0 && max_cumulative_ < scale_to)
e.frequency = 1;

// Accumulate frequency
cumulative += e.frequency;

// We need to keep track of some numbers
if (e.frequency == 0)
{
zeros_from = i;
break; // From here all values will be zero
}
else if (e.frequency > 1)
twos_from = i;
}

if (max_cumulative_ > normalize_to)
// Bad news, we need to fix zero values. We are going to convert
// them to one while subtracting one to those greater than... one
std::cout << "### Zeros: " << table.size() - zeros_from << "\n";

if (zeros_from != table.size())
{
std::cout << max_cumulative_ << " > " << normalize_to << "\n";
std::cout << "Entropy: " << h << " bits per symbol\n";
throw std::runtime_error("No enough precision (and bad maths).");
if (twos_from == table.size())
throw std::runtime_error("No enough entropy."); // White noise isn't funny

for (size_t i = 0; i < (table.size() - zeros_from); i++)
{
struct CdfEntry<T>& ez = table[table.size() - i - 1];
ez.frequency = 1;

struct CdfEntry<T>& et = table[twos_from];
et.frequency--;

if (et.frequency == 1)
twos_from--;

if (twos_from > table.size()) // Underflows
throw std::runtime_error("No enough precision."); // Too much data
}

// Accumulate frequencies, for a third time
cumulative = 0;
for (auto& i : table)
{
max_cumulative_ = cumulative + 1;
i.cumulative = cumulative;
cumulative += i.frequency;
}
}

// Done
if (max_cumulative_ > scale_to) // This shouldn't happen
throw std::runtime_error("Bad math involving std::floor()!.");

max_cumulative_ = scale_to; // Ok, it can be less, in such case we lie
}

// Developers, developers, developers
Expand All @@ -147,12 +203,11 @@ template <typename T> class Cdf

std::cout << "\"\n";
std::cout << "Length: " << len << " symbols\n";
std::cout << "Maximum cumulative: " << max_cumulative_ << "\n";
std::cout << "Unique symbols: " << table.size() << "\n";
std::cout << "Entropy: " << h << " bits per symbol"
<< ((normalized == true) ? " (before normalization)\n" : "\n");
std::cout << "Maximum cumulative: " << max_cumulative_ << ((scale_to != 0) ? " (scaled)\n" : "\n");
std::cout << "Entropy: " << h << " bits per symbol" << ((scale_to != 0) ? " (before scaling)\n" : "\n");
std::cout << "Shannon target: " << (h * static_cast<double>(len)) / 8.0 << " bytes"
<< ((normalized == true) ? " (before normalization)\n" : "\n");
<< ((scale_to != 0) ? " (before scaling)\n" : "\n");

for (size_t i = 0; i < std::min(table.size(), SYMBOLS_MAX_PRINT); i++)
std::cout << " - '" << table[i].symbol << "' (f: " << table[i].frequency
Expand Down Expand Up @@ -190,7 +245,7 @@ template <typename T> class Cdf
return table[table.size() - 1];
}

uint32_t max_cumulative() const
uint32_t m() const
{
return max_cumulative_;
}
Expand Down
79 changes: 79 additions & 0 deletions resources/ans-d3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
I am he as you are he as you are me
And we are all together
See how they run like pigs from a gun
See how they fly
I'm crying
Sitting on a corn flake
Waiting for the van to come
Corporation T-shirt, stupid bloody Tuesday
Man you've been a naughty boy
You let your face grow long
I am the egg man
They are the egg men
I am the walrus
Goo goo g'joob
Mister City policeman sitting
Pretty little policemen in a row
See how they fly like Lucy in the sky, see how they run
I'm crying, I'm crying
I'm crying, I'm crying
Yellow matter custard
Dripping from a dead dog's eye
Crabalocker fishwife, pornographic priestess
Boy, you've been a naughty girl, you let your knickers down
I am the egg man
They are the egg men
I am the walrus
Goo goo g'joob
Sitting in an English garden
Waiting for the sun
If the sun don't come you get a tan
From standing in the English rain
I am the egg man (now good sir)
They are the egg men (a poor man, made tame to fortune's blows)
I am the walrus
Goo goo g'joob, goo goo goo g'joob (good pity)
Expert, texpert choking smokers
Don't you think the joker laughs at you (ho ho ho, hee hee hee, hah hah hah)
See how they smile like pigs in a sty, see how they snide
I'm crying
Semolina Pilchard
Climbing up the Eiffel tower
Elementary penguin singing Hare Krishna
Man, you should have seen them kicking Edgar Allen Poe
I am the egg man
They are the egg men
I am the walrus
Goo goo g'joob, goo goo goo g'joob
Goo goo g'joob, goo goo goo g'joob, goo
Joob, joob, jooba
Jooba, jooba, jooba
Joob, jooba
Joob, jooba
Umpa, umpa, stick it up your jumper (jooba, jooba)
Umpa, umpa, stick it up your jumper
Everybody's got one (umpa, umpa)
Everybody's got one (stick it up your jumper)
Everybody's got one (umpa, umpa)
Everybody's got one (stick it up your jumper)
Everybody's got one (umpa, umpa)
Everybody's got one (stick it up your jumper)
Everybody's got one (umpa, umpa)
Everybody's got one (stick it up your jumper)
Everybody's got one (umpa, umpa)
Everybody's got one (stick it up your jumper)
Everybody's got one (umpa, umpa)
Slave
Thou hast slain me
Villain, take my purse
If I ever
Bury my body
The letters which though find'st about me
To Edmund Earl of Gloucester
Seek him out upon the British Party
O untimely death
I know thee well
A serviceable villain, as duteous to the vices of thy mistress
As badness would desire
What, is is he dead?
Sit you down, Father, rest you
99 changes: 49 additions & 50 deletions resources/ans1-core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ USE OR PERFORMANCE OF THIS SOFTWARE.
// ans1-core.cpp


#include <cstring>
#include <iostream>
#include <vector>

#include "ans-cdf.hpp"

Expand All @@ -43,44 +43,43 @@ typedef uint64_t state_t;
const state_t INITIAL_STATE = 123;


template <typename T> state_t Encode(const Cdf<T>& cdf, const T* message, size_t len)
{
// (En)coding:
// « C(s, x) = m * floor(x / l[s]) + b[s] + mod(x, l[s]) » (Duda 2014, p.8)
// (En)coding:
// « C(s, x) = m * floor(x / l[s]) + b[s] + mod(x, l[s]) » (Duda 2014, p.8)

// Decoding:
// D(x) = (s <- s[mod(x, m)])
// D(x, s) = (x <- l[s] * floor(x / m) + mod(x, m) - b[s])
// Decoding:
// D(x) = (s <- s[mod(x, m)])
// D(x, s) = (x <- l[s] * floor(x / m) + mod(x, m) - b[s])

// Own nomenclature on Duda (2014, ibid.)
// Own nomenclature on Duda (2014, ibid.)

// Where 'l[s]' and 'b[s]' are, respectively, the probabilities of symbol 's' and its
// cumulative frequency. 'x' the state/accumulator and 'm' a number greater than the
// cumulative frequency of the last symbol in our lexicographic ordered cumulative-table.
// This last one can be also a power of two (helps in replace multiplications and
// divisions with shifts).
// Where 'l[s]' and 'b[s]' are, respectively, the probabilities of symbol 's' and its
// cumulative frequency. 'x' the state/accumulator and 'm' a number greater than the
// cumulative frequency of the last symbol in our lexicographic ordered cumulative-table.
// This last one can be also a power of two (helps in replace multiplications and
// divisions with shifts).

std::cout << "\nEncode:\n";
state_t state = INITIAL_STATE;

for (const T* i = message; i != message + len; i++)
{
// Encode
const CdfEntry<T>& e = cdf.of_symbol(*i);
state_t C(state_t state, uint32_t frequency, uint32_t cumulative, uint32_t m)
{
return m * (state / frequency) + (state % frequency) + cumulative;
}

const auto prev_state = state;
state = cdf.max_cumulative() * (state / e.frequency) + (state % e.frequency) + e.cumulative;
state_t D(state_t state, uint32_t frequency, uint32_t cumulative, uint32_t m, uint32_t modulo_point)
{
return frequency * (state / m) + modulo_point - cumulative;
}

// Try to decode, a failure means that the state overflow-ed (well... wrap-around)
{
const uint32_t point = (state % cdf.max_cumulative()); // "Point" as is inside a range
const CdfEntry<T>& e = cdf.of_point(point);

const state_t decode_state = e.frequency * (state / cdf.max_cumulative()) + point - e.cumulative;
template <typename T> state_t AnsEncode(const Cdf<T>& cdf, const std::vector<T>& message)
{
std::cout << "\nEncode:\n";
state_t state = INITIAL_STATE;

if (decode_state != prev_state)
throw std::runtime_error("ANS overflow, we don't have a reversible state.");
}
for (const auto& i : message)
{
// Encode
const CdfEntry<T>& e = cdf.of_symbol(i);
state = C(state, e.frequency, e.cumulative, cdf.m());

// Developers, developers, developers
std::cout << " - '" << e.symbol << "' (f: " << e.frequency << ", c: " << e.cumulative << ")\t->\t" << state
Expand All @@ -91,20 +90,20 @@ template <typename T> state_t Encode(const Cdf<T>& cdf, const T* message, size_t
}


template <typename T> void Decode(const Cdf<T>& cdf, state_t state)
template <typename T> void AnsDecode(const Cdf<T>& cdf, state_t state)
{
std::cout << "\nDecode:\n";

while (state > INITIAL_STATE)
{
// Decode
const uint32_t point = (state % cdf.max_cumulative());
const CdfEntry<T>& e = cdf.of_point(point);
const uint32_t modulo_point = (state % cdf.m());

state = e.frequency * (state / cdf.max_cumulative()) + point - e.cumulative;
const CdfEntry<T>& e = cdf.of_point(modulo_point);
state = D(state, e.frequency, e.cumulative, cdf.m(), modulo_point);

// Developers, developers, developers
std::cout << " - '" << e.symbol << "' (p: " << point << ", f: " << e.frequency << ", c: " << e.cumulative
std::cout << " - '" << e.symbol << "' (p: " << modulo_point << ", f: " << e.frequency << ", c: " << e.cumulative
<< ")\t->\t" << state << "\n";
}
}
Expand All @@ -116,20 +115,21 @@ int main()

// Using Uint8 (char)
{
// const auto message = "hello";
const auto message = "hello there";
// const auto message = "abracadabra";
// const auto message = "111111111112";
// const auto message = "211111111111";
// const std::string s = "hello";
const std::string s = "hello there";
// const std::string s = "abracadabra";
// const std::string s = "111111111112";
// const std::string s = "211111111111";

// const auto message = "hello there, come here my little friend"; // Should fail
// const auto message = "1111111"; // Should fail
// const std::string s = "hello there, come here my little friend"; // Should fail
// const std::string s = "1111111"; // Should fail

try
{
const auto cdf = Cdf<char>(message, strlen(message));
const auto state = Encode<char>(cdf, message, strlen(message));
Decode<char>(cdf, state);
const auto message = std::vector<char>(s.begin(), s.end());
const auto cdf = Cdf<char>(message.data(), message.size());
const auto state = AnsEncode(cdf, message);
AnsDecode(cdf, state);
}
catch (const std::exception& e)
{
Expand All @@ -141,14 +141,13 @@ int main()
// Using Int16
{
std::cout << "\n";
const size_t length = 8;
const int16_t message[length] = {1, 9, 2, 1, 6, 8, 0, 1};
const std::vector<uint16_t> message = {1, 9, 2, 1, 6, 8, 0, 1};

try
{
const auto cdf = Cdf<int16_t>(message, length);
const auto state = Encode<int16_t>(cdf, message, length);
Decode<int16_t>(cdf, state);
const auto cdf = Cdf<uint16_t>(message.data(), message.size());
const auto state = AnsEncode(cdf, message);
AnsDecode(cdf, state);
}
catch (const std::exception& e)
{
Expand Down
Loading

0 comments on commit 41a44ec

Please sign in to comment.