Skip to content

Commit

Permalink
feat: add an arithmetic encoder to transform strings into f64 (#134)
Browse files Browse the repository at this point in the history
This PR implements an arithmetic encoder for alphanumeric strings,
enabling the strings to utilize t-digests (see
[Wikipedia](https://en.wikipedia.org/wiki/Arithmetic_coding)).

Characters outside the specified alpha-numerical range are treated as
identical and considered 'heaviest'.
  • Loading branch information
AlSchlo authored Mar 25, 2024
1 parent 204758e commit 3ebe94e
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion optd-gungnir/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ edition = "2021"
itertools = "0.11"
rand = "0.8"
crossbeam = "0.8"
serde = {version = "1.0", features = ["derive"]}
lazy_static = "1.4"
serde = {version = "1.0", features = ["derive"]}
1 change: 1 addition & 0 deletions optd-gungnir/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(clippy::new_without_default)]

pub mod stats;
pub mod utils;
2 changes: 2 additions & 0 deletions optd-gungnir/src/stats/misragries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use std::{cmp::min, collections::HashMap, hash::Hash};

use itertools::Itertools;

pub const DEFAULT_K_TO_TRACK: u16 = 100;

/// The Misra-Gries structure to approximate the k most frequent elements in
/// a stream of N elements. It will always identify elements with frequency
/// f >= (n/k), and include additional leftovers.
Expand Down
1 change: 1 addition & 0 deletions optd-gungnir/src/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod arith_encoder;
71 changes: 71 additions & 0 deletions optd-gungnir/src/utils/arith_encoder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//! This module provides an encoder that converts alpha-numeric strings
//! into f64 values, designed to maintain the natural ordering of strings.
//!
//! While the encoding is theoretically lossless, in practice, it may suffer
//! from precision loss due to floating-point errors.
//!
//! Non-alpha-numeric characters are relegated to the end of the encoded value,
//! rendering them indistinguishable from one another in this context.
use std::collections::HashMap;

use lazy_static::lazy_static;

// The alphanumerical ordering.
const ALPHANUMERIC_ORDER: [char; 95] = [
' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<',
'=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4',
'5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
];

const PMF: f64 = 1.0 / (ALPHANUMERIC_ORDER.len() as f64);

lazy_static! {
static ref CDF: HashMap<char, f64> = {
let length = ALPHANUMERIC_ORDER.len() + 1; // To account for non-alpha-numeric characters.
let mut cdf = HashMap::with_capacity(length);
for (index, &char) in ALPHANUMERIC_ORDER.iter().enumerate() {
cdf.insert(char, (index as f64) / (length as f64));
}
cdf
};
}

pub fn encode(string: &str) -> f64 {
let mut left = 0.0;
let mut right = f64::MAX;

for char in string.chars() {
let cdf = CDF.get(&char).unwrap_or(&1.0);
let distance = right - left;
right = left + distance * (cdf + PMF);
left += distance * cdf;
}

left
}

// Start of unit testing section.
#[cfg(test)]
mod tests {
use super::encode;

#[test]
fn encode_tests() {
assert!(encode("") < encode("abc"));
assert!(encode("abc") < encode("bcd"));

assert!(encode("a") < encode("aaa"));
assert!(encode("!a") < encode("a!"));
assert!(encode("Alexis") < encode("Schlomer"));

assert!(encode("Gungnir Rules!") < encode("Schlomer"));
assert!(encode("Gungnir Rules!") < encode("Schlomer"));

assert_eq!(encode(" "), encode(" "));
assert_eq!(encode("Same"), encode("Same"));
assert!(encode("Nicolas ") < encode("Nicolas💰💼"));
}
}

0 comments on commit 3ebe94e

Please sign in to comment.