From b55f745c9e4bd86e3dd7fd68220e39a2aa1e81e0 Mon Sep 17 00:00:00 2001 From: David Teller Date: Fri, 13 Sep 2019 14:34:17 +0200 Subject: [PATCH 1/7] Huffman: Key invariant: highest weight bits are 0 --- crates/binjs_io/src/context/huffman.rs | 61 +++++++++++++++++++++----- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/crates/binjs_io/src/context/huffman.rs b/crates/binjs_io/src/context/huffman.rs index 62cfaf4f6..a54dbca41 100644 --- a/crates/binjs_io/src/context/huffman.rs +++ b/crates/binjs_io/src/context/huffman.rs @@ -35,23 +35,64 @@ impl std::ops::Shl for u32 { } } +/// Convenience implementation of operator `>>` in +/// `bits >> bit_len` +impl std::ops::Shr for u32 { + type Output = u32; + fn shr(self, rhs: BitLen) -> u32 { + self >> Into::::into(rhs) + } +} + /// The largerst acceptable length for a key. /// /// Hardcoded in the format. const MAX_CODE_BIT_LENGTH: u8 = 20; +// privacy barrier +mod key { + use context::huffman::BitLen; + /// A Huffman key #[derive(Debug)] -struct Key { +pub struct Key { /// The bits in the key. /// /// Note that we only use the `bit_len` lowest-weight bits. - /// Any other bit is ignored. + /// Any other bit MUST BE 0. bits: u32, /// The number of bits of `bits` to use. bit_len: BitLen, } +impl Key { + /// Create a new Key. + pub fn new(bits: u32, bit_len: BitLen) -> Self { + debug_assert!({let bit_len : u8 = bit_len.into(); bit_len <= 32}); + debug_assert!({let bit_len : u8 = bit_len.into(); if bit_len < 32 { bits >> bit_len == 0 } else { true }}); + Key { + bits, + bit_len, + } + } + + /// The bits in the key. + /// + /// Note that we only use the `bit_len` lowest-weight bits. + /// Any other bit is guaranteed to be 0. + pub fn bits(&self) -> u32 { + self.bits + } + + /// The number of bits of `bits` to use. + pub fn bit_len(&self) -> BitLen { + self.bit_len + } +} + +} // mod key + +use self::key::Key; /// A node in the Huffman tree. struct Node { @@ -171,12 +212,12 @@ where bit_lengths[i].0.clone(), bit_lengths[i + 1].1, ); - keys.push((symbol.clone(), Key { bits, bit_len })); + keys.push((symbol.clone(), Key::new(bits, bit_len))); bits = (bits + 1) << (next_bit_len - bit_len); } // Handle the last element. let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1]; - keys.push((symbol.clone(), Key { bits, bit_len })); + keys.push((symbol.clone(), Key::new(bits, bit_len))); return Ok(Self { keys }); } @@ -275,14 +316,14 @@ fn test_coded_from_sequence() { assert_eq!(coded.keys[2].0, 'l'); // Check bit length of symbols. - assert_eq!(coded.keys[0].1.bit_len, 1.into()); - assert_eq!(coded.keys[1].1.bit_len, 2.into()); - assert_eq!(coded.keys[2].1.bit_len, 2.into()); + assert_eq!(coded.keys[0].1.bit_len(), 1.into()); + assert_eq!(coded.keys[1].1.bit_len(), 2.into()); + assert_eq!(coded.keys[2].1.bit_len(), 2.into()); // Check code of symbols. - assert_eq!(coded.keys[0].1.bits, 0b00); - assert_eq!(coded.keys[1].1.bits, 0b10); - assert_eq!(coded.keys[2].1.bits, 0b11); + assert_eq!(coded.keys[0].1.bits(), 0b00); + assert_eq!(coded.keys[1].1.bits(), 0b10); + assert_eq!(coded.keys[2].1.bits(), 0b11); // Let's try again with a limit to 1 bit paths. assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2); From 7e8d52f0715bffe826e3855a3541b8ac892652fb Mon Sep 17 00:00:00 2001 From: David Teller Date: Mon, 16 Sep 2019 17:08:56 +0200 Subject: [PATCH 2/7] Context 0.1: Introducing SingleLookupHuffmanTable This table provides best performance but may only be used reasonably for small bit lengths due to its memory cost. We'll implement a more space-efficient (but not as fast) MultiLookupHuffmanTable in a followup patch. --- .../context/{huffman.rs => huffman/mod.rs} | 187 ++++++++--- crates/binjs_io/src/context/huffman/read.rs | 308 ++++++++++++++++++ 2 files changed, 455 insertions(+), 40 deletions(-) rename crates/binjs_io/src/context/{huffman.rs => huffman/mod.rs} (67%) create mode 100644 crates/binjs_io/src/context/huffman/read.rs diff --git a/crates/binjs_io/src/context/huffman.rs b/crates/binjs_io/src/context/huffman/mod.rs similarity index 67% rename from crates/binjs_io/src/context/huffman.rs rename to crates/binjs_io/src/context/huffman/mod.rs index a54dbca41..cec407397 100644 --- a/crates/binjs_io/src/context/huffman.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -1,9 +1,13 @@ use io::statistics::Instances; +use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap}; use std::hash::Hash; +/// Reading from bitstreams and decoding their contents using Huffman tables. +pub mod read; + /// A newtype for `u8` used to count the length of a key in bits. #[derive( Debug, @@ -25,6 +29,11 @@ use std::hash::Hash; Eq, )] pub struct BitLen(u8); +impl BitLen { + pub fn as_u8(&self) -> u8 { + self.0 + } +} /// Convenience implementation of operator `<<` in /// `bits << bit_len` @@ -34,6 +43,12 @@ impl std::ops::Shl for u32 { self << Into::::into(rhs) } } +impl std::ops::Shl for usize { + type Output = usize; + fn shl(self, rhs: BitLen) -> usize { + self << Into::::into(rhs) + } +} /// Convenience implementation of operator `>>` in /// `bits >> bit_len` @@ -43,56 +58,125 @@ impl std::ops::Shr for u32 { self >> Into::::into(rhs) } } +impl std::ops::Shr for usize { + type Output = usize; + fn shr(self, rhs: BitLen) -> usize { + self >> Into::::into(rhs) + } +} /// The largerst acceptable length for a key. /// /// Hardcoded in the format. const MAX_CODE_BIT_LENGTH: u8 = 20; -// privacy barrier -mod key { - use context::huffman::BitLen; - -/// A Huffman key -#[derive(Debug)] -pub struct Key { - /// The bits in the key. - /// - /// Note that we only use the `bit_len` lowest-weight bits. - /// Any other bit MUST BE 0. +/// A sequence of bits, read from a bit stream. +/// +/// Typically used for lookup of entries in Huffman tables. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct BitSequence { bits: u32, - - /// The number of bits of `bits` to use. bit_len: BitLen, } -impl Key { - /// Create a new Key. +impl BitSequence { pub fn new(bits: u32, bit_len: BitLen) -> Self { - debug_assert!({let bit_len : u8 = bit_len.into(); bit_len <= 32}); - debug_assert!({let bit_len : u8 = bit_len.into(); if bit_len < 32 { bits >> bit_len == 0 } else { true }}); - Key { - bits, - bit_len, + Self { bits, bit_len } + } + pub fn bits(&self) -> u32 { + self.bits + } + /// The number of bits of `bits` to use. + pub fn bit_len(&self) -> BitLen { + self.bit_len + } + /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len` + /// bits. + /// + /// # Failure + /// + /// This function panics if `bit_len > self.bit_len`. + pub fn split(&self, bit_len: BitLen) -> (u32, u32) { + let shift = self.bit_len - bit_len; + match shift.into() { + 0u8 => (self.bits, 0), // Special case: cannot >> 32 + 32u8 => (0, self.bits), // Special case: cannot >> 32 + shift => ( + self.bits >> shift, + self.bits & (std::u32::MAX >> 32 - shift), + ), } } + pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow { + assert!(total_bit_len.0 <= 32u8); + if total_bit_len <= self.bit_len { + return Cow::Borrowed(self); + } + let shift = total_bit_len - self.bit_len; + if shift.0 == 32u8 { + return Cow::Owned(BitSequence::new(0, BitLen(32))); + } + Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len)) + } +} - /// The bits in the key. +#[test] +fn test_bit_sequence_split() { + let bits = 0b11111111_11111111_00000000_00000000; + let key = BitSequence::new(bits, BitLen(32)); + assert_eq!(key.split(BitLen(0)), (0, bits)); + assert_eq!(key.split(BitLen(32)), (bits, 0)); + assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0)); + + let bits = 0b00000000_00000000_00000000_11111111; + let key = BitSequence::new(bits, BitLen(16)); + assert_eq!(key.split(BitLen(0)), (0, bits)); + assert_eq!(key.split(BitLen(16)), (bits, 0)); + assert_eq!(key.split(BitLen(8)), (0, 0b11111111)); +} + +/// A Huffman key +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Key(BitSequence); + +impl Key { + /// Create a new Key. /// /// Note that we only use the `bit_len` lowest-weight bits. - /// Any other bit is guaranteed to be 0. + /// Any other bit MUST BE 0. + pub fn new(bits: u32, bit_len: BitLen) -> Self { + debug_assert!({ + let bit_len: u8 = bit_len.into(); + bit_len <= 32 + }); + debug_assert!({ + let bit_len: u8 = bit_len.into(); + if bit_len < 32 { + bits >> bit_len == 0 + } else { + true + } + }); + Key(BitSequence { bits, bit_len }) + } + + /// The bits in this Key. + /// + /// # Invariant + /// + /// Only the `self.bit_len()` lowest-weight bits may be non-0. pub fn bits(&self) -> u32 { - self.bits + self.0.bits } /// The number of bits of `bits` to use. pub fn bit_len(&self) -> BitLen { - self.bit_len + self.0.bit_len } -} - -} // mod key -use self::key::Key; + pub fn as_bit_sequence(&self) -> &BitSequence { + &self.0 + } +} /// A node in the Huffman tree. struct Node { @@ -136,17 +220,34 @@ impl PartialEq for Node { impl Eq for Node {} /// Keys associated to a sequence of values. -#[derive(Debug)] -pub struct Keys -where - T: Ord + Clone, -{ +#[derive(Clone, Debug)] +pub struct Keys { + /// The longest bit length that actually appears in `keys`. + highest_bit_len: BitLen, + /// The sequence of keys. /// /// Order is meaningful. keys: Vec<(T, Key)>, } +impl Keys { + pub fn len(&self) -> usize { + self.keys.len() + } + pub fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } +} + +impl IntoIterator for Keys { + type Item = (T, Key); + type IntoIter = std::vec::IntoIter<(T, Key)>; + fn into_iter(self) -> Self::IntoIter { + self.keys.into_iter() + } +} + impl Keys where T: Ord + Clone, @@ -155,12 +256,12 @@ where /// /// Optionally, `max_bit_len` may specify a largest acceptable bit length. /// If `Keys` may not be computed without exceeding this bit length, - /// fail with `Err(problemantic_bit_length)`. + /// fail with `Err(problemantic_bit_len)`. /// /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_length` even though an + /// level. This may cause us to exceed `max_bit_len` even though an /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_length`. + /// proceed without exceeding `max_bit_len`. /// /// # Performance /// @@ -185,9 +286,9 @@ where /// with a number of instances already attached. /// /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_length` even though an + /// level. This may cause us to exceed `max_bit_len` even though an /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_length`. + /// proceed without exceeding `max_bit_len`. /// /// # Requirement /// @@ -197,9 +298,9 @@ where S: IntoIterator, { let mut bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; + let mut highest_bit_len = BitLen(0); // Canonicalize order: (BitLen, T) - // As values of `T` are bit_lengths.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); // The bits associated to the next value. @@ -214,12 +315,18 @@ where ); keys.push((symbol.clone(), Key::new(bits, bit_len))); bits = (bits + 1) << (next_bit_len - bit_len); + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } } // Handle the last element. let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1]; keys.push((symbol.clone(), Key::new(bits, bit_len))); - return Ok(Self { keys }); + return Ok(Self { + highest_bit_len, + keys, + }); } /// Convert a sequence of values labelled by their number of instances diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs new file mode 100644 index 000000000..51faac772 --- /dev/null +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -0,0 +1,308 @@ +//! Huffman tables for reading. +//! +//! These tables are designed to aid decoding from sequences of bits +//! into values. + +use context::huffman::*; + +use std::convert::{TryFrom, TryInto}; + +/// A Huffman table. +/// +/// We have several implementations of HuffmanTable designed for +/// distinct space/speed tradeoffs. +pub trait HuffmanTable { + /// Return the number of elements in the table. + fn len(&self) -> usize; + + /// Return bit length of the table with most elements. + fn highest_bit_len(&self) -> BitLen; + + /// Lookup a value from a sequence of bits. + /// + /// The sequence of bits MUST be at least as long as `highest_bit_len`. + /// Use the `Key` result to determine how many bits need to actually be + /// consumed from the bit stream. + fn lookup(&self, key: &BitSequence) -> Option<&(T, Key)>; +} + +/// A type that has a maximal value. +pub trait ValueIndex: TryFrom + TryInto + Clone { + fn max_value() -> Self; +} +impl ValueIndex for u8 { + fn max_value() -> u8 { + std::u8::MAX + } +} +impl ValueIndex for u32 { + fn max_value() -> u32 { + std::u32::MAX + } +} +impl ValueIndex for usize { + fn max_value() -> usize { + std::usize::MAX + } +} + +/// An implementation of Huffman Tables as a vector designed to allow +/// constant-time lookups at the expense of high space complexity. +/// +/// Type parameter `V` is the internal type of indices. Instantiating +/// with `V = u8` will provide the maximal speed and space-efficiency +/// but will only work if the table contains at most 2^8 values. +/// Alternatively, you may instantiate with `u32` or `usize` for +/// larger tables. +/// +/// # Time complexity +/// +/// Lookups take constant time, which essentially consists in two +/// simple vector lookups. +/// +/// # Space complexity +/// +/// After initialization, a `SingleLookupHuffmanTable` +/// requires O(2 ^ max bit length in the table) space: +/// +/// - A vector `values` containing one entry per symbol. +/// - A vector `saturated` containing exactly 2 ^ (max bit length in the +/// table) entries, which we use to map any combination of `maxBitLength` +/// bits onto the only `HuffmanEntry` that may be reached by a prefix +/// of these `maxBitLength` bits. See below for more details. +/// +/// # Algorithm +/// +/// Consider the following Huffman table +/// +/// Symbol | Binary Code | Int value of Code | Bit Length +/// ------ | ------------ | ----------------- | ---------- +/// A | 11000 | 24 | 5 +/// B | 11001 | 25 | 5 +/// C | 1101 | 13 | 4 +/// D | 100 | 4 | 3 +/// E | 101 | 5 | 3 +/// F | 111 | 7 | 3 +/// G | 00 | 0 | 2 +/// H | 01 | 1 | 2 +/// +/// By definition of a Huffman Table, the Binary Codes represent +/// paths in a Huffman Tree. Consequently, padding these codes +/// to the end would not change the result. +/// +/// Symbol | Binary Code | Int value of Code | Bit Length +/// ------ | ------------ | ----------------- | ---------- +/// A | 11000 | 24 | 5 +/// B | 11001 | 25 | 5 +/// C | 1101? | [26...27] | 4 +/// D | 100?? | [16...19] | 3 +/// E | 101?? | [20..23] | 3 +/// F | 111?? | [28..31] | 3 +/// G | 00??? | [0...7] | 2 +/// H | 01??? | [8...15] | 2 +/// +/// Row "Int value of Code" now contains all possible values +/// that may be expressed in 5 bits. By using these values +/// as array indices, we may therefore represent the +/// Huffman table as an array: +/// +/// Index | Symbol | Bit Length +/// --------- | ---------- | ------------- +/// [0...7] | G | 2 +/// [8...15] | H | 2 +/// [16...19] | D | 3 +/// [20...23] | E | 3 +/// 24 | A | 5 +/// 25 | B | 5 +/// [26...27] | C | 4 +/// [28...31] | F | 3 +/// +/// By using the next 5 bits in the bit buffer, we may, in +/// a single lookup, determine the symbol and the bit length. +/// +/// In the current implementation, to save some space, we have +/// two distinct arrays, one (`values`) with a single instance of each +/// symbols bit length, and one (`saturated`) with indices into that +/// array. +#[derive(Debug)] +pub struct SingleLookupHuffmanTable { + highest_bit_len: BitLen, + saturated: Vec, + values: Vec<(T, Key)>, +} +impl SingleLookupHuffmanTable +where + V: ValueIndex, +{ + pub fn from_keys(keys: Keys) -> Self { + assert!( + keys.len() + <= V::max_value() + .try_into() + .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")) + ); + let highest_bit_len = keys.highest_bit_len(); + + let mut values = Vec::with_capacity(keys.len()); + + // Fill `saturated` with a default value of `V::max_value()`. + // This is the value most likely to trigger errors in case + // we have a bug in the implementation of `SingleLookupHuffmanTable` + // or if the data provided is inconsistent. + let mut saturated = Vec::with_capacity(1usize << highest_bit_len); + saturated.resize(1usize << highest_bit_len, V::max_value()); + + for (value_index, (value, key)) in keys.into_iter().enumerate() { + let value_index: V = value_index + .try_into() + .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")); + + // When we perform lookup, we will extract `highest_bit_len` bits from the key + // into a value `0bB...B`. We have a match for `value` if and only if + // `0bB...B` may be decomposed into `0bC...CX...X` such that + // - `0bC...C` is `bit_len` bits long; + // - `0bC...C == bits`. + // + // To perform a fast lookup, we precompute all possible values of `0bB...B` + // for which this condition is true. That's all the values of segment + // `[0bC...C0...0, 0bC...C1...1]`. + let padding = highest_bit_len - key.bit_len(); + assert!(padding.as_u8() < 32); + + // `seg_begin` holds `0bC...C0...0` above + let seg_begin = (key.bits() << padding) as usize; + + // `seg_len` holds `0bC...C1...1` - `0bC...C0...0` + let seg_len: usize = if padding.as_u8() == 0 { + 0 + } else { + let shift: u8 = + u8::checked_sub(8 * std::mem::size_of::() as u8, padding.into()) + .unwrap(); + std::usize::MAX >> shift + } + 1; + for entry in &mut saturated[seg_begin..seg_begin + seg_len] { + *entry = value_index.clone(); + } + + values.push((value, key)); + } + + Self { + highest_bit_len, + saturated, + values, + } + } +} + +impl HuffmanTable for SingleLookupHuffmanTable +where + V: ValueIndex, +{ + fn len(&self) -> usize { + self.values.len() + } + + fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } + + fn lookup(&self, key: &BitSequence) -> Option<&(T, Key)> { + assert!(key.bit_len() >= self.highest_bit_len()); + let (prefix, _) = key.split(self.highest_bit_len()); + let value_index = self.saturated[prefix as usize].clone(); + let value_index: usize = value_index + .try_into() + .unwrap_or_else(|_| panic!("Value index does not fit into a usize")); + self.values.get(value_index) + } +} + +#[test] +fn test_single_lookup_huffman_table() { + // Check against a hardcoded constant, to ensure consistency + // with fbssdc implementation. + + fn run_test() + where + V: ValueIndex, + { + let sample = "appl"; + let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let table: SingleLookupHuffmanTable = SingleLookupHuffmanTable::from_keys(coded); + + assert_eq!(table.len(), 3); + + // Test with all possible 2 bit sequences. + let candidate = BitSequence::new(0b10, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('a', Key::new(0b10, BitLen(2)))); + + let candidate = BitSequence::new(0b11, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('l', Key::new(0b11, BitLen(2)))); + + // With a bit length of 2, there are two keys that + // should return 'p' + for prefix in &[0b00, 0b01] { + let candidate = BitSequence::new(*prefix, BitLen(2)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('p', Key::new(0, BitLen(1)))); + } + + // Test values with all possible 3 bit sequences. + for prefix in &[0b100, 0b101] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('a', Key::new(0b10, BitLen(2)))); + } + + for prefix in &[0b110, 0b111] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('l', Key::new(0b11, BitLen(2)))); + } + + for prefix in &[0b000, 0b001, 0b010, 0b011] { + let candidate = BitSequence::new(*prefix, BitLen(3)); + let result = table.lookup(&candidate).unwrap(); + assert_eq!(result, &('p', Key::new(0, BitLen(1)))); + } + } + + run_test::(); + run_test::(); +} + +#[test] +fn test_single_lookup_huffman_table_2() { + // Check internal consistency. + + fn run_test() + where + V: ValueIndex, + { + let sample = "Lorem ipsum dolor sit amet consectetur adipiscing elit convallis nostra, integer diam odio mus eros ut sodales sociis cursus, montes imperdiet morbi rhoncus felis venenatis curabitur magna. Volutpat tincidunt sociosqu pharetra id feugiat enim eget, integer quisque magna in senectus mollis, himenaeos malesuada convallis faucibus ornare egestas. Netus platea himenaeos suscipit nostra montes mattis, lobortis ut arcu facilisi hac ornare, integer ante sociosqu placerat morbi. + +Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis praesent habitasse, tortor id leo consequat sollicitudin elementum fames fringilla himenaeos donec. Phasellus posuere congue ultricies scelerisque senectus vivamus facilisi, vestibulum consequat aptent lectus ad sociis porta, purus libero eros leo at nec. Netus viverra urna nisl sapien conubia porta sed luctus penatibus cras, pulvinar iaculis sagittis fusce fringilla et rutrum sollicitudin ligula, dui vestibulum interdum pretium montes diam nibh inceptos ante. +"; + let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let table: SingleLookupHuffmanTable = + SingleLookupHuffmanTable::from_keys(coded.clone()); + for (value, key) in coded { + // Test that candidate keys obtained by extending `key` with additional bits + // return the expected `(value, key)`. + for bit_len in table.highest_bit_len().as_u8() + ..=std::cmp::min(table.highest_bit_len().as_u8() + 5, 32) + { + let candidate = key.as_bit_sequence().pad_lowest_to(BitLen(bit_len)); + let lookup = table.lookup(&candidate).expect("Lookup value not found"); + assert_eq!(lookup.0, value); + assert_eq!(lookup.1, key); + } + } + } + run_test::(); + run_test::(); +} From 1cf92cfab80e8fced330d2356a95ed6fc8ec30ae Mon Sep 17 00:00:00 2001 From: David Teller Date: Tue, 17 Sep 2019 15:54:57 +0200 Subject: [PATCH 3/7] Context 0.1: MultiLookupHuffmanTable --- crates/binjs_io/src/context/huffman/mod.rs | 253 +++++++++++--- crates/binjs_io/src/context/huffman/read.rs | 355 ++++++++++++++++++-- crates/binjs_io/src/context/mod.rs | 2 +- 3 files changed, 533 insertions(+), 77 deletions(-) diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs index cec407397..d255b1efd 100644 --- a/crates/binjs_io/src/context/huffman/mod.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -10,6 +10,7 @@ pub mod read; /// A newtype for `u8` used to count the length of a key in bits. #[derive( + Constructor, Debug, Default, Display, @@ -82,30 +83,63 @@ impl BitSequence { pub fn new(bits: u32, bit_len: BitLen) -> Self { Self { bits, bit_len } } + pub fn bits(&self) -> u32 { self.bits } + /// The number of bits of `bits` to use. pub fn bit_len(&self) -> BitLen { self.bit_len } + + /// Split the bits into a prefix of `bit_len` bits and a suffix containing the + /// remaining bits. + /// + /// If `bit_len` is larger than the number of bits, the prefix is padded with + /// lower-weight bits into `bit_len` bits. + pub fn split_bits(&self, bit_len: BitLen) -> (u32, u32) { + debug_assert!(bit_len.as_u8() <= 32); + if self.bit_len <= bit_len { + let padding = bit_len - self.bit_len; + (self.bits << padding, 0) + } else { + let shift = self.bit_len - bit_len; + match shift.into() { + 32u8 => (0, self.bits), // Special case: cannot >> 32 + shift => ( + self.bits >> shift, + self.bits & (std::u32::MAX >> 32 - shift), + ), + } + } + } + /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len` /// bits. /// /// # Failure /// /// This function panics if `bit_len > self.bit_len`. - pub fn split(&self, bit_len: BitLen) -> (u32, u32) { - let shift = self.bit_len - bit_len; - match shift.into() { - 0u8 => (self.bits, 0), // Special case: cannot >> 32 - 32u8 => (0, self.bits), // Special case: cannot >> 32 - shift => ( - self.bits >> shift, - self.bits & (std::u32::MAX >> 32 - shift), + pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) { + let (prefix, suffix) = self.split_bits(bit_len); + ( + BitSequence::new(prefix, bit_len), + BitSequence::new( + suffix, + if self.bit_len >= bit_len { + self.bit_len - bit_len + } else { + BitLen::new(0) + }, ), - } + ) } + + /// Add lowest-weight to this bit sequence bits until it reaches + /// a sufficient bit length. + /// + /// Does nothing if the bit sequence already has a sufficient bitlength. pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow { assert!(total_bit_len.0 <= 32u8); if total_bit_len <= self.bit_len { @@ -117,21 +151,93 @@ impl BitSequence { } Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len)) } + + /// Prepend a sequence of bits to a sequencce.s + pub fn with_prefix(&self, prefix: &BitSequence) -> Self { + assert!((prefix.bit_len() + self.bit_len()).as_u8() <= 32); + let bits = self.bits | (prefix.bits() << self.bit_len); + let bit_len = self.bit_len + prefix.bit_len; + BitSequence::new(bits, bit_len) + } + + /// Return a range representing all possible suffixes of this `BitSequence` + /// containing exactly `bit_len` bits. + /// + /// If this `BitSequence` is already at least `bit_len` bits long, we + /// truncate the `BitSequence` to `bit_len` bits by removing the + /// lower-weight bits and there is only one such suffix. + /// + /// ``` + /// use binjs_io::context::huffman::{ BitLen, BitSequence }; + /// + /// let zero = BitSequence::new(0, BitLen::new(0)); + /// + /// let range = zero.suffixes(BitLen::new(0)); + /// assert_eq!(range, 0..1); + /// + /// let range = zero.suffixes(BitLen::new(2)); + /// assert_eq!(range, 0..4); + /// + /// let range = zero.suffixes(BitLen::new(3)); + /// assert_eq!(range, 0..8); + /// + /// let range = zero.suffixes(BitLen::new(4)); + /// assert_eq!(range, 0..16); + /// + /// let sequence = BitSequence::new(0b00000100, BitLen::new(3)); + /// + /// let range = sequence.suffixes(BitLen::new(0)); + /// assert_eq!(range, 0..1); + /// + /// let range = sequence.suffixes(BitLen::new(2)); + /// assert_eq!(range, 2..3); + /// + /// let range = sequence.suffixes(BitLen::new(3)); + /// assert_eq!(range, 4..5); + /// + /// let range = sequence.suffixes(BitLen::new(4)); + /// assert_eq!(range, 8..10); // 0b000001000 to 0b00001001 included + /// ``` + pub fn suffixes(&self, bit_len: BitLen) -> std::ops::Range { + debug_assert!(bit_len.as_u8() as usize <= 8 * std::mem::size_of_val(&self.bits())); + debug_assert!( + std::mem::size_of_val(&self.bits()) == std::mem::size_of::(), + "The arithmetics relies upon the fact that we're only using `u32` for Huffman keys" + ); + let (first, last) = if bit_len <= self.bit_len() { + // We have too many bits, we need to truncate the bits, + // then return a single element. + let shearing: u8 = (self.bit_len() - bit_len).as_u8(); + let first = if shearing == 32 { + 0 + } else { + self.bits() >> shearing + }; + (first, first) + } else { + // We need to pad with lower-weight 0s. + let padding: u8 = (bit_len - self.bit_len()).as_u8(); + let first = self.bits() << padding; + let len = std::u32::MAX >> (8 * std::mem::size_of::() as u8 - padding); + (first, first + len) + }; + first..(last + 1) + } } #[test] fn test_bit_sequence_split() { let bits = 0b11111111_11111111_00000000_00000000; let key = BitSequence::new(bits, BitLen(32)); - assert_eq!(key.split(BitLen(0)), (0, bits)); - assert_eq!(key.split(BitLen(32)), (bits, 0)); - assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0)); + assert_eq!(key.split_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_bits(BitLen(32)), (bits, 0)); + assert_eq!(key.split_bits(BitLen(16)), (0b11111111_11111111, 0)); let bits = 0b00000000_00000000_00000000_11111111; let key = BitSequence::new(bits, BitLen(16)); - assert_eq!(key.split(BitLen(0)), (0, bits)); - assert_eq!(key.split(BitLen(16)), (bits, 0)); - assert_eq!(key.split(BitLen(8)), (0, 0b11111111)); + assert_eq!(key.split_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_bits(BitLen(16)), (bits, 0)); + assert_eq!(key.split_bits(BitLen(8)), (0, 0b11111111)); } /// A Huffman key @@ -159,6 +265,10 @@ impl Key { Key(BitSequence { bits, bit_len }) } + pub fn from_bit_sequence(sequence: BitSequence) -> Self { + Self::new(sequence.bits, sequence.bit_len) + } + /// The bits in this Key. /// /// # Invariant @@ -176,6 +286,11 @@ impl Key { pub fn as_bit_sequence(&self) -> &BitSequence { &self.0 } + + pub fn with_prefix(&self, prefix: &BitSequence) -> Self { + let sequence = self.0.with_prefix(prefix); + Key::from_bit_sequence(sequence) + } } /// A node in the Huffman tree. @@ -219,43 +334,46 @@ impl PartialEq for Node { } impl Eq for Node {} -/// Keys associated to a sequence of values. +/// Codebook associated to a sequence of values. #[derive(Clone, Debug)] -pub struct Keys { - /// The longest bit length that actually appears in `keys`. +pub struct Codebook { + /// The longest bit length that actually appears in `mappings`. highest_bit_len: BitLen, /// The sequence of keys. /// /// Order is meaningful. - keys: Vec<(T, Key)>, + mappings: Vec<(T, Key)>, } -impl Keys { +impl Codebook { + /// The number of elements in this Codebook. pub fn len(&self) -> usize { - self.keys.len() + self.mappings.len() } + + /// The longest bit length that acctually appears in this Codebook. pub fn highest_bit_len(&self) -> BitLen { self.highest_bit_len } } -impl IntoIterator for Keys { +impl IntoIterator for Codebook { type Item = (T, Key); type IntoIter = std::vec::IntoIter<(T, Key)>; fn into_iter(self) -> Self::IntoIter { - self.keys.into_iter() + self.mappings.into_iter() } } -impl Keys +impl Codebook where T: Ord + Clone, { - /// Compute a `Keys` from a sequence of values. + /// Compute a `Codebook` from a sequence of values. /// /// Optionally, `max_bit_len` may specify a largest acceptable bit length. - /// If `Keys` may not be computed without exceeding this bit length, + /// If the `Codebook` may not be computed without exceeding this bit length, /// fail with `Err(problemantic_bit_len)`. /// /// The current implementation only attempts to produce the best compression @@ -278,11 +396,11 @@ where let counter = map.entry(item).or_insert(0.into()); *counter += 1.into(); } - // Then compute the `Keys`. + // Then compute the `Codebook`. Self::from_instances(map, max_bit_len) } - /// Compute a `Keys` from a sequence of values + /// Compute a `Codebook` from a sequence of values /// with a number of instances already attached. /// /// The current implementation only attempts to produce the best compression @@ -305,7 +423,7 @@ where // The bits associated to the next value. let mut bits = 0; - let mut keys = Vec::with_capacity(bit_lengths.len()); + let mut mappings = Vec::with_capacity(bit_lengths.len()); for i in 0..bit_lengths.len() - 1 { let (bit_len, symbol, next_bit_len) = ( @@ -313,7 +431,7 @@ where bit_lengths[i].0.clone(), bit_lengths[i + 1].1, ); - keys.push((symbol.clone(), Key::new(bits, bit_len))); + mappings.push((symbol.clone(), Key::new(bits, bit_len))); bits = (bits + 1) << (next_bit_len - bit_len); if bit_len > highest_bit_len { highest_bit_len = bit_len; @@ -321,11 +439,11 @@ where } // Handle the last element. let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1]; - keys.push((symbol.clone(), Key::new(bits, bit_len))); + mappings.push((symbol.clone(), Key::new(bits, bit_len))); return Ok(Self { highest_bit_len, - keys, + mappings, }); } @@ -412,26 +530,73 @@ where #[test] fn test_coded_from_sequence() { let sample = "appl"; - let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); // Symbol 'p' appears twice, we should see 3 codes. - assert_eq!(coded.keys.len(), 3); + assert_eq!(coded.mappings.len(), 3); // Check order of symbols. - assert_eq!(coded.keys[0].0, 'p'); - assert_eq!(coded.keys[1].0, 'a'); - assert_eq!(coded.keys[2].0, 'l'); + assert_eq!(coded.mappings[0].0, 'p'); + assert_eq!(coded.mappings[1].0, 'a'); + assert_eq!(coded.mappings[2].0, 'l'); // Check bit length of symbols. - assert_eq!(coded.keys[0].1.bit_len(), 1.into()); - assert_eq!(coded.keys[1].1.bit_len(), 2.into()); - assert_eq!(coded.keys[2].1.bit_len(), 2.into()); + assert_eq!(coded.mappings[0].1.bit_len(), 1.into()); + assert_eq!(coded.mappings[1].1.bit_len(), 2.into()); + assert_eq!(coded.mappings[2].1.bit_len(), 2.into()); // Check code of symbols. - assert_eq!(coded.keys[0].1.bits(), 0b00); - assert_eq!(coded.keys[1].1.bits(), 0b10); - assert_eq!(coded.keys[2].1.bits(), 0b11); + assert_eq!(coded.mappings[0].1.bits(), 0b00); + assert_eq!(coded.mappings[1].1.bits(), 0b10); + assert_eq!(coded.mappings[2].1.bits(), 0b11); // Let's try again with a limit to 1 bit paths. - assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2); + assert_eq!(Codebook::from_sequence(sample.chars(), 1).unwrap_err(), 2); +} + +impl Codebook { + /// Create an empty Codebook + pub fn new() -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: vec![], + } + } + + /// Create an empty Codebook + pub fn with_capacity(len: usize) -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: Vec::with_capacity(len), + } + } + + /// Add a mapping to a Codebook. + /// + /// This method does **not** check that the resulting Codebook is correct. + pub unsafe fn add_mapping(&mut self, value: T, key: Key) { + if key.bit_len() > self.highest_bit_len { + self.highest_bit_len = key.bit_len(); + } + self.mappings.push((value, key)); + } + + /// Return the mappings of a Codebook. + pub fn mappings(self) -> Vec<(T, Key)> { + self.mappings + } + + pub fn map(self, mut f: F) -> Codebook + where + F: FnMut(T) -> U, + { + Codebook { + highest_bit_len: self.highest_bit_len, + mappings: self + .mappings + .into_iter() + .map(|(value, key)| (f(value), key)) + .collect(), + } + } } diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs index 51faac772..e242be097 100644 --- a/crates/binjs_io/src/context/huffman/read.rs +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -11,7 +11,10 @@ use std::convert::{TryFrom, TryInto}; /// /// We have several implementations of HuffmanTable designed for /// distinct space/speed tradeoffs. -pub trait HuffmanTable { +pub trait HuffmanTable +where + T: Clone, +{ /// Return the number of elements in the table. fn len(&self) -> usize; @@ -23,7 +26,7 @@ pub trait HuffmanTable { /// The sequence of bits MUST be at least as long as `highest_bit_len`. /// Use the `Key` result to determine how many bits need to actually be /// consumed from the bit stream. - fn lookup(&self, key: &BitSequence) -> Option<&(T, Key)>; + fn lookup(&self, key: &BitSequence) -> Option>; } /// A type that has a maximal value. @@ -134,16 +137,19 @@ impl SingleLookupHuffmanTable where V: ValueIndex, { - pub fn from_keys(keys: Keys) -> Self { + /// Construct a Huffman table from a Codebook. + /// + /// Time complexity: `O(2^codebook.max_bit_len())`. + pub fn from_codebook(codebook: Codebook) -> Self { assert!( - keys.len() + codebook.len() <= V::max_value() .try_into() .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")) ); - let highest_bit_len = keys.highest_bit_len(); + let highest_bit_len = codebook.highest_bit_len(); - let mut values = Vec::with_capacity(keys.len()); + let mut values = Vec::with_capacity(codebook.len()); // Fill `saturated` with a default value of `V::max_value()`. // This is the value most likely to trigger errors in case @@ -152,7 +158,7 @@ where let mut saturated = Vec::with_capacity(1usize << highest_bit_len); saturated.resize(1usize << highest_bit_len, V::max_value()); - for (value_index, (value, key)) in keys.into_iter().enumerate() { + for (value_index, (value, key)) in codebook.into_iter().enumerate() { let value_index: V = value_index .try_into() .unwrap_or_else(|_| panic!("Too many keys for ValueIndex")); @@ -199,98 +205,348 @@ where impl HuffmanTable for SingleLookupHuffmanTable where V: ValueIndex, + T: Clone, { + /// Constant time length access. fn len(&self) -> usize { self.values.len() } + /// Constant time highest bit access. fn highest_bit_len(&self) -> BitLen { self.highest_bit_len } - fn lookup(&self, key: &BitSequence) -> Option<&(T, Key)> { + /// Constant-time lookup. + fn lookup(&self, key: &BitSequence) -> Option> { assert!(key.bit_len() >= self.highest_bit_len()); - let (prefix, _) = key.split(self.highest_bit_len()); + let (prefix, _) = key.split_bits(self.highest_bit_len()); let value_index = self.saturated[prefix as usize].clone(); let value_index: usize = value_index .try_into() .unwrap_or_else(|_| panic!("Value index does not fit into a usize")); - self.values.get(value_index) + let entry = self.values.get(value_index)?; + Some(Cow::Borrowed(entry)) + } +} + +/// An alias for `SingleLookupHuffmanTable::from_codebook`, meant mainly to be used in +/// `MultiLookupHuffmanTable::from_codebook`. +impl From> for SingleLookupHuffmanTable +where + V: ValueIndex, +{ + fn from(codebook: Codebook) -> Self { + Self::from_codebook(codebook) + } +} + +/// A table designed to support fast lookup in large sets of data. +/// In most cases, lookup will be slower than a `SingleLookupHuffmanTable` +/// but, particularly in heavily unbalanced trees, the table will +/// take ~2^prefix_len fewer internal entries than a `SingleLookupHuffmanTable`. +/// +/// Typically, use this table whenever codes range between 10 and 20 bits. +/// +/// # Time complexity +/// +/// Assuming that lookups in `Subtable` take constant time, a lookup in `MultiLookupHuffmanTable` +/// will also take constant time: +/// +/// - a constant-time lookup to determine into which Subtable to perform the lookup; +/// - a constant-time lookup into Subtable; +/// - a final constant-time lookup to extract the result. // FIXME: We could get rid of this final lookup. +/// +/// +/// # Space complexity +/// +/// TBD. Highly dependent on the shape of the Huffman Tree. +/// +/// +/// # Algorithm +/// +/// Consider the following Huffman table +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ------------ | ---------- +/// A | 11000 | 5 +/// B | 11001 | 5 +/// C | 1101 | 4 +/// D | 100 | 3 +/// E | 101 | 3 +/// F | 111 | 3 +/// G | 00 | 2 +/// H | 01 | 2 +/// +/// With a prefix length of 3, we will precompute all possible 3-bit prefixes +/// and split the table across such prefixes. +/// +/// Prefix | Int Value of Prefix | Symbols | Max bit length +/// ------ | ------------------- | --------- | -------------- +/// 000 | 0 | G | 0 +/// 001 | 1 | G | 0 +/// 010 | 2 | H | 0 +/// 011 | 3 | H | 0 +/// 100 | 4 | D | 0 +/// 101 | 5 | E | 0 +/// 110 | 6 | A, B, C | 2 +/// 111 | 7 | F | 0 +/// +/// For each prefix, we build the table containing the Symbols, +/// stripping prefix from the Binary Code. +/// +/// - Prefix 000 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// G | (none) | 0 +/// +/// - Prefix 001 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// G | (none) | 0 +/// +/// - Prefix 010 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// H | (none) | 0 +/// +/// - Prefix 11 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// H | (none) | 0 +/// +/// - Prefix 100 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// D | (none) | 0 +/// +/// - Prefix 101 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// E | (none) | 0 +/// +/// - Prefix 110 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// A | 00 | 2 +/// B | 01 | 2 +/// C | 1 | 1 +/// +/// - Prefix 111 +/// +/// Symbol | Binary Code | Bit Length +/// ------ | ----------- | ---------- +/// F | (none) | 0 +/// +/// With this transformation, we have represented one table +/// with an initial max bit length of 5 as: +/// +/// - 1 table with a max bit length of 2; +/// - 7 tables with a max bit length of 0. +/// +/// Consequently, instead of storing 2^5 = 32 internal references, +/// as we would have done with a SingleLookupHuffmanTable, we only +/// need to store (assuming that `SubTable` is a `SingleLookupHuffmanTable`): +/// +/// - 7 subtables with 1 reference each; +/// - 1 subtable with 2^2 = 4 references. +pub struct MultiLookupHuffmanTable { + /// The highest bit length. + highest_bit_len: BitLen, + + /// Invariant: `prefix_len < highest_bit_len`. + prefix_len: BitLen, + + /// A mapping from 0..2^prefix_len such that index `i` + /// maps to a subtable that holds all values associated + /// with a key that starts with `Key::new(i, prefix_len)`. + /// + /// Note that, to allow the use of smaller tables, keys + /// inside the subtables have been stripped + /// from the prefix `Key::new(i, prefix_len)`. + by_prefix: Vec, + + /// The number of entries in this table. + len: usize, + + values: Vec<(T, Key)>, +} + +impl MultiLookupHuffmanTable +where + SubTable: HuffmanTable + From>, + T: Clone, +{ + pub fn from_codebook(prefix_len: BitLen, codebook: Codebook) -> Self { + let len = codebook.len(); + let mut values = Vec::with_capacity(codebook.len()); + let highest_bit_len = codebook.highest_bit_len(); + + // At this stage, we cannot immediately create subtables, as + // we first need to determine the `highest_bit_len`. So we + // first need to split our Codebook into a forest of Codebooks + // sharing the same prefix. + let mut buckets = Vec::with_capacity(1usize << prefix_len); + buckets.resize_with(1usize << prefix_len, || Codebook::new()); + + // Dispatch each (value, key) to its buckets. + for (value, key) in codebook.into_iter() { + let (prefix, suffix) = key.as_bit_sequence().split(prefix_len); + for index in prefix.suffixes(prefix_len) { + let ref mut bucket = buckets[index as usize]; + // Store the new mapping: + // - in the smaller Codebook, we only need the remaining bits (`suffix`); + // - in the smaller Codebook, we don't use the `value` itself but rather + // a reference to value stored in `values`. + unsafe { + bucket.add_mapping(values.len(), Key::from_bit_sequence(suffix.clone())); + } + } + values.push((value, key)); + } + + // Now convert buckets into Huffman tables + let mut by_prefix = Vec::with_capacity(1usize << prefix_len); + for bucket in buckets { + by_prefix.push(SubTable::from(bucket)); + } + + Self { + highest_bit_len, + prefix_len, + by_prefix, + len, + values, + } + } +} + +impl HuffmanTable for MultiLookupHuffmanTable +where + SubTable: HuffmanTable, + T: Clone, +{ + /// Constant-time length. + fn len(&self) -> usize { + self.len + } + + /// Constant time highest bit length. + fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } + + /// Constant-time lookup. + fn lookup(&self, key: &BitSequence) -> Option> + where + T: Clone, + { + assert!(key.bit_len() >= self.highest_bit_len()); + + // Find in which `SingleLookupHuffmanTable` to look for the entry. + let (prefix, suffix) = key.split_bits(self.prefix_len); + let ref table = self.by_prefix.get(prefix as usize)?; + + // Now lookup in second table. + let suffix = BitSequence::new(suffix, key.bit_len() - self.prefix_len); + let suffix = suffix.pad_lowest_to(table.highest_bit_len()); + let lookup = table.lookup(&suffix)?; + + // Finally, build the result. + Some(Cow::Borrowed(&self.values[lookup.0])) } } #[test] -fn test_single_lookup_huffman_table() { +fn test_huffman_lookup() { // Check against a hardcoded constant, to ensure consistency // with fbssdc implementation. - fn run_test() + fn run_test(from_codebook: F) where - V: ValueIndex, + F: Fn(Codebook) -> H, + H: HuffmanTable, { let sample = "appl"; - let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); - let table: SingleLookupHuffmanTable = SingleLookupHuffmanTable::from_keys(coded); + let codebook = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let table = from_codebook(codebook); assert_eq!(table.len(), 3); // Test with all possible 2 bit sequences. let candidate = BitSequence::new(0b10, BitLen(2)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('a', Key::new(0b10, BitLen(2)))); + assert_eq!(result.as_ref(), &('a', Key::new(0b10, BitLen(2)))); let candidate = BitSequence::new(0b11, BitLen(2)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('l', Key::new(0b11, BitLen(2)))); + assert_eq!(result.as_ref(), &('l', Key::new(0b11, BitLen(2)))); // With a bit length of 2, there are two keys that // should return 'p' for prefix in &[0b00, 0b01] { let candidate = BitSequence::new(*prefix, BitLen(2)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('p', Key::new(0, BitLen(1)))); + assert_eq!(result.as_ref(), &('p', Key::new(0, BitLen(1)))); } // Test values with all possible 3 bit sequences. for prefix in &[0b100, 0b101] { let candidate = BitSequence::new(*prefix, BitLen(3)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('a', Key::new(0b10, BitLen(2)))); + assert_eq!(result.as_ref(), &('a', Key::new(0b10, BitLen(2)))); } for prefix in &[0b110, 0b111] { let candidate = BitSequence::new(*prefix, BitLen(3)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('l', Key::new(0b11, BitLen(2)))); + assert_eq!(result.as_ref(), &('l', Key::new(0b11, BitLen(2)))); } for prefix in &[0b000, 0b001, 0b010, 0b011] { let candidate = BitSequence::new(*prefix, BitLen(3)); let result = table.lookup(&candidate).unwrap(); - assert_eq!(result, &('p', Key::new(0, BitLen(1)))); + assert_eq!(result.as_ref(), &('p', Key::new(0, BitLen(1)))); } } - run_test::(); - run_test::(); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); } #[test] -fn test_single_lookup_huffman_table_2() { +fn test_huffman_lookup_2() { // Check internal consistency. - fn run_test() + fn run_test(from_codebook: F) where - V: ValueIndex, + F: Fn(Codebook) -> H, + H: HuffmanTable, { let sample = "Lorem ipsum dolor sit amet consectetur adipiscing elit convallis nostra, integer diam odio mus eros ut sodales sociis cursus, montes imperdiet morbi rhoncus felis venenatis curabitur magna. Volutpat tincidunt sociosqu pharetra id feugiat enim eget, integer quisque magna in senectus mollis, himenaeos malesuada convallis faucibus ornare egestas. Netus platea himenaeos suscipit nostra montes mattis, lobortis ut arcu facilisi hac ornare, integer ante sociosqu placerat morbi. Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis praesent habitasse, tortor id leo consequat sollicitudin elementum fames fringilla himenaeos donec. Phasellus posuere congue ultricies scelerisque senectus vivamus facilisi, vestibulum consequat aptent lectus ad sociis porta, purus libero eros leo at nec. Netus viverra urna nisl sapien conubia porta sed luctus penatibus cras, pulvinar iaculis sagittis fusce fringilla et rutrum sollicitudin ligula, dui vestibulum interdum pretium montes diam nibh inceptos ante. "; - let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap(); - let table: SingleLookupHuffmanTable = - SingleLookupHuffmanTable::from_keys(coded.clone()); - for (value, key) in coded { + let codebook = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let table = from_codebook(codebook.clone()); + for (value, key) in codebook { // Test that candidate keys obtained by extending `key` with additional bits // return the expected `(value, key)`. for bit_len in table.highest_bit_len().as_u8() @@ -303,6 +559,41 @@ Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis prae } } } - run_test::(); - run_test::(); + // Test with a single lookup. + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + run_test::, _>(SingleLookupHuffmanTable::from_codebook); + + // Test with two lookups, with a very short prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(1), codebook), + ); + + // Test with two lookups, still with a very short prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(2), codebook), + ); + + // Test with two lookups, with an unreasonably large prefix length. + run_test::>, _>(|codebook| { + MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook) + }); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), + ); + run_test::>, _>( + |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), + ); } diff --git a/crates/binjs_io/src/context/mod.rs b/crates/binjs_io/src/context/mod.rs index f51f2073e..ced4f3336 100644 --- a/crates/binjs_io/src/context/mod.rs +++ b/crates/binjs_io/src/context/mod.rs @@ -3,7 +3,7 @@ /// Format documentation. mod format; -mod huffman; +pub mod huffman; mod varnum; /// A four-char name embedded in the binary. From 6ddd9cf769d41d4a863222fd0148a247ee4902f7 Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 26 Sep 2019 11:52:40 +0200 Subject: [PATCH 4/7] Implementing Codebook::parse* --- crates/binjs_io/src/context/huffman/mod.rs | 102 +++++++++++++++----- crates/binjs_io/src/context/huffman/read.rs | 82 +++++++++++++++- crates/binjs_io/src/context/varnum.rs | 11 +++ crates/binjs_io/src/io/statistics.rs | 1 + spec/context.md | 3 +- 5 files changed, 172 insertions(+), 27 deletions(-) diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs index d255b1efd..daa7e74f3 100644 --- a/crates/binjs_io/src/context/huffman/mod.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap}; use std::hash::Hash; +use std::io; /// Reading from bitstreams and decoding their contents using Huffman tables. pub mod read; @@ -36,6 +37,10 @@ impl BitLen { } } +/// The maximal number of bits permitted in a Huffman key +/// in this format. +pub const MAX_CODE_BIT_LEN: BitLen = BitLen(20); + /// Convenience implementation of operator `<<` in /// `bits << bit_len` impl std::ops::Shl for u32 { @@ -385,7 +390,7 @@ where /// /// Values (type `T`) will be cloned regularly, so you should make /// sure that their cloning is reasonably cheap. - pub fn from_sequence(source: S, max_bit_len: u8) -> Result + pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result where S: IntoIterator, T: PartialEq + Hash, @@ -411,25 +416,41 @@ where /// # Requirement /// /// Values of `T` in the source MUST be distinct. - pub fn from_instances(source: S, max_bit_len: u8) -> Result + pub fn from_instances(source: S, max_bit_len: BitLen) -> Result where S: IntoIterator, { - let mut bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; + let bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; + Self::from_bit_lens(bit_lengths, max_bit_len) + } + + /// Compute a `Codebook` from a sequence of values + /// with a bit length already attached. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Requirement + /// + /// Values of `T` in the source MUST be distinct. + pub fn from_bit_lens(mut bit_lens: Vec<(T, BitLen)>, max_bit_len: BitLen) -> Result + { let mut highest_bit_len = BitLen(0); // Canonicalize order: (BitLen, T) - bit_lengths.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); + bit_lens.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); // The bits associated to the next value. let mut bits = 0; - let mut mappings = Vec::with_capacity(bit_lengths.len()); + let mut mappings = Vec::with_capacity(bit_lens.len()); - for i in 0..bit_lengths.len() - 1 { + for i in 0..bit_lens.len() - 1 { let (bit_len, symbol, next_bit_len) = ( - bit_lengths[i].1, - bit_lengths[i].0.clone(), - bit_lengths[i + 1].1, + bit_lens[i].1, + bit_lens[i].0.clone(), + bit_lens[i + 1].1, ); mappings.push((symbol.clone(), Key::new(bits, bit_len))); bits = (bits + 1) << (next_bit_len - bit_len); @@ -438,9 +459,16 @@ where } } // Handle the last element. - let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1]; + let (ref symbol, bit_len) = bit_lens[bit_lens.len() - 1]; + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } mappings.push((symbol.clone(), Key::new(bits, bit_len))); + if highest_bit_len > max_bit_len { + return Err(highest_bit_len) + } + return Ok(Self { highest_bit_len, mappings, @@ -452,7 +480,7 @@ where /// in the Huffman tree, aka the bitlength of their Huffman key. /// /// Values that have 0 instances are skipped. - pub fn compute_bit_lengths(source: S, max_bit_len: u8) -> Result, u8> + pub fn compute_bit_lengths(source: S, max_bit_len: BitLen) -> Result, u8> where S: IntoIterator, { @@ -496,7 +524,7 @@ where let mut bit_lengths = Vec::with_capacity(len); fn aux( bit_lengths: &mut Vec<(T, BitLen)>, - max_bit_len: u8, + max_bit_len: BitLen, depth: u8, node: &NodeContent, ) -> Result<(), u8> @@ -505,7 +533,7 @@ where { match *node { NodeContent::Leaf(ref value) => { - if depth > max_bit_len { + if depth > max_bit_len.as_u8() { return Err(depth); } bit_lengths.push((value.clone(), BitLen(depth))); @@ -586,17 +614,41 @@ impl Codebook { self.mappings } - pub fn map(self, mut f: F) -> Codebook - where - F: FnMut(T) -> U, - { - Codebook { - highest_bit_len: self.highest_bit_len, - mappings: self - .mappings - .into_iter() - .map(|(value, key)| (f(value), key)) - .collect(), - } + + /// Iterate through this Codebook. + pub fn iter(&self) -> impl Iterator { + self.mappings.iter() } } + +/// An alphabet of symbols. +pub trait Alphabet { + type Symbol: Ord + Clone; + + /// Read a symbol from an input stream. + fn read_literal(input: R) -> Result + where R: io::Read; +} + +/// An alphabet of symbols known statically from the grammar. +/// Also known as `Implicit Symbols` in the grammar. +/// +/// For instance, in most languages, there is a finite set of +/// arithmetic operators specified by the grammar. +pub trait StaticAlphabet: Alphabet { + /// The number of symbols in this static alphabet. + fn len() -> u32; + + /// Return the nth value of the alphabet of `None` if there is no such value. + fn index(u32) -> Option; +} + +/// An alphabet of symbols known dynamically from the file. +/// Also known as `Explicit Symbols` in the grammar. +/// +/// For instance, in most languages, the set of literal strings +/// actually used in a file is determined by the user, not by +/// the grammar. +pub trait DynamicAlphabet: Alphabet { +} + diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs index e242be097..3a9449f78 100644 --- a/crates/binjs_io/src/context/huffman/read.rs +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -4,8 +4,10 @@ //! into values. use context::huffman::*; +use context::varnum::ReadVaru32; -use std::convert::{TryFrom, TryInto}; +use std::convert::{ TryFrom, TryInto }; +use std::io::{ self, Read }; /// A Huffman table. /// @@ -597,3 +599,81 @@ Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis prae |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), ); } + + +impl Codebook where T: Ord + Clone { + /// Parse a Codebook containing a single symbol. + fn parse_single_symbol(mut inp: R) -> Result where A: Alphabet, R: Read { + let symbol = A::read_literal(&mut inp)?; + Codebook::from_bit_lens(vec![(symbol, BitLen::new(0))], MAX_CODE_BIT_LEN) + .map_err(|_| + io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") + ) + } + + /// Parse a Codebook for `StaticAlphabet`. + fn parse_static(mut inp: R) -> Result where A: StaticAlphabet, R: Read { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => /* spec: UnitCodeTable */ Self::parse_single_symbol::(inp), + 1 => /* spec: MultiCodeTableImplicit */ { + let number_of_symbols = A::len(); + let mut bit_lens = Vec::with_capacity(number_of_symbols as usize); + for i in 0..number_of_symbols { + // Read the bit length. + let mut byte = [0]; + inp.read_exact(&mut byte)?; + let bit_len = BitLen::new(byte[0]); + + // Extract the symbol from the grammar. + let symbol = A::index(i).unwrap(); // We're within 0..A::len() + + bit_lens.push((symbol, bit_len)); + } + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN) + .map_err(|_| + io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") + ) + } + 2 => /* spec: EmptyCodeTable */ Ok(Codebook::new()), + _ => Err(io::Error::new(io::ErrorKind::InvalidData, "Incorrect CodeTable kind")) + } + } + + /// Parse a Codebook for `DynamicAlphabet`. + fn parse_dynamic(mut inp: R) -> Result where A: DynamicAlphabet, R: Read, T: Default { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => /* spec: UnitCodeTable */ Self::parse_single_symbol::(inp), + 1 => /* spec: MultiCodeTableExplicit */ { + let number_of_symbols = *inp.read_varu32_no_normalization()?.value(); + // FIXME: We may need to guard against DoS by high `number_of_symbols`. + let mut bit_lens = Vec::with_capacity(number_of_symbols as usize); + + // Read bit lengths. + for _ in 0..number_of_symbols { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + bit_lens.push((T::default(), BitLen::new(byte[0]))); + } + + // Amend with symbols + for i in 0..number_of_symbols { + let symbol = A::read_literal(&mut inp)?; + bit_lens[i as usize].0 = symbol; + } + + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN) + .map_err(|_| + io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") + ) + } + 2 => /* spec: EmptyCodeTable */ Ok(Codebook::new()), + _ => Err(io::Error::new(io::ErrorKind::InvalidData, "Incorrect CodeTable kind")) + } + } +} \ No newline at end of file diff --git a/crates/binjs_io/src/context/varnum.rs b/crates/binjs_io/src/context/varnum.rs index fa4738066..56b70e825 100644 --- a/crates/binjs_io/src/context/varnum.rs +++ b/crates/binjs_io/src/context/varnum.rs @@ -24,6 +24,17 @@ pub struct ByteValue { /// The number of bytes consumed. byte_len: usize, } +impl ByteValue { + /// The value read. + pub fn value(&self) -> &T { + &self.value + } + + /// The number of bytes consumed. + pub fn byte_len(&self) -> usize { + self.byte_len + } +} /// A reader that may read varu32-encoded u32 values from a stream. pub trait ReadVaru32 { diff --git a/crates/binjs_io/src/io/statistics.rs b/crates/binjs_io/src/io/statistics.rs index cc39ab5a8..e12205365 100644 --- a/crates/binjs_io/src/io/statistics.rs +++ b/crates/binjs_io/src/io/statistics.rs @@ -12,6 +12,7 @@ impl std::iter::Sum for Bytes { /// A newtype for `usize` used to count the number of instances of some item. #[derive( + Constructor, Default, Display, Serialize, diff --git a/spec/context.md b/spec/context.md index fe8311f04..28f72f573 100644 --- a/spec/context.md +++ b/spec/context.md @@ -137,7 +137,8 @@ CodeTable ::= UnitCodeTable UnitCodeTable ::= 00h LiteralSymbol MultiCodeTableExplicit ::= 01h CodeCount [CodeLength]{CodeCount} [LiteralSymbol]{CodeCount} CodeLength ::= 00h .. 14h -MultiCodeTableExplicit ::= 01h [CodeLength]{SymbolCount} +CodeCount ::= Varuint +MultiCodeTableImplicit ::= 01h [CodeLength]{SymbolCount} EmptyCodeTable ::= 02h ``` From 4727bfde405af41bb10fdbaf8e560d63dad9954c Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 26 Sep 2019 15:41:15 +0200 Subject: [PATCH 5/7] Implementing Codebook::write* --- .../binjs_io/src/context/huffman/codebook.rs | 616 ++++++++++++++++++ crates/binjs_io/src/context/huffman/mod.rs | 298 +-------- crates/binjs_io/src/context/huffman/read.rs | 87 +-- 3 files changed, 631 insertions(+), 370 deletions(-) create mode 100644 crates/binjs_io/src/context/huffman/codebook.rs diff --git a/crates/binjs_io/src/context/huffman/codebook.rs b/crates/binjs_io/src/context/huffman/codebook.rs new file mode 100644 index 000000000..dbe8451c8 --- /dev/null +++ b/crates/binjs_io/src/context/huffman/codebook.rs @@ -0,0 +1,616 @@ +use context::huffman::*; +use context::varnum::{ReadVaru32, WriteVaru32}; + +use std::io::{self, Read, Write}; + +const TABLE_HEADER_UNIT: u8 = 0; +const TABLE_HEADER_MULTI: u8 = 1; +const TABLE_HEADER_EMPTY: u8 = 2; + +const VEC_MAX_PRE_ALLOC: usize = 1024; + +/// Codebook associated to a sequence of values. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Codebook { + /// The longest bit length that actually appears in `mappings`. + highest_bit_len: BitLen, + + /// The sequence of keys. + /// + /// Order is meaningful. + mappings: Vec<(T, Key)>, +} + +impl Codebook { + /// The number of elements in this Codebook. + pub fn len(&self) -> usize { + self.mappings.len() + } + + /// The longest bit length that acctually appears in this Codebook. + pub fn highest_bit_len(&self) -> BitLen { + self.highest_bit_len + } +} + +impl IntoIterator for Codebook { + type Item = (T, Key); + type IntoIter = std::vec::IntoIter<(T, Key)>; + fn into_iter(self) -> Self::IntoIter { + self.mappings.into_iter() + } +} + +impl Codebook +where + T: Ord + Clone, +{ + /// Compute a `Codebook` from a sequence of values. + /// + /// Optionally, `max_bit_len` may specify a largest acceptable bit length. + /// If the `Codebook` may not be computed without exceeding this bit length, + /// fail with `Err(problemantic_bit_len)`. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Performance + /// + /// Values (type `T`) will be cloned regularly, so you should make + /// sure that their cloning is reasonably cheap. + pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result + where + S: IntoIterator, + T: PartialEq + Hash, + { + // Count the values. + let mut map = HashMap::new(); + for item in source { + let counter = map.entry(item).or_insert(0.into()); + *counter += 1.into(); + } + // Then compute the `Codebook`. + Self::from_instances(map, max_bit_len) + } + + /// Compute a `Codebook` from a sequence of values + /// with a number of instances already attached. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Requirement + /// + /// Values of `T` in the source MUST be distinct. + pub fn from_instances(source: S, max_bit_len: BitLen) -> Result + where + S: IntoIterator, + { + let bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; + Self::from_bit_lens(bit_lengths, max_bit_len) + } + + /// Compute a `Codebook` from a sequence of values + /// with a bit length already attached. + /// + /// The current implementation only attempts to produce the best compression + /// level. This may cause us to exceed `max_bit_len` even though an + /// alternative table, with a lower compression level, would let us + /// proceed without exceeding `max_bit_len`. + /// + /// # Requirement + /// + /// Values of `T` in the source MUST be distinct. + pub fn from_bit_lens( + mut bit_lens: Vec<(T, BitLen)>, + max_bit_len: BitLen, + ) -> Result { + let mut highest_bit_len = BitLen(0); + + // Canonicalize order: (BitLen, T) + bit_lens.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); + + // The bits associated to the next value. + let mut bits = 0; + let mut mappings = Vec::with_capacity(bit_lens.len()); + + for i in 0..bit_lens.len() - 1 { + let (bit_len, symbol, next_bit_len) = + (bit_lens[i].1, bit_lens[i].0.clone(), bit_lens[i + 1].1); + // FIXME: Instead of asserting, this should fail gracefully. + mappings.push((symbol.clone(), Key::new(bits, bit_len))); + bits = (bits + 1) << (next_bit_len - bit_len); + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } + } + // Handle the last element. + let (ref symbol, bit_len) = bit_lens[bit_lens.len() - 1]; + if bit_len > highest_bit_len { + highest_bit_len = bit_len; + } + mappings.push((symbol.clone(), Key::new(bits, bit_len))); + + if highest_bit_len > max_bit_len { + return Err(highest_bit_len); + } + + return Ok(Self { + highest_bit_len, + mappings, + }); + } + + /// Convert a sequence of values labelled by their number of instances + /// into a sequence of values labelled by the length for their path + /// in the Huffman tree, aka the bitlength of their Huffman key. + /// + /// Values that have 0 instances are skipped. + pub fn compute_bit_lengths(source: S, max_bit_len: BitLen) -> Result, u8> + where + S: IntoIterator, + { + // Build a min-heap sorted by number of instances. + use std::cmp::Reverse; + let mut heap = BinaryHeap::new(); + + // Skip values that have 0 instances. + for (value, instances) in source { + if !instances.is_zero() { + heap.push(Reverse(Node { + instances, + content: NodeContent::Leaf(value), + })); + } + } + + let len = heap.len(); + if len == 0 { + // Special case: no tree to build. + return Ok(vec![]); + } + + // Take the two rarest nodes, merge them behind a prefix, + // turn them into a single node with combined number of + // instances. Repeat. + while heap.len() > 1 { + let left = heap.pop().unwrap(); + let right = heap.pop().unwrap(); + heap.push(Reverse(Node { + instances: left.0.instances + right.0.instances, + content: NodeContent::Internal { + left: Box::new(left.0.content), + right: Box::new(right.0.content), + }, + })); + } + + // Convert tree into bit lengths + let root = heap.pop().unwrap(); // We have checked above that there is at least one value. + let mut bit_lengths = Vec::with_capacity(len); + fn aux( + bit_lengths: &mut Vec<(T, BitLen)>, + max_bit_len: BitLen, + depth: u8, + node: &NodeContent, + ) -> Result<(), u8> + where + T: Clone, + { + match *node { + NodeContent::Leaf(ref value) => { + if depth > max_bit_len.as_u8() { + return Err(depth); + } + bit_lengths.push((value.clone(), BitLen(depth))); + Ok(()) + } + NodeContent::Internal { + ref left, + ref right, + } => { + aux(bit_lengths, max_bit_len, depth + 1, left)?; + aux(bit_lengths, max_bit_len, depth + 1, right)?; + Ok(()) + } + } + } + aux(&mut bit_lengths, max_bit_len, 0, &root.0.content)?; + + Ok(bit_lengths) + } +} + +#[test] +fn test_coded_from_sequence() { + let sample = "appl"; + let coded = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); + + // Symbol 'p' appears twice, we should see 3 codes. + assert_eq!(coded.mappings.len(), 3); + + // Check order of symbols. + assert_eq!(coded.mappings[0].0, 'p'); + assert_eq!(coded.mappings[1].0, 'a'); + assert_eq!(coded.mappings[2].0, 'l'); + + // Check bit length of symbols. + assert_eq!(coded.mappings[0].1.bit_len(), 1.into()); + assert_eq!(coded.mappings[1].1.bit_len(), 2.into()); + assert_eq!(coded.mappings[2].1.bit_len(), 2.into()); + + // Check code of symbols. + assert_eq!(coded.mappings[0].1.bits(), 0b00); + assert_eq!(coded.mappings[1].1.bits(), 0b10); + assert_eq!(coded.mappings[2].1.bits(), 0b11); + + // Let's try again with a limit to 1 bit paths. + assert_eq!( + Codebook::from_sequence(sample.chars(), BitLen::new(1)).unwrap_err(), + BitLen::new(2) + ); +} + +impl Codebook { + /// Create an empty Codebook + pub fn new() -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: vec![], + } + } + + /// Create an empty Codebook + pub fn with_capacity(len: usize) -> Self { + Self { + highest_bit_len: BitLen::new(0), + mappings: Vec::with_capacity(len), + } + } + + /// Add a mapping to a Codebook. + /// + /// This method does **not** check that the resulting Codebook is correct. + pub unsafe fn add_mapping(&mut self, value: T, key: Key) { + if key.bit_len() > self.highest_bit_len { + self.highest_bit_len = key.bit_len(); + } + self.mappings.push((value, key)); + } + + /// Return the mappings of a Codebook. + pub fn mappings(self) -> Vec<(T, Key)> { + self.mappings + } + + /// Iterate through this Codebook. + pub fn iter(&self) -> impl Iterator { + self.mappings.iter() + } +} + +/// Writing +impl Codebook +where + T: Ord + Clone + Hash, +{ + /// Write a Codebook for `StaticAlphabet`. + fn write_static(&self, mut out: W) -> Result<(), io::Error> + where + A: StaticAlphabet, + W: Write, + { + match self.len() { + 0 => { + /* spec: EmptyCodeTable */ + out.write_all(&[TABLE_HEADER_EMPTY])?; + Ok(()) + } + 1 => { + /* spec: UnitCodeTable */ + out.write_all(&[TABLE_HEADER_UNIT])?; + A::write_literal(&self.mappings[0].0, out)?; + Ok(()) + } + _ => { + /* spec: MultiCodeTableImplicit */ + out.write_all(&[TABLE_HEADER_MULTI])?; + let map: HashMap<_, _> = self.mappings.iter().cloned().collect(); + for i in 0..A::len() { + let symbol = A::symbol(i).unwrap(); // We're in 0..A::len() + let bit_len = map + .get(&symbol) + .map(|key| key.bit_len().clone()) + .unwrap_or(BitLen::new(0)); + out.write_all(&[bit_len.as_u8()])?; + } + Ok(()) + } + } + } + + /// Write a Codebook for `DynamicAlphabet`. + fn write_dynamic(&self, mut out: W) -> Result<(), io::Error> + where + A: DynamicAlphabet, + W: Write, + { + match self.len() { + 0 => { + /* spec: EmptyCodeTable */ + out.write_all(&[TABLE_HEADER_EMPTY])?; + Ok(()) + } + 1 => { + /* spec: UnitCodeTable */ + out.write_all(&[TABLE_HEADER_UNIT])?; + A::write_literal(&self.mappings[0].0, out)?; + Ok(()) + } + _ => { + /* spec: MultiCodeTableExplicit */ + + // First the header. + out.write_all(&[TABLE_HEADER_MULTI])?; + + // Now, the length. + out.write_varu32(self.len() as u32)?; + + // Then bit lengths. + for &(_, ref key) in &self.mappings { + out.write_all(&[key.bit_len().as_u8()])?; + } + self.mappings.len(); + + // Then symbols. + for &(ref symbol, _) in &self.mappings { + A::write_literal(symbol, &mut out)?; + } + + Ok(()) + } + } + } +} + +/// Reading +impl Codebook +where + T: Ord + Clone, +{ + /// Parse a Codebook containing a single symbol. + fn read_single_symbol(mut inp: R) -> Result + where + A: Alphabet, + R: Read, + { + let symbol = A::read_literal(&mut inp)?; + Codebook::from_bit_lens(vec![(symbol, BitLen::new(0))], MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + + /// Parse a Codebook for `StaticAlphabet`. + pub fn read_static(mut inp: R) -> Result + where + A: StaticAlphabet, + R: Read, + { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => + /* spec: UnitCodeTable */ + { + Self::read_single_symbol::(inp) + } + 1 => + /* spec: MultiCodeTableImplicit */ + { + let number_of_symbols = A::len(); + let mut bit_lens = + Vec::with_capacity(usize::min(number_of_symbols as usize, VEC_MAX_PRE_ALLOC)); + for i in 0..number_of_symbols { + // Read the bit length. + let mut byte = [0]; + inp.read_exact(&mut byte)?; + let bit_len = BitLen::new(byte[0]); + + if bit_len > BitLen::new(0) { + // Extract the symbol from the grammar. + let symbol = A::symbol(i).unwrap(); // We're within 0..A::len() + + bit_lens.push((symbol, bit_len)); + } + } + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + 2 => + /* spec: EmptyCodeTable */ + { + Ok(Codebook::new()) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "Incorrect CodeTable kind", + )), + } + } + + /// Parse a Codebook for `DynamicAlphabet`. + pub fn read_dynamic(mut inp: R) -> Result + where + A: DynamicAlphabet, + R: Read, + T: Default, + { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + match byte[0] { + 0 => + /* spec: UnitCodeTable */ + { + Self::read_single_symbol::(inp) + } + 1 => + /* spec: MultiCodeTableExplicit */ + { + let number_of_symbols = *inp.read_varu32_no_normalization()?.value(); + let mut bit_lens = + Vec::with_capacity(usize::min(number_of_symbols as usize, VEC_MAX_PRE_ALLOC)); + + // Read bit lengths. + for _ in 0..number_of_symbols { + let mut byte = [0]; + inp.read_exact(&mut byte)?; + bit_lens.push((T::default(), BitLen::new(byte[0]))); + } + + // Amend with symbols + for i in 0..number_of_symbols { + let symbol = A::read_literal(&mut inp)?; + bit_lens[i as usize].0 = symbol; + } + + // Finally, build a codebook. + Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN", + ) + }) + } + 2 => + /* spec: EmptyCodeTable */ + { + Ok(Codebook::new()) + } + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "Incorrect CodeTable kind", + )), + } + } +} + +#[test] +fn read_and_write_codebook() { + use std::convert::TryInto; + use std::io; + + for sample in &[ + "appl", + "Lorem ipsum dolor sit amet consectetur adipiscing elit nunc, ridiculus hac natoque ante quisque imperdiet primis et euismod, pellentesque per turpis purus vestibulum quam dui. Himenaeos inceptos hac laoreet purus eros donec imperdiet, aliquam habitant felis class fusce etiam nulla facilisi, pretium eu nisl ultrices augue dictum. Venenatis mauris semper ultricies platea interdum sapien iaculis, habitasse eget habitant nec nam tincidunt, nulla aptent arcu duis laoreet volutpat. + +Torquent facilisi vestibulum erat eleifend diam convallis ac at, feugiat nullam vulputate euismod lacinia mollis quis venenatis, gravida porttitor cursus nascetur lacus per nostra. Platea ante curae netus torquent diam ultrices massa orci, vulputate sociis curabitur himenaeos litora sed aliquam nisi rutrum, cras porttitor per etiam iaculis eget arcu. Varius turpis libero metus luctus senectus condimentum cum mattis arcu, faucibus volutpat dapibus torquent ultrices fusce primis morbi, sed augue ridiculus magnis vitae placerat tempus curabitur. + +Aliquam habitant eu curae est eget orci auctor, non vehicula augue montes litora ac, class quis cum volutpat condimentum ullamcorper. Quisque consequat est vehicula volutpat at proin gravida sociosqu, nec dis ac ultricies phasellus viverra donec nullam, eros potenti facilisis mauris ad curabitur quis. Magna nisl ligula tellus conubia accumsan fringilla iaculis inceptos leo litora, eget integer malesuada rhoncus varius a tempor augue. Posuere nullam parturient eleifend quisque ornare vulputate curae ultricies iaculis est, odio scelerisque hendrerit non primis ut leo ante libero, nisi eu quam euismod habitant velit per lectus cubilia. + +Blandit quisque urna proin nostra praesent dui, magnis sollicitudin auctor ultrices platea sociis habitant, ut faucibus habitasse luctus elementum. Hendrerit elementum rutrum in erat nulla facilisi mauris torquent mus, diam consequat pulvinar tempor sociosqu conubia ornare ante, vehicula litora scelerisque magna placerat eleifend sapien risus. Pellentesque curabitur parturient per facilisi rhoncus porta posuere enim hendrerit, lacus litora aptent etiam vel id ante rutrum donec, platea gravida integer urna tristique est potenti class. Mus ante ut cursus in lacinia, sollicitudin posuere inceptos ullamcorper a, nam cras mi venenatis. + +Arcu magna lacus habitant eleifend cursus vitae, fermentum diam scelerisque nisi habitasse, conubia felis quis suscipit facilisis. Sociosqu erat lectus etiam aliquam quis vulputate praesent pharetra cras nam fermentum ultricies, nunc parturient fames imperdiet sem posuere molestie mi felis suscipit. Tortor etiam ligula leo nunc senectus sem pharetra, viverra suscipit egestas cum eu ullamcorper netus accumsan, eleifend porttitor sed lectus varius integer. Sem nascetur ligula ultrices risus eros nisl quisque, pulvinar lacinia sagittis magna primis odio dictum, metus a curabitur ante taciti inceptos. + +Mollis laoreet sollicitudin augue tortor facilisis cubilia molestie auctor erat sociis, condimentum parturient vestibulum lacinia urna potenti nascetur vehicula varius tempor mattis, velit maecenas tristique a habitant et porttitor tempus netus. Habitant interdum penatibus litora himenaeos dignissim torquent quam nulla, praesent elementum ad potenti accumsan class urna malesuada ut, aliquet aliquam egestas venenatis leo eu rhoncus pellentesque, augue ultricies posuere fames nullam aenean pretium. Tristique penatibus neque leo dignissim vulputate bibendum rhoncus pharetra, sem rutrum vehicula mauris lobortis proin platea, viverra metus natoque accumsan hendrerit posuere nunc. Aliquam nam porttitor leo tortor vel tempor nulla non, sollicitudin habitasse ornare magnis feugiat metus viverra quisque libero, risus eget enim orci torquent aptent molestie. + +Dignissim laoreet quis ligula non auctor id pellentesque justo, varius platea eget convallis dictum dui faucibus nec porttitor, porta praesent eu ante in rhoncus congue. Massa etiam eget vel torquent dis potenti accumsan ultrices, pulvinar et cursus cubilia maecenas diam himenaeos nunc blandit, semper vulputate turpis at scelerisque porttitor primis. Nam odio venenatis maecenas at tortor viverra metus, turpis suscipit ad facilisis elementum primis felis luctus, tempor curabitur suspendisse lobortis nunc ligula. + +Ante aliquet ultricies est lobortis a sollicitudin urna parturient eu, nec massa cursus mollis sagittis id risus accumsan condimentum, nisl platea habitant aenean eros leo fringilla blandit. Mi semper convallis posuere dictum integer torquent suspendisse, in rhoncus nulla himenaeos sociosqu cras praesent quam, nostra turpis scelerisque tempor facilisi velit. Mauris nec ut risus imperdiet varius venenatis quam ligula, luctus cursus velit scelerisque ullamcorper ultrices sociis viverra, vulputate lectus volutpat sodales nostra tincidunt suspendisse. Senectus fermentum bibendum a tristique sed sociosqu potenti, lectus ante egestas ac consequat donec eros, penatibus enim ridiculus luctus cursus malesuada. + +Dui dictum dignissim dis ultricies justo donec nisi, cum quisque rhoncus aliquam interdum iaculis dapibus, fringilla phasellus accumsan eget odio inceptos. Placerat laoreet iaculis nullam enim praesent diam semper porta montes, nisi commodo tempus rutrum nostra in himenaeos cum primis mollis, auctor congue venenatis a sed sollicitudin pulvinar ad. Duis faucibus penatibus mauris turpis tempus suscipit, litora habitasse ultricies potenti auctor, semper in ac placerat sollicitudin. + +Turpis at taciti lacus aenean cum, donec facilisi diam neque, pellentesque mattis sem auctor. Duis donec maecenas consequat nullam a fusce cubilia malesuada, hendrerit ad porttitor ac neque netus dictum, felis suscipit est nisl parturient porta elementum. Ullamcorper tempor porttitor quis integer nullam proin taciti facilisis eget dui habitasse, nisl ad erat placerat curae dictum litora lectus urna facilisi, varius tincidunt nam enim lacus tellus est suspendisse porta cum. + +Conubia a rhoncus metus felis nullam dictumst tempus dignissim, egestas neque pulvinar tincidunt feugiat congue suscipit elementum, hac sociosqu fringilla nunc bibendum magna curae. Netus massa suspendisse tellus sapien a montes, metus varius aenean mauris tempus dis fames, tincidunt eu vulputate quis pulvinar. Lobortis curabitur molestie tortor aliquam posuere magnis consequat, tellus suspendisse purus pretium ultricies nibh fermentum, potenti odio egestas tempus varius id. + +Placerat fames proin suspendisse porta posuere quam orci senectus integer sed, nostra diam elementum phasellus vulputate dictum litora accumsan platea, sociosqu morbi dictumst nascetur parturient lacinia cubilia blandit pretium. Felis nostra natoque facilisis taciti diam nam netus est malesuada, tellus accumsan montes arcu lacinia et dictum rhoncus commodo, cum purus dui maecenas egestas sollicitudin eu risus. Augue ullamcorper penatibus at curae urna hac habitant suspendisse fringilla platea, fames sed fermentum sociis etiam sapien ac dictum maecenas cras, volutpat nullam tempus ornare leo ultricies lobortis mus arcu. Velit consequat fermentum facilisis eleifend vestibulum ullamcorper platea mi faucibus potenti sagittis nisl, himenaeos volutpat pellentesque nascetur gravida tempus interdum enim tristique sed curabitur mollis, commodo magnis facilisi tempor ultrices vehicula vel nisi metus iaculis varius. + +Rhoncus aliquet fermentum imperdiet senectus porttitor vulputate pharetra tortor, feugiat suscipit proin magnis cubilia primis magna urna, blandit facilisis cum aenean purus curabitur platea. Pharetra dis vivamus cursus proin hendrerit faucibus himenaeos praesent, mus facilisi sodales sed curabitur scelerisque aliquam, aenean velit platea mollis ultrices integer tincidunt. Montes vivamus phasellus tempus tellus a fermentum habitant hendrerit parturient ligula mollis et, varius dapibus sed cras nam libero blandit eu vestibulum laoreet nunc, porttitor ut pretium curae dictum id justo erat nisl nisi integer. + +Ultrices iaculis per netus odio condimentum molestie penatibus nibh, ultricies faucibus cras sagittis neque ante pulvinar, justo ad ullamcorper at malesuada tellus nisl. Porttitor lacinia vestibulum ut condimentum donec, blandit ullamcorper euismod fringilla pharetra id, natoque lectus pretium vel. Sodales elementum sed est himenaeos ligula luctus porta montes cum, integer eu vivamus volutpat viverra pulvinar orci faucibus nostra, maecenas neque magnis dis nulla habitant metus velit. Urna quam a enim scelerisque pretium taciti vestibulum quisque dignissim, suspendisse nisl habitasse turpis accumsan nec pellentesque inceptos, tempor aptent ad sollicitudin velit praesent porttitor facilisis. + +Lacus dui velit mus ut cursus ridiculus montes, id vehicula vivamus taciti egestas urna vulputate, rutrum dapibus aptent non ullamcorper aliquet. Eget erat dictum montes facilisis sodales nascetur ante quisque, mattis venenatis penatibus senectus ultricies praesent himenaeos, aliquam porttitor accumsan diam quis platea et. Habitasse donec parturient lectus vehicula non magnis quis et ante netus, natoque proin lacus posuere commodo nisl eget placerat sed aenean, imperdiet lobortis volutpat massa cubilia curae metus nisi blandit. Viverra tortor suspendisse aenean nisl pretium augue, parturient vestibulum dignissim tristique quis, neque ultricies ad quisque lacinia. + +Condimentum nisl mus pulvinar semper metus placerat habitasse commodo aptent, fermentum eros mollis inceptos venenatis ut natoque id hac magna, per ornare penatibus conubia tellus sed erat mi. Etiam felis enim inceptos libero facilisis dis litora imperdiet cursus netus, sapien accumsan in turpis facilisi fermentum mus dictumst fames, bibendum aptent metus habitasse tempus condimentum ante augue volutpat. Pulvinar inceptos sociis elementum blandit facilisi natoque eu, mollis neque lacus aliquet tristique massa habitasse, mus praesent vestibulum augue porta nisl. Quisque porta vestibulum sociis ad vulputate felis conubia lacus enim, sociosqu libero luctus condimentum nibh parturient et lobortis, egestas mauris proin tempus montes pulvinar senectus dictum. + +Pharetra habitasse praesent tristique taciti dignissim nullam faucibus mus at, curabitur inceptos libero accumsan facilisis tempus duis mi ut, massa magnis vitae metus est magna placerat nam, convallis aliquet sed auctor ullamcorper gravida rhoncus aptent. Platea aenean sagittis per fringilla mollis auctor rhoncus, blandit magna aptent egestas himenaeos tincidunt malesuada eget, luctus ad massa vulputate sapien pulvinar. Senectus scelerisque gravida viverra morbi metus augue suspendisse, pulvinar maecenas urna dictum nascetur cursus, sem ultricies curae enim parturient accumsan. + +Nunc cubilia fusce ullamcorper senectus vulputate pellentesque natoque ac, taciti tortor nisl torquent quis posuere mus. Vel dignissim nulla imperdiet accumsan aliquet faucibus hendrerit ultricies neque vivamus, tempus feugiat praesent sodales rhoncus taciti congue ad dis velit, orci himenaeos quis hac suscipit litora ornare senectus dui. Inceptos nec condimentum viverra et augue lectus nunc diam, eros dis purus magna nullam ligula ultrices tortor, velit aenean tellus id porttitor faucibus volutpat. Quisque blandit gravida integer sociosqu est accumsan pulvinar, nullam condimentum conubia vulputate cursus netus iaculis, urna a habitant scelerisque aptent torquent. Vulputate himenaeos class malesuada tortor interdum velit potenti quisque risus pharetra, primis cum lectus mi ullamcorper sociosqu consequat posuere nisi, varius eleifend arcu id eget vel nullam etiam blandit. + +Quisque vestibulum proin torquent vel dictum convallis ligula placerat suspendisse enim, tristique lobortis sem feugiat libero lacus parturient tempus volutpat, habitasse imperdiet sociosqu mi dapibus scelerisque sollicitudin ullamcorper et. Euismod scelerisque mauris augue lacus porttitor cras ornare penatibus, nascetur egestas placerat platea cubilia varius volutpat duis malesuada, quisque mus ridiculus habitant senectus suscipit morbi. Ultrices leo cras morbi magna curabitur potenti vel mi, non hac varius imperdiet id metus ornare, nullam in quis dapibus torquent eros rhoncus. Nullam dapibus quisque luctus sollicitudin lacus euismod porta pulvinar sapien rutrum est, feugiat mollis nec ridiculus aenean sem tristique massa suspendisse faucibus. + +Bibendum tempor congue sed curabitur non quam velit porta, mauris montes mattis mollis sodales vivamus sociosqu tempus, himenaeos penatibus taciti commodo in id maecenas. Vulputate pretium mauris at viverra mus massa vehicula parturient, conubia velit tempus eleifend libero bibendum curabitur in ultricies, hendrerit tincidunt consequat porttitor justo commodo id. Neque congue sociosqu morbi massa libero aliquet purus nibh conubia, venenatis diam mauris justo mollis felis fusce tempus quis, suspendisse gravida blandit viverra bibendum euismod porttitor placerat." + ] { + let reference = Codebook::from_sequence(sample.bytes(), BitLen::new(std::u8::MAX)).unwrap(); + + struct ByteAlphabet; + impl Alphabet for ByteAlphabet { + type Symbol = u8; + fn read_literal(mut input: R) -> Result + where R: io::Read { + let mut buf = [0]; + input.read_exact(&mut buf)?; + Ok(buf[0]) + } + fn write_literal(symbol: &Self::Symbol, mut output: W) -> Result<(), io::Error> + where + W: io::Write + { + output.write_all(&[*symbol])?; + Ok(()) + } + } + + { + // Test as a static alphabet. + impl StaticAlphabet for ByteAlphabet { + fn len() -> u32 { + std::u8::MAX as u32 + } + fn symbol(index: u32) -> Option { + index.try_into().ok() + } + } + + // ...write + let mut buf = vec![]; + reference.write_static::(&mut buf) + .unwrap(); + + // ...read + let result = Codebook::read_static::(io::Cursor::new(&buf)) + .unwrap(); + + assert_eq!(result, reference); + } + + { + // Test as a dynamic alphabet. + impl DynamicAlphabet for ByteAlphabet { } + + // ...write + let mut buf = vec![]; + reference.write_dynamic::(&mut buf) + .unwrap(); + + + // ...read + let result = Codebook::read_dynamic::(io::Cursor::new(&buf)) + .unwrap(); + + assert_eq!(result, reference); + } + } +} diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs index daa7e74f3..a1889ff97 100644 --- a/crates/binjs_io/src/context/huffman/mod.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -6,6 +6,9 @@ use std::collections::{BinaryHeap, HashMap}; use std::hash::Hash; use std::io; +/// Huffman trees. +mod codebook; + /// Reading from bitstreams and decoding their contents using Huffman tables. pub mod read; @@ -339,295 +342,18 @@ impl PartialEq for Node { } impl Eq for Node {} -/// Codebook associated to a sequence of values. -#[derive(Clone, Debug)] -pub struct Codebook { - /// The longest bit length that actually appears in `mappings`. - highest_bit_len: BitLen, - - /// The sequence of keys. - /// - /// Order is meaningful. - mappings: Vec<(T, Key)>, -} - -impl Codebook { - /// The number of elements in this Codebook. - pub fn len(&self) -> usize { - self.mappings.len() - } - - /// The longest bit length that acctually appears in this Codebook. - pub fn highest_bit_len(&self) -> BitLen { - self.highest_bit_len - } -} - -impl IntoIterator for Codebook { - type Item = (T, Key); - type IntoIter = std::vec::IntoIter<(T, Key)>; - fn into_iter(self) -> Self::IntoIter { - self.mappings.into_iter() - } -} - -impl Codebook -where - T: Ord + Clone, -{ - /// Compute a `Codebook` from a sequence of values. - /// - /// Optionally, `max_bit_len` may specify a largest acceptable bit length. - /// If the `Codebook` may not be computed without exceeding this bit length, - /// fail with `Err(problemantic_bit_len)`. - /// - /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_len` even though an - /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_len`. - /// - /// # Performance - /// - /// Values (type `T`) will be cloned regularly, so you should make - /// sure that their cloning is reasonably cheap. - pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result - where - S: IntoIterator, - T: PartialEq + Hash, - { - // Count the values. - let mut map = HashMap::new(); - for item in source { - let counter = map.entry(item).or_insert(0.into()); - *counter += 1.into(); - } - // Then compute the `Codebook`. - Self::from_instances(map, max_bit_len) - } - - /// Compute a `Codebook` from a sequence of values - /// with a number of instances already attached. - /// - /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_len` even though an - /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_len`. - /// - /// # Requirement - /// - /// Values of `T` in the source MUST be distinct. - pub fn from_instances(source: S, max_bit_len: BitLen) -> Result - where - S: IntoIterator, - { - let bit_lengths = Self::compute_bit_lengths(source, max_bit_len)?; - Self::from_bit_lens(bit_lengths, max_bit_len) - } - - /// Compute a `Codebook` from a sequence of values - /// with a bit length already attached. - /// - /// The current implementation only attempts to produce the best compression - /// level. This may cause us to exceed `max_bit_len` even though an - /// alternative table, with a lower compression level, would let us - /// proceed without exceeding `max_bit_len`. - /// - /// # Requirement - /// - /// Values of `T` in the source MUST be distinct. - pub fn from_bit_lens(mut bit_lens: Vec<(T, BitLen)>, max_bit_len: BitLen) -> Result - { - let mut highest_bit_len = BitLen(0); - - // Canonicalize order: (BitLen, T) - bit_lens.sort_unstable_by_key(|&(ref value, ref bit_len)| (*bit_len, value.clone())); - - // The bits associated to the next value. - let mut bits = 0; - let mut mappings = Vec::with_capacity(bit_lens.len()); - - for i in 0..bit_lens.len() - 1 { - let (bit_len, symbol, next_bit_len) = ( - bit_lens[i].1, - bit_lens[i].0.clone(), - bit_lens[i + 1].1, - ); - mappings.push((symbol.clone(), Key::new(bits, bit_len))); - bits = (bits + 1) << (next_bit_len - bit_len); - if bit_len > highest_bit_len { - highest_bit_len = bit_len; - } - } - // Handle the last element. - let (ref symbol, bit_len) = bit_lens[bit_lens.len() - 1]; - if bit_len > highest_bit_len { - highest_bit_len = bit_len; - } - mappings.push((symbol.clone(), Key::new(bits, bit_len))); - - if highest_bit_len > max_bit_len { - return Err(highest_bit_len) - } - - return Ok(Self { - highest_bit_len, - mappings, - }); - } - - /// Convert a sequence of values labelled by their number of instances - /// into a sequence of values labelled by the length for their path - /// in the Huffman tree, aka the bitlength of their Huffman key. - /// - /// Values that have 0 instances are skipped. - pub fn compute_bit_lengths(source: S, max_bit_len: BitLen) -> Result, u8> - where - S: IntoIterator, - { - // Build a min-heap sorted by number of instances. - use std::cmp::Reverse; - let mut heap = BinaryHeap::new(); - - // Skip values that have 0 instances. - for (value, instances) in source { - if !instances.is_zero() { - heap.push(Reverse(Node { - instances, - content: NodeContent::Leaf(value), - })); - } - } - - let len = heap.len(); - if len == 0 { - // Special case: no tree to build. - return Ok(vec![]); - } - - // Take the two rarest nodes, merge them behind a prefix, - // turn them into a single node with combined number of - // instances. Repeat. - while heap.len() > 1 { - let left = heap.pop().unwrap(); - let right = heap.pop().unwrap(); - heap.push(Reverse(Node { - instances: left.0.instances + right.0.instances, - content: NodeContent::Internal { - left: Box::new(left.0.content), - right: Box::new(right.0.content), - }, - })); - } - - // Convert tree into bit lengths - let root = heap.pop().unwrap(); // We have checked above that there is at least one value. - let mut bit_lengths = Vec::with_capacity(len); - fn aux( - bit_lengths: &mut Vec<(T, BitLen)>, - max_bit_len: BitLen, - depth: u8, - node: &NodeContent, - ) -> Result<(), u8> - where - T: Clone, - { - match *node { - NodeContent::Leaf(ref value) => { - if depth > max_bit_len.as_u8() { - return Err(depth); - } - bit_lengths.push((value.clone(), BitLen(depth))); - Ok(()) - } - NodeContent::Internal { - ref left, - ref right, - } => { - aux(bit_lengths, max_bit_len, depth + 1, left)?; - aux(bit_lengths, max_bit_len, depth + 1, right)?; - Ok(()) - } - } - } - aux(&mut bit_lengths, max_bit_len, 0, &root.0.content)?; - - Ok(bit_lengths) - } -} - -#[test] -fn test_coded_from_sequence() { - let sample = "appl"; - let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); - - // Symbol 'p' appears twice, we should see 3 codes. - assert_eq!(coded.mappings.len(), 3); - - // Check order of symbols. - assert_eq!(coded.mappings[0].0, 'p'); - assert_eq!(coded.mappings[1].0, 'a'); - assert_eq!(coded.mappings[2].0, 'l'); - - // Check bit length of symbols. - assert_eq!(coded.mappings[0].1.bit_len(), 1.into()); - assert_eq!(coded.mappings[1].1.bit_len(), 2.into()); - assert_eq!(coded.mappings[2].1.bit_len(), 2.into()); - - // Check code of symbols. - assert_eq!(coded.mappings[0].1.bits(), 0b00); - assert_eq!(coded.mappings[1].1.bits(), 0b10); - assert_eq!(coded.mappings[2].1.bits(), 0b11); - - // Let's try again with a limit to 1 bit paths. - assert_eq!(Codebook::from_sequence(sample.chars(), 1).unwrap_err(), 2); -} - -impl Codebook { - /// Create an empty Codebook - pub fn new() -> Self { - Self { - highest_bit_len: BitLen::new(0), - mappings: vec![], - } - } - - /// Create an empty Codebook - pub fn with_capacity(len: usize) -> Self { - Self { - highest_bit_len: BitLen::new(0), - mappings: Vec::with_capacity(len), - } - } - - /// Add a mapping to a Codebook. - /// - /// This method does **not** check that the resulting Codebook is correct. - pub unsafe fn add_mapping(&mut self, value: T, key: Key) { - if key.bit_len() > self.highest_bit_len { - self.highest_bit_len = key.bit_len(); - } - self.mappings.push((value, key)); - } - - /// Return the mappings of a Codebook. - pub fn mappings(self) -> Vec<(T, Key)> { - self.mappings - } - - - /// Iterate through this Codebook. - pub fn iter(&self) -> impl Iterator { - self.mappings.iter() - } -} - /// An alphabet of symbols. pub trait Alphabet { type Symbol: Ord + Clone; /// Read a symbol from an input stream. fn read_literal(input: R) -> Result - where R: io::Read; + where + R: io::Read; + + fn write_literal(symbol: &Self::Symbol, output: W) -> Result<(), io::Error> + where + W: io::Write; } /// An alphabet of symbols known statically from the grammar. @@ -640,7 +366,7 @@ pub trait StaticAlphabet: Alphabet { fn len() -> u32; /// Return the nth value of the alphabet of `None` if there is no such value. - fn index(u32) -> Option; + fn symbol(u32) -> Option; } /// An alphabet of symbols known dynamically from the file. @@ -649,6 +375,4 @@ pub trait StaticAlphabet: Alphabet { /// For instance, in most languages, the set of literal strings /// actually used in a file is determined by the user, not by /// the grammar. -pub trait DynamicAlphabet: Alphabet { -} - +pub trait DynamicAlphabet: Alphabet {} diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs index 3a9449f78..0fafd3920 100644 --- a/crates/binjs_io/src/context/huffman/read.rs +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -3,11 +3,10 @@ //! These tables are designed to aid decoding from sequences of bits //! into values. +use context::huffman::codebook::*; use context::huffman::*; -use context::varnum::ReadVaru32; -use std::convert::{ TryFrom, TryInto }; -use std::io::{ self, Read }; +use std::convert::{TryFrom, TryInto}; /// A Huffman table. /// @@ -477,7 +476,7 @@ fn test_huffman_lookup() { H: HuffmanTable, { let sample = "appl"; - let codebook = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let codebook = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); let table = from_codebook(codebook); assert_eq!(table.len(), 3); @@ -546,7 +545,7 @@ fn test_huffman_lookup_2() { Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis praesent habitasse, tortor id leo consequat sollicitudin elementum fames fringilla himenaeos donec. Phasellus posuere congue ultricies scelerisque senectus vivamus facilisi, vestibulum consequat aptent lectus ad sociis porta, purus libero eros leo at nec. Netus viverra urna nisl sapien conubia porta sed luctus penatibus cras, pulvinar iaculis sagittis fusce fringilla et rutrum sollicitudin ligula, dui vestibulum interdum pretium montes diam nibh inceptos ante. "; - let codebook = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap(); + let codebook = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); let table = from_codebook(codebook.clone()); for (value, key) in codebook { // Test that candidate keys obtained by extending `key` with additional bits @@ -599,81 +598,3 @@ Viverra arcu dapibus nam magna a imperdiet inceptos cubilia libero lobortis prae |codebook| MultiLookupHuffmanTable::from_codebook(BitLen(10), codebook), ); } - - -impl Codebook where T: Ord + Clone { - /// Parse a Codebook containing a single symbol. - fn parse_single_symbol(mut inp: R) -> Result where A: Alphabet, R: Read { - let symbol = A::read_literal(&mut inp)?; - Codebook::from_bit_lens(vec![(symbol, BitLen::new(0))], MAX_CODE_BIT_LEN) - .map_err(|_| - io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") - ) - } - - /// Parse a Codebook for `StaticAlphabet`. - fn parse_static(mut inp: R) -> Result where A: StaticAlphabet, R: Read { - let mut byte = [0]; - inp.read_exact(&mut byte)?; - match byte[0] { - 0 => /* spec: UnitCodeTable */ Self::parse_single_symbol::(inp), - 1 => /* spec: MultiCodeTableImplicit */ { - let number_of_symbols = A::len(); - let mut bit_lens = Vec::with_capacity(number_of_symbols as usize); - for i in 0..number_of_symbols { - // Read the bit length. - let mut byte = [0]; - inp.read_exact(&mut byte)?; - let bit_len = BitLen::new(byte[0]); - - // Extract the symbol from the grammar. - let symbol = A::index(i).unwrap(); // We're within 0..A::len() - - bit_lens.push((symbol, bit_len)); - } - // Finally, build a codebook. - Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN) - .map_err(|_| - io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") - ) - } - 2 => /* spec: EmptyCodeTable */ Ok(Codebook::new()), - _ => Err(io::Error::new(io::ErrorKind::InvalidData, "Incorrect CodeTable kind")) - } - } - - /// Parse a Codebook for `DynamicAlphabet`. - fn parse_dynamic(mut inp: R) -> Result where A: DynamicAlphabet, R: Read, T: Default { - let mut byte = [0]; - inp.read_exact(&mut byte)?; - match byte[0] { - 0 => /* spec: UnitCodeTable */ Self::parse_single_symbol::(inp), - 1 => /* spec: MultiCodeTableExplicit */ { - let number_of_symbols = *inp.read_varu32_no_normalization()?.value(); - // FIXME: We may need to guard against DoS by high `number_of_symbols`. - let mut bit_lens = Vec::with_capacity(number_of_symbols as usize); - - // Read bit lengths. - for _ in 0..number_of_symbols { - let mut byte = [0]; - inp.read_exact(&mut byte)?; - bit_lens.push((T::default(), BitLen::new(byte[0]))); - } - - // Amend with symbols - for i in 0..number_of_symbols { - let symbol = A::read_literal(&mut inp)?; - bit_lens[i as usize].0 = symbol; - } - - // Finally, build a codebook. - Codebook::from_bit_lens(bit_lens, MAX_CODE_BIT_LEN) - .map_err(|_| - io::Error::new(io::ErrorKind::InvalidData, "Could not derive a Codebook that does not exceed MAX_CODE_BIT_LEN") - ) - } - 2 => /* spec: EmptyCodeTable */ Ok(Codebook::new()), - _ => Err(io::Error::new(io::ErrorKind::InvalidData, "Incorrect CodeTable kind")) - } - } -} \ No newline at end of file From c2f02b07fd174a983c0813e3d7015e6ba11c7287 Mon Sep 17 00:00:00 2001 From: David Teller Date: Fri, 27 Sep 2019 15:41:22 +0200 Subject: [PATCH 6/7] Key::try_new --- .../binjs_io/src/context/huffman/codebook.rs | 34 ++++++++------- crates/binjs_io/src/context/huffman/mod.rs | 42 +++++++++++++------ 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/crates/binjs_io/src/context/huffman/codebook.rs b/crates/binjs_io/src/context/huffman/codebook.rs index dbe8451c8..5b588d30c 100644 --- a/crates/binjs_io/src/context/huffman/codebook.rs +++ b/crates/binjs_io/src/context/huffman/codebook.rs @@ -60,7 +60,7 @@ where /// /// Values (type `T`) will be cloned regularly, so you should make /// sure that their cloning is reasonably cheap. - pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result + pub fn from_sequence(source: S, max_bit_len: BitLen) -> Result where S: IntoIterator, T: PartialEq + Hash, @@ -86,7 +86,7 @@ where /// # Requirement /// /// Values of `T` in the source MUST be distinct. - pub fn from_instances(source: S, max_bit_len: BitLen) -> Result + pub fn from_instances(source: S, max_bit_len: BitLen) -> Result where S: IntoIterator, { @@ -108,7 +108,7 @@ where pub fn from_bit_lens( mut bit_lens: Vec<(T, BitLen)>, max_bit_len: BitLen, - ) -> Result { + ) -> Result { let mut highest_bit_len = BitLen(0); // Canonicalize order: (BitLen, T) @@ -121,8 +121,7 @@ where for i in 0..bit_lens.len() - 1 { let (bit_len, symbol, next_bit_len) = (bit_lens[i].1, bit_lens[i].0.clone(), bit_lens[i + 1].1); - // FIXME: Instead of asserting, this should fail gracefully. - mappings.push((symbol.clone(), Key::new(bits, bit_len))); + mappings.push((symbol.clone(), Key::try_new(bits, bit_len)?)); bits = (bits + 1) << (next_bit_len - bit_len); if bit_len > highest_bit_len { highest_bit_len = bit_len; @@ -136,7 +135,10 @@ where mappings.push((symbol.clone(), Key::new(bits, bit_len))); if highest_bit_len > max_bit_len { - return Err(highest_bit_len); + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Could not create a codebook that fits into this bit length", + )); } return Ok(Self { @@ -150,7 +152,10 @@ where /// in the Huffman tree, aka the bitlength of their Huffman key. /// /// Values that have 0 instances are skipped. - pub fn compute_bit_lengths(source: S, max_bit_len: BitLen) -> Result, u8> + pub fn compute_bit_lengths( + source: S, + max_bit_len: BitLen, + ) -> Result, std::io::Error> where S: IntoIterator, { @@ -197,14 +202,17 @@ where max_bit_len: BitLen, depth: u8, node: &NodeContent, - ) -> Result<(), u8> + ) -> Result<(), std::io::Error> where T: Clone, { match *node { NodeContent::Leaf(ref value) => { if depth > max_bit_len.as_u8() { - return Err(depth); + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Could not create a codebook that fits into this bit length", + )); } bit_lengths.push((value.clone(), BitLen(depth))); Ok(()) @@ -228,7 +236,8 @@ where #[test] fn test_coded_from_sequence() { let sample = "appl"; - let coded = Codebook::from_sequence(sample.chars(), BitLen::new(std::u8::MAX)).unwrap(); + let try_make_codebook = |bit_len| Codebook::from_sequence(sample.chars(), bit_len); + let coded = try_make_codebook(BitLen::new(std::u8::MAX)).unwrap(); // Symbol 'p' appears twice, we should see 3 codes. assert_eq!(coded.mappings.len(), 3); @@ -249,10 +258,7 @@ fn test_coded_from_sequence() { assert_eq!(coded.mappings[2].1.bits(), 0b11); // Let's try again with a limit to 1 bit paths. - assert_eq!( - Codebook::from_sequence(sample.chars(), BitLen::new(1)).unwrap_err(), - BitLen::new(2) - ); + assert!(try_make_codebook(BitLen::new(1)).is_err()); } impl Codebook { diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs index a1889ff97..a0462427b 100644 --- a/crates/binjs_io/src/context/huffman/mod.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -256,21 +256,37 @@ impl Key { /// Create a new Key. /// /// Note that we only use the `bit_len` lowest-weight bits. - /// Any other bit MUST BE 0. + /// + /// # Failure + /// + /// - Panic if any bit other than the `bit_len` lowest-weight bits is 0. + /// - Panic if the bit length is greater than 32. pub fn new(bits: u32, bit_len: BitLen) -> Self { - debug_assert!({ - let bit_len: u8 = bit_len.into(); - bit_len <= 32 - }); - debug_assert!({ - let bit_len: u8 = bit_len.into(); - if bit_len < 32 { - bits >> bit_len == 0 - } else { - true + Self::try_new(bits, bit_len).expect("Invalid Key") + } + + /// Create a new Key. + /// + /// Note that we only use the `bit_len` lowest-weight bits. + /// Any other bit MUST BE 0. + pub fn try_new(bits: u32, bit_len: BitLen) -> Result { + // May the value fit in a `Key`? + if bit_len.as_u8() > 32 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "bitlength exceeds Key capacity", + )); + } + // Are the heavy-weight bits 0s, as expected? + if bit_len.as_u8() < 32 { + if bits >> bit_len != 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid Key content", + )); } - }); - Key(BitSequence { bits, bit_len }) + } + Ok(Key(BitSequence { bits, bit_len })) } pub fn from_bit_sequence(sequence: BitSequence) -> Self { From b74b3fe5470e1c8a58c7087e4d5d2e491ed9679b Mon Sep 17 00:00:00 2001 From: David Teller Date: Fri, 4 Oct 2019 16:14:55 +0200 Subject: [PATCH 7/7] Cleaning up around BitSequence, BitLen >> --- crates/binjs_io/src/context/huffman/mod.rs | 75 +++++++++------------ crates/binjs_io/src/context/huffman/read.rs | 4 +- 2 files changed, 32 insertions(+), 47 deletions(-) diff --git a/crates/binjs_io/src/context/huffman/mod.rs b/crates/binjs_io/src/context/huffman/mod.rs index a0462427b..282148b51 100644 --- a/crates/binjs_io/src/context/huffman/mod.rs +++ b/crates/binjs_io/src/context/huffman/mod.rs @@ -64,13 +64,19 @@ impl std::ops::Shl for usize { impl std::ops::Shr for u32 { type Output = u32; fn shr(self, rhs: BitLen) -> u32 { - self >> Into::::into(rhs) + if rhs.as_u8() == 32 { + return 0; + } + self >> rhs.as_u8() } } impl std::ops::Shr for usize { type Output = usize; fn shr(self, rhs: BitLen) -> usize { - self >> Into::::into(rhs) + if rhs.as_u8() == 32 { + return 0; + } + self >> rhs.as_u8() } } @@ -89,6 +95,8 @@ pub struct BitSequence { } impl BitSequence { pub fn new(bits: u32, bit_len: BitLen) -> Self { + assert!(bit_len.as_u8() <= 32); + assert_eq!(bits >> bit_len, 0); Self { bits, bit_len } } @@ -106,20 +114,15 @@ impl BitSequence { /// /// If `bit_len` is larger than the number of bits, the prefix is padded with /// lower-weight bits into `bit_len` bits. - pub fn split_bits(&self, bit_len: BitLen) -> (u32, u32) { + pub fn split_raw_bits(&self, bit_len: BitLen) -> (u32, u32) { debug_assert!(bit_len.as_u8() <= 32); if self.bit_len <= bit_len { let padding = bit_len - self.bit_len; (self.bits << padding, 0) } else { - let shift = self.bit_len - bit_len; - match shift.into() { - 32u8 => (0, self.bits), // Special case: cannot >> 32 - shift => ( - self.bits >> shift, - self.bits & (std::u32::MAX >> 32 - shift), - ), - } + let shift: BitLen = self.bit_len - bit_len; + let co_shift: BitLen = BitLen::new(32) - shift; + (self.bits >> shift, self.bits & (std::u32::MAX >> co_shift)) } } @@ -130,7 +133,7 @@ impl BitSequence { /// /// This function panics if `bit_len > self.bit_len`. pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) { - let (prefix, suffix) = self.split_bits(bit_len); + let (prefix, suffix) = self.split_raw_bits(bit_len); ( BitSequence::new(prefix, bit_len), BitSequence::new( @@ -149,25 +152,14 @@ impl BitSequence { /// /// Does nothing if the bit sequence already has a sufficient bitlength. pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow { - assert!(total_bit_len.0 <= 32u8); + assert!(total_bit_len.as_u8() <= 32); if total_bit_len <= self.bit_len { return Cow::Borrowed(self); } - let shift = total_bit_len - self.bit_len; - if shift.0 == 32u8 { - return Cow::Owned(BitSequence::new(0, BitLen(32))); - } + let shift: BitLen = total_bit_len - self.bit_len; Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len)) } - /// Prepend a sequence of bits to a sequencce.s - pub fn with_prefix(&self, prefix: &BitSequence) -> Self { - assert!((prefix.bit_len() + self.bit_len()).as_u8() <= 32); - let bits = self.bits | (prefix.bits() << self.bit_len); - let bit_len = self.bit_len + prefix.bit_len; - BitSequence::new(bits, bit_len) - } - /// Return a range representing all possible suffixes of this `BitSequence` /// containing exactly `bit_len` bits. /// @@ -215,18 +207,15 @@ impl BitSequence { let (first, last) = if bit_len <= self.bit_len() { // We have too many bits, we need to truncate the bits, // then return a single element. - let shearing: u8 = (self.bit_len() - bit_len).as_u8(); - let first = if shearing == 32 { - 0 - } else { - self.bits() >> shearing - }; + let shearing: BitLen = self.bit_len() - bit_len; + let first = self.bits() >> shearing; (first, first) } else { // We need to pad with lower-weight 0s. - let padding: u8 = (bit_len - self.bit_len()).as_u8(); + let padding: BitLen = bit_len - self.bit_len(); + let co_padding = BitLen::new(32) - padding; let first = self.bits() << padding; - let len = std::u32::MAX >> (8 * std::mem::size_of::() as u8 - padding); + let len = std::u32::MAX >> co_padding; (first, first + len) }; first..(last + 1) @@ -237,15 +226,15 @@ impl BitSequence { fn test_bit_sequence_split() { let bits = 0b11111111_11111111_00000000_00000000; let key = BitSequence::new(bits, BitLen(32)); - assert_eq!(key.split_bits(BitLen(0)), (0, bits)); - assert_eq!(key.split_bits(BitLen(32)), (bits, 0)); - assert_eq!(key.split_bits(BitLen(16)), (0b11111111_11111111, 0)); + assert_eq!(key.split_raw_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_raw_bits(BitLen(32)), (bits, 0)); + assert_eq!(key.split_raw_bits(BitLen(16)), (0b11111111_11111111, 0)); let bits = 0b00000000_00000000_00000000_11111111; let key = BitSequence::new(bits, BitLen(16)); - assert_eq!(key.split_bits(BitLen(0)), (0, bits)); - assert_eq!(key.split_bits(BitLen(16)), (bits, 0)); - assert_eq!(key.split_bits(BitLen(8)), (0, 0b11111111)); + assert_eq!(key.split_raw_bits(BitLen(0)), (0, bits)); + assert_eq!(key.split_raw_bits(BitLen(16)), (bits, 0)); + assert_eq!(key.split_raw_bits(BitLen(8)), (0, 0b11111111)); } /// A Huffman key @@ -260,8 +249,9 @@ impl Key { /// # Failure /// /// - Panic if any bit other than the `bit_len` lowest-weight bits is 0. - /// - Panic if the bit length is greater than 32. + /// - Panic if the bit length is greater than 20. pub fn new(bits: u32, bit_len: BitLen) -> Self { + assert!(bit_len <= BitLen::new(20)); Self::try_new(bits, bit_len).expect("Invalid Key") } @@ -310,11 +300,6 @@ impl Key { pub fn as_bit_sequence(&self) -> &BitSequence { &self.0 } - - pub fn with_prefix(&self, prefix: &BitSequence) -> Self { - let sequence = self.0.with_prefix(prefix); - Key::from_bit_sequence(sequence) - } } /// A node in the Huffman tree. diff --git a/crates/binjs_io/src/context/huffman/read.rs b/crates/binjs_io/src/context/huffman/read.rs index 0fafd3920..2320a10fe 100644 --- a/crates/binjs_io/src/context/huffman/read.rs +++ b/crates/binjs_io/src/context/huffman/read.rs @@ -221,7 +221,7 @@ where /// Constant-time lookup. fn lookup(&self, key: &BitSequence) -> Option> { assert!(key.bit_len() >= self.highest_bit_len()); - let (prefix, _) = key.split_bits(self.highest_bit_len()); + let (prefix, _) = key.split_raw_bits(self.highest_bit_len()); let value_index = self.saturated[prefix as usize].clone(); let value_index: usize = value_index .try_into() @@ -452,7 +452,7 @@ where assert!(key.bit_len() >= self.highest_bit_len()); // Find in which `SingleLookupHuffmanTable` to look for the entry. - let (prefix, suffix) = key.split_bits(self.prefix_len); + let (prefix, suffix) = key.split_raw_bits(self.prefix_len); let ref table = self.by_prefix.get(prefix as usize)?; // Now lookup in second table.