@@ -10,6 +10,7 @@ pub mod read;
1010
1111/// A newtype for `u8` used to count the length of a key in bits.
1212#[ derive(
13+ Constructor ,
1314 Debug ,
1415 Default ,
1516 Display ,
@@ -82,30 +83,63 @@ impl BitSequence {
8283 pub fn new ( bits : u32 , bit_len : BitLen ) -> Self {
8384 Self { bits, bit_len }
8485 }
86+
8587 pub fn bits ( & self ) -> u32 {
8688 self . bits
8789 }
90+
8891 /// The number of bits of `bits` to use.
8992 pub fn bit_len ( & self ) -> BitLen {
9093 self . bit_len
9194 }
95+
96+ /// Split the bits into a prefix of `bit_len` bits and a suffix containing the
97+ /// remaining bits.
98+ ///
99+ /// If `bit_len` is larger than the number of bits, the prefix is padded with
100+ /// lower-weight bits into `bit_len` bits.
101+ pub fn split_bits ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
102+ debug_assert ! ( bit_len. as_u8( ) <= 32 ) ;
103+ if self . bit_len <= bit_len {
104+ let padding = bit_len - self . bit_len ;
105+ ( self . bits << padding, 0 )
106+ } else {
107+ let shift = self . bit_len - bit_len;
108+ match shift. into ( ) {
109+ 32u8 => ( 0 , self . bits ) , // Special case: cannot >> 32
110+ shift => (
111+ self . bits >> shift,
112+ self . bits & ( std:: u32:: MAX >> 32 - shift) ,
113+ ) ,
114+ }
115+ }
116+ }
117+
92118 /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
93119 /// bits.
94120 ///
95121 /// # Failure
96122 ///
97123 /// This function panics if `bit_len > self.bit_len`.
98- pub fn split ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
99- let shift = self . bit_len - bit_len;
100- match shift. into ( ) {
101- 0u8 => ( self . bits , 0 ) , // Special case: cannot >> 32
102- 32u8 => ( 0 , self . bits ) , // Special case: cannot >> 32
103- shift => (
104- self . bits >> shift,
105- self . bits & ( std:: u32:: MAX >> 32 - shift) ,
124+ pub fn split ( & self , bit_len : BitLen ) -> ( BitSequence , BitSequence ) {
125+ let ( prefix, suffix) = self . split_bits ( bit_len) ;
126+ (
127+ BitSequence :: new ( prefix, bit_len) ,
128+ BitSequence :: new (
129+ suffix,
130+ if self . bit_len >= bit_len {
131+ self . bit_len - bit_len
132+ } else {
133+ BitLen :: new ( 0 )
134+ } ,
106135 ) ,
107- }
136+ )
108137 }
138+
139+ /// Add lowest-weight to this bit sequence bits until it reaches
140+ /// a sufficient bit length.
141+ ///
142+ /// Does nothing if the bit sequence already has a sufficient bitlength.
109143 pub fn pad_lowest_to ( & self , total_bit_len : BitLen ) -> Cow < BitSequence > {
110144 assert ! ( total_bit_len. 0 <= 32u8 ) ;
111145 if total_bit_len <= self . bit_len {
@@ -117,21 +151,93 @@ impl BitSequence {
117151 }
118152 Cow :: Owned ( BitSequence :: new ( self . bits << shift, total_bit_len) )
119153 }
154+
155+ /// Prepend a sequence of bits to a sequencce.s
156+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
157+ assert ! ( ( prefix. bit_len( ) + self . bit_len( ) ) . as_u8( ) <= 32 ) ;
158+ let bits = self . bits | ( prefix. bits ( ) << self . bit_len ) ;
159+ let bit_len = self . bit_len + prefix. bit_len ;
160+ BitSequence :: new ( bits, bit_len)
161+ }
162+
163+ /// Return a range representing all possible suffixes of this `BitSequence`
164+ /// containing exactly `bit_len` bits.
165+ ///
166+ /// If this `BitSequence` is already at least `bit_len` bits long, we
167+ /// truncate the `BitSequence` to `bit_len` bits by removing the
168+ /// lower-weight bits and there is only one such suffix.
169+ ///
170+ /// ```
171+ /// use binjs_io::context::huffman::{ BitLen, BitSequence };
172+ ///
173+ /// let zero = BitSequence::new(0, BitLen::new(0));
174+ ///
175+ /// let range = zero.suffixes(BitLen::new(0));
176+ /// assert_eq!(range, 0..1);
177+ ///
178+ /// let range = zero.suffixes(BitLen::new(2));
179+ /// assert_eq!(range, 0..4);
180+ ///
181+ /// let range = zero.suffixes(BitLen::new(3));
182+ /// assert_eq!(range, 0..8);
183+ ///
184+ /// let range = zero.suffixes(BitLen::new(4));
185+ /// assert_eq!(range, 0..16);
186+ ///
187+ /// let sequence = BitSequence::new(0b00000100, BitLen::new(3));
188+ ///
189+ /// let range = sequence.suffixes(BitLen::new(0));
190+ /// assert_eq!(range, 0..1);
191+ ///
192+ /// let range = sequence.suffixes(BitLen::new(2));
193+ /// assert_eq!(range, 2..3);
194+ ///
195+ /// let range = sequence.suffixes(BitLen::new(3));
196+ /// assert_eq!(range, 4..5);
197+ ///
198+ /// let range = sequence.suffixes(BitLen::new(4));
199+ /// assert_eq!(range, 8..10); // 0b000001000 to 0b00001001 included
200+ /// ```
201+ pub fn suffixes ( & self , bit_len : BitLen ) -> std:: ops:: Range < u32 > {
202+ debug_assert ! ( bit_len. as_u8( ) as usize <= 8 * std:: mem:: size_of_val( & self . bits( ) ) ) ;
203+ debug_assert ! (
204+ std:: mem:: size_of_val( & self . bits( ) ) == std:: mem:: size_of:: <u32 >( ) ,
205+ "The arithmetics relies upon the fact that we're only using `u32` for Huffman keys"
206+ ) ;
207+ let ( first, last) = if bit_len <= self . bit_len ( ) {
208+ // We have too many bits, we need to truncate the bits,
209+ // then return a single element.
210+ let shearing: u8 = ( self . bit_len ( ) - bit_len) . as_u8 ( ) ;
211+ let first = if shearing == 32 {
212+ 0
213+ } else {
214+ self . bits ( ) >> shearing
215+ } ;
216+ ( first, first)
217+ } else {
218+ // We need to pad with lower-weight 0s.
219+ let padding: u8 = ( bit_len - self . bit_len ( ) ) . as_u8 ( ) ;
220+ let first = self . bits ( ) << padding;
221+ let len = std:: u32:: MAX >> ( 8 * std:: mem:: size_of :: < u32 > ( ) as u8 - padding) ;
222+ ( first, first + len)
223+ } ;
224+ first..( last + 1 )
225+ }
120226}
121227
122228#[ test]
123229fn test_bit_sequence_split ( ) {
124230 let bits = 0b11111111_11111111_00000000_00000000 ;
125231 let key = BitSequence :: new ( bits, BitLen ( 32 ) ) ;
126- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
127- assert_eq ! ( key. split ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
128- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
232+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
233+ assert_eq ! ( key. split_bits ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
234+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
129235
130236 let bits = 0b00000000_00000000_00000000_11111111 ;
131237 let key = BitSequence :: new ( bits, BitLen ( 16 ) ) ;
132- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
133- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
134- assert_eq ! ( key. split ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
238+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
239+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
240+ assert_eq ! ( key. split_bits ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
135241}
136242
137243/// A Huffman key
@@ -159,6 +265,10 @@ impl Key {
159265 Key ( BitSequence { bits, bit_len } )
160266 }
161267
268+ pub fn from_bit_sequence ( sequence : BitSequence ) -> Self {
269+ Self :: new ( sequence. bits , sequence. bit_len )
270+ }
271+
162272 /// The bits in this Key.
163273 ///
164274 /// # Invariant
@@ -176,6 +286,11 @@ impl Key {
176286 pub fn as_bit_sequence ( & self ) -> & BitSequence {
177287 & self . 0
178288 }
289+
290+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
291+ let sequence = self . 0 . with_prefix ( prefix) ;
292+ Key :: from_bit_sequence ( sequence)
293+ }
179294}
180295
181296/// A node in the Huffman tree.
@@ -219,43 +334,46 @@ impl<T> PartialEq for Node<T> {
219334}
220335impl < T > Eq for Node < T > { }
221336
222- /// Keys associated to a sequence of values.
337+ /// Codebook associated to a sequence of values.
223338#[ derive( Clone , Debug ) ]
224- pub struct Keys < T > {
225- /// The longest bit length that actually appears in `keys `.
339+ pub struct Codebook < T > {
340+ /// The longest bit length that actually appears in `mappings `.
226341 highest_bit_len : BitLen ,
227342
228343 /// The sequence of keys.
229344 ///
230345 /// Order is meaningful.
231- keys : Vec < ( T , Key ) > ,
346+ mappings : Vec < ( T , Key ) > ,
232347}
233348
234- impl < T > Keys < T > {
349+ impl < T > Codebook < T > {
350+ /// The number of elements in this Codebook.
235351 pub fn len ( & self ) -> usize {
236- self . keys . len ( )
352+ self . mappings . len ( )
237353 }
354+
355+ /// The longest bit length that acctually appears in this Codebook.
238356 pub fn highest_bit_len ( & self ) -> BitLen {
239357 self . highest_bit_len
240358 }
241359}
242360
243- impl < T > IntoIterator for Keys < T > {
361+ impl < T > IntoIterator for Codebook < T > {
244362 type Item = ( T , Key ) ;
245363 type IntoIter = std:: vec:: IntoIter < ( T , Key ) > ;
246364 fn into_iter ( self ) -> Self :: IntoIter {
247- self . keys . into_iter ( )
365+ self . mappings . into_iter ( )
248366 }
249367}
250368
251- impl < T > Keys < T >
369+ impl < T > Codebook < T >
252370where
253371 T : Ord + Clone ,
254372{
255- /// Compute a `Keys ` from a sequence of values.
373+ /// Compute a `Codebook ` from a sequence of values.
256374 ///
257375 /// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258- /// If `Keys ` may not be computed without exceeding this bit length,
376+ /// If the `Codebook ` may not be computed without exceeding this bit length,
259377 /// fail with `Err(problemantic_bit_len)`.
260378 ///
261379 /// The current implementation only attempts to produce the best compression
@@ -278,11 +396,11 @@ where
278396 let counter = map. entry ( item) . or_insert ( 0 . into ( ) ) ;
279397 * counter += 1 . into ( ) ;
280398 }
281- // Then compute the `Keys `.
399+ // Then compute the `Codebook `.
282400 Self :: from_instances ( map, max_bit_len)
283401 }
284402
285- /// Compute a `Keys ` from a sequence of values
403+ /// Compute a `Codebook ` from a sequence of values
286404 /// with a number of instances already attached.
287405 ///
288406 /// The current implementation only attempts to produce the best compression
@@ -305,27 +423,27 @@ where
305423
306424 // The bits associated to the next value.
307425 let mut bits = 0 ;
308- let mut keys = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
426+ let mut mappings = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
309427
310428 for i in 0 ..bit_lengths. len ( ) - 1 {
311429 let ( bit_len, symbol, next_bit_len) = (
312430 bit_lengths[ i] . 1 ,
313431 bit_lengths[ i] . 0 . clone ( ) ,
314432 bit_lengths[ i + 1 ] . 1 ,
315433 ) ;
316- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
434+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
317435 bits = ( bits + 1 ) << ( next_bit_len - bit_len) ;
318436 if bit_len > highest_bit_len {
319437 highest_bit_len = bit_len;
320438 }
321439 }
322440 // Handle the last element.
323441 let ( ref symbol, bit_len) = bit_lengths[ bit_lengths. len ( ) - 1 ] ;
324- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
442+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
325443
326444 return Ok ( Self {
327445 highest_bit_len,
328- keys ,
446+ mappings ,
329447 } ) ;
330448 }
331449
@@ -412,26 +530,73 @@ where
412530#[ test]
413531fn test_coded_from_sequence ( ) {
414532 let sample = "appl" ;
415- let coded = Keys :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
533+ let coded = Codebook :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
416534
417535 // Symbol 'p' appears twice, we should see 3 codes.
418- assert_eq ! ( coded. keys . len( ) , 3 ) ;
536+ assert_eq ! ( coded. mappings . len( ) , 3 ) ;
419537
420538 // Check order of symbols.
421- assert_eq ! ( coded. keys [ 0 ] . 0 , 'p' ) ;
422- assert_eq ! ( coded. keys [ 1 ] . 0 , 'a' ) ;
423- assert_eq ! ( coded. keys [ 2 ] . 0 , 'l' ) ;
539+ assert_eq ! ( coded. mappings [ 0 ] . 0 , 'p' ) ;
540+ assert_eq ! ( coded. mappings [ 1 ] . 0 , 'a' ) ;
541+ assert_eq ! ( coded. mappings [ 2 ] . 0 , 'l' ) ;
424542
425543 // Check bit length of symbols.
426- assert_eq ! ( coded. keys [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
427- assert_eq ! ( coded. keys [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
428- assert_eq ! ( coded. keys [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
544+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
545+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
546+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
429547
430548 // Check code of symbols.
431- assert_eq ! ( coded. keys [ 0 ] . 1 . bits( ) , 0b00 ) ;
432- assert_eq ! ( coded. keys [ 1 ] . 1 . bits( ) , 0b10 ) ;
433- assert_eq ! ( coded. keys [ 2 ] . 1 . bits( ) , 0b11 ) ;
549+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bits( ) , 0b00 ) ;
550+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bits( ) , 0b10 ) ;
551+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bits( ) , 0b11 ) ;
434552
435553 // Let's try again with a limit to 1 bit paths.
436- assert_eq ! ( Keys :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
554+ assert_eq ! ( Codebook :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
555+ }
556+
557+ impl < T > Codebook < T > {
558+ /// Create an empty Codebook
559+ pub fn new ( ) -> Self {
560+ Self {
561+ highest_bit_len : BitLen :: new ( 0 ) ,
562+ mappings : vec ! [ ] ,
563+ }
564+ }
565+
566+ /// Create an empty Codebook
567+ pub fn with_capacity ( len : usize ) -> Self {
568+ Self {
569+ highest_bit_len : BitLen :: new ( 0 ) ,
570+ mappings : Vec :: with_capacity ( len) ,
571+ }
572+ }
573+
574+ /// Add a mapping to a Codebook.
575+ ///
576+ /// This method does **not** check that the resulting Codebook is correct.
577+ pub unsafe fn add_mapping ( & mut self , value : T , key : Key ) {
578+ if key. bit_len ( ) > self . highest_bit_len {
579+ self . highest_bit_len = key. bit_len ( ) ;
580+ }
581+ self . mappings . push ( ( value, key) ) ;
582+ }
583+
584+ /// Return the mappings of a Codebook.
585+ pub fn mappings ( self ) -> Vec < ( T , Key ) > {
586+ self . mappings
587+ }
588+
589+ pub fn map < F , U > ( self , mut f : F ) -> Codebook < U >
590+ where
591+ F : FnMut ( T ) -> U ,
592+ {
593+ Codebook {
594+ highest_bit_len : self . highest_bit_len ,
595+ mappings : self
596+ . mappings
597+ . into_iter ( )
598+ . map ( |( value, key) | ( f ( value) , key) )
599+ . collect ( ) ,
600+ }
601+ }
437602}
0 commit comments