ruzstd/huff0/huff0_decoder.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
//! Utilities for decoding Huff0 encoded huffman data.
use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError};
use crate::fse::{FSEDecoder, FSEDecoderError, FSETable, FSETableError};
use alloc::vec::Vec;
#[cfg(feature = "std")]
use std::error::Error as StdError;
pub struct HuffmanTable {
decode: Vec<Entry>,
/// The weight of a symbol is the number of occurences in a table.
/// This value is used in constructing a binary tree referred to as
/// a huffman tree.
weights: Vec<u8>,
/// The maximum size in bits a prefix code in the encoded data can be.
/// This value is used so that the decoder knows how many bits
/// to read from the bitstream before checking the table. This
/// value must be 11 or lower.
pub max_num_bits: u8,
bits: Vec<u8>,
bit_ranks: Vec<u32>,
rank_indexes: Vec<usize>,
/// In some cases, the list of weights is compressed using FSE compression.
fse_table: FSETable,
}
#[derive(Debug)]
#[non_exhaustive]
pub enum HuffmanTableError {
GetBitsError(GetBitsError),
FSEDecoderError(FSEDecoderError),
FSETableError(FSETableError),
SourceIsEmpty,
NotEnoughBytesForWeights {
got_bytes: usize,
expected_bytes: u8,
},
ExtraPadding {
skipped_bits: i32,
},
TooManyWeights {
got: usize,
},
MissingWeights,
LeftoverIsNotAPowerOf2 {
got: u32,
},
NotEnoughBytesToDecompressWeights {
have: usize,
need: usize,
},
FSETableUsedTooManyBytes {
used: usize,
available_bytes: u8,
},
NotEnoughBytesInSource {
got: usize,
need: usize,
},
WeightBiggerThanMaxNumBits {
got: u8,
},
MaxBitsTooHigh {
got: u8,
},
}
#[cfg(feature = "std")]
impl StdError for HuffmanTableError {
fn source(&self) -> Option<&(dyn StdError + 'static)> {
match self {
HuffmanTableError::GetBitsError(source) => Some(source),
HuffmanTableError::FSEDecoderError(source) => Some(source),
HuffmanTableError::FSETableError(source) => Some(source),
_ => None,
}
}
}
impl core::fmt::Display for HuffmanTableError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> ::core::fmt::Result {
match self {
HuffmanTableError::GetBitsError(e) => write!(f, "{:?}", e),
HuffmanTableError::FSEDecoderError(e) => write!(f, "{:?}", e),
HuffmanTableError::FSETableError(e) => write!(f, "{:?}", e),
HuffmanTableError::SourceIsEmpty => write!(f, "Source needs to have at least one byte"),
HuffmanTableError::NotEnoughBytesForWeights {
got_bytes,
expected_bytes,
} => {
write!(f, "Header says there should be {} bytes for the weights but there are only {} bytes in the stream",
expected_bytes,
got_bytes)
}
HuffmanTableError::ExtraPadding { skipped_bits } => {
write!(f,
"Padding at the end of the sequence_section was more than a byte long: {} bits. Probably caused by data corruption",
skipped_bits,
)
}
HuffmanTableError::TooManyWeights { got } => {
write!(
f,
"More than 255 weights decoded (got {} weights). Stream is probably corrupted",
got,
)
}
HuffmanTableError::MissingWeights => {
write!(f, "Can\'t build huffman table without any weights")
}
HuffmanTableError::LeftoverIsNotAPowerOf2 { got } => {
write!(f, "Leftover must be power of two but is: {}", got)
}
HuffmanTableError::NotEnoughBytesToDecompressWeights { have, need } => {
write!(
f,
"Not enough bytes in stream to decompress weights. Is: {}, Should be: {}",
have, need,
)
}
HuffmanTableError::FSETableUsedTooManyBytes {
used,
available_bytes,
} => {
write!(f,
"FSE table used more bytes: {} than were meant to be used for the whole stream of huffman weights ({})",
used,
available_bytes,
)
}
HuffmanTableError::NotEnoughBytesInSource { got, need } => {
write!(
f,
"Source needs to have at least {} bytes, got: {}",
need, got,
)
}
HuffmanTableError::WeightBiggerThanMaxNumBits { got } => {
write!(
f,
"Cant have weight: {} bigger than max_num_bits: {}",
got, MAX_MAX_NUM_BITS,
)
}
HuffmanTableError::MaxBitsTooHigh { got } => {
write!(
f,
"max_bits derived from weights is: {} should be lower than: {}",
got, MAX_MAX_NUM_BITS,
)
}
}
}
}
impl From<GetBitsError> for HuffmanTableError {
fn from(val: GetBitsError) -> Self {
Self::GetBitsError(val)
}
}
impl From<FSEDecoderError> for HuffmanTableError {
fn from(val: FSEDecoderError) -> Self {
Self::FSEDecoderError(val)
}
}
impl From<FSETableError> for HuffmanTableError {
fn from(val: FSETableError) -> Self {
Self::FSETableError(val)
}
}
/// An interface around a huffman table used to decode data.
pub struct HuffmanDecoder<'table> {
table: &'table HuffmanTable,
/// State is used to index into the table.
pub state: u64,
}
#[derive(Debug)]
#[non_exhaustive]
pub enum HuffmanDecoderError {
GetBitsError(GetBitsError),
}
impl core::fmt::Display for HuffmanDecoderError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
HuffmanDecoderError::GetBitsError(e) => write!(f, "{:?}", e),
}
}
}
#[cfg(feature = "std")]
impl StdError for HuffmanDecoderError {
fn source(&self) -> Option<&(dyn StdError + 'static)> {
match self {
HuffmanDecoderError::GetBitsError(source) => Some(source),
}
}
}
impl From<GetBitsError> for HuffmanDecoderError {
fn from(val: GetBitsError) -> Self {
Self::GetBitsError(val)
}
}
/// A single entry in the table contains the decoded symbol/literal and the
/// size of the prefix code.
#[derive(Copy, Clone)]
pub struct Entry {
/// The byte that the prefix code replaces during encoding.
symbol: u8,
/// The number of bits the prefix code occupies.
num_bits: u8,
}
/// The Zstandard specification limits the maximum length of a code to 11 bits.
const MAX_MAX_NUM_BITS: u8 = 11;
/// Assert that the provided value is greater than zero, and returns the
/// 32 - the number of leading zeros
fn highest_bit_set(x: u32) -> u32 {
assert!(x > 0);
u32::BITS - x.leading_zeros()
}
impl<'t> HuffmanDecoder<'t> {
/// Create a new decoder with the provided table
pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> {
HuffmanDecoder { table, state: 0 }
}
/// Re-initialize the decoder, using the new table if one is provided.
/// This might used for treeless blocks, because they re-use the table from old
/// data.
pub fn reset(mut self, new_table: Option<&'t HuffmanTable>) {
self.state = 0;
if let Some(next_table) = new_table {
self.table = next_table;
}
}
/// Decode the symbol the internal state (cursor) is pointed at and return the
/// decoded literal.
pub fn decode_symbol(&mut self) -> u8 {
self.table.decode[self.state as usize].symbol
}
/// Initialize internal state and prepare to decode data. Then, `decode_symbol` can be called
/// to read the byte the internal cursor is pointing at, and `next_state` can be called to advance
/// the cursor until the max number of bits has been read.
pub fn init_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
let num_bits = self.table.max_num_bits;
let new_bits = br.get_bits(num_bits);
self.state = new_bits;
num_bits
}
/// Advance the internal cursor to the next symbol. After this, you can call `decode_symbol`
/// to read from the new position.
pub fn next_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
// self.state stores a small section, or a window of the bit stream. The table can be indexed via this state,
// telling you how many bits identify the current symbol.
let num_bits = self.table.decode[self.state as usize].num_bits;
// New bits are read from the stream
let new_bits = br.get_bits(num_bits);
// Shift and mask out the bits that identify the current symbol
self.state <<= num_bits;
self.state &= self.table.decode.len() as u64 - 1;
// The new bits are appended at the end of the current state.
self.state |= new_bits;
num_bits
}
}
impl Default for HuffmanTable {
fn default() -> Self {
Self::new()
}
}
impl HuffmanTable {
/// Create a new, empty table.
pub fn new() -> HuffmanTable {
HuffmanTable {
decode: Vec::new(),
weights: Vec::with_capacity(256),
max_num_bits: 0,
bits: Vec::with_capacity(256),
bit_ranks: Vec::with_capacity(11),
rank_indexes: Vec::with_capacity(11),
fse_table: FSETable::new(100),
}
}
/// Completely empty the table then repopulate as a replica
/// of `other`.
pub fn reinit_from(&mut self, other: &Self) {
self.reset();
self.decode.extend_from_slice(&other.decode);
self.weights.extend_from_slice(&other.weights);
self.max_num_bits = other.max_num_bits;
self.bits.extend_from_slice(&other.bits);
self.rank_indexes.extend_from_slice(&other.rank_indexes);
self.fse_table.reinit_from(&other.fse_table);
}
/// Completely empty the table of all data.
pub fn reset(&mut self) {
self.decode.clear();
self.weights.clear();
self.max_num_bits = 0;
self.bits.clear();
self.bit_ranks.clear();
self.rank_indexes.clear();
self.fse_table.reset();
}
/// Read from `source` and parse it into a huffman table.
///
/// Returns the number of bytes read.
pub fn build_decoder(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
self.decode.clear();
let bytes_used = self.read_weights(source)?;
self.build_table_from_weights()?;
Ok(bytes_used)
}
/// Read weights from the provided source.
///
/// The huffman table is represented in the encoded data as a list of weights
/// at the most basic level. After the header, weights are read, then the table
/// can be built using that list of weights.
///
/// Returns the number of bytes read.
fn read_weights(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
use HuffmanTableError as err;
if source.is_empty() {
return Err(err::SourceIsEmpty);
}
let header = source[0];
let mut bits_read = 8;
match header {
// If the header byte is less than 128, the series of weights
// is compressed using two interleaved FSE streams that share
// a distribution table.
0..=127 => {
let fse_stream = &source[1..];
if header as usize > fse_stream.len() {
return Err(err::NotEnoughBytesForWeights {
got_bytes: fse_stream.len(),
expected_bytes: header,
});
}
//fse decompress weights
let bytes_used_by_fse_header = self
.fse_table
.build_decoder(fse_stream, /*TODO find actual max*/ 100)?;
if bytes_used_by_fse_header > header as usize {
return Err(err::FSETableUsedTooManyBytes {
used: bytes_used_by_fse_header,
available_bytes: header,
});
}
vprintln!(
"Building fse table for huffman weights used: {}",
bytes_used_by_fse_header
);
// Huffman headers are compressed using two interleaved
// FSE bitstreams, where the first state (decoder) handles
// even symbols, and the second handles odd symbols.
let mut dec1 = FSEDecoder::new(&self.fse_table);
let mut dec2 = FSEDecoder::new(&self.fse_table);
let compressed_start = bytes_used_by_fse_header;
let compressed_length = header as usize - bytes_used_by_fse_header;
let compressed_weights = &fse_stream[compressed_start..];
if compressed_weights.len() < compressed_length {
return Err(err::NotEnoughBytesToDecompressWeights {
have: compressed_weights.len(),
need: compressed_length,
});
}
let compressed_weights = &compressed_weights[..compressed_length];
let mut br = BitReaderReversed::new(compressed_weights);
bits_read += (bytes_used_by_fse_header + compressed_length) * 8;
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(err::ExtraPadding { skipped_bits });
}
dec1.init_state(&mut br)?;
dec2.init_state(&mut br)?;
self.weights.clear();
// The two decoders take turns decoding a single symbol and updating their state.
loop {
let w = dec1.decode_symbol();
self.weights.push(w);
dec1.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec2.decode_symbol());
break;
}
let w = dec2.decode_symbol();
self.weights.push(w);
dec2.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec1.decode_symbol());
break;
}
//maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others
if self.weights.len() > 255 {
return Err(err::TooManyWeights {
got: self.weights.len(),
});
}
}
}
// If the header byte is greater than or equal to 128,
// weights are directly represented, where each weight is
// encoded directly as a 4 bit field. The weights will
// always be encoded with full bytes, meaning if there's
// an odd number of weights, the last weight will still
// occupy a full byte.
_ => {
// weights are directly encoded
let weights_raw = &source[1..];
let num_weights = header - 127;
self.weights.resize(num_weights as usize, 0);
let bytes_needed = if num_weights % 2 == 0 {
num_weights as usize / 2
} else {
(num_weights as usize / 2) + 1
};
if weights_raw.len() < bytes_needed {
return Err(err::NotEnoughBytesInSource {
got: weights_raw.len(),
need: bytes_needed,
});
}
for idx in 0..num_weights {
if idx % 2 == 0 {
self.weights[idx as usize] = weights_raw[idx as usize / 2] >> 4;
} else {
self.weights[idx as usize] = weights_raw[idx as usize / 2] & 0xF;
}
bits_read += 4;
}
}
}
let bytes_read = if bits_read % 8 == 0 {
bits_read / 8
} else {
(bits_read / 8) + 1
};
Ok(bytes_read as u32)
}
/// Once the weights have been read from the data, you can decode the weights
/// into a table, and use that table to decode the actual compressed data.
///
/// This function populates the rest of the table from the series of weights.
fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> {
use HuffmanTableError as err;
self.bits.clear();
self.bits.resize(self.weights.len() + 1, 0);
let mut weight_sum: u32 = 0;
for w in &self.weights {
if *w > MAX_MAX_NUM_BITS {
return Err(err::WeightBiggerThanMaxNumBits { got: *w });
}
weight_sum += if *w > 0 { 1_u32 << (*w - 1) } else { 0 };
}
if weight_sum == 0 {
return Err(err::MissingWeights);
}
let max_bits = highest_bit_set(weight_sum) as u8;
let left_over = (1 << max_bits) - weight_sum;
//left_over must be power of two
if !left_over.is_power_of_two() {
return Err(err::LeftoverIsNotAPowerOf2 { got: left_over });
}
let last_weight = highest_bit_set(left_over) as u8;
for symbol in 0..self.weights.len() {
let bits = if self.weights[symbol] > 0 {
max_bits + 1 - self.weights[symbol]
} else {
0
};
self.bits[symbol] = bits;
}
self.bits[self.weights.len()] = max_bits + 1 - last_weight;
self.max_num_bits = max_bits;
if max_bits > MAX_MAX_NUM_BITS {
return Err(err::MaxBitsTooHigh { got: max_bits });
}
self.bit_ranks.clear();
self.bit_ranks.resize((max_bits + 1) as usize, 0);
for num_bits in &self.bits {
self.bit_ranks[(*num_bits) as usize] += 1;
}
//fill with dummy symbols
self.decode.resize(
1 << self.max_num_bits,
Entry {
symbol: 0,
num_bits: 0,
},
);
//starting codes for each rank
self.rank_indexes.clear();
self.rank_indexes.resize((max_bits + 1) as usize, 0);
self.rank_indexes[max_bits as usize] = 0;
for bits in (1..self.rank_indexes.len() as u8).rev() {
self.rank_indexes[bits as usize - 1] = self.rank_indexes[bits as usize]
+ self.bit_ranks[bits as usize] as usize * (1 << (max_bits - bits));
}
assert!(
self.rank_indexes[0] == self.decode.len(),
"rank_idx[0]: {} should be: {}",
self.rank_indexes[0],
self.decode.len()
);
for symbol in 0..self.bits.len() {
let bits_for_symbol = self.bits[symbol];
if bits_for_symbol != 0 {
// allocate code for the symbol and set in the table
// a code ignores all max_bits - bits[symbol] bits, so it gets
// a range that spans all of those in the decoding table
let base_idx = self.rank_indexes[bits_for_symbol as usize];
let len = 1 << (max_bits - bits_for_symbol);
self.rank_indexes[bits_for_symbol as usize] += len;
for idx in 0..len {
self.decode[base_idx + idx].symbol = symbol as u8;
self.decode[base_idx + idx].num_bits = bits_for_symbol;
}
}
}
Ok(())
}
}