ruzstd/blocks/sequence_section.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
//! Utilities and representations for the second half of a block, the sequence section.
//! This section copies literals from the literals section into the decompressed output.
pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
pub(crate) const MAX_OFFSET_CODE: u8 = 31;
pub struct SequencesHeader {
pub num_sequences: u32,
pub modes: Option<CompressionModes>,
}
/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
/// - A copy step, where data is copied from the literals section to the decompressed output
/// - A *match* copy step that copies data from within the previously decompressed output.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
#[derive(Clone, Copy)]
pub struct Sequence {
/// Literal length, or the number of bytes to be copied from the literals section
/// in the copy step.
pub ll: u32,
/// The length of the match to make during the match copy step.
pub ml: u32,
/// How far back to go in the decompressed data to read from the match copy step.
/// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
/// then it has special handling:
///
/// The first 3 values define 3 different repeated offsets, with 1 referring to the most
/// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
/// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
/// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
/// is considered corrupted.
pub of: u32,
}
impl core::fmt::Display for Sequence {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
}
}
/// This byte defines the compression mode of each symbol type
#[derive(Copy, Clone)]
pub struct CompressionModes(u8);
/// The compression mode used for symbol compression
pub enum ModeType {
/// A predefined FSE distribution table is used, and no distribution table
/// will be present.
Predefined,
/// The table consists of a single byte, which contains the symbol's value.
RLE,
/// Standard FSE compression, a distribution table will be present. This
/// mode should not be used when only one symbol is present.
FSECompressed,
/// The table used in the previous compressed block with at least one sequence
/// will be used again. If this is the first block, the table in the dictionary will
/// be used.
Repeat,
}
impl CompressionModes {
/// Deserialize a two bit mode value into a [ModeType]
pub fn decode_mode(m: u8) -> ModeType {
match m {
0 => ModeType::Predefined,
1 => ModeType::RLE,
2 => ModeType::FSECompressed,
3 => ModeType::Repeat,
_ => panic!("This can never happen"),
}
}
/// Read the compression mode of the literal lengths field.
pub fn ll_mode(self) -> ModeType {
Self::decode_mode(self.0 >> 6)
}
/// Read the compression mode of the offset value field.
pub fn of_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 4) & 0x3)
}
/// Read the compression mode of the match lengths field.
pub fn ml_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 2) & 0x3)
}
}
impl Default for SequencesHeader {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
#[non_exhaustive]
pub enum SequencesHeaderParseError {
NotEnoughBytes { need_at_least: u8, got: usize },
}
#[cfg(feature = "std")]
impl std::error::Error for SequencesHeaderParseError {}
impl core::fmt::Display for SequencesHeaderParseError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
SequencesHeaderParseError::NotEnoughBytes { need_at_least, got } => {
write!(
f,
"source must have at least {} bytes to parse header; got {} bytes",
need_at_least, got,
)
}
}
}
}
impl SequencesHeader {
/// Create a new [SequencesHeader].
pub fn new() -> SequencesHeader {
SequencesHeader {
num_sequences: 0,
modes: None,
}
}
/// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
let mut bytes_read = 0;
if source.is_empty() {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 1,
got: 0,
});
}
let source = match source[0] {
0 => {
self.num_sequences = 0;
return Ok(1);
}
1..=127 => {
if source.len() < 2 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 2,
got: source.len(),
});
}
self.num_sequences = u32::from(source[0]);
bytes_read += 1;
&source[1..]
}
128..=254 => {
if source.len() < 3 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 3,
got: source.len(),
});
}
self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
bytes_read += 2;
&source[2..]
}
255 => {
if source.len() < 4 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 4,
got: source.len(),
});
}
self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
bytes_read += 3;
&source[3..]
}
};
self.modes = Some(CompressionModes(source[0]));
bytes_read += 1;
Ok(bytes_read)
}
}