ruzstd/blocks/
literals_section.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
//! Utilities and representations for the first half of a block, the literals section.
//! It contains data that is then copied from by the sequences section.
use super::super::decoding::bit_reader::{BitReader, GetBitsError};

/// A compressed block consists of two sections, a literals section, and a sequences section.
///
/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
pub struct LiteralsSection {
    /// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
    ///     bytes long, and it contains the raw literals data to be used during the second section,
    ///     the sequences section.
    /// - If this block is of type [LiteralsSectionType::RLE],
    ///     then the literal consists of a single byte repeated `regenerated_size` times.
    /// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
    ///     then this is the size of the decompressed data.
    pub regenerated_size: u32,
    /// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
    /// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
    ///     be set to the size of the compressed data.
    pub compressed_size: Option<u32>,
    /// This value will be either 1 stream or 4 streams if the literal is of type
    /// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
    /// is not used for RLE or uncompressed literals.
    pub num_streams: Option<u8>,
    /// The type of the literal section.
    pub ls_type: LiteralsSectionType,
}

/// The way which a literal section is encoded.
pub enum LiteralsSectionType {
    /// Literals are stored uncompressed.
    Raw,
    /// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
    RLE,
    /// This is a standard Huffman-compressed block, starting with a Huffman tree description.
    /// In this mode, there are at least *2* different literals represented in the Huffman tree
    /// description.
    Compressed,
    /// This is a Huffman-compressed block,
    /// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
    /// in the sequence. If this mode is triggered without any previous Huffman-tables in the
    /// frame (or dictionary), it should be treated as data corruption.
    Treeless,
}

#[derive(Debug)]
#[non_exhaustive]
pub enum LiteralsSectionParseError {
    IllegalLiteralSectionType { got: u8 },
    GetBitsError(GetBitsError),
    NotEnoughBytes { have: usize, need: u8 },
}

#[cfg(feature = "std")]
impl std::error::Error for LiteralsSectionParseError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            LiteralsSectionParseError::GetBitsError(source) => Some(source),
            _ => None,
        }
    }
}
impl core::fmt::Display for LiteralsSectionParseError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            LiteralsSectionParseError::IllegalLiteralSectionType { got } => {
                write!(
                    f,
                    "Illegal literalssectiontype. Is: {}, must be in: 0, 1, 2, 3",
                    got
                )
            }
            LiteralsSectionParseError::GetBitsError(e) => write!(f, "{:?}", e),
            LiteralsSectionParseError::NotEnoughBytes { have, need } => {
                write!(
                    f,
                    "Not enough byte to parse the literals section header. Have: {}, Need: {}",
                    have, need,
                )
            }
        }
    }
}

impl From<GetBitsError> for LiteralsSectionParseError {
    fn from(val: GetBitsError) -> Self {
        Self::GetBitsError(val)
    }
}

impl core::fmt::Display for LiteralsSectionType {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
        match self {
            LiteralsSectionType::Compressed => write!(f, "Compressed"),
            LiteralsSectionType::Raw => write!(f, "Raw"),
            LiteralsSectionType::RLE => write!(f, "RLE"),
            LiteralsSectionType::Treeless => write!(f, "Treeless"),
        }
    }
}

impl Default for LiteralsSection {
    fn default() -> Self {
        Self::new()
    }
}

impl LiteralsSection {
    /// Create a new [LiteralsSection].
    pub fn new() -> LiteralsSection {
        LiteralsSection {
            regenerated_size: 0,
            compressed_size: None,
            num_streams: None,
            ls_type: LiteralsSectionType::Raw,
        }
    }

    /// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
    pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
        let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
        let size_format = (first_byte >> 2) & 0x3;
        match ls_type {
            LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
                match size_format {
                    0 | 2 => {
                        // size_format actually only uses one bit
                        // regenerated_size uses 5 bits
                        Ok(1)
                    }
                    1 => {
                        // size_format uses 2 bit
                        // regenerated_size uses 12 bits
                        Ok(2)
                    }
                    3 => {
                        // size_format uses 2 bit
                        // regenerated_size uses 20 bits
                        Ok(3)
                    }
                    _ => panic!(
                        "This is a bug in the program. There should only be values between 0..3"
                    ),
                }
            }
            LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
                match size_format {
                    0 | 1 => {
                        // Only differ in num_streams
                        // both regenerated and compressed sizes use 10 bit
                        Ok(3)
                    }
                    2 => {
                        // both regenerated and compressed sizes use 14 bit
                        Ok(4)
                    }
                    3 => {
                        // both regenerated and compressed sizes use 18 bit
                        Ok(5)
                    }

                    _ => panic!(
                        "This is a bug in the program. There should only be values between 0..3"
                    ),
                }
            }
        }
    }

    /// Parse the header into `self`, and returns the number of bytes read.
    pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
        let mut br: BitReader<'_> = BitReader::new(raw);
        let block_type = br.get_bits(2)? as u8;
        self.ls_type = Self::section_type(block_type)?;
        let size_format = br.get_bits(2)? as u8;

        let byte_needed = self.header_bytes_needed(raw[0])?;
        if raw.len() < byte_needed as usize {
            return Err(LiteralsSectionParseError::NotEnoughBytes {
                have: raw.len(),
                need: byte_needed,
            });
        }

        match self.ls_type {
            LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
                self.compressed_size = None;
                match size_format {
                    0 | 2 => {
                        // size_format actually only uses one bit
                        // regenerated_size uses 5 bits
                        self.regenerated_size = u32::from(raw[0]) >> 3;
                        Ok(1)
                    }
                    1 => {
                        // size_format uses 2 bit
                        // regenerated_size uses 12 bits
                        self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
                        Ok(2)
                    }
                    3 => {
                        // size_format uses 2 bit
                        // regenerated_size uses 20 bits
                        self.regenerated_size = (u32::from(raw[0]) >> 4)
                            + (u32::from(raw[1]) << 4)
                            + (u32::from(raw[2]) << 12);
                        Ok(3)
                    }
                    _ => panic!(
                        "This is a bug in the program. There should only be values between 0..3"
                    ),
                }
            }
            LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
                match size_format {
                    0 => {
                        self.num_streams = Some(1);
                    }
                    1..=3 => {
                        self.num_streams = Some(4);
                    }
                    _ => panic!(
                        "This is a bug in the program. There should only be values between 0..3"
                    ),
                };

                match size_format {
                    0 | 1 => {
                        // Differ in num_streams see above
                        // both regenerated and compressed sizes use 10 bit

                        // 4 from the first, six from the second byte
                        self.regenerated_size =
                            (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);

                        // 2 from the second, full last byte
                        self.compressed_size =
                            Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2));
                        Ok(3)
                    }
                    2 => {
                        // both regenerated and compressed sizes use 14 bit

                        // 4 from first, full second, 2 from the third byte
                        self.regenerated_size = (u32::from(raw[0]) >> 4)
                            + (u32::from(raw[1]) << 4)
                            + ((u32::from(raw[2]) & 0x3) << 12);

                        // 6 from the third, full last byte
                        self.compressed_size =
                            Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
                        Ok(4)
                    }
                    3 => {
                        // both regenerated and compressed sizes use 18 bit

                        // 4 from first, full second, six from third byte
                        self.regenerated_size = (u32::from(raw[0]) >> 4)
                            + (u32::from(raw[1]) << 4)
                            + ((u32::from(raw[2]) & 0x3F) << 12);

                        // 2 from third, full fourth, full fifth byte
                        self.compressed_size = Some(
                            (u32::from(raw[2]) >> 6)
                                + (u32::from(raw[3]) << 2)
                                + (u32::from(raw[4]) << 10),
                        );
                        Ok(5)
                    }

                    _ => panic!(
                        "This is a bug in the program. There should only be values between 0..3"
                    ),
                }
            }
        }
    }

    /// Given the first two bits of a header, determine the type of a header.
    fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
        let t = raw & 0x3;
        match t {
            0 => Ok(LiteralsSectionType::Raw),
            1 => Ok(LiteralsSectionType::RLE),
            2 => Ok(LiteralsSectionType::Compressed),
            3 => Ok(LiteralsSectionType::Treeless),
            other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }),
        }
    }
}