• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9357137046

03 Jun 2024 08:51PM UTC coverage: 75.121% (-1.1%) from 76.254%
9357137046

push

github

web-flow
Switch locid Value to use Subtag (#4941)

This is part of #1833 switching Value API to use Subtag.

61 of 71 new or added lines in 11 files covered. (85.92%)

3224 existing lines in 178 files now uncovered.

52958 of 70497 relevant lines covered (75.12%)

572757.08 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/utils/resb/src/binary.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
//! The `binary` module provides a `serde` deserializer for the binary resource
6
//! bundle format as well as a means of writing resource bundles to the binary
7
//! file format.
8

9
mod deserializer;
10
mod header;
11
pub use self::deserializer::from_bytes;
12

13
#[cfg(feature = "serialize")]
14
mod serializer;
15
#[cfg(feature = "serialize")]
16
pub use self::serializer::Serializer;
17

18
use core::{fmt, slice::SliceIndex};
19

20
use self::header::BinHeader;
21

22
/// Gets the endianness of a binary resource bundle's data.
23
pub fn determine_endianness(resb: &[u8]) -> Result<Endianness, BinaryDeserializerError> {
×
24
    let header = BinHeader::try_from(resb)?;
×
25

26
    Ok(header.repr_info.endianness)
×
27
}
×
28

29
/// The `BinIndex` struct represents details of the written bundle.
30
///
31
/// The index is present from [`FormatVersion::V1_1`] on.
32
#[allow(dead_code)]
33
struct BinIndex {
34
    /// The number of 32-bit fields written in the index, including the field
35
    /// count.
36
    field_count: u32,
37

38
    /// The offset of the end of the key block in 32-bit values from the
39
    /// beginning of the body.
40
    keys_end: u32,
41

42
    /// The offset of the end of the resources block in 32-bit values from the
43
    /// beginning of the body.
44
    resources_end: u32,
45

46
    /// The offset of the end of the bundle in 32-bit values from the beginning
47
    /// of the body.
48
    ///
49
    /// In all versions through [`FormatVersion::V3_0`], this is always the same
50
    /// as `resources_end`.
51
    bundle_end: u32,
52

53
    /// The number of entries in the largest table in the bundle.
54
    largest_table_entry_count: u32,
55

56
    /// Attributes describing resolution of external resources.
57
    ///
58
    /// Present from [`FormatVersion::V1_2`] on.
59
    bundle_attributes: Option<u32>,
60

61
    /// The offset of the end of the 16-bit data block in 32-bit values from the
62
    /// beginning of the body.
63
    ///
64
    /// Present from [`FormatVersion::V2_0`] on.
65
    data_16_bit_end: Option<u32>,
66

67
    /// The resource pool bundle checksum.
68
    ///
69
    /// Present from [`FormatVersion::V2_0`] on when the bundle either is a pool
70
    /// bundle or uses a pool bundle for sharing resources.
71
    pool_checksum: Option<u32>,
72
}
73

74
/// Adds convenience properties to an `enum` represented as a primitive type,
75
/// including conversions to and from the primitive type.
76
macro_rules! primitive_enum {
77
    ($type:ty, $(#[$meta:meta])* $vis:vis enum $name:ident {
78
        $($(#[$variant_meta:meta])* $variant:ident = $value:expr,)*
79
    }) => {
80
        // At a minimum, `$meta` is needed to allow for doc comments on enums
81
        // created via this macro.
82
        $(#[$meta])*
83
        #[repr($type)]
84
        $vis enum $name {
85
            // At a minimum, `$variant_meta` is needed to allow for doc comments
86
            // on variants created via this macro.
87
            $($(#[$variant_meta])* $variant = $value,)*
88
        }
89

90
        impl From<$name> for $type {
91
            fn from(v: $name) -> Self {
×
92
                v as $type
×
93
            }
×
94
        }
95

96
        impl TryFrom<$type> for $name {
97
            type Error = BinaryDeserializerError;
98

99
            fn try_from(value: $type) -> Result<Self, Self::Error> {
×
100
                match value {
101
                    $(x if x == $name::$variant as $type => Ok($name::$variant),)*
×
102
                    _ => Err(BinaryDeserializerError::invalid_data(
×
103
                        concat!("unrecognized value for ", stringify!($name))
104
                    )),
×
105
                }
106
            }
×
107
        }
108
    }
109
}
110

111
primitive_enum!(
112
    u8,
113
    /// The endianness used to write a resource bundle.
114
    #[derive(Clone, Copy, Debug, PartialEq)]
×
115
    // The resource bundle format doesn't support any sort of mixed-endian or
116
    // middle-endian encodings.
117
    #[allow(clippy::exhaustive_enums)]
118
    #[allow(missing_docs)]
119
    pub enum Endianness {
120
        Little = 0,
121
        Big = 1,
122
    }
123
);
124

125
primitive_enum!(
126
    u8,
127
    /// A family of character sets used to represent the characters of key strings.
128
    #[derive(Clone, Copy, Debug, Eq, PartialEq)]
×
129
    enum CharsetFamily {
130
        /// The ASCII family of character sets, such as ASCII, latin1, and
131
        /// UTF-8.
132
        Ascii = 0,
133

134
        /// The EBCDIC family of character sets, such as EBCDIC and UTF-EBCDIC.
135
        ///
136
        /// The EBCDIC family is currently unsupported by this crate both for
137
        /// serialization and deserialization of binary bundles.
138
        Ebcdic = 1,
139
    }
140
);
141

142
primitive_enum!(
143
    u16,
144
    #[derive(Clone, Copy, Debug, Eq, PartialEq)]
×
145
    /// The type of a resource representation within a binary resource bundle.
146
    ///
147
    /// The representation is distinct from the resource type as presented to
148
    /// consumers. Some resource types may have multiple possible
149
    /// representations, depending on the [`FormatVersion`] and—in the case of
150
    /// collections—number and type of their constituent resources.
151
    enum ResourceReprType {
152
        /// A string resource. Not yet supported.
153
        _String = 0,
154

155
        /// A raw binary resource.
156
        ///
157
        /// Consists of a 32-bit length value `n` followed by `n` arbitrary
158
        /// bytes.
159
        Binary = 1,
160

161
        /// A table resource for bundles with fewer than `0x1_0000` keys.
162
        ///
163
        /// Consists of a 16-bit length value `n` followed by `n` 16-bit key
164
        /// offsets from the beginning of the key block, 0 or 16 bits of padding
165
        /// (in order to align the table representation so far to a 32-bit
166
        /// boundary), and `n` 32-bit resource descriptors. For details on the
167
        /// representation of resource descriptors, see [`ResDescriptor`].
168
        ///
169
        /// The `i`th entry in the resulting table is a a pair of the `i`th key
170
        /// and the `i`th resource.
171
        Table = 2,
172

173
        /// An alias resource. Not yet supported.
174
        _Alias = 3,
175

176
        /// A table resource for bundles with `0x1_0000` or more keys. Not yet
177
        /// supported.
178
        _Table32 = 4,
179

180
        /// A 16-bit table resource. Not yet supported.
181
        Table16 = 5,
182

183
        /// A 16-bit string resource for [`FormatVersion::V2_0`] and later.
184
        ///
185
        /// Consists of a UTF-16 string with length marked in one of the
186
        /// following ways:
187
        ///
188
        /// - For strings of length `[1..40]` characters, there is no length
189
        ///   marker and the string must be null-terminated (i.e., by two `0`
190
        ///   bytes in a row).
191
        ///
192
        /// - For strings of length `n` in the range `(40..0x3ef)`, the string
193
        ///   is preceded by a single UTF-16 low surrogate composed as
194
        ///   `0xdc00 & n`.
195
        ///
196
        /// - For strings of length `n` in the range `[0x3ef..0x10_0000)`, the
197
        ///   string is preceded by a length marker consisting of a UTF-16 low
198
        ///   surrogate followed by a 16-bit value, composed as
199
        ///   `[0xdfef + (n >> 0x10), n & 0xffff]`.
200
        ///
201
        /// - For strings of length `n` in the range `[0x10_0000,
202
        ///   0x1_0000_0000)`, the string is preceded by a length marker
203
        ///   consisting of a UTF-16 low surrogate followed by two 16-bit length
204
        ///   values, composed as `[0xdfff, n >> 0x10, n & 0xffff]`.
205
        ///
206
        /// Strings of greater length than those described above may not be
207
        /// stored in binary bundles.
208
        ///
209
        /// These length markers can be reliably detected, as UTF-16 low
210
        /// surrogates may not legally appear without a preceding high surrogate
211
        /// in a UTF-16 string.
212
        StringV2 = 6,
213

214
        /// A 28-bit integer resource.
215
        ///
216
        /// Consists solely of the resource descriptor with the 28 bits of the
217
        /// integer in place of an offset.
218
        ///
219
        /// The signedness of 28-bit integers is not indicated in the resource
220
        /// bundle itself. Consumers are expected to explicitly request a signed
221
        /// or unsigned integer. In order to
222
        Int = 7,
223

224
        /// A general array resource.
225
        ///
226
        /// Consists of a 32-bit length value `n` followed by `n` 32-bit
227
        /// resource descriptors. For more details on the representation of
228
        /// resource descriptors, see [`ResDescriptor`].
229
        Array = 8,
230

231
        /// A 16-bit array resource.
232
        ///
233
        /// Consists of a 16-bit length value `n` followed by `n` 16-bit offsets
234
        /// from the beginning of the 16-bit data block.
235
        ///
236
        /// As of [`FormatVersion::V3_0`], only `StringV2` representations can
237
        /// be fully stored in the 16-bit data block. As such, only `StringV2`
238
        /// resources can appear in an `Array16`.
239
        Array16 = 9,
240

241
        /// An integer array resource.
242
        ///
243
        /// Consists of a 32-bit length value `n` followed by `n` 32-bit integer
244
        /// values.
245
        ///
246
        /// Note that these are not integer _resources_, but rather full 32-bit
247
        /// integers.
248
        IntVector = 14,
249
    }
250
);
251

252
/// A `FormatVersion` represents a specific binary file format used for
253
/// representing resource bundles.
254
///
255
/// A partial [specification] of each format version is present in the ICU4C
256
/// source code.
257
///
258
/// [specification]: https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uresdata.h
259
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
×
260
// We use `u32` representation to enforce correct sizing for structs containing
261
// a `FormatVersion`.
262
//
263
// Note that order of declaration is important for partial ordering.
264
#[repr(u32)]
265
enum FormatVersion {
266
    V1_0,
267
    V1_1,
268
    V1_2,
269
    V1_3,
270
    V2_0,
271
    V3_0,
272
}
273

274
/// The `ResDescriptor` struct represents a typed pointer to a resource body
275
/// within a binary resource bundle.
276
///
277
/// It is represented within the binary bundle as a 4-bit resource type in the
278
/// most significant nibble of a 32-bit integer with a 28-bit unsigned offset
279
/// in the remaining bits. The offset is interpreted as a count of 32-bit
280
/// values from the start of the body.
281
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
×
282
struct ResDescriptor {
283
    resource_type: ResourceReprType,
×
284
    value: u32,
×
285
}
286

287
impl ResDescriptor {
288
    /// Makes a new resource descriptor with the given type and 28-bit value.
289
    pub const fn new(resource_type: ResourceReprType, value: u32) -> Self {
×
290
        Self {
×
291
            resource_type,
292
            value,
293
        }
294
    }
×
295

296
    /// Makes a new resource descriptor with the given type and no body.
297
    pub const fn _new_empty(resource_type: ResourceReprType) -> Self {
×
298
        Self::new(resource_type, 0)
×
299
    }
×
300

301
    /// Returns `true` if the described resource is empty.
302
    pub fn is_empty(&self) -> bool {
×
303
        self.value == 0
×
304
    }
×
305

306
    /// Gets the offset to the described 16-bit resource in bytes.
307
    ///
308
    /// The type of the resource representation is not verified. Consumers are
309
    /// expected to call the function appropriate to the resource type they are
310
    /// querying.
311
    fn value_as_16_bit_offset(&self) -> usize {
×
312
        // When the value of a resource descriptor is an offset, it is counted
313
        // in units dependent on the resource type (16-bit values for 16-bit
314
        // resources, 32-bit values for 32-bit resources). Translate that into
315
        // bytes for consumers.
316
        (self.value as usize) * core::mem::size_of::<u16>()
×
317
    }
×
318

319
    /// Gets the offset to the described 32-bit resource in bytes.
320
    ///
321
    /// The type of the resource representation is not verified. Consumers are
322
    /// expected to call the function appropriate to the resource type they are
323
    /// querying.
324
    fn value_as_32_bit_offset(&self) -> usize {
×
325
        // When the value of a resource descriptor is an offset, it is counted
326
        // in units dependent on the resource type (16-bit values for 16-bit
327
        // resources, 32-bit values for 32-bit resources). Translate that into
328
        // bytes for consumers.
329
        (self.value as usize) * core::mem::size_of::<u32>()
×
330
    }
×
331

332
    /// Gets the value of the resource descriptor as a signed integer.
333
    ///
334
    /// The type of the resource representation is not verified. Consumers are
335
    /// expected to call the function appropriate to the resource type they are
336
    /// querying.
337
    fn value_as_signed_int(&self) -> i32 {
×
338
        ((self.value as i32) << 4) >> 4
×
339
    }
×
340

341
    /// Gets the value of the resource descriptor as an unsigned integer.
342
    ///
343
    /// The type of the resource representation is not verified. Consumers are
344
    /// expected to call the function appropriate to the resource type they are
345
    /// querying.
346
    fn value_as_unsigned_int(&self) -> u32 {
×
347
        self.value
×
348
    }
×
349

350
    /// Gets the resource type of the described resource.
351
    pub fn resource_type(&self) -> ResourceReprType {
×
352
        self.resource_type
×
353
    }
×
354
}
355

356
/// The `Error` type provides basic error handling for deserialization of binary
357
/// resource bundles.
358
#[derive(Clone, Copy, Debug)]
×
359
pub struct BinaryDeserializerError {
360
    kind: ErrorKind,
×
361
    message: &'static str,
×
362
}
363

364
impl BinaryDeserializerError {
365
    fn invalid_data(message: &'static str) -> Self {
×
366
        Self {
×
UNCOV
367
            kind: ErrorKind::InvalidData,
×
368
            message,
369
        }
370
    }
×
371

372
    fn resource_type_mismatch(message: &'static str) -> Self {
×
373
        Self {
×
UNCOV
374
            kind: ErrorKind::ResourceTypeMismatch,
×
375
            message,
376
        }
377
    }
×
378

379
    fn unsupported_format(message: &'static str) -> Self {
×
380
        Self {
×
UNCOV
381
            kind: ErrorKind::UnsupportedFormat,
×
382
            message,
383
        }
384
    }
×
385

386
    fn unknown(message: &'static str) -> Self {
×
387
        Self {
×
UNCOV
388
            kind: ErrorKind::Unknown,
×
389
            message,
390
        }
391
    }
×
392
}
393

394
impl fmt::Display for BinaryDeserializerError {
395
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
×
396
        let description = match self.kind {
×
397
            ErrorKind::InvalidData => "Invalid resource bundle data",
×
398
            ErrorKind::ResourceTypeMismatch => "Resource did not match expected data type",
×
399
            ErrorKind::UnsupportedFormat => "Unsupported resource bundle format",
×
400
            ErrorKind::Unknown => "Unknown error",
×
401
        };
402

403
        write!(f, "{description}: {}", self.message)
×
404
    }
×
405
}
406

407
#[derive(Clone, Copy, Debug)]
×
408
#[non_exhaustive]
409
enum ErrorKind {
410
    ResourceTypeMismatch,
411
    InvalidData,
412
    UnsupportedFormat,
413

414
    Unknown,
415
}
416

417
/// Gets a subset of the given `u8` slice based on the specified index with
418
/// bounds checking.
419
///
420
/// Returns the subslice. Returns an error if the index is not valid for the
421
/// input.
422
fn get_subslice<I>(input: &[u8], index: I) -> Result<&[u8], BinaryDeserializerError>
×
423
where
424
    I: SliceIndex<[u8], Output = [u8]>,
425
{
426
    input
×
427
        .get(index)
428
        .ok_or(BinaryDeserializerError::invalid_data(
×
429
            "unexpected end of input",
430
        ))
431
}
×
432

433
/// Reads the first two bytes of the input and interprets them as a `u16` with
434
/// native endianness.
435
///
436
/// Returns the `u16` and a slice containing all input after the interpreted
437
/// bytes. Returns an error if the input is of insufficient length.
438
fn read_u16(input: &[u8]) -> Result<(u16, &[u8]), BinaryDeserializerError> {
×
439
    // Safe to unwrap at the end of this because `try_into()` for arrays will
440
    // only fail if the slice is the wrong size.
441
    #[allow(clippy::unwrap_used)]
442
    let bytes = get_subslice(input, ..core::mem::size_of::<u16>())?
×
443
        .try_into()
444
        .unwrap();
445
    let value = u16::from_le_bytes(bytes);
×
446

447
    let rest = get_subslice(input, core::mem::size_of::<u16>()..)?;
×
448
    Ok((value, rest))
×
449
}
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc