• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 6815798908

09 Nov 2023 05:17PM CUT coverage: 72.607% (-2.4%) from 75.01%
6815798908

push

github

web-flow
Implement `Any/BufferProvider` for some smart pointers (#4255)

Allows storing them as a `Box<dyn Any/BufferProvider>` without using a
wrapper type that implements the trait.

44281 of 60987 relevant lines covered (72.61%)

201375.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/components/casemap/src/provider/exceptions_builder.rs
1
// This file is part of ICU4X. For terms of use, please see the file
×
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::exception_helpers::{
6
    ExceptionBits, ExceptionBitsULE, ExceptionSlot, SlotPresence,
7
};
8
use crate::provider::exceptions::{CaseMapExceptions, DecodedException};
9
use alloc::borrow::Cow;
10
use alloc::collections::BTreeMap;
11
use alloc::string::String;
12
use alloc::vec::Vec;
13
use icu_provider::DataError;
14
use zerovec::ule::{AsULE, ULE};
15

16
/// The header for exception types as found in ICU4C data. See [`ExceptionHeaderULE`]
17
/// for the wire format
18
#[derive(Copy, Clone, PartialEq, Eq)]
×
19
pub struct ExceptionHeader {
20
    /// The various slots that are present, masked by ExceptionSlot
21
    ///
22
    /// We still store this as a bitmask since it's more convenient to access as one
23
    pub slot_presence: SlotPresence,
×
24
    pub bits: ExceptionBits,
×
25
}
26

27
impl ExceptionHeader {
28
    /// Construct from an ICU4C-format u16.
29
    pub(crate) fn from_integer(int: u16) -> Self {
×
30
        let slot_presence =
31
            SlotPresence(u8::try_from(int & ExceptionHeaderULE::SLOTS_MASK).unwrap_or(0));
×
32
        let bits = ExceptionBits::from_integer(
×
33
            u8::try_from(int >> ExceptionHeaderULE::BITS_SHIFT).unwrap_or(0),
×
34
        );
35
        Self {
×
36
            slot_presence,
×
37
            bits,
×
38
        }
39
    }
×
40

41
    // Returns true if the given slot exists for this exception
42
    pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool {
×
43
        self.slot_presence.has_slot(slot)
×
44
    }
×
45
}
46

47
/// Packed exception header (format from icu4c, documented in casepropsbuilder.cpp)
48
///
49
/// ```text
50
///       Bits:
51
///         0..7  Flag bits indicating which optional slots are present (if any):
52
///               0: Lowercase mapping (code point)
53
///               1: Case folding (code point)
54
///               2: Uppercase mapping (code point)
55
///               3: Titlecase mapping (code point)
56
///               4: Delta to simple case mapping (code point) (sign stored separately)
57
///               5: RESERVED
58
///               6: Closure mappings (string; see below)
59
///               7: Full mappings (strings; see below)
60
///            8  Double-width slots. If set, then each optional slot is stored as two
61
///               elements of the array (high and low halves of 32-bit values) instead of
62
///               a single element.
63
///            9  Has no simple case folding, even if there is a simple lowercase mapping
64
///           10  The value in the delta slot is negative
65
///           11  Is case-sensitive (not exposed)
66
///       12..13  Dot type
67
///           14  Has conditional special casing
68
///           15  Has conditional case folding
69
/// ```
70
///
71
/// In this struct the RESERVED bit is still allowed to be set, and it will produce a different
72
/// exception header, but it will not have any other effects.
73
#[derive(Copy, Clone, PartialEq, Eq, ULE)]
×
74
#[repr(packed)]
75
pub struct ExceptionHeaderULE {
76
    slot_presence: SlotPresence,
×
77
    bits: ExceptionBitsULE,
×
78
}
79

80
impl ExceptionHeaderULE {
81
    const SLOTS_MASK: u16 = 0xff;
82
    const BITS_SHIFT: u16 = 8;
83
}
84

85
impl AsULE for ExceptionHeader {
86
    type ULE = ExceptionHeaderULE;
87
    fn from_unaligned(u: ExceptionHeaderULE) -> Self {
×
88
        Self {
×
89
            slot_presence: u.slot_presence,
×
90
            bits: ExceptionBits::from_integer(u.bits.0),
×
91
        }
92
    }
×
93

94
    fn to_unaligned(self) -> ExceptionHeaderULE {
×
95
        ExceptionHeaderULE {
×
96
            slot_presence: self.slot_presence,
×
97
            bits: ExceptionBitsULE(self.bits.to_integer()),
×
98
        }
99
    }
×
100
}
101
// CaseMapExceptionsBuilder consumes the exceptions data produced by
102
// casepropsbuilder.cpp in ICU4C. It generates an instance of CaseMapExceptions. The
103
// primary difference is that the ICU4C representation stores full mapping and closure
104
// strings inline in the data, while CaseMapExceptions uses a side table. As a result,
105
// the starting index of each exception in the resulting CaseMapExceptions may have
106
// changed, so we also produce a map from old indices to new indices that will be used to
107
// update the data stored in the code point trie.
108
pub struct CaseMapExceptionsBuilder<'a> {
109
    raw_data: &'a [u16],
110
    raw_data_idx: usize,
111
    double_slots: bool,
112
}
113

114
impl<'a> CaseMapExceptionsBuilder<'a> {
115
    const MAPPINGS_ALL_LENGTHS_MASK: u32 = 0xffff;
116
    const FULL_MAPPINGS_LENGTH_MASK: u32 = 0xf;
117
    const FULL_MAPPINGS_LENGTH_SHIFT: u32 = 4;
118

119
    const CLOSURE_MAX_LENGTH: u32 = 0xf;
120

121
    pub fn new(raw_data: &'a [u16]) -> Self {
×
122
        Self {
×
123
            raw_data,
124
            raw_data_idx: 0,
125
            double_slots: false,
126
        }
127
    }
×
128

129
    fn done(&self) -> bool {
×
130
        self.raw_data_idx >= self.raw_data.len()
×
131
    }
×
132
    fn read_raw(&mut self) -> Result<u16, DataError> {
×
133
        let result = self
×
134
            .raw_data
135
            .get(self.raw_data_idx)
×
136
            .ok_or(DataError::custom("Incomplete exception data"))?;
×
137
        self.raw_data_idx += 1;
×
138
        Ok(*result)
×
139
    }
×
140

141
    fn read_slot(&mut self) -> Result<u32, DataError> {
×
142
        if self.double_slots {
×
143
            let hi = self.read_raw()? as u32;
×
144
            let lo = self.read_raw()? as u32;
×
145
            Ok(hi << 16 | lo)
×
146
        } else {
147
            Ok(self.read_raw()? as u32)
×
148
        }
149
    }
×
150

151
    // After reading a string out of the raw data, advance raw_data_idx.
152
    fn skip_string(&mut self, s: &str) {
×
153
        for c in s.chars() {
×
154
            self.raw_data_idx += c.len_utf16();
×
155
        }
156
    }
×
157

158
    pub(crate) fn build(
×
159
        mut self,
160
    ) -> Result<(CaseMapExceptions<'static>, BTreeMap<u16, u16>), DataError> {
161
        let mut exceptions = Vec::new();
×
162
        let mut idx_map = BTreeMap::new();
×
163
        // The format of the raw data from ICU4C is the same as the format described in
164
        // exceptions.rs, with the exception of full mapping and closure strings. The
165
        // header and non-string slots can be copied over without modification. For string
166
        // slots, we read the length information from the ICU4C slot (described below),
167
        // read the strings, add the strings to the CaseMapExceptions string table,
168
        // and write an updated slot value containing the index of the string in the
169
        // table. In the case of full mappings, we store the index of the lowercase
170
        // mapping; the remaining mappings are stored at sequential indices.
171
        //
172
        // Full mappings: If there is at least one full (string) case mapping, then the
173
        // lengths of the mappings are encoded as nibbles in the full mappings slot:
174
        //     Bits:
175
        //        0..4   Length of lowercase string
176
        //        5..7   Length of case folding string
177
        //        8..11  Length of uppercase string
178
        //        12..15 Length of titlecase string
179
        // Mappings that do not exist have length 0. The strings themselves are stored in
180
        // the above order immediately following the last optional slot, encoded as UTF16.
181
        //
182
        // Case closure: If the case closure for a code point includes code points that
183
        // are not included in the simple or full mappings, then bits 0..3 of the closure
184
        // mappings slot will contain the number of codepoints in the closure string.
185
        // (Other bits are reserved.) The closure string itself is encoded as UTF16 and
186
        // stored following the full mappings data (if it exists) or the final optional
187
        // slot.
188
        while !self.done() {
×
189
            let old_idx = self.raw_data_idx as u16;
×
190

191
            let mut exception = DecodedException::default();
×
192

193
            // Copy header.
194
            let header = ExceptionHeader::from_integer(self.read_raw()?);
×
195
            self.double_slots = header.bits.double_width_slots;
×
196

197
            // Copy unmodified slots.
198
            for (slot, output) in [
×
199
                (ExceptionSlot::Lower, &mut exception.lowercase),
×
200
                (ExceptionSlot::Fold, &mut exception.casefold),
×
201
                (ExceptionSlot::Upper, &mut exception.uppercase),
×
202
                (ExceptionSlot::Title, &mut exception.titlecase),
×
203
            ] {
204
                if header.has_slot(slot) {
×
205
                    let value = self.read_slot()?;
×
206
                    if let Ok(ch) = char::try_from(value) {
×
207
                        *output = Some(ch)
×
208
                    } else {
209
                        return Err(DataError::custom(
×
210
                            "Found non-char value in casemapping exceptions data",
211
                        ));
212
                    }
213
                }
214
            }
215
            if header.has_slot(ExceptionSlot::Delta) {
×
216
                let delta = self.read_slot()?;
×
217

218
                exception.simple_case_delta = Some(delta)
×
219
            }
220

221
            // Read the closure and full mappings slots, if they exist.
222
            let closure_length = if header.has_slot(ExceptionSlot::Closure) {
×
223
                Some((self.read_slot()? & Self::CLOSURE_MAX_LENGTH) as usize)
×
224
            } else {
225
                None
×
226
            };
227
            let mappings_lengths = if header.has_slot(ExceptionSlot::FullMappings) {
×
228
                Some(self.read_slot()? & Self::MAPPINGS_ALL_LENGTHS_MASK)
×
229
            } else {
230
                None
×
231
            };
232

233
            // Copy the full mappings strings into the strings table, if they exist.
234
            if let Some(mut lengths) = mappings_lengths {
×
235
                let mut arr: [Cow<_>; 4] = Default::default();
×
236
                for mapping in &mut arr {
×
237
                    let len = lengths & Self::FULL_MAPPINGS_LENGTH_MASK;
×
238
                    lengths >>= Self::FULL_MAPPINGS_LENGTH_SHIFT;
×
239

240
                    let start = self.raw_data_idx;
×
241
                    let end = start + len as usize;
×
242
                    let slice = &self
×
243
                        .raw_data
244
                        .get(start..end)
×
245
                        .ok_or(DataError::custom("Incomplete string data"))?;
×
246
                    let string = char::decode_utf16(slice.iter().copied())
×
247
                        .collect::<Result<String, _>>()
248
                        .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?;
×
249
                    self.skip_string(&string);
×
250
                    *mapping = string.into()
×
251
                }
×
252
                exception.full = Some(arr)
×
253
            }
×
254

255
            // Copy the closure string into the strings table, if it exists.
256
            if let Some(len) = closure_length {
×
257
                let start = self.raw_data_idx;
×
258
                let slice = &self
×
259
                    .raw_data
260
                    .get(start..)
×
261
                    .ok_or(DataError::custom("Incomplete string data"))?;
×
262
                let string = char::decode_utf16(slice.iter().copied())
×
263
                    .take(len)
264
                    .collect::<Result<String, _>>()
265
                    .map_err(|_| DataError::custom("Found non-utf16 exceptions data"))?;
×
266
                self.skip_string(&string);
×
267
                exception.closure = Some(string.into())
×
268
            }
×
269

270
            exception.bits = header.bits;
×
271
            // unused bits in ICU4X
272
            exception.bits.double_width_slots = false;
×
273

274
            let new_exception_index = if let Ok(idx) = u16::try_from(exceptions.len()) {
×
275
                idx
×
276
            } else {
277
                return Err(DataError::custom("More than u16 exceptions"));
×
278
            };
279
            idx_map.insert(old_idx, new_exception_index);
×
280
            exceptions.push(exception.encode());
×
281
        }
×
282

283
        Ok((
×
284
            CaseMapExceptions {
×
285
                exceptions: (&exceptions).into(),
×
286
            },
287
            idx_map,
×
288
        ))
289
    }
×
290
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc