• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9457158389

10 Jun 2024 11:45PM UTC coverage: 75.174% (+0.05%) from 75.121%
9457158389

push

github

web-flow
Add constructing TinyAsciiStr from utf16 (#5025)

Introduces TinyAsciiStr constructors from utf16 and converges on the
consensus from #4931.

---------

Co-authored-by: Robert Bastian <4706271+robertbastian@users.noreply.github.com>

65 of 82 new or added lines in 14 files covered. (79.27%)

3441 existing lines in 141 files now uncovered.

52850 of 70304 relevant lines covered (75.17%)

563298.06 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.67
/components/collections/src/codepointinvliststringlist/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2✔
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
//! This module provides functionality for querying of sets of Unicode code points and strings.
6
//!
7
//! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
8
//! it also maintains a list of strings in the set.
9
//!
10
//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
11

12
use crate::codepointinvlist::{
13
    CodePointInversionList, CodePointInversionListBuilder, CodePointInversionListULE,
14
};
15
use alloc::string::{String, ToString};
16
use alloc::vec::Vec;
17
use displaydoc::Display;
18
use yoke::Yokeable;
19
use zerofrom::ZeroFrom;
20
use zerovec::{VarZeroSlice, VarZeroVec};
21

22
/// A data structure providing a concrete implementation of a `UnicodeSet`
23
/// (which represents a set of code points and strings) using an inversion list for the code points and a simple
24
/// list-like structure to store and iterate over the strings.
25
#[zerovec::make_varule(CodePointInversionListAndStringListULE)]
961,770✔
26
#[zerovec::skip_derive(Ord)]
27
#[zerovec::derive(Debug)]
28
#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
407✔
29
// Valid to auto-derive Deserialize because the invariants are weakly held
30
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
273✔
31
#[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
32
pub struct CodePointInversionListAndStringList<'data> {
33
    #[cfg_attr(feature = "serde", serde(borrow))]
34
    #[zerovec::varule(CodePointInversionListULE)]
35
    cp_inv_list: CodePointInversionList<'data>,
313✔
36
    // Invariants (weakly held):
37
    //   - no input string is length 1 (a length 1 string should be a single code point)
38
    //   - the string list is sorted
39
    //   - the elements in the string list are unique
40
    #[cfg_attr(feature = "serde", serde(borrow))]
41
    str_list: VarZeroVec<'data, str>,
313✔
42
}
43

44
#[cfg(feature = "databake")]
45
impl databake::Bake for CodePointInversionListAndStringList<'_> {
UNCOV
46
    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
×
47
        env.insert("icu_collections");
×
48
        let cp_inv_list = self.cp_inv_list.bake(env);
×
49
        let str_list = self.str_list.bake(env);
×
50
        // Safe because our parts are safe.
UNCOV
51
        databake::quote! {
×
52
            icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
53
        }
UNCOV
54
    }
×
55
}
56

57
impl<'data> CodePointInversionListAndStringList<'data> {
58
    /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
59
    /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
60
    pub fn try_from(
765✔
61
        cp_inv_list: CodePointInversionList<'data>,
62
        str_list: VarZeroVec<'data, str>,
63
    ) -> Result<Self, InvalidStringList> {
64
        // Verify invariants:
65
        // Do so by using the equivalent of str_list.iter().windows(2) to get
66
        // overlapping windows of size 2. The above putative code is not possible
67
        // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
68
        // because the non-fixed size elements necessitate at least some type
69
        // of allocation.
70
        {
765✔
71
            let mut it = str_list.iter();
765✔
72
            if let Some(mut x) = it.next() {
639✔
73
                if x.len() == 1 {
66✔
74
                    return Err(InvalidStringList::InvalidStringLength(x.to_string()));
1✔
75
                }
76
                for y in it {
501✔
77
                    if x.len() == 1 {
436✔
UNCOV
78
                        return Err(InvalidStringList::InvalidStringLength(x.to_string()));
×
79
                    } else if x == y {
436✔
80
                        return Err(InvalidStringList::StringListNotUnique(x.to_string()));
1✔
81
                    } else if x > y {
435✔
82
                        return Err(InvalidStringList::StringListNotSorted(
1✔
83
                            x.to_string(),
1✔
84
                            y.to_string(),
1✔
85
                        ));
×
86
                    }
87

88
                    // Next window begins. Update `x` here, `y` will be updated in next loop iteration.
89
                    x = y;
434✔
90
                }
91
            }
92
        }
635✔
93

94
        Ok(CodePointInversionListAndStringList {
636✔
95
            cp_inv_list,
636✔
96
            str_list,
636✔
97
        })
98
    }
761✔
99

100
    #[doc(hidden)] // databake internal
UNCOV
101
    pub const fn from_parts_unchecked(
×
102
        cp_inv_list: CodePointInversionList<'data>,
103
        str_list: VarZeroVec<'data, str>,
104
    ) -> Self {
UNCOV
105
        CodePointInversionListAndStringList {
×
106
            cp_inv_list,
107
            str_list,
108
        }
UNCOV
109
    }
×
110

111
    /// Returns the number of elements in this set (its cardinality).
112
    /// Note than the elements of a set may include both individual
113
    /// codepoints and strings.
114
    pub fn size(&self) -> usize {
147✔
115
        self.cp_inv_list.size() + self.str_list.len()
147✔
116
    }
147✔
117

118
    /// Return true if this set contains multi-code point strings or the empty string.
119
    pub fn has_strings(&self) -> bool {
78✔
120
        !self.str_list.is_empty()
78✔
121
    }
78✔
122

123
    ///
124
    /// # Examples
125
    /// ```
126
    /// use icu::collections::codepointinvlist::CodePointInversionList;
127
    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
128
    /// use zerovec::VarZeroVec;
129
    ///
130
    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
131
    /// let cp_list =
132
    ///    CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
133
    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
134
    /// let str_list = VarZeroVec::<str>::from(str_slice);
135
    ///
136
    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
137
    ///
138
    /// assert!(cpilsl.contains("bmp_max"));
139
    /// assert!(cpilsl.contains(""));
140
    /// assert!(cpilsl.contains("A"));
141
    /// assert!(cpilsl.contains("ቔ"));  // U+1254 ETHIOPIC SYLLABLE QHEE
142
    /// assert!(!cpilsl.contains("bazinga!"));
143
    /// ```
144
    pub fn contains(&self, s: &str) -> bool {
15,897✔
145
        let mut chars = s.chars();
15,897✔
146
        if let Some(first_char) = chars.next() {
15,897✔
147
            if chars.next().is_none() {
7,972✔
148
                return self.contains_char(first_char);
7,938✔
149
            }
150
        }
151
        self.str_list.binary_search(s).is_ok()
7,959✔
152
    }
15,897✔
153

154
    ///
155
    /// # Examples
156
    /// ```
157
    /// use icu::collections::codepointinvlist::CodePointInversionList;
158
    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
159
    /// use zerovec::VarZeroVec;
160
    ///
161
    /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
162
    /// let cp_list =
163
    ///     CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
164
    /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
165
    /// let str_list = VarZeroVec::<str>::from(str_slice);
166
    ///
167
    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
168
    ///
169
    /// assert!(cpilsl.contains32(0));
170
    /// assert!(cpilsl.contains32(0x0042));
171
    /// assert!(!cpilsl.contains32(0x0080));
172
    /// ```
173
    pub fn contains32(&self, cp: u32) -> bool {
240,279✔
174
        self.cp_inv_list.contains32(cp)
240,279✔
175
    }
240,279✔
176

177
    ///
178
    /// # Examples
179
    /// ```
180
    /// use icu::collections::codepointinvlist::CodePointInversionList;
181
    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
182
    /// use zerovec::VarZeroVec;
183
    ///
184
    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
185
    /// let cp_list =
186
    ///    CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
187
    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
188
    /// let str_list = VarZeroVec::<str>::from(str_slice);
189
    ///
190
    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
191
    ///
192
    /// assert!(cpilsl.contains_char('A'));
193
    /// assert!(cpilsl.contains_char('ቔ'));  // U+1254 ETHIOPIC SYLLABLE QHEE
194
    /// assert!(!cpilsl.contains_char('\u{1_0000}'));
195
    /// assert!(!cpilsl.contains_char('🨫'));  // U+1FA2B NEUTRAL CHESS TURNED QUEEN
196
    pub fn contains_char(&self, ch: char) -> bool {
240,274✔
197
        self.contains32(ch as u32)
240,274✔
198
    }
240,274✔
199

200
    /// Access the underlying [`CodePointInversionList`].
201
    pub fn code_points(&self) -> &CodePointInversionList<'data> {
566✔
202
        &self.cp_inv_list
203
    }
566✔
204

205
    /// Access the contained strings.
206
    pub fn strings(&self) -> &VarZeroSlice<str> {
232,516✔
207
        &self.str_list
232,516✔
208
    }
232,516✔
209
}
210

211
impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
212
    fn from_iter<I>(it: I) -> Self
94✔
213
    where
214
        I: IntoIterator<Item = &'a str>,
215
    {
216
        let mut builder = CodePointInversionListBuilder::new();
94✔
217
        let mut strings = Vec::<&str>::new();
94✔
218
        for s in it {
4,921✔
219
            let mut chars = s.chars();
4,827✔
220
            if let Some(first_char) = chars.next() {
4,827✔
221
                if chars.next().is_none() {
4,827✔
222
                    builder.add_char(first_char);
4,794✔
223
                    continue;
224
                }
225
            }
226
            strings.push(s);
33✔
227
        }
228

229
        // Ensure that the string list is sorted. If not, the binary search that
230
        // is used for `.contains(&str)` will return garbase otuput.
231
        strings.sort_unstable();
94✔
232
        strings.dedup();
94✔
233

234
        let cp_inv_list = builder.build();
94✔
235
        let str_list = VarZeroVec::<str>::from(&strings);
94✔
236

237
        CodePointInversionListAndStringList {
94✔
238
            cp_inv_list,
94✔
239
            str_list,
94✔
240
        }
241
    }
94✔
242
}
243

244
/// Custom Errors for [`CodePointInversionListAndStringList`].
UNCOV
245
#[derive(Display, Debug)]
×
246
pub enum InvalidStringList {
247
    /// A string in the string list had an invalid length
UNCOV
248
    #[displaydoc("Invalid string length for string: {0}")]
×
UNCOV
249
    InvalidStringLength(String),
×
250
    /// A string in the string list appears more than once
UNCOV
251
    #[displaydoc("String list has duplicate: {0}")]
×
UNCOV
252
    StringListNotUnique(String),
×
253
    /// Two strings in the string list compare to each other opposite of sorted order
UNCOV
254
    #[displaydoc("Strings in string list not in sorted order: ({0}, {1})")]
×
UNCOV
255
    StringListNotSorted(String, String),
×
256
}
257

258
#[cfg(test)]
259
mod tests {
260
    use super::*;
261

262
    #[test]
263
    fn test_size_has_strings() {
2✔
264
        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
2✔
265
        let cp_list =
266
            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
1✔
267
        let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
1✔
268
        let str_list = VarZeroVec::<str>::from(str_slice);
1✔
269

270
        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
1✔
271

272
        assert!(cpilsl.has_strings());
1✔
273
        assert_eq!(8, cpilsl.size());
1✔
274
    }
2✔
275

276
    #[test]
277
    fn test_empty_string_allowed() {
2✔
278
        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
2✔
279
        let cp_list =
280
            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
1✔
281
        let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
1✔
282
        let str_list = VarZeroVec::<str>::from(str_slice);
1✔
283

284
        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
1✔
285

286
        assert!(cpilsl.has_strings());
1✔
287
        assert_eq!(9, cpilsl.size());
1✔
288
    }
2✔
289

290
    #[test]
291
    fn test_invalid_string() {
2✔
292
        let cp_slice = &[0, 1];
2✔
293
        let cp_list =
294
            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
1✔
295
        let str_slice = &["a"];
1✔
296
        let str_list = VarZeroVec::<str>::from(str_slice);
1✔
297

298
        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
1✔
299

300
        assert!(matches!(
1✔
301
            cpilsl,
1✔
302
            Err(InvalidStringList::InvalidStringLength(_))
303
        ));
304
    }
2✔
305

306
    #[test]
307
    fn test_invalid_string_list_has_duplicate() {
2✔
308
        let cp_slice = &[0, 1];
2✔
309
        let cp_list =
310
            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
1✔
311
        let str_slice = &["abc", "abc"];
1✔
312
        let str_list = VarZeroVec::<str>::from(str_slice);
1✔
313

314
        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
1✔
315

316
        assert!(matches!(
1✔
317
            cpilsl,
1✔
318
            Err(InvalidStringList::StringListNotUnique(_))
319
        ));
320
    }
2✔
321

322
    #[test]
323
    fn test_invalid_string_list_not_sorted() {
2✔
324
        let cp_slice = &[0, 1];
2✔
325
        let cp_list =
326
            CodePointInversionList::try_clone_from_inversion_list_slice(cp_slice).unwrap();
1✔
327
        let str_slice = &["xyz", "abc"];
1✔
328
        let str_list = VarZeroVec::<str>::from(str_slice);
1✔
329

330
        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
1✔
331

332
        assert!(matches!(
1✔
333
            cpilsl,
1✔
334
            Err(InvalidStringList::StringListNotSorted(_, _))
335
        ));
336
    }
2✔
337

338
    #[test]
339
    fn test_from_iter_invariants() {
2✔
340
        let in_strs_1 = ["a", "abc", "xyz", "abc"];
1✔
341
        let in_strs_2 = ["xyz", "abc", "a", "abc"];
1✔
342

343
        let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
1✔
344
        let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
1✔
345

346
        assert_eq!(cpilsl_1, cpilsl_2);
1✔
347

348
        assert!(cpilsl_1.has_strings());
1✔
349
        assert!(cpilsl_1.contains("abc"));
1✔
350
        assert!(cpilsl_1.contains("xyz"));
1✔
351
        assert!(!cpilsl_1.contains("def"));
1✔
352

353
        assert_eq!(1, cpilsl_1.cp_inv_list.size());
1✔
354
        assert!(cpilsl_1.contains_char('a'));
1✔
355
        assert!(!cpilsl_1.contains_char('0'));
1✔
356
        assert!(!cpilsl_1.contains_char('q'));
1✔
357

358
        assert_eq!(3, cpilsl_1.size());
1✔
359
    }
2✔
360
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc