• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 13958601093

19 Mar 2025 04:17PM UTC coverage: 74.164% (-1.5%) from 75.71%
13958601093

push

github

web-flow
Clean up properties docs (#6315)

58056 of 78281 relevant lines covered (74.16%)

819371.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.0
/components/properties/src/code_point_set.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use core::ops::RangeInclusive;
7
use icu_collections::codepointinvlist::CodePointInversionList;
8
use icu_provider::marker::ErasedMarker;
9
use icu_provider::prelude::*;
10

11
/// A set of Unicode code points. Access its data via the borrowed version,
12
/// [`CodePointSetDataBorrowed`].
13
///
14
/// # Example
15
/// ```rust
16
/// use icu::properties::CodePointSetData;
17
/// use icu::properties::props::Alphabetic;
18
///
19
/// let alphabetic = CodePointSetData::new::<Alphabetic>();
20
///
21
/// assert!(!alphabetic.contains('3'));
22
/// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
23
/// assert!(alphabetic.contains('A'));
24
/// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
25
/// ```
26
#[derive(Debug)]
×
27
pub struct CodePointSetData {
28
    data: DataPayload<ErasedMarker<PropertyCodePointSet<'static>>>,
×
29
}
30

31
impl CodePointSetData {
32
    /// Creates a new [`CodePointSetDataBorrowed`] for a [`BinaryProperty`].
33
    ///
34
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
35
    ///
36
    /// [📚 Help choosing a constructor](icu_provider::constructors)
37
    #[allow(clippy::new_ret_no_self)]
38
    #[cfg(feature = "compiled_data")]
39
    pub const fn new<P: BinaryProperty>() -> CodePointSetDataBorrowed<'static> {
789✔
40
        CodePointSetDataBorrowed::new::<P>()
789✔
41
    }
789✔
42

43
    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44
    pub fn try_new_unstable<P: BinaryProperty>(
1,550✔
45
        provider: &(impl DataProvider<P::DataMarker> + ?Sized),
46
    ) -> Result<CodePointSetData, DataError> {
47
        Ok(CodePointSetData::from_data(
1,550✔
48
            provider.load(Default::default())?.payload,
1,550✔
49
        ))
50
    }
1,550✔
51

52
    /// Construct a borrowed version of this type that can be queried.
53
    ///
54
    /// This owned version if returned by functions that use a runtime data provider.
55
    #[inline]
56
    pub fn as_borrowed(&self) -> CodePointSetDataBorrowed<'_> {
10,921✔
57
        CodePointSetDataBorrowed {
58
            set: self.data.get(),
10,921✔
59
        }
60
    }
10,921✔
61

62
    /// Construct a new one from loaded data
63
    ///
64
    /// Typically it is preferable to use getters like [`load_ascii_hex_digit()`] instead
65
    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
12,547✔
66
    where
67
        M: DynamicDataMarker<DataStruct = PropertyCodePointSet<'static>>,
68
    {
69
        Self { data: data.cast() }
12,547✔
70
    }
12,547✔
71

72
    /// Construct a new owned [`CodePointInversionList`]
73
    pub fn from_code_point_inversion_list(set: CodePointInversionList<'static>) -> Self {
10,997✔
74
        let set = PropertyCodePointSet::from_code_point_inversion_list(set);
10,997✔
75
        CodePointSetData::from_data(
10,997✔
76
            DataPayload::<ErasedMarker<PropertyCodePointSet<'static>>>::from_owned(set),
10,997✔
77
        )
78
    }
10,997✔
79

80
    /// Convert this type to a [`CodePointInversionList`] as a borrowed value.
81
    ///
82
    /// The data backing this is extensible and supports multiple implementations.
83
    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
84
    /// added, and users may select which at data generation time.
85
    ///
86
    /// This method returns an `Option` in order to return `None` when the backing data provider
87
    /// cannot return a [`CodePointInversionList`], or cannot do so within the expected constant time
88
    /// constraint.
89
    pub fn as_code_point_inversion_list(&self) -> Option<&CodePointInversionList<'_>> {
8✔
90
        self.data.get().as_code_point_inversion_list()
8✔
91
    }
8✔
92

93
    /// Convert this type to a [`CodePointInversionList`], borrowing if possible,
94
    /// otherwise allocating a new [`CodePointInversionList`].
95
    ///
96
    /// The data backing this is extensible and supports multiple implementations.
97
    /// Currently it is always [`CodePointInversionList`]; however in the future more backends may be
98
    /// added, and users may select which at data generation time.
99
    ///
100
    /// The performance of the conversion to this specific return type will vary
101
    /// depending on the data structure that is backing `self`.
102
    pub fn to_code_point_inversion_list(&self) -> CodePointInversionList<'_> {
2,213✔
103
        self.data.get().to_code_point_inversion_list()
2,213✔
104
    }
2,213✔
105
}
106

107
/// A borrowed wrapper around code point set data, returned by
108
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
109
#[derive(Clone, Copy, Debug)]
×
110
pub struct CodePointSetDataBorrowed<'a> {
111
    set: &'a PropertyCodePointSet<'a>,
×
112
}
113

114
impl CodePointSetDataBorrowed<'static> {
115
    /// Creates a new [`CodePointSetData`] for a [`BinaryProperty`].
116
    ///
117
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
118
    ///
119
    /// [📚 Help choosing a constructor](icu_provider::constructors)
120
    #[inline]
121
    #[cfg(feature = "compiled_data")]
122
    pub const fn new<P: BinaryProperty>() -> Self {
789✔
123
        CodePointSetDataBorrowed { set: P::SINGLETON }
124
    }
789✔
125
    /// Cheaply converts a [`CodePointSetDataBorrowed<'static>`] into a [`CodePointSetData`].
126
    ///
127
    /// Note: Due to branching and indirection, using [`CodePointSetData`] might inhibit some
128
    /// compile-time optimizations that are possible with [`CodePointSetDataBorrowed`].
129
    pub const fn static_to_owned(self) -> CodePointSetData {
603✔
130
        CodePointSetData {
603✔
131
            data: DataPayload::from_static_ref(self.set),
603✔
132
        }
133
    }
603✔
134
}
135

136
impl<'a> CodePointSetDataBorrowed<'a> {
137
    /// Check if the set contains a character
138
    ///
139
    /// ```rust
140
    /// use icu::properties::CodePointSetData;
141
    /// use icu::properties::props::Alphabetic;
142
    ///
143
    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
144
    ///
145
    /// assert!(!alphabetic.contains('3'));
146
    /// assert!(!alphabetic.contains('੩'));  // U+0A69 GURMUKHI DIGIT THREE
147
    /// assert!(alphabetic.contains('A'));
148
    /// assert!(alphabetic.contains('Ä'));  // U+00C4 LATIN CAPITAL LETTER A WITH DIAERESIS
149
    /// ```
150
    #[inline]
151
    pub fn contains(self, ch: char) -> bool {
712,895✔
152
        self.set.contains(ch)
712,895✔
153
    }
712,895✔
154

155
    /// See [`Self::contains`].
156
    #[inline]
157
    pub fn contains32(self, ch: u32) -> bool {
3✔
158
        self.set.contains32(ch)
3✔
159
    }
3✔
160

161
    // Yields an [`Iterator`] returning the ranges of the code points that are
162
    /// included in the [`CodePointSetData`]
163
    ///
164
    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
165
    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
166
    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
167
    ///
168
    /// # Example
169
    ///
170
    /// ```
171
    /// use icu::properties::props::Alphabetic;
172
    /// use icu::properties::CodePointSetData;
173
    ///
174
    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
175
    /// let mut ranges = alphabetic.iter_ranges();
176
    ///
177
    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
178
    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
179
    /// ```
180
    #[inline]
181
    pub fn iter_ranges(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
95✔
182
        self.set.iter_ranges()
95✔
183
    }
95✔
184

185
    // Yields an [`Iterator`] returning the ranges of the code points that are
186
    /// *not* included in the [`CodePointSetData`]
187
    ///
188
    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
189
    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
190
    /// behavior of ranges, ex: `UnicodeSet::contains(UChar32 start, UChar32 end)`.
191
    ///
192
    /// # Example
193
    ///
194
    /// ```
195
    /// use icu::properties::props::Alphabetic;
196
    /// use icu::properties::CodePointSetData;
197
    ///
198
    /// let alphabetic = CodePointSetData::new::<Alphabetic>();
199
    /// let mut ranges = alphabetic.iter_ranges();
200
    ///
201
    /// assert_eq!(Some(0x0041..=0x005A), ranges.next()); // 'A'..'Z'
202
    /// assert_eq!(Some(0x0061..=0x007A), ranges.next()); // 'a'..'z'
203
    /// ```
204
    #[inline]
205
    pub fn iter_ranges_complemented(self) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
65✔
206
        self.set.iter_ranges_complemented()
65✔
207
    }
65✔
208
}
209

210
/// A binary Unicode character property.
211
///
212
/// The descriptions of most properties are taken from [`TR44`], the documentation for the
213
/// Unicode Character Database.  Some properties are instead defined in [`TR18`], the
214
/// documentation for Unicode regular expressions. In particular, Annex C of this document
215
/// defines properties for POSIX compatibility.
216
///
217
/// <div class="stab unstable">
218
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
219
/// trait, please consider using a type from the implementors listed below.
220
/// </div>
221
///
222
/// [`TR44`]: https://www.unicode.org/reports/tr44
223
/// [`TR18`]: https://www.unicode.org/reports/tr18
224
pub trait BinaryProperty: crate::private::Sealed + Sized {
225
    #[doc(hidden)]
226
    type DataMarker: DataMarker<DataStruct = PropertyCodePointSet<'static>>;
227
    #[doc(hidden)]
228
    #[cfg(feature = "compiled_data")]
229
    const SINGLETON: &'static PropertyCodePointSet<'static>;
230
    /// The name of this property
231
    const NAME: &'static [u8];
232
    /// The abbreviated name of this property, if it exists, otherwise the name
233
    const SHORT_NAME: &'static [u8];
234

235
    /// Convenience method for `CodePointSetData::new().contains(ch)`
236
    ///
237
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
238
    #[cfg(feature = "compiled_data")]
239
    fn for_char(ch: char) -> bool {
×
240
        CodePointSetData::new::<Self>().contains(ch)
×
241
    }
×
242
}
243

244
#[cfg(test)]
245
mod tests {
246
    #[test]
247
    fn test_general_category() {
2✔
248
        use icu::properties::props::GeneralCategory;
249
        use icu::properties::props::GeneralCategoryGroup;
250
        use icu::properties::CodePointMapData;
251

252
        let digits_data = CodePointMapData::<GeneralCategory>::new()
1✔
253
            .get_set_for_value_group(GeneralCategoryGroup::Number);
254
        let digits = digits_data.as_borrowed();
1✔
255

256
        assert!(digits.contains('5'));
1✔
257
        assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE
1✔
258
        assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE
1✔
259

260
        assert!(!digits.contains('A'));
1✔
261
    }
2✔
262

263
    #[test]
264
    fn test_script() {
2✔
265
        use icu::properties::props::Script;
266
        use icu::properties::CodePointMapData;
267

268
        let thai_data = CodePointMapData::<Script>::new().get_set_for_value(Script::Thai);
1✔
269
        let thai = thai_data.as_borrowed();
1✔
270

271
        assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI
1✔
272
        assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO
1✔
273

274
        assert!(!thai.contains('A'));
1✔
275
        assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT
1✔
276
    }
2✔
277

278
    #[test]
279
    fn test_gc_groupings() {
2✔
280
        use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
281
        use icu::properties::CodePointMapData;
282
        use icu_collections::codepointinvlist::CodePointInversionListBuilder;
283

284
        let test_group = |category: GeneralCategoryGroup, subcategories: &[GeneralCategory]| {
7✔
285
            let category_set =
7✔
286
                CodePointMapData::<GeneralCategory>::new().get_set_for_value_group(category);
7✔
287
            let category_set = category_set
7✔
288
                .as_code_point_inversion_list()
289
                .expect("The data should be valid");
290

291
            let mut builder = CodePointInversionListBuilder::new();
7✔
292
            for &subcategory in subcategories {
7✔
293
                let gc_set_data =
294
                    CodePointMapData::<GeneralCategory>::new().get_set_for_value(subcategory);
30✔
295
                let gc_set = gc_set_data.as_borrowed();
30✔
296
                for range in gc_set.iter_ranges() {
4,129✔
297
                    builder.add_range32(range);
4,099✔
298
                }
299
            }
30✔
300
            let combined_set = builder.build();
7✔
301
            println!("{category:?} {subcategories:?}");
7✔
302
            assert_eq!(
14✔
303
                category_set.get_inversion_list_vec(),
7✔
304
                combined_set.get_inversion_list_vec()
7✔
305
            );
306
        };
7✔
307

308
        test_group(
1✔
309
            GeneralCategoryGroup::Letter,
310
            &[
311
                GeneralCategory::UppercaseLetter,
312
                GeneralCategory::LowercaseLetter,
313
                GeneralCategory::TitlecaseLetter,
314
                GeneralCategory::ModifierLetter,
315
                GeneralCategory::OtherLetter,
316
            ],
317
        );
318
        test_group(
1✔
319
            GeneralCategoryGroup::Other,
320
            &[
321
                GeneralCategory::Control,
322
                GeneralCategory::Format,
323
                GeneralCategory::Unassigned,
324
                GeneralCategory::PrivateUse,
325
                GeneralCategory::Surrogate,
326
            ],
327
        );
328
        test_group(
1✔
329
            GeneralCategoryGroup::Mark,
330
            &[
331
                GeneralCategory::SpacingMark,
332
                GeneralCategory::EnclosingMark,
333
                GeneralCategory::NonspacingMark,
334
            ],
335
        );
336
        test_group(
1✔
337
            GeneralCategoryGroup::Number,
338
            &[
339
                GeneralCategory::DecimalNumber,
340
                GeneralCategory::LetterNumber,
341
                GeneralCategory::OtherNumber,
342
            ],
343
        );
344
        test_group(
1✔
345
            GeneralCategoryGroup::Punctuation,
346
            &[
347
                GeneralCategory::ConnectorPunctuation,
348
                GeneralCategory::DashPunctuation,
349
                GeneralCategory::ClosePunctuation,
350
                GeneralCategory::FinalPunctuation,
351
                GeneralCategory::InitialPunctuation,
352
                GeneralCategory::OtherPunctuation,
353
                GeneralCategory::OpenPunctuation,
354
            ],
355
        );
356
        test_group(
1✔
357
            GeneralCategoryGroup::Symbol,
358
            &[
359
                GeneralCategory::CurrencySymbol,
360
                GeneralCategory::ModifierSymbol,
361
                GeneralCategory::MathSymbol,
362
                GeneralCategory::OtherSymbol,
363
            ],
364
        );
365
        test_group(
1✔
366
            GeneralCategoryGroup::Separator,
367
            &[
368
                GeneralCategory::LineSeparator,
369
                GeneralCategory::ParagraphSeparator,
370
                GeneralCategory::SpaceSeparator,
371
            ],
372
        );
373
    }
2✔
374

375
    #[test]
376
    fn test_gc_surrogate() {
2✔
377
        use icu::properties::props::GeneralCategory;
378
        use icu::properties::CodePointMapData;
379

380
        let surrogates_data = CodePointMapData::<GeneralCategory>::new()
2✔
381
            .get_set_for_value(GeneralCategory::Surrogate);
1✔
382
        let surrogates = surrogates_data.as_borrowed();
1✔
383

384
        assert!(surrogates.contains32(0xd800));
1✔
385
        assert!(surrogates.contains32(0xd900));
1✔
386
        assert!(surrogates.contains32(0xdfff));
1✔
387

388
        assert!(!surrogates.contains('A'));
1✔
389
    }
2✔
390
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc