• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 13958601093

19 Mar 2025 04:17PM UTC coverage: 74.164% (-1.5%) from 75.71%
13958601093

push

github

web-flow
Clean up properties docs (#6315)

58056 of 78281 relevant lines covered (74.16%)

819371.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.73
/components/experimental/src/unicodeset_parse/parse.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use alloc::borrow::Cow;
6
use alloc::collections::{BTreeMap, BTreeSet};
7
use alloc::fmt::Display;
8
use alloc::format;
9
use alloc::string::{String, ToString};
10
use alloc::vec::Vec;
11
use core::{iter::Peekable, str::CharIndices};
12

13
use icu_collections::{
14
    codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
15
    codepointinvliststringlist::CodePointInversionListAndStringList,
16
};
17
use icu_properties::script::ScriptWithExtensions;
18
use icu_properties::{
19
    props::{
20
        CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
21
        GraphemeClusterBreak, Script, SentenceBreak, WordBreak,
22
    },
23
    CodePointMapData,
24
};
25
use icu_properties::{
26
    props::{PatternWhiteSpace, XidContinue, XidStart},
27
    CodePointSetData,
28
};
29
use icu_properties::{provider::*, PropertyParser};
30
use icu_provider::prelude::*;
31

32
/// The kind of error that occurred.
33
#[derive(Debug, Clone, Copy, PartialEq, Eq, displaydoc::Display)]
48✔
34
#[non_exhaustive]
35
pub enum ParseErrorKind {
36
    /// An unexpected character was encountered.
37
    ///
38
    /// This variant implies the other variants
39
    /// (notably `UnknownProperty` and `Unimplemented`) do not apply.
40
    #[displaydoc("An unexpected character was encountered")]
41
    UnexpectedChar(char),
×
42
    /// The property name or value is unknown.
43
    ///
44
    /// For property names, make sure you use the spelling
45
    /// defined in [ECMA-262](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
46
    #[displaydoc("The property name or value is unknown")]
47
    UnknownProperty,
48
    /// A reference to an unknown variable.
49
    UnknownVariable,
50
    /// A variable of a certain type occurring in an unexpected context.
51
    UnexpectedVariable,
52
    /// The source is an incomplete unicode set.
53
    Eof,
54
    /// Something unexpected went wrong with our code. Please file a bug report on GitHub.
55
    Internal,
56
    /// The provided syntax is not supported by us.
57
    ///
58
    /// Note that unknown properties will return the
59
    /// `UnknownProperty` variant, not this one.
60
    #[displaydoc("The provided syntax is not supported by us.")]
61
    Unimplemented,
62
    /// The provided escape sequence is not a valid Unicode code point or represents too many code points.
63
    InvalidEscape,
64
}
65
use zerovec::VarZeroVec;
66
use ParseErrorKind as PEK;
67

68
impl ParseErrorKind {
69
    fn with_offset(self, offset: usize) -> ParseError {
302✔
70
        ParseError {
302✔
71
            offset: Some(offset),
302✔
72
            kind: self,
73
        }
74
    }
302✔
75
}
76

77
impl From<ParseErrorKind> for ParseError {
78
    fn from(kind: ParseErrorKind) -> Self {
14,327✔
79
        ParseError { offset: None, kind }
14,327✔
80
    }
14,327✔
81
}
82

83
/// The error type returned by the `parse` functions in this crate.
84
///
85
/// See [`ParseError::fmt_with_source`] for pretty-printing and [`ParseErrorKind`] of the
86
/// different types of errors represented by this struct.
87
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
×
88
pub struct ParseError {
89
    // offset is the index to an arbitrary byte in the last character in the source that makes sense
90
    // to display as location for the error, e.g., the unexpected character itself or
91
    // for an unknown property name the last character of the name.
92
    offset: Option<usize>,
×
93
    kind: ParseErrorKind,
×
94
}
95

96
type Result<T, E = ParseError> = core::result::Result<T, E>;
97

98
impl ParseError {
99
    /// Pretty-prints this error and if applicable, shows where the error occurred in the source.
100
    ///
101
    /// Must be called with the same source that was used to parse the set.
102
    ///
103
    /// # Examples
104
    ///
105
    /// ```
106
    /// use icu::experimental::unicodeset_parse::*;
107
    ///
108
    /// let source = "[[abc]-x]";
109
    /// let set = parse(source);
110
    /// assert!(set.is_err());
111
    /// let err = set.unwrap_err();
112
    /// assert_eq!(
113
    ///     err.fmt_with_source(source).to_string(),
114
    ///     "[[abc]-x← error: unexpected character 'x'"
115
    /// );
116
    /// ```
117
    ///
118
    /// ```
119
    /// use icu::experimental::unicodeset_parse::*;
120
    ///
121
    /// let source = r"[\N{LATIN CAPITAL LETTER A}]";
122
    /// let set = parse(source);
123
    /// assert!(set.is_err());
124
    /// let err = set.unwrap_err();
125
    /// assert_eq!(
126
    ///     err.fmt_with_source(source).to_string(),
127
    ///     r"[\N← error: unimplemented"
128
    /// );
129
    /// ```
130
    pub fn fmt_with_source(&self, source: &str) -> impl Display {
48✔
131
        let ParseError { offset, kind } = *self;
48✔
132

133
        if kind == ParseErrorKind::Eof {
48✔
134
            return format!("{source}← error: unexpected end of input");
4✔
135
        }
136
        let mut s = String::new();
44✔
137
        if let Some(offset) = offset {
44✔
138
            if offset < source.len() {
44✔
139
                // offset points to any byte of the last character we want to display.
140
                // in the case of ASCII, this is easy - we just display bytes [..=offset].
141
                // however, if the last character is more than one byte in UTF-8
142
                // we cannot use ..=offset, because that would potentially include only partial
143
                // bytes of last character in our string. hence we must find the start of the
144
                // following character and use that as the (exclusive) end of our string.
145

146
                // offset points into the last character we want to include, hence the start of the
147
                // first character we want to exclude is at least offset + 1.
148
                let mut exclusive_end = offset + 1;
44✔
149
                // TODO: replace this loop with str::ceil_char_boundary once stable
150
                for _ in 0..3 {
46✔
151
                    // is_char_boundary returns true at the latest once exclusive_end == source.len()
152
                    if source.is_char_boundary(exclusive_end) {
46✔
153
                        break;
154
                    }
155
                    exclusive_end += 1;
2✔
156
                }
157

158
                // exclusive_end is at most source.len() due to str::is_char_boundary and at least 0 by type
159
                #[allow(clippy::indexing_slicing)]
160
                s.push_str(&source[..exclusive_end]);
44✔
161
                s.push_str("← ");
44✔
162
            }
163
        }
164
        s.push_str("error: ");
44✔
165
        match kind {
44✔
166
            ParseErrorKind::UnexpectedChar(c) => {
31✔
167
                s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
31✔
168
            }
169
            ParseErrorKind::UnknownProperty => {
170
                s.push_str("unknown property");
4✔
171
            }
172
            ParseErrorKind::UnknownVariable => {
173
                s.push_str("unknown variable");
×
174
            }
175
            ParseErrorKind::UnexpectedVariable => {
176
                s.push_str("unexpected variable");
6✔
177
            }
178
            ParseErrorKind::Eof => {
179
                s.push_str("unexpected end of input");
×
180
            }
181
            ParseErrorKind::Internal => {
182
                s.push_str("internal error");
×
183
            }
184
            ParseErrorKind::Unimplemented => {
185
                s.push_str("unimplemented");
1✔
186
            }
187
            ParseErrorKind::InvalidEscape => {
188
                s.push_str("invalid escape sequence");
2✔
189
            }
190
        }
191

192
        s
44✔
193
    }
48✔
194

195
    /// Returns the [`ParseErrorKind`] of this error.
196
    pub fn kind(&self) -> ParseErrorKind {
×
197
        self.kind
×
198
    }
×
199

200
    /// Returns the offset of this error in the source string, if it was specified.
201
    pub fn offset(&self) -> Option<usize> {
×
202
        self.offset
×
203
    }
×
204

205
    fn or_with_offset(self, offset: usize) -> Self {
4✔
206
        match self.offset {
4✔
207
            Some(_) => self,
×
208
            None => ParseError {
4✔
209
                offset: Some(offset),
4✔
210
                ..self
211
            },
4✔
212
        }
213
    }
4✔
214
}
215

216
/// The value of a variable in a UnicodeSet. Used as value type in [`VariableMap`].
217
#[derive(Debug, Clone)]
46✔
218
#[non_exhaustive]
219
pub enum VariableValue<'a> {
220
    /// A UnicodeSet, represented as a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList).
221
    UnicodeSet(CodePointInversionListAndStringList<'a>),
19✔
222
    // in theory, a one-code-point string is always the same as a char, but we might want to keep
223
    // this variant for efficiency?
224
    /// A single code point.
225
    Char(char),
23✔
226
    /// A string. It is guaranteed that when returned from a VariableMap, this variant contains never exactly one code point.
227
    String(Cow<'a, str>),
4✔
228
}
229

230
/// The map used for parsing UnicodeSets with variable support. See [`parse_with_variables`].
231
#[derive(Debug, Clone, Default)]
732✔
232
pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
366✔
233

234
impl<'a> VariableMap<'a> {
235
    /// Creates a new empty map.
236
    pub fn new() -> Self {
1✔
237
        Self::default()
1✔
238
    }
1✔
239

240
    /// Removes a key from the map, returning the value at the key if the key
241
    /// was previously in the map.
242
    pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
×
243
        self.0.remove(key)
×
244
    }
×
245

246
    /// Get a reference to the value associated with this key, if it exists.
247
    pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
7✔
248
        self.0.get(key)
7✔
249
    }
7✔
250

251
    /// Insert a `VariableValue` into the `VariableMap`.
252
    ///
253
    /// Returns `Err` with the old value, if it exists, and does not update the map.
254
    pub fn insert(&mut self, key: String, value: VariableValue<'a>) -> Result<(), &VariableValue> {
51✔
255
        // borrow-checker shenanigans, otherwise we could use if let
256
        if self.0.contains_key(&key) {
51✔
257
            // we just checked that this key exists
258
            #[allow(clippy::indexing_slicing)]
259
            return Err(&self.0[&key]);
×
260
        }
261

262
        if let VariableValue::String(s) = &value {
51✔
263
            let mut chars = s.chars();
21✔
264
            if let (Some(c), None) = (chars.next(), chars.next()) {
21✔
265
                self.0.insert(key, VariableValue::Char(c));
16✔
266
                return Ok(());
16✔
267
            };
268
        }
269

270
        self.0.insert(key, value);
35✔
271
        Ok(())
35✔
272
    }
51✔
273

274
    /// Insert a `char` into the `VariableMap`.    
275
    ///
276
    /// Returns `Err` with the old value, if it exists, and does not update the map.
277
    pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue> {
12✔
278
        // borrow-checker shenanigans, otherwise we could use if let
279
        if self.0.contains_key(&key) {
12✔
280
            // we just checked that this key exists
281
            #[allow(clippy::indexing_slicing)]
282
            return Err(&self.0[&key]);
1✔
283
        }
284

285
        self.0.insert(key, VariableValue::Char(c));
11✔
286
        Ok(())
11✔
287
    }
12✔
288

289
    /// Insert a `String` of any length into the `VariableMap`.
290
    ///
291
    /// Returns `Err` with the old value, if it exists, and does not update the map.
292
    pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue> {
3✔
293
        // borrow-checker shenanigans, otherwise we could use if let
294
        if self.0.contains_key(&key) {
3✔
295
            // we just checked that this key exists
296
            #[allow(clippy::indexing_slicing)]
297
            return Err(&self.0[&key]);
×
298
        }
299

300
        let mut chars = s.chars();
3✔
301
        let val = match (chars.next(), chars.next()) {
3✔
302
            (Some(c), None) => VariableValue::Char(c),
×
303
            _ => VariableValue::String(Cow::Owned(s)),
3✔
304
        };
305

306
        self.0.insert(key, val);
3✔
307
        Ok(())
3✔
308
    }
3✔
309

310
    /// Insert a `&str` of any length into the `VariableMap`.
311
    ///
312
    /// Returns `Err` with the old value, if it exists, and does not update the map.
313
    pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue> {
×
314
        // borrow-checker shenanigans, otherwise we could use if let
315
        if self.0.contains_key(&key) {
×
316
            // we just checked that this key exists
317
            #[allow(clippy::indexing_slicing)]
318
            return Err(&self.0[&key]);
×
319
        }
320

321
        let mut chars = s.chars();
×
322
        let val = match (chars.next(), chars.next()) {
×
323
            (Some(c), None) => VariableValue::Char(c),
×
324
            _ => VariableValue::String(Cow::Borrowed(s)),
×
325
        };
326

327
        self.0.insert(key, val);
×
328
        Ok(())
×
329
    }
×
330

331
    /// Insert a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList) into the `VariableMap`.
332
    ///
333
    /// Returns `Err` with the old value, if it exists, and does not update the map.
334
    pub fn insert_set(
3✔
335
        &mut self,
336
        key: String,
337
        set: CodePointInversionListAndStringList<'a>,
338
    ) -> Result<(), &VariableValue> {
339
        // borrow-checker shenanigans, otherwise we could use if let
340
        if self.0.contains_key(&key) {
3✔
341
            // we just checked that this key exists
342
            #[allow(clippy::indexing_slicing)]
343
            return Err(&self.0[&key]);
×
344
        }
345
        self.0.insert(key, VariableValue::UnicodeSet(set));
3✔
346
        Ok(())
3✔
347
    }
3✔
348
}
349

350
// this ignores the ambiguity between \-escapes and \p{} perl properties. it assumes it is in a context where \p is just 'p'
351
// returns whether the provided char signifies the start of a literal char (raw or escaped - so \ is a legal char start)
352
// important: assumes c is not pattern_white_space
353
fn legal_char_start(c: char) -> bool {
1,648✔
354
    !(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
1,648✔
355
}
1,648✔
356

357
// same as `legal_char_start` but adapted to the charInString nonterminal. \ is allowed due to escapes.
358
// important: assumes c is not pattern_white_space
359
fn legal_char_in_string_start(c: char) -> bool {
283✔
360
    c != '}'
283✔
361
}
283✔
362

363
#[derive(Debug)]
×
364
enum SingleOrMultiChar {
365
    Single(char),
×
366
    // Multi is a marker that indicates parsing was paused and needs to be resumed using parse_multi_escape* when
367
    // this token is consumed. The contained char is the first char of the multi sequence.
368
    Multi(char),
×
369
}
370

371
// A char or a string. The Vec<char> represents multi-escapes in the 2+ case.
372
// invariant: a String is either zero or 2+ chars long, a one-char-string is equivalent to a single char.
373
// invariant: a char is 1+ chars long
374
#[derive(Debug)]
×
375
enum Literal {
376
    String(String),
×
377
    CharKind(SingleOrMultiChar),
×
378
}
379

380
#[derive(Debug)]
×
381
enum MainToken<'data> {
382
    // to be interpreted as value
383
    Literal(Literal),
×
384
    // inner set
385
    UnicodeSet(CodePointInversionListAndStringList<'data>),
×
386
    // anchor, only at the end of a set ([... $])
387
    DollarSign,
388
    // intersection operator, only inbetween two sets ([[...] & [...]])
389
    Ampersand,
390
    // difference operator, only inbetween two sets ([[...] - [...]])
391
    // or
392
    // range operator, only inbetween two chars ([a-z], [a-{z}])
393
    Minus,
394
    // ] to indicate the end of a set
395
    ClosingBracket,
396
}
397

398
impl<'data> MainToken<'data> {
399
    fn from_variable_value(val: VariableValue<'data>) -> Self {
44✔
400
        match val {
44✔
401
            VariableValue::Char(c) => {
21✔
402
                MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
21✔
403
            }
21✔
404
            VariableValue::String(s) => {
4✔
405
                // we know that the VariableMap only contains non-length-1 Strings.
406
                MainToken::Literal(Literal::String(s.into_owned()))
4✔
407
            }
4✔
408
            VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
19✔
409
        }
410
    }
44✔
411
}
412

413
#[derive(Debug, Clone, Copy)]
×
414
enum Operation {
415
    Union,
416
    Difference,
417
    Intersection,
418
}
419

420
// this builds the set on-the-fly while parsing it
421
struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
422
    single_set: CodePointInversionListBuilder,
423
    string_set: BTreeSet<String>,
424
    iter: &'a mut Peekable<CharIndices<'b>>,
425
    source: &'b str,
426
    inverted: bool,
427
    variable_map: &'a VariableMap<'a>,
428
    xid_start: &'a CodePointInversionList<'a>,
429
    xid_continue: &'a CodePointInversionList<'a>,
430
    pat_ws: &'a CodePointInversionList<'a>,
431
    property_provider: &'a P,
432
}
433

434
impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
435
where
436
    P: ?Sized
437
        + DataProvider<PropertyBinaryAlphabeticV1>
438
        + DataProvider<PropertyBinaryAsciiHexDigitV1>
439
        + DataProvider<PropertyBinaryBidiControlV1>
440
        + DataProvider<PropertyBinaryBidiMirroredV1>
441
        + DataProvider<PropertyBinaryCasedV1>
442
        + DataProvider<PropertyBinaryCaseIgnorableV1>
443
        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
444
        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
445
        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
446
        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
447
        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
448
        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
449
        + DataProvider<PropertyBinaryDashV1>
450
        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
451
        + DataProvider<PropertyBinaryDeprecatedV1>
452
        + DataProvider<PropertyBinaryDiacriticV1>
453
        + DataProvider<PropertyBinaryEmojiComponentV1>
454
        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
455
        + DataProvider<PropertyBinaryEmojiModifierV1>
456
        + DataProvider<PropertyBinaryEmojiPresentationV1>
457
        + DataProvider<PropertyBinaryEmojiV1>
458
        + DataProvider<PropertyBinaryExtendedPictographicV1>
459
        + DataProvider<PropertyBinaryExtenderV1>
460
        + DataProvider<PropertyBinaryGraphemeBaseV1>
461
        + DataProvider<PropertyBinaryGraphemeExtendV1>
462
        + DataProvider<PropertyBinaryHexDigitV1>
463
        + DataProvider<PropertyBinaryIdContinueV1>
464
        + DataProvider<PropertyBinaryIdeographicV1>
465
        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
466
        + DataProvider<PropertyBinaryIdStartV1>
467
        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
468
        + DataProvider<PropertyBinaryJoinControlV1>
469
        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
470
        + DataProvider<PropertyBinaryLowercaseV1>
471
        + DataProvider<PropertyBinaryMathV1>
472
        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
473
        + DataProvider<PropertyBinaryPatternSyntaxV1>
474
        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
475
        + DataProvider<PropertyBinaryQuotationMarkV1>
476
        + DataProvider<PropertyBinaryRadicalV1>
477
        + DataProvider<PropertyBinaryRegionalIndicatorV1>
478
        + DataProvider<PropertyBinarySentenceTerminalV1>
479
        + DataProvider<PropertyBinarySoftDottedV1>
480
        + DataProvider<PropertyBinaryTerminalPunctuationV1>
481
        + DataProvider<PropertyBinaryUnifiedIdeographV1>
482
        + DataProvider<PropertyBinaryUppercaseV1>
483
        + DataProvider<PropertyBinaryVariationSelectorV1>
484
        + DataProvider<PropertyBinaryWhiteSpaceV1>
485
        + DataProvider<PropertyBinaryXidContinueV1>
486
        + DataProvider<PropertyBinaryXidStartV1>
487
        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
488
        + DataProvider<PropertyEnumGeneralCategoryV1>
489
        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
490
        + DataProvider<PropertyEnumScriptV1>
491
        + DataProvider<PropertyEnumSentenceBreakV1>
492
        + DataProvider<PropertyEnumWordBreakV1>
493
        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
494
        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
495
        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
496
        + DataProvider<PropertyNameParseScriptV1>
497
        + DataProvider<PropertyNameParseSentenceBreakV1>
498
        + DataProvider<PropertyNameParseWordBreakV1>
499
        + DataProvider<PropertyScriptWithExtensionsV1>,
500
{
501
    fn new_internal(
687✔
502
        iter: &'a mut Peekable<CharIndices<'b>>,
503
        source: &'b str,
504
        variable_map: &'a VariableMap<'a>,
505
        xid_start: &'a CodePointInversionList<'a>,
506
        xid_continue: &'a CodePointInversionList<'a>,
507
        pat_ws: &'a CodePointInversionList<'a>,
508
        provider: &'a P,
509
    ) -> Self {
510
        UnicodeSetBuilder {
687✔
511
            single_set: CodePointInversionListBuilder::new(),
687✔
512
            string_set: Default::default(),
687✔
513
            iter,
514
            source,
515
            inverted: false,
516
            variable_map,
517
            xid_start,
518
            xid_continue,
519
            pat_ws,
520
            property_provider: provider,
521
        }
×
522
    }
687✔
523

524
    // the entry point, parses a full UnicodeSet. ignores remaining input
525
    fn parse_unicode_set(&mut self) -> Result<()> {
688✔
526
        match self.must_peek_char()? {
688✔
527
            '\\' => self.parse_property_perl(),
26✔
528
            '[' => {
529
                self.iter.next();
657✔
530
                if let Some(':') = self.peek_char() {
657✔
531
                    self.parse_property_posix()
85✔
532
                } else {
533
                    self.parse_unicode_set_inner()
572✔
534
                }
535
            }
536
            '$' => {
537
                // must be variable ref to a UnicodeSet
538
                let (offset, v) = self.parse_variable()?;
3✔
539
                match v {
2✔
540
                    Some(VariableValue::UnicodeSet(s)) => {
1✔
541
                        self.single_set.add_set(s.code_points());
1✔
542
                        self.string_set
2✔
543
                            .extend(s.strings().iter().map(ToString::to_string));
1✔
544
                        Ok(())
1✔
545
                    }
1✔
546
                    Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
1✔
547
                    None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
×
548
                }
549
            }
550
            c => self.error_here(PEK::UnexpectedChar(c)),
1✔
551
        }
552
    }
688✔
553

554
    // beginning [ is already consumed
555
    fn parse_unicode_set_inner(&mut self) -> Result<()> {
594✔
556
        // special cases for the first chars after [
557
        if self.must_peek_char()? == '^' {
594✔
558
            self.iter.next();
111✔
559
            self.inverted = true;
111✔
560
        }
561
        // whitespace allowed between ^ and - in `[^ - ....]`
562
        self.skip_whitespace();
594✔
563
        if self.must_peek_char()? == '-' {
594✔
564
            self.iter.next();
7✔
565
            self.single_set.add_char('-');
7✔
566
        }
567

568
        // repeatedly parse the following:
569
        // char
570
        // char-char
571
        // {string}
572
        // unicodeset
573
        // & and - operators, but only between unicodesets
574
        // $variables in place of strings, chars, or unicodesets
575

576
        #[derive(Debug, Clone, Copy)]
×
577
        enum State {
578
            // a state equivalent to the beginning
579
            Begin,
580
            // a state after a char. implies `prev_char` is Some(_), because we need to buffer it
581
            // in case it is part of a range, e.g., a-z
582
            Char,
583
            // in the middle of parsing a range. implies `prev_char` is Some(_), and the next
584
            // element must be a char as well
585
            CharMinus,
586
            // state directly after parsing a recursive unicode set. operators are only allowed
587
            // in this state
588
            AfterUnicodeSet,
589
            // state directly after parsing an operator. forces the next element to be a recursive
590
            // unicode set
591
            AfterOp,
592
            // state after parsing a $ (that was not a variable reference)
593
            // the only valid next option is a closing bracket
594
            AfterDollar,
595
            // state after parsing a - in an otherwise invalid position
596
            // the only valid next option is a closing bracket
597
            AfterMinus,
598
        }
599
        use State::*;
600

601
        const DEFAULT_OP: Operation = Operation::Union;
602

603
        let mut state = Begin;
594✔
604
        let mut prev_char = None;
594✔
605
        let mut operation = Operation::Union;
594✔
606

607
        loop {
2,532✔
608
            self.skip_whitespace();
2,532✔
609

610
            // for error messages
611
            let (immediate_offset, immediate_char) = self.must_peek()?;
2,532✔
612

613
            let (tok_offset, from_var, tok) = self.parse_main_token()?;
2,532✔
614
            // warning: self.iter should not be advanced any more after this point on any path to
615
            // MT::Literal(Literal::CharKind(SingleOrMultiChar::Multi)), because that variant
616
            // expects a certain self.iter state
617

618
            use MainToken as MT;
619
            use SingleOrMultiChar as SMC;
620
            match (state, tok) {
2,518✔
621
                // the end of this unicode set
622
                (
623
                    Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
624
                    MT::ClosingBracket,
625
                ) => {
626
                    if let Some(prev) = prev_char.take() {
534✔
627
                        self.single_set.add_char(prev);
142✔
628
                    }
629
                    if matches!(state, CharMinus) {
534✔
630
                        self.single_set.add_char('-');
2✔
631
                    }
632

633
                    return Ok(());
534✔
634
                }
635
                // special case ends for -
636
                // [[a-z]-]
637
                (AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
2✔
638
                    self.single_set.add_char('-');
1✔
639
                    return Ok(());
1✔
640
                }
641
                (Begin, MT::Minus) => {
9✔
642
                    self.single_set.add_char('-');
9✔
643
                    state = AfterMinus;
9✔
644
                }
645
                // inner unicode set
646
                (Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
209✔
647
                    if let Some(prev) = prev_char.take() {
209✔
648
                        self.single_set.add_char(prev);
3✔
649
                    }
650

651
                    self.process_chars(operation, set.code_points().clone());
209✔
652
                    self.process_strings(
209✔
653
                        operation,
209✔
654
                        set.strings().iter().map(ToString::to_string).collect(),
209✔
655
                    );
656

657
                    operation = DEFAULT_OP;
209✔
658
                    state = AfterUnicodeSet;
209✔
659
                }
209✔
660
                // a literal char (either individually or as the start of a range if char)
661
                (
662
                    Begin | Char | AfterUnicodeSet,
663
                    MT::Literal(Literal::CharKind(SMC::Single(c))),
1,074✔
664
                ) => {
665
                    if let Some(prev) = prev_char.take() {
1,074✔
666
                        self.single_set.add_char(prev);
613✔
667
                    }
668
                    prev_char = Some(c);
1,065✔
669
                    state = Char;
1,065✔
670
                }
1,065✔
671
                // a bunch of literal chars as part of a multi-escape sequence
672
                (
673
                    Begin | Char | AfterUnicodeSet,
674
                    MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
6✔
675
                ) => {
676
                    if let Some(prev) = prev_char.take() {
6✔
677
                        self.single_set.add_char(prev);
×
678
                    }
679
                    self.single_set.add_char(first_c);
4✔
680
                    self.parse_multi_escape_into_set()?;
598✔
681

682
                    // Note we cannot go to the Char state, because a multi-escape sequence of
683
                    // length > 1 cannot initiate a range
684
                    state = Begin;
3✔
685
                }
3✔
686
                // a literal string (length != 1, by CharOrString invariant)
687
                (Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
59✔
688
                    if let Some(prev) = prev_char.take() {
59✔
689
                        self.single_set.add_char(prev);
21✔
690
                    }
691

692
                    self.string_set.insert(s);
59✔
693
                    state = Begin;
59✔
694
                }
59✔
695
                // parse a literal char as the end of a range
696
                (CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
259✔
697
                    let start = prev_char.ok_or(PEK::Internal.with_offset(tok_offset))?;
259✔
698
                    let end = c;
699
                    if start > end {
259✔
700
                        // TODO(#3558): Better error message (e.g., "start greater than end in range")?
701
                        return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
4✔
702
                    }
703

704
                    self.single_set.add_range(start..=end);
255✔
705
                    prev_char = None;
255✔
706
                    state = Begin;
255✔
707
                }
255✔
708
                // start parsing a char range
709
                (Char, MT::Minus) => {
266✔
710
                    state = CharMinus;
266✔
711
                }
712
                // start parsing a unicode set difference
713
                (AfterUnicodeSet, MT::Minus) => {
30✔
714
                    operation = Operation::Difference;
30✔
715
                    state = AfterOp;
30✔
716
                }
717
                // start parsing a unicode set difference
718
                (AfterUnicodeSet, MT::Ampersand) => {
27✔
719
                    operation = Operation::Intersection;
27✔
720
                    state = AfterOp;
27✔
721
                }
722
                (Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
37✔
723
                    if let Some(prev) = prev_char.take() {
28✔
724
                        self.single_set.add_char(prev);
21✔
725
                    }
726
                    self.single_set.add_char('\u{FFFF}');
37✔
727
                    state = AfterDollar;
37✔
728
                }
729
                _ => {
730
                    // TODO(#3558): We have precise knowledge about the following MainToken here,
731
                    //  should we make use of that?
732

733
                    if from_var {
18✔
734
                        // otherwise we get error messages such as
735
                        // [$a-$← error: unexpected character '$'
736
                        // for input [$a-$b], $a = 'a', $b = "string" ;
737
                        return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
5✔
738
                    }
739
                    return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
13✔
740
                }
741
            }
742
        }
2,496✔
743
    }
572✔
744

745
    fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
2,532✔
746
        let (initial_offset, first) = self.must_peek()?;
2,532✔
747
        if first == ']' {
2,532✔
748
            self.iter.next();
536✔
749
            return Ok((initial_offset, false, MainToken::ClosingBracket));
536✔
750
        }
751
        let (_, second) = self.must_peek_double()?;
1,996✔
752
        match (first, second) {
2,330✔
753
            // variable or anchor
754
            ('$', _) => {
755
                let (offset, var_or_anchor) = self.parse_variable()?;
82✔
756
                match var_or_anchor {
81✔
757
                    None => Ok((offset, false, MainToken::DollarSign)),
37✔
758
                    Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
44✔
759
                }
760
            }
761
            // string
762
            ('{', _) => self
71✔
763
                .parse_string()
764
                .map(|(offset, l)| (offset, false, MainToken::Literal(l))),
70✔
765
            // inner set
766
            ('\\', 'p' | 'P') | ('[', _) => {
767
                let mut inner_builder = UnicodeSetBuilder::new_internal(
193✔
768
                    self.iter,
193✔
769
                    self.source,
193✔
770
                    self.variable_map,
193✔
771
                    self.xid_start,
193✔
772
                    self.xid_continue,
193✔
773
                    self.pat_ws,
193✔
774
                    self.property_provider,
193✔
775
                );
776
                inner_builder.parse_unicode_set()?;
2,725✔
777
                let (single, string_set) = inner_builder.finalize();
193✔
778
                // note: offset - 1, because we already consumed full set
779
                let offset = self.must_peek_index()? - 1;
193✔
780
                let mut strings = string_set.into_iter().collect::<Vec<_>>();
192✔
781
                strings.sort();
192✔
782
                let cpilasl = CodePointInversionListAndStringList::try_from(
192✔
783
                    single.build(),
192✔
784
                    VarZeroVec::from(&strings),
192✔
785
                )
192✔
786
                .map_err(|_| PEK::Internal.with_offset(offset))?;
×
787
                Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
192✔
788
            }
193✔
789
            // note: c cannot be a whitespace, because we called skip_whitespace just before
790
            // (in the main parse loop), so it's safe to call this guard function
791
            (c, _) if legal_char_start(c) => self
1,648✔
792
                .parse_char()
793
                .map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
1,304✔
794
            ('-', _) => {
795
                self.iter.next();
306✔
796
                Ok((initial_offset, false, MainToken::Minus))
306✔
797
            }
798
            ('&', _) => {
799
                self.iter.next();
29✔
800
                Ok((initial_offset, false, MainToken::Ampersand))
29✔
801
            }
802
            (c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
1✔
803
        }
804
    }
2,532✔
805

806
    // parses a variable or an anchor. expects '$' as next token.
807
    // if this is a single $ (eg `[... $ ]` or the invalid `$ a`), then this function returns Ok(None),
808
    // otherwise Ok(Some(variable_value)).
809
    fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
85✔
810
        self.consume('$')?;
85✔
811

812
        let mut res = String::new();
85✔
813
        let (mut var_offset, first_c) = self.must_peek()?;
85✔
814

815
        if !self.xid_start.contains(first_c) {
84✔
816
            // -1 because we already consumed the '$'
817
            return Ok((var_offset - 1, None));
37✔
818
        }
819

820
        res.push(first_c);
47✔
821
        self.iter.next();
47✔
822
        // important: if we are parsing a root unicodeset as a variable, we might reach EOF as
823
        // a valid end of the variable name, so we cannot use must_peek here.
824
        while let Some(&(offset, c)) = self.iter.peek() {
238✔
825
            if !self.xid_continue.contains(c) {
236✔
826
                break;
827
            }
828
            // only update the offset if we're adding a new char to our variable
829
            var_offset = offset;
191✔
830
            self.iter.next();
191✔
831
            res.push(c);
191✔
832
        }
833

834
        if let Some(v) = self.variable_map.0.get(&res) {
47✔
835
            return Ok((var_offset, Some(v)));
46✔
836
        }
837

838
        Err(PEK::UnknownVariable.with_offset(var_offset))
1✔
839
    }
85✔
840

841
    // parses and consumes: '{' (s charInString)* s '}'
842
    fn parse_string(&mut self) -> Result<(usize, Literal)> {
71✔
843
        self.consume('{')?;
71✔
844

845
        let mut buffer = String::new();
71✔
846
        let mut last_offset;
847

848
        loop {
849
            self.skip_whitespace();
354✔
850
            last_offset = self.must_peek_index()?;
354✔
851
            match self.must_peek_char()? {
353✔
852
                '}' => {
853
                    self.iter.next();
70✔
854
                    break;
855
                }
856
                // note: c cannot be a whitespace, because we called skip_whitespace just before,
857
                // so it's safe to call this guard function
858
                c if legal_char_in_string_start(c) => {
283✔
859
                    // don't need the offset, because '}' will always be the last char
860
                    let (_, c) = self.parse_char()?;
283✔
861
                    match c {
283✔
862
                        SingleOrMultiChar::Single(c) => buffer.push(c),
282✔
863
                        SingleOrMultiChar::Multi(first) => {
1✔
864
                            buffer.push(first);
1✔
865
                            self.parse_multi_escape_into_string(&mut buffer)?;
72✔
866
                        }
867
                    }
868
                }
869
                c => return self.error_here(PEK::UnexpectedChar(c)),
×
870
            }
871
        }
872

873
        let mut chars = buffer.chars();
70✔
874
        let literal = match (chars.next(), chars.next()) {
70✔
875
            (Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
14✔
876
            _ => Literal::String(buffer),
56✔
877
        };
878
        Ok((last_offset, literal))
70✔
879
    }
71✔
880

881
    // finishes a partial multi escape parse. in case of a parse error, self.single_set
882
    // may be left in an inconsistent state
883
    fn parse_multi_escape_into_set(&mut self) -> Result<()> {
4✔
884
        // note: would be good to somehow merge the two multi_escape methods. splitting up the UnicodeSetBuilder into a more
885
        // conventional parser + lexer combo might allow this.
886
        // issue is that we cannot pass this method an argument that somehow mutates `self` in the current architecture.
887
        // self.lexer.parse_multi_into_charappendable(&mut self.single_set) should work because the lifetimes are separate
888

889
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
890
        // enforced when creating the SingleOrMultiChar::Multi.
891
        let mut first = true;
4✔
892
        loop {
4✔
893
            let skipped = self.skip_whitespace();
10✔
894
            match self.must_peek_char()? {
10✔
895
                '}' => {
896
                    self.iter.next();
3✔
897
                    return Ok(());
3✔
898
                }
899
                initial_c => {
900
                    if skipped == 0 && !first {
7✔
901
                        // bracketed hex code points must be separated by whitespace
902
                        return self.error_here(PEK::UnexpectedChar(initial_c));
1✔
903
                    }
904
                    first = false;
6✔
905

906
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
6✔
907
                    self.single_set.add_char(c);
6✔
908
                }
909
            }
910
        }
911
    }
4✔
912

913
    // finishes a partial multi escape parse. in case of a parse error, the caller must clean up the
914
    // string if necessary.
915
    fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
1✔
916
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
917
        // enforced when creating the SingleOrMultiChar::Multi.
918
        let mut first = true;
1✔
919
        loop {
1✔
920
            let skipped = self.skip_whitespace();
3✔
921
            match self.must_peek_char()? {
3✔
922
                '}' => {
923
                    self.iter.next();
1✔
924
                    return Ok(());
1✔
925
                }
926
                initial_c => {
927
                    if skipped == 0 && !first {
2✔
928
                        // bracketed hex code points must be separated by whitespace
929
                        return self.error_here(PEK::UnexpectedChar(initial_c));
×
930
                    }
931
                    first = false;
2✔
932

933
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
2✔
934
                    s.push(c);
2✔
935
                }
936
            }
937
        }
938
    }
1✔
939

940
    // starts with \ and consumes the whole escape sequence if a single
941
    // char is escaped, otherwise pauses the parse after the first char
942
    fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
133✔
943
        self.consume('\\')?;
133✔
944

945
        let (offset, next_char) = self.must_next()?;
133✔
946

947
        match next_char {
133✔
948
            'u' | 'x' if self.peek_char() == Some('{') => {
60✔
949
                // bracketedHex
950
                self.iter.next();
20✔
951

952
                self.skip_whitespace();
20✔
953
                let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
20✔
954
                let skipped = self.skip_whitespace();
16✔
955

956
                match self.must_peek()? {
16✔
957
                    (offset, '}') => {
7✔
958
                        self.iter.next();
7✔
959
                        Ok((offset, SingleOrMultiChar::Single(first_c)))
7✔
960
                    }
7✔
961
                    // note: enforcing whitespace after the first char here, because the parse_multi_escape functions
962
                    // won't have access to this information anymore
963
                    (offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
9✔
964
                        Ok((offset, SingleOrMultiChar::Multi(first_c)))
7✔
965
                    }
7✔
966
                    (_, c) => self.error_here(PEK::UnexpectedChar(c)),
2✔
967
                }
968
            }
969
            'u' => {
970
                // 'u' hex{4}
971
                self.parse_hex_digits_into_char(4, 4)
24✔
972
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
24✔
973
            }
974
            'x' => {
975
                // 'x' hex{2}
976
                self.parse_hex_digits_into_char(2, 2)
16✔
977
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
15✔
978
            }
979
            'U' => {
980
                // 'U00' ('0' hex{5} | '10' hex{4})
981
                self.consume('0')?;
3✔
982
                self.consume('0')?;
136✔
983
                self.parse_hex_digits_into_char(6, 6)
3✔
984
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
3✔
985
            }
986
            'N' => {
987
                // parse code point with name in {}
988
                // tracking issue: https://github.com/unicode-org/icu4x/issues/1397
989
                Err(PEK::Unimplemented.with_offset(offset))
1✔
990
            }
991
            'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
×
992
            'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
×
993
            't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
×
994
            'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
9✔
995
            'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
×
996
            'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
×
997
            'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
8✔
998
            _ => Ok((offset, SingleOrMultiChar::Single(next_char))),
52✔
999
        }
1000
    }
133✔
1001

1002
    // starts with :, consumes the trailing :]
1003
    fn parse_property_posix(&mut self) -> Result<()> {
85✔
1004
        self.consume(':')?;
85✔
1005
        if self.must_peek_char()? == '^' {
85✔
1006
            self.inverted = true;
3✔
1007
            self.iter.next();
3✔
1008
        }
1009

1010
        self.parse_property_inner(':')?;
85✔
1011

1012
        self.consume(']')?;
162✔
1013

1014
        Ok(())
77✔
1015
    }
85✔
1016

1017
    // starts with \p{ or \P{, consumes the trailing }
1018
    fn parse_property_perl(&mut self) -> Result<()> {
26✔
1019
        self.consume('\\')?;
26✔
1020
        match self.must_next()? {
26✔
1021
            (_, 'p') => {}
1022
            (_, 'P') => self.inverted = true,
1✔
1023
            (offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
×
1024
        }
1025
        self.consume('{')?;
26✔
1026

1027
        self.parse_property_inner('}')?;
51✔
1028

1029
        Ok(())
22✔
1030
    }
26✔
1031

1032
    fn parse_property_inner(&mut self, end: char) -> Result<()> {
114✔
1033
        // UnicodeSet spec ignores whitespace, '-', and '_',
1034
        // but ECMA-262 requires '_', so we'll allow that.
1035
        // TODO(#3559): support loose matching on property names (e.g., "AS  -_-  CII_Hex_ D-igit")
1036
        // TODO(#3559): support more properties than ECMA-262
1037

1038
        let property_offset;
1039

1040
        let mut key_buffer = String::new();
114✔
1041
        let mut value_buffer = String::new();
114✔
1042

1043
        enum State {
1044
            // initial state, nothing parsed yet
1045
            Begin,
1046
            // non-empty property name
1047
            PropertyName,
1048
            // property name parsed, '=' or '≠' parsed, no value parsed yet
1049
            PropertyValueBegin,
1050
            // non-empty property name, non-empty property value
1051
            PropertyValue,
1052
        }
1053
        use State::*;
1054

1055
        let mut state = Begin;
114✔
1056
        // whether '=' (true) or '≠' (false) was parsed
1057
        let mut equality = true;
114✔
1058

1059
        loop {
114✔
1060
            self.skip_whitespace();
684✔
1061
            match (state, self.must_peek_char()?) {
1,355✔
1062
                // parse the end of the property expression
1063
                (PropertyName | PropertyValue, c) if c == end => {
539✔
1064
                    // byte index of (full) property name/value is one back
1065
                    property_offset = self.must_peek_index()? - 1;
103✔
1066
                    self.iter.next();
103✔
1067
                    break;
1068
                }
1069
                // parse the property name
1070
                // NOTE: this might be too strict, because in the case of e.g. [:value:], we might want to
1071
                // allow [:lower-case-letter:] ([:gc=lower-case-letter:] works)
1072
                (Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
437✔
1073
                    key_buffer.push(c);
400✔
1074
                    self.iter.next();
400✔
1075
                    state = PropertyName;
400✔
1076
                }
400✔
1077
                // parse the name-value separator
1078
                (PropertyName, c @ ('=' | '≠')) => {
33✔
1079
                    equality = c == '=';
33✔
1080
                    self.iter.next();
33✔
1081
                    state = PropertyValueBegin;
31✔
1082
                }
31✔
1083
                // parse the property value
1084
                (PropertyValue | PropertyValueBegin, c) if c != end => {
140✔
1085
                    value_buffer.push(c);
139✔
1086
                    self.iter.next();
139✔
1087
                    state = PropertyValue;
139✔
1088
                }
139✔
1089
                (_, c) => return self.error_here(PEK::UnexpectedChar(c)),
5✔
1090
            }
1091
        }
1092

1093
        if !equality {
103✔
1094
            self.inverted = !self.inverted;
5✔
1095
        }
1096

1097
        let inverted = self
103✔
1098
            .load_property_codepoints(&key_buffer, &value_buffer)
103✔
1099
            // any error that does not already have an offset should use the appropriate property offset
1100
            .map_err(|e| e.or_with_offset(property_offset))?;
8✔
1101
        if inverted {
99✔
1102
            self.inverted = !self.inverted;
3✔
1103
        }
1104

1105
        Ok(())
99✔
1106
    }
110✔
1107

1108
    // returns whether the set needs to be inverted or not
1109
    fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
103✔
1110
        // we support:
1111
        // [:gc = value:]
1112
        // [:sc = value:]
1113
        // [:scx = value:]
1114
        // [:Grapheme_Cluster_Break = value:]
1115
        // [:Sentence_Break = value:]
1116
        // [:Word_Break = value:]
1117
        // [:value:] - looks up value in gc, sc
1118
        // [:prop:] - binary property, returns codepoints that have the property
1119
        // [:prop = truthy/falsy:] - same as above
1120

1121
        let mut inverted = false;
103✔
1122

1123
        // contains a value for the General_Category property that needs to be tried
1124
        let mut try_gc = Err(PEK::UnknownProperty.into());
103✔
1125
        // contains a value for the Script property that needs to be tried
1126
        let mut try_sc = Err(PEK::UnknownProperty.into());
103✔
1127
        // contains a value for the Script_Extensions property that needs to be tried
1128
        let mut try_scx = Err(PEK::UnknownProperty.into());
103✔
1129
        // contains a value for the Grapheme_Cluster_Break property that needs to be tried
1130
        let mut try_gcb = Err(PEK::UnknownProperty.into());
103✔
1131
        // contains a value for the Sentence_Break property that needs to be tried
1132
        let mut try_sb = Err(PEK::UnknownProperty.into());
103✔
1133
        // contains a value for the Word_Break property that needs to be tried
1134
        let mut try_wb = Err(PEK::UnknownProperty.into());
103✔
1135
        // contains a supposed binary property name that needs to be tried
1136
        let mut try_binary = Err(PEK::UnknownProperty.into());
103✔
1137
        // contains a supposed canonical combining class property name that needs to be tried
1138
        let mut try_ccc: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
103✔
1139
        // contains a supposed block property name that needs to be tried
1140
        let mut try_block: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
103✔
1141

1142
        if !value.is_empty() {
103✔
1143
            // key is gc, sc, scx, grapheme cluster break, sentence break, word break
1144
            // value is a property value
1145
            // OR
1146
            // key is a binary property and value is a truthy/falsy value
1147

1148
            match key.as_bytes() {
47✔
1149
                GeneralCategory::NAME | GeneralCategory::SHORT_NAME => try_gc = Ok(value),
34✔
1150
                GraphemeClusterBreak::NAME | GraphemeClusterBreak::SHORT_NAME => {
20✔
1151
                    try_gcb = Ok(value)
1✔
1152
                }
1153
                Script::NAME | Script::SHORT_NAME => try_sc = Ok(value),
20✔
1154
                SentenceBreak::NAME | SentenceBreak::SHORT_NAME => try_sb = Ok(value),
16✔
1155
                WordBreak::NAME | WordBreak::SHORT_NAME => try_wb = Ok(value),
15✔
1156
                CanonicalCombiningClass::NAME | CanonicalCombiningClass::SHORT_NAME => {
13✔
1157
                    try_ccc = Ok(value)
×
1158
                }
1159
                b"Script_Extensions" | b"scx" => try_scx = Ok(value),
17✔
1160
                b"Block" | b"blk" => try_block = Ok(value),
13✔
1161
                _ => {
1162
                    let normalized_value = value.to_ascii_lowercase();
14✔
1163
                    let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
14✔
1164
                    let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
14✔
1165
                    // value must either match truthy or falsy
1166
                    if truthy == falsy {
14✔
1167
                        return Err(PEK::UnknownProperty.into());
×
1168
                    }
1169
                    // correctness: if we reach this point, only `try_binary` can be Ok, hence
1170
                    // it does not matter that further down we unconditionally return `inverted`,
1171
                    // because only `try_binary` can enter that code path.
1172
                    inverted = falsy;
14✔
1173
                    try_binary = Ok(key);
14✔
1174
                }
14✔
1175
            }
1176
        } else {
1177
            // key is binary property
1178
            // OR a value of gc, sc (only gc or sc are supported as implicit keys by UTS35!)
1179
            try_gc = Ok(key);
73✔
1180
            try_sc = Ok(key);
73✔
1181
            try_binary = Ok(key);
73✔
1182
        }
1183

1184
        try_gc
206✔
1185
            .and_then(|value| self.try_load_general_category_set(value))
77✔
1186
            .or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
71✔
1187
            .or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
36✔
1188
            .or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
51✔
1189
            .or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
8✔
1190
            .or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
7✔
1191
            .or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))
6✔
1192
            .or_else(|_| try_ccc.and_then(|value| self.try_load_ccc_set(value)))
4✔
1193
            .or_else(|_| try_block.and_then(|value| self.try_load_block_set(value)))?;
8✔
1194
        Ok(inverted)
99✔
1195
    }
103✔
1196

1197
    fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
635✔
1198
        if self.inverted {
635✔
1199
            // code point inversion; removes all strings
1200
            #[cfg(feature = "log")]
1201
            if !self.string_set.is_empty() {
111✔
1202
                log::info!(
3✔
1203
                    "Inverting a unicode set with strings. This removes all strings entirely."
1204
                );
1205
            }
1206
            self.string_set.clear();
111✔
1207
            self.single_set.complement();
111✔
1208
        }
1209

1210
        (self.single_set, self.string_set)
635✔
1211
    }
635✔
1212

1213
    // parses either a raw char or an escaped char. all chars are allowed, the caller must make sure to handle
1214
    // cases where some characters are not allowed
1215
    fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
1,595✔
1216
        let (offset, c) = self.must_peek()?;
1,595✔
1217
        match c {
1,595✔
1218
            '\\' => self.parse_escaped_char(),
133✔
1219
            _ => {
1220
                self.iter.next();
1,462✔
1221
                Ok((offset, SingleOrMultiChar::Single(c)))
1,462✔
1222
            }
1223
        }
1224
    }
1,595✔
1225

1226
    // note: could turn this from the current two-pass approach into a one-pass approach
1227
    // by manually parsing the digits instead of using u32::from_str_radix.
1228
    fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
71✔
1229
        let first_offset = self.must_peek_index()?;
71✔
1230
        let end_offset = self.validate_hex_digits(min, max)?;
71✔
1231

1232
        // validate_hex_digits ensures that chars (including the last one) are ascii hex digits,
1233
        // which are all exactly one UTF-8 byte long, so slicing on these offsets always respects char boundaries
1234
        #[allow(clippy::indexing_slicing)]
1235
        let hex_source = &self.source[first_offset..=end_offset];
68✔
1236
        let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
68✔
1237
        char::try_from(num)
68✔
1238
            .map(|c| (end_offset, c))
66✔
1239
            .map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
2✔
1240
    }
71✔
1241

1242
    // validates [0-9a-fA-F]{min,max}, returns the offset of the last digit, consuming everything in the process
1243
    fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
71✔
1244
        let mut last_offset = 0;
71✔
1245
        for count in 0..max {
292✔
1246
            let (offset, c) = self.must_peek()?;
245✔
1247
            if !c.is_ascii_hexdigit() {
245✔
1248
                if count < min {
24✔
1249
                    return Err(PEK::UnexpectedChar(c).with_offset(offset));
3✔
1250
                } else {
1251
                    break;
1252
                }
1253
            }
1254
            self.iter.next();
221✔
1255
            last_offset = offset;
221✔
1256
        }
1257
        Ok(last_offset)
68✔
1258
    }
71✔
1259

1260
    // returns the number of skipped whitespace chars
1261
    fn skip_whitespace(&mut self) -> usize {
4,186✔
1262
        let mut num = 0;
4,186✔
1263
        while let Some(c) = self.peek_char() {
4,332✔
1264
            if !self.pat_ws.contains(c) {
4,332✔
1265
                break;
1266
            }
1267
            self.iter.next();
146✔
1268
            num += 1;
146✔
1269
        }
1270
        num
4,186✔
1271
    }
4,186✔
1272

1273
    fn consume(&mut self, expected: char) -> Result<()> {
509✔
1274
        match self.must_next()? {
509✔
1275
            (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
509✔
1276
            _ => Ok(()),
508✔
1277
        }
1278
    }
509✔
1279

1280
    // use this whenever an empty iterator would imply an Eof error
1281
    fn must_next(&mut self) -> Result<(usize, char)> {
668✔
1282
        self.iter.next().ok_or(PEK::Eof.into())
668✔
1283
    }
668✔
1284

1285
    // use this whenever an empty iterator would imply an Eof error
1286
    fn must_peek(&mut self) -> Result<(usize, char)> {
10,687✔
1287
        self.iter.peek().copied().ok_or(PEK::Eof.into())
10,687✔
1288
    }
10,687✔
1289

1290
    // must_peek, but looks two chars ahead. use sparingly
1291
    fn must_peek_double(&mut self) -> Result<(usize, char)> {
1,996✔
1292
        let mut copy = self.iter.clone();
1,996✔
1293
        copy.next();
1,996✔
1294
        copy.next().ok_or(PEK::Eof.into())
1,996✔
1295
    }
1,996✔
1296

1297
    // see must_peek
1298
    fn must_peek_char(&mut self) -> Result<char> {
2,963✔
1299
        self.must_peek().map(|(_, c)| c)
5,924✔
1300
    }
2,963✔
1301

1302
    // see must_peek
1303
    fn must_peek_index(&mut self) -> Result<usize> {
721✔
1304
        self.must_peek().map(|(idx, _)| idx)
1,440✔
1305
    }
721✔
1306

1307
    fn peek_char(&mut self) -> Option<char> {
5,049✔
1308
        self.iter.peek().map(|&(_, c)| c)
10,098✔
1309
    }
5,049✔
1310

1311
    // TODO: return Result<!> once ! is stable
1312
    #[inline]
1313
    fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
11✔
1314
        match self.iter.peek() {
11✔
1315
            None => Err(kind.into()),
×
1316
            Some(&(offset, _)) => Err(kind.with_offset(offset)),
11✔
1317
        }
1318
    }
11✔
1319

1320
    fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
209✔
1321
        match op {
209✔
1322
            Operation::Union => self.string_set.extend(other_strings),
157✔
1323
            Operation::Difference => {
1324
                self.string_set = self
27✔
1325
                    .string_set
1326
                    .difference(&other_strings)
1327
                    .cloned()
1328
                    .collect()
×
1329
            }
27✔
1330
            Operation::Intersection => {
1331
                self.string_set = self
25✔
1332
                    .string_set
1333
                    .intersection(&other_strings)
1334
                    .cloned()
1335
                    .collect()
×
1336
            }
25✔
1337
        }
1338
    }
209✔
1339

1340
    fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
209✔
1341
        match op {
209✔
1342
            Operation::Union => self.single_set.add_set(&other_chars),
157✔
1343
            Operation::Difference => self.single_set.remove_set(&other_chars),
27✔
1344
            Operation::Intersection => self.single_set.retain_set(&other_chars),
25✔
1345
        }
1346
    }
209✔
1347

1348
    fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
77✔
1349
        // TODO(#3550): This could be cached; does not depend on name.
1350
        let name_map =
1351
            PropertyParser::<GeneralCategoryGroup>::try_new_unstable(self.property_provider)
77✔
1352
                .map_err(|_| PEK::Internal)?;
×
1353
        let gc_value = name_map
77✔
1354
            .as_borrowed()
1355
            .get_loose(name)
1356
            .ok_or(PEK::UnknownProperty)?;
98✔
1357
        // TODO(#3550): This could be cached; does not depend on name.
1358
        let set = CodePointMapData::<GeneralCategory>::try_new_unstable(self.property_provider)
56✔
1359
            .map_err(|_| PEK::Internal)?
×
1360
            .as_borrowed()
1361
            .get_set_for_value_group(gc_value);
56✔
1362
        self.single_set.add_set(&set.to_code_point_inversion_list());
56✔
1363
        Ok(())
56✔
1364
    }
77✔
1365

1366
    fn try_get_script(&self, name: &str) -> Result<Script> {
28✔
1367
        // TODO(#3550): This could be cached; does not depend on name.
1368
        let name_map = PropertyParser::<Script>::try_new_unstable(self.property_provider)
28✔
1369
            .map_err(|_| PEK::Internal)?;
×
1370
        name_map
28✔
1371
            .as_borrowed()
1372
            .get_loose(name)
1373
            .ok_or(PEK::UnknownProperty.into())
28✔
1374
    }
28✔
1375

1376
    fn try_load_script_set(&mut self, name: &str) -> Result<()> {
24✔
1377
        let sc_value = self.try_get_script(name)?;
24✔
1378
        // TODO(#3550): This could be cached; does not depend on name.
1379
        let property_map = CodePointMapData::<Script>::try_new_unstable(self.property_provider)
15✔
1380
            .map_err(|_| PEK::Internal)?;
×
1381
        let set = property_map.as_borrowed().get_set_for_value(sc_value);
15✔
1382
        self.single_set.add_set(&set.to_code_point_inversion_list());
15✔
1383
        Ok(())
15✔
1384
    }
24✔
1385

1386
    fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
4✔
1387
        // TODO(#3550): This could be cached; does not depend on name.
1388
        let scx = ScriptWithExtensions::try_new_unstable(self.property_provider)
4✔
1389
            .map_err(|_| PEK::Internal)?;
×
1390
        let sc_value = self.try_get_script(name)?;
4✔
1391
        let set = scx.as_borrowed().get_script_extensions_set(sc_value);
4✔
1392
        self.single_set.add_set(&set);
4✔
1393
        Ok(())
4✔
1394
    }
4✔
1395

1396
    fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
23✔
1397
        let set =
1398
            CodePointSetData::try_new_for_ecma262_unstable(self.property_provider, name.as_bytes())
46✔
1399
                .ok_or(PEK::UnknownProperty)?
23✔
1400
                .map_err(|_data_error| PEK::Internal)?;
×
1401
        self.single_set.add_set(&set.to_code_point_inversion_list());
21✔
1402
        Ok(())
21✔
1403
    }
23✔
1404

1405
    fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
1✔
1406
        let parser =
1407
            PropertyParser::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1✔
1408
                .map_err(|_| PEK::Internal)?;
×
1409
        let gcb_value = parser
1✔
1410
            .as_borrowed()
1411
            .get_loose(name)
1412
            .ok_or(PEK::UnknownProperty)?;
1✔
1413
        // TODO(#3550): This could be cached; does not depend on name.
1414
        let property_map =
1415
            CodePointMapData::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1✔
1416
                .map_err(|_| PEK::Internal)?;
×
1417
        let set = property_map.as_borrowed().get_set_for_value(gcb_value);
1✔
1418
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1419
        Ok(())
1✔
1420
    }
1✔
1421

1422
    fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
1✔
1423
        let parser = PropertyParser::<SentenceBreak>::try_new_unstable(self.property_provider)
1✔
1424
            .map_err(|_| PEK::Internal)?;
×
1425
        let sb_value = parser
1✔
1426
            .as_borrowed()
1427
            .get_loose(name)
1428
            .ok_or(PEK::UnknownProperty)?;
1✔
1429
        // TODO(#3550): This could be cached; does not depend on name.
1430
        let property_map =
1431
            CodePointMapData::<SentenceBreak>::try_new_unstable(self.property_provider)
1✔
1432
                .map_err(|_| PEK::Internal)?;
×
1433
        let set = property_map.as_borrowed().get_set_for_value(sb_value);
1✔
1434
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1435
        Ok(())
1✔
1436
    }
1✔
1437

1438
    fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
1✔
1439
        let parser = PropertyParser::<WordBreak>::try_new_unstable(self.property_provider)
1✔
1440
            .map_err(|_| PEK::Internal)?;
×
1441
        let wb_value = parser
1✔
1442
            .as_borrowed()
1443
            .get_loose(name)
1444
            .ok_or(PEK::UnknownProperty)?;
1✔
1445
        // TODO(#3550): This could be cached; does not depend on name.
1446
        let property_map = CodePointMapData::<WordBreak>::try_new_unstable(self.property_provider)
1✔
1447
            .map_err(|_| PEK::Internal)?;
×
1448
        let set = property_map.as_borrowed().get_set_for_value(wb_value);
1✔
1449
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1450
        Ok(())
1✔
1451
    }
1✔
1452

1453
    fn try_load_ccc_set(&mut self, name: &str) -> Result<()> {
×
1454
        let parser =
1455
            PropertyParser::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
×
1456
                .map_err(|_| PEK::Internal)?;
×
1457
        let value = parser
×
1458
            .as_borrowed()
1459
            .get_loose(name)
1460
            // TODO: make the property parser do this
1461
            .or_else(|| {
×
1462
                name.parse()
×
1463
                    .ok()
1464
                    .map(CanonicalCombiningClass::from_icu4c_value)
1465
            })
×
1466
            .ok_or(PEK::UnknownProperty)?;
×
1467
        // TODO(#3550): This could be cached; does not depend on name.
1468
        let property_map =
1469
            CodePointMapData::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
×
1470
                .map_err(|_| PEK::Internal)?;
×
1471
        let set = property_map.as_borrowed().get_set_for_value(value);
×
1472
        self.single_set.add_set(&set.to_code_point_inversion_list());
×
1473
        Ok(())
×
1474
    }
×
1475

1476
    fn try_load_block_set(&mut self, name: &str) -> Result<()> {
×
1477
        // TODO: source these from properties
1478
        self.single_set
×
1479
            .add_range(match name.to_ascii_lowercase().as_str() {
×
1480
                "arabic" => '\u{0600}'..'\u{06FF}',
×
1481
                "thaana" => '\u{0780}'..'\u{07BF}',
×
1482
                _ => {
1483
                    #[cfg(feature = "log")]
1484
                    log::warn!("Skipping :block={name}:");
×
1485
                    return Err(PEK::Unimplemented.into());
×
1486
                }
1487
            });
×
1488
        Ok(())
×
1489
    }
×
1490
}
1491

1492
/// Parses a UnicodeSet pattern and returns a UnicodeSet in the form of a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList),
1493
/// as well as the number of bytes consumed from the source string.
1494
///
1495
/// Supports UnicodeSets as described in [UTS #35 - Unicode Sets](https://unicode.org/reports/tr35/#Unicode_Sets).
1496
///
1497
/// The error type of the returned Result can be pretty-printed with [`ParseError::fmt_with_source`].
1498
///
1499
/// # Variables
1500
///
1501
/// If you need support for variables inside UnicodeSets (e.g., `[$start-$end]`), use [`parse_with_variables`].
1502
///
1503
/// # Limitations
1504
///
1505
/// * Currently, we only support the [ECMA-262 properties](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
1506
///   The property names must match the exact spelling listed in ECMA-262. Note that we do support UTS35 syntax for elided `General_Category`
1507
///   and `Script` property names, i.e., `[:Latn:]` and `[:Ll:]` are both valid, with the former implying the `Script` property, and the latter the
1508
///   `General_Category` property.
1509
/// * We do not support `\N{Unicode code point name}` character escaping. Use any other escape method described in UTS35.
1510
///
1511
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
1512
///
1513
/// [📚 Help choosing a constructor](icu_provider::constructors)
1514
///
1515
/// # Examples
1516
///
1517
/// Parse ranges
1518
/// ```
1519
/// use icu::experimental::unicodeset_parse::parse;
1520
///
1521
/// let source = "[a-zA-Z0-9]";
1522
/// let (set, consumed) = parse(source).unwrap();
1523
/// let code_points = set.code_points();
1524
///
1525
/// assert!(code_points.contains_range('a'..='z'));
1526
/// assert!(code_points.contains_range('A'..='Z'));
1527
/// assert!(code_points.contains_range('0'..='9'));
1528
/// assert_eq!(consumed, source.len());
1529
/// ```
1530
///
1531
/// Parse properties, set operations, inner sets
1532
/// ```
1533
/// use icu::experimental::unicodeset_parse::parse;
1534
///
1535
/// let (set, _) =
1536
///     parse("[[:^ll:]-[^][:gc = Lowercase Letter:]&[^[[^]-[a-z]]]]").unwrap();
1537
/// assert!(set.code_points().contains_range('a'..='z'));
1538
/// assert_eq!(('a'..='z').count(), set.size());
1539
/// ```
1540
///
1541
/// Inversions remove strings
1542
/// ```
1543
/// use icu::experimental::unicodeset_parse::parse;
1544
///
1545
/// let (set, _) =
1546
///     parse(r"[[a-z{hello\ world}]&[^a-y{hello\ world}]]").unwrap();
1547
/// assert!(set.contains('z'));
1548
/// assert_eq!(set.size(), 1);
1549
/// assert!(!set.has_strings());
1550
/// ```
1551
///
1552
/// Set operators (including the implicit union) have the same precedence and are left-associative
1553
/// ```
1554
/// use icu::experimental::unicodeset_parse::parse;
1555
///
1556
/// let (set, _) = parse("[[ace][bdf] - [abc][def]]").unwrap();
1557
/// assert!(set.code_points().contains_range('d'..='f'));
1558
/// assert_eq!(set.size(), ('d'..='f').count());
1559
/// ```
1560
///
1561
/// Supports partial parses
1562
/// ```
1563
/// use icu::experimental::unicodeset_parse::parse;
1564
///
1565
/// let (set, consumed) = parse("[a-c][x-z]").unwrap();
1566
/// let code_points = set.code_points();
1567
/// assert!(code_points.contains_range('a'..='c'));
1568
/// assert!(!code_points.contains_range('x'..='z'));
1569
/// assert_eq!(set.size(), ('a'..='c').count());
1570
/// // only the first UnicodeSet is parsed
1571
/// assert_eq!(consumed, "[a-c]".len());
1572
/// ```
1573
#[cfg(feature = "compiled_data")]
1574
pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
134✔
1575
    parse_unstable(source, &icu_properties::provider::Baked)
134✔
1576
}
134✔
1577

1578
/// Parses a UnicodeSet pattern with support for variables enabled.
1579
///
1580
/// See [`parse`] for more information.
1581
///
1582
/// # Examples
1583
///
1584
/// ```
1585
/// use icu::experimental::unicodeset_parse::*;
1586
///
1587
/// let (my_set, _) = parse("[abc]").unwrap();
1588
///
1589
/// let mut variable_map = VariableMap::new();
1590
/// variable_map.insert_char("start".into(), 'a').unwrap();
1591
/// variable_map.insert_char("end".into(), 'z').unwrap();
1592
/// variable_map.insert_string("str".into(), "Hello World".into()).unwrap();
1593
/// variable_map.insert_set("the_set".into(), my_set).unwrap();
1594
///
1595
/// // If a variable already exists, `Err` is returned, and the map is not updated.
1596
/// variable_map.insert_char("end".into(), 'Ω').unwrap_err();
1597
///
1598
/// let source = "[[$start-$end]-$the_set $str]";
1599
/// let (set, consumed) = parse_with_variables(source, &variable_map).unwrap();
1600
/// assert_eq!(consumed, source.len());
1601
/// assert!(set.code_points().contains_range('d'..='z'));
1602
/// assert!(set.contains_str("Hello World"));
1603
/// assert_eq!(set.size(), 1 + ('d'..='z').count());
1604
#[cfg(feature = "compiled_data")]
1605
pub fn parse_with_variables(
74✔
1606
    source: &str,
1607
    variable_map: &VariableMap<'_>,
1608
) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1609
    parse_unstable_with_variables(source, variable_map, &icu_properties::provider::Baked)
74✔
1610
}
74✔
1611

1612
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
1613
pub fn parse_unstable_with_variables<P>(
498✔
1614
    source: &str,
1615
    variable_map: &VariableMap<'_>,
1616
    provider: &P,
1617
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1618
where
1619
    P: ?Sized
1620
        + DataProvider<PropertyBinaryAlphabeticV1>
1621
        + DataProvider<PropertyBinaryAsciiHexDigitV1>
1622
        + DataProvider<PropertyBinaryBidiControlV1>
1623
        + DataProvider<PropertyBinaryBidiMirroredV1>
1624
        + DataProvider<PropertyBinaryCasedV1>
1625
        + DataProvider<PropertyBinaryCaseIgnorableV1>
1626
        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1627
        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1628
        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1629
        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1630
        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1631
        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1632
        + DataProvider<PropertyBinaryDashV1>
1633
        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1634
        + DataProvider<PropertyBinaryDeprecatedV1>
1635
        + DataProvider<PropertyBinaryDiacriticV1>
1636
        + DataProvider<PropertyBinaryEmojiComponentV1>
1637
        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1638
        + DataProvider<PropertyBinaryEmojiModifierV1>
1639
        + DataProvider<PropertyBinaryEmojiPresentationV1>
1640
        + DataProvider<PropertyBinaryEmojiV1>
1641
        + DataProvider<PropertyBinaryExtendedPictographicV1>
1642
        + DataProvider<PropertyBinaryExtenderV1>
1643
        + DataProvider<PropertyBinaryGraphemeBaseV1>
1644
        + DataProvider<PropertyBinaryGraphemeExtendV1>
1645
        + DataProvider<PropertyBinaryHexDigitV1>
1646
        + DataProvider<PropertyBinaryIdContinueV1>
1647
        + DataProvider<PropertyBinaryIdeographicV1>
1648
        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1649
        + DataProvider<PropertyBinaryIdStartV1>
1650
        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1651
        + DataProvider<PropertyBinaryJoinControlV1>
1652
        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1653
        + DataProvider<PropertyBinaryLowercaseV1>
1654
        + DataProvider<PropertyBinaryMathV1>
1655
        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1656
        + DataProvider<PropertyBinaryPatternSyntaxV1>
1657
        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1658
        + DataProvider<PropertyBinaryQuotationMarkV1>
1659
        + DataProvider<PropertyBinaryRadicalV1>
1660
        + DataProvider<PropertyBinaryRegionalIndicatorV1>
1661
        + DataProvider<PropertyBinarySentenceTerminalV1>
1662
        + DataProvider<PropertyBinarySoftDottedV1>
1663
        + DataProvider<PropertyBinaryTerminalPunctuationV1>
1664
        + DataProvider<PropertyBinaryUnifiedIdeographV1>
1665
        + DataProvider<PropertyBinaryUppercaseV1>
1666
        + DataProvider<PropertyBinaryVariationSelectorV1>
1667
        + DataProvider<PropertyBinaryWhiteSpaceV1>
1668
        + DataProvider<PropertyBinaryXidContinueV1>
1669
        + DataProvider<PropertyBinaryXidStartV1>
1670
        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1671
        + DataProvider<PropertyEnumGeneralCategoryV1>
1672
        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1673
        + DataProvider<PropertyEnumScriptV1>
1674
        + DataProvider<PropertyEnumSentenceBreakV1>
1675
        + DataProvider<PropertyEnumWordBreakV1>
1676
        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1677
        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1678
        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1679
        + DataProvider<PropertyNameParseScriptV1>
1680
        + DataProvider<PropertyNameParseSentenceBreakV1>
1681
        + DataProvider<PropertyNameParseWordBreakV1>
1682
        + DataProvider<PropertyScriptWithExtensionsV1>,
1683
{
1684
    // TODO(#3550): Add function "parse_overescaped" that uses a custom iterator to de-overescape (i.e., maps \\ to \) on-the-fly?
1685
    // ^ will likely need a different iterator type on UnicodeSetBuilder
1686

1687
    let mut iter = source.char_indices().peekable();
498✔
1688

1689
    let xid_start =
1690
        CodePointSetData::try_new_unstable::<XidStart>(provider).map_err(|_| PEK::Internal)?;
498✔
1691
    let xid_start_list = xid_start.to_code_point_inversion_list();
498✔
1692
    let xid_continue =
1693
        CodePointSetData::try_new_unstable::<XidContinue>(provider).map_err(|_| PEK::Internal)?;
495✔
1694
    let xid_continue_list = xid_continue.to_code_point_inversion_list();
497✔
1695

1696
    let pat_ws = CodePointSetData::try_new_unstable::<PatternWhiteSpace>(provider)
495✔
1697
        .map_err(|_| PEK::Internal)?;
×
1698
    let pat_ws_list = pat_ws.to_code_point_inversion_list();
497✔
1699

1700
    let mut builder = UnicodeSetBuilder::new_internal(
500✔
1701
        &mut iter,
1702
        source,
1703
        variable_map,
1704
        &xid_start_list,
1705
        &xid_continue_list,
1706
        &pat_ws_list,
1707
        provider,
1708
    );
497✔
1709

1710
    builder.parse_unicode_set()?;
995✔
1711
    let (single, string_set) = builder.finalize();
442✔
1712
    let built_single = single.build();
444✔
1713

1714
    let mut strings = string_set.into_iter().collect::<Vec<_>>();
444✔
1715
    strings.sort();
444✔
1716
    let zerovec = (&strings).into();
442✔
1717

1718
    let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
442✔
1719
        .map_err(|_| PEK::Internal)?;
×
1720

1721
    let parsed_bytes = match iter.peek().copied() {
442✔
1722
        None => source.len(),
163✔
1723
        Some((offset, _)) => offset,
279✔
1724
    };
1725

1726
    Ok((cpinvlistandstrlist, parsed_bytes))
442✔
1727
}
495✔
1728

1729
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, parse)]
1730
pub fn parse_unstable<P>(
150✔
1731
    source: &str,
1732
    provider: &P,
1733
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1734
where
1735
    P: ?Sized
1736
        + DataProvider<PropertyBinaryAlphabeticV1>
1737
        + DataProvider<PropertyBinaryAsciiHexDigitV1>
1738
        + DataProvider<PropertyBinaryBidiControlV1>
1739
        + DataProvider<PropertyBinaryBidiMirroredV1>
1740
        + DataProvider<PropertyBinaryCasedV1>
1741
        + DataProvider<PropertyBinaryCaseIgnorableV1>
1742
        + DataProvider<PropertyBinaryChangesWhenCasefoldedV1>
1743
        + DataProvider<PropertyBinaryChangesWhenCasemappedV1>
1744
        + DataProvider<PropertyBinaryChangesWhenLowercasedV1>
1745
        + DataProvider<PropertyBinaryChangesWhenNfkcCasefoldedV1>
1746
        + DataProvider<PropertyBinaryChangesWhenTitlecasedV1>
1747
        + DataProvider<PropertyBinaryChangesWhenUppercasedV1>
1748
        + DataProvider<PropertyBinaryDashV1>
1749
        + DataProvider<PropertyBinaryDefaultIgnorableCodePointV1>
1750
        + DataProvider<PropertyBinaryDeprecatedV1>
1751
        + DataProvider<PropertyBinaryDiacriticV1>
1752
        + DataProvider<PropertyBinaryEmojiComponentV1>
1753
        + DataProvider<PropertyBinaryEmojiModifierBaseV1>
1754
        + DataProvider<PropertyBinaryEmojiModifierV1>
1755
        + DataProvider<PropertyBinaryEmojiPresentationV1>
1756
        + DataProvider<PropertyBinaryEmojiV1>
1757
        + DataProvider<PropertyBinaryExtendedPictographicV1>
1758
        + DataProvider<PropertyBinaryExtenderV1>
1759
        + DataProvider<PropertyBinaryGraphemeBaseV1>
1760
        + DataProvider<PropertyBinaryGraphemeExtendV1>
1761
        + DataProvider<PropertyBinaryHexDigitV1>
1762
        + DataProvider<PropertyBinaryIdContinueV1>
1763
        + DataProvider<PropertyBinaryIdeographicV1>
1764
        + DataProvider<PropertyBinaryIdsBinaryOperatorV1>
1765
        + DataProvider<PropertyBinaryIdStartV1>
1766
        + DataProvider<PropertyBinaryIdsTrinaryOperatorV1>
1767
        + DataProvider<PropertyBinaryJoinControlV1>
1768
        + DataProvider<PropertyBinaryLogicalOrderExceptionV1>
1769
        + DataProvider<PropertyBinaryLowercaseV1>
1770
        + DataProvider<PropertyBinaryMathV1>
1771
        + DataProvider<PropertyBinaryNoncharacterCodePointV1>
1772
        + DataProvider<PropertyBinaryPatternSyntaxV1>
1773
        + DataProvider<PropertyBinaryPatternWhiteSpaceV1>
1774
        + DataProvider<PropertyBinaryQuotationMarkV1>
1775
        + DataProvider<PropertyBinaryRadicalV1>
1776
        + DataProvider<PropertyBinaryRegionalIndicatorV1>
1777
        + DataProvider<PropertyBinarySentenceTerminalV1>
1778
        + DataProvider<PropertyBinarySoftDottedV1>
1779
        + DataProvider<PropertyBinaryTerminalPunctuationV1>
1780
        + DataProvider<PropertyBinaryUnifiedIdeographV1>
1781
        + DataProvider<PropertyBinaryUppercaseV1>
1782
        + DataProvider<PropertyBinaryVariationSelectorV1>
1783
        + DataProvider<PropertyBinaryWhiteSpaceV1>
1784
        + DataProvider<PropertyBinaryXidContinueV1>
1785
        + DataProvider<PropertyBinaryXidStartV1>
1786
        + DataProvider<PropertyEnumCanonicalCombiningClassV1>
1787
        + DataProvider<PropertyEnumGeneralCategoryV1>
1788
        + DataProvider<PropertyEnumGraphemeClusterBreakV1>
1789
        + DataProvider<PropertyEnumScriptV1>
1790
        + DataProvider<PropertyEnumSentenceBreakV1>
1791
        + DataProvider<PropertyEnumWordBreakV1>
1792
        + DataProvider<PropertyNameParseCanonicalCombiningClassV1>
1793
        + DataProvider<PropertyNameParseGeneralCategoryMaskV1>
1794
        + DataProvider<PropertyNameParseGraphemeClusterBreakV1>
1795
        + DataProvider<PropertyNameParseScriptV1>
1796
        + DataProvider<PropertyNameParseSentenceBreakV1>
1797
        + DataProvider<PropertyNameParseWordBreakV1>
1798
        + DataProvider<PropertyScriptWithExtensionsV1>,
1799
{
1800
    let dummy = Default::default();
150✔
1801
    parse_unstable_with_variables(source, &dummy, provider)
150✔
1802
}
150✔
1803

1804
#[cfg(test)]
1805
mod tests {
1806
    use core::ops::RangeInclusive;
1807
    use std::collections::HashSet;
1808

1809
    use super::*;
1810

1811
    // "aabxzz" => [a..=a, b..=x, z..=z]
1812
    fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
139✔
1813
        debug_assert_eq!(
278✔
1814
            s.chars().count() % 2,
139✔
1815
            0,
1816
            "string \"{}\" does not contain an even number of code points",
1817
            s.escape_debug()
1818
        );
1819
        let mut res = vec![];
139✔
1820
        let mut skip = false;
139✔
1821
        for (a, b) in s.chars().zip(s.chars().skip(1)) {
382✔
1822
            if skip {
243✔
1823
                skip = false;
66✔
1824
                continue;
1825
            }
1826
            let a = a as u32;
177✔
1827
            let b = b as u32;
177✔
1828
            res.push(a..=b);
177✔
1829
            skip = true;
177✔
1830
        }
1831

1832
        res.into_iter()
139✔
1833
    }
139✔
1834

1835
    fn assert_set_equality<'a>(
139✔
1836
        source: &str,
1837
        cpinvlistandstrlist: &CodePointInversionListAndStringList,
1838
        single: impl Iterator<Item = RangeInclusive<u32>>,
1839
        strings: impl Iterator<Item = &'a str>,
1840
    ) {
1841
        let expected_ranges: HashSet<_> = single.collect();
139✔
1842
        let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
139✔
1843
        assert_eq!(
139✔
1844
            actual_ranges,
1845
            expected_ranges,
1846
            "got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
1847
            actual_ranges,
1848
            expected_ranges,
1849
            source.escape_debug()
1850
        );
1851
        let mut expected_size = cpinvlistandstrlist.code_points().size();
139✔
1852
        for s in strings {
160✔
1853
            expected_size += 1;
21✔
1854
            assert!(
×
1855
                cpinvlistandstrlist.contains_str(s),
21✔
1856
                "missing string \"{}\" from parsed set \"{}\"",
1857
                s.escape_debug(),
×
1858
                source.escape_debug()
×
1859
            );
1860
        }
139✔
1861
        let actual_size = cpinvlistandstrlist.size();
139✔
1862
        assert_eq!(
139✔
1863
            actual_size,
1864
            expected_size,
1865
            "got unexpected size {}, expected {} for parsed set \"{}\"",
1866
            actual_size,
1867
            expected_size,
1868
            source.escape_debug()
1869
        );
1870
    }
139✔
1871

1872
    fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
46✔
1873
        let result = parse_with_variables(source, vm);
46✔
1874
        assert!(result.is_err(), "{source} does not cause an error!");
46✔
1875
        let err = result.unwrap_err();
46✔
1876
        assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
92✔
1877
    }
46✔
1878

1879
    #[test]
1880
    fn test_semantics_with_variables() {
2✔
1881
        let mut map_char_char = VariableMap::default();
1✔
1882
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
1883
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
1884

1885
        let mut map_headache = VariableMap::default();
1✔
1886
        map_headache.insert_char("hehe".to_string(), '-').unwrap();
1✔
1887

1888
        let mut map_char_string = VariableMap::default();
1✔
1889
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
1890
        map_char_string
1✔
1891
            .insert_string("var2".to_string(), "abc".to_string())
2✔
1892
            .unwrap();
1893

1894
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
1895
        let mut map_char_set = VariableMap::default();
1✔
1896
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
1897
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
1898

1899
        let cases: Vec<(_, _, _, Vec<&str>)> = vec![
2✔
1900
            // simple
1901
            (&map_char_char, "[$a]", "aa", vec![]),
1✔
1902
            (&map_char_char, "[ $a ]", "aa", vec![]),
1✔
1903
            (&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1904
            (&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1905
            (&map_char_char, "[$a$var2]", "aazz", vec![]),
1✔
1906
            (&map_char_char, "[$a - $var2]", "az", vec![]),
1✔
1907
            (&map_char_char, "[$a-$var2]", "az", vec![]),
1✔
1908
            (&map_headache, "[a $hehe z]", "aazz--", vec![]),
1✔
1909
            (
1✔
1910
                &map_char_char,
1911
                "[[$]var2]",
1912
                "\u{ffff}\u{ffff}vvaarr22",
1913
                vec![],
1✔
1914
            ),
1915
            // variable prefix escaping
1916
            (&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
1✔
1917
            (&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
1✔
1918
            // no variable dereferencing in strings
1919
            (&map_char_char, "[{$a}]", "", vec!["$a"]),
1✔
1920
            // set operations
1921
            (&map_char_set, "[$set & [b-z]]", "bz", vec![]),
1✔
1922
            (&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
1✔
1923
            (&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
1✔
1924
            (&map_char_set, "[$set-$set]", "", vec![]),
1✔
1925
            (&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
1✔
1926
            (&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
1✔
1927
            (&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
1✔
1928
            (&map_char_set, "$set", "az", vec!["Hello, World!"]),
1✔
1929
            // strings
1930
            (&map_char_string, "[$var2]", "", vec!["abc"]),
1✔
1931
        ];
1932
        for (variable_map, source, single, strings) in cases {
22✔
1933
            let parsed = parse_with_variables(source, variable_map);
21✔
1934
            if let Err(err) = parsed {
21✔
1935
                panic!(
×
1936
                    "{source} results in an error: {}",
1937
                    err.fmt_with_source(source)
×
1938
                );
1939
            }
1940
            let (set, consumed) = parsed.unwrap();
21✔
1941
            assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
21✔
1942
            assert_set_equality(
21✔
1943
                source,
1944
                &set,
1945
                range_iter_from_str(single),
21✔
1946
                strings.into_iter(),
21✔
1947
            );
21✔
1948
        }
22✔
1949
    }
2✔
1950

1951
    #[test]
1952
    fn test_semantics() {
2✔
1953
        const ALL_CHARS: &str = "\x00\u{10FFFF}";
1954
        let cases: Vec<(_, _, Vec<&str>)> = vec![
2✔
1955
            // simple
1956
            ("[a]", "aa", vec![]),
1✔
1957
            ("[]", "", vec![]),
1✔
1958
            ("[qax]", "aaqqxx", vec![]),
1✔
1959
            ("[a-z]", "az", vec![]),
1✔
1960
            ("[--]", "--", vec![]),
1✔
1961
            ("[a-b-]", "ab--", vec![]),
1✔
1962
            ("[[a-b]-]", "ab--", vec![]),
1✔
1963
            ("[{ab}-]", "--", vec!["ab"]),
1✔
1964
            ("[-a-b]", "ab--", vec![]),
1✔
1965
            ("[-a]", "--aa", vec![]),
1✔
1966
            // whitespace escaping
1967
            (r"[\n]", "\n\n", vec![]),
1✔
1968
            ("[\\\n]", "\n\n", vec![]),
1✔
1969
            // empty - whitespace is skipped
1970
            ("[\n]", "", vec![]),
1✔
1971
            ("[\u{9}]", "", vec![]),
1✔
1972
            ("[\u{A}]", "", vec![]),
1✔
1973
            ("[\u{B}]", "", vec![]),
1✔
1974
            ("[\u{C}]", "", vec![]),
1✔
1975
            ("[\u{D}]", "", vec![]),
1✔
1976
            ("[\u{20}]", "", vec![]),
1✔
1977
            ("[\u{85}]", "", vec![]),
1✔
1978
            ("[\u{200E}]", "", vec![]),
1✔
1979
            ("[\u{200F}]", "", vec![]),
1✔
1980
            ("[\u{2028}]", "", vec![]),
1✔
1981
            ("[\u{2029}]", "", vec![]),
1✔
1982
            // whitespace significance:
1983
            ("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1984
            ("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1985
            ("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1986
            ("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1987
            ("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1988
            ("[-]", "--", vec![]),
1✔
1989
            ("[  -  ]", "--", vec![]),
1✔
1990
            ("[  - -  ]", "--", vec![]),
1✔
1991
            ("[ a-b -  ]", "ab--", vec![]),
1✔
1992
            ("[ -a]", "--aa", vec![]),
1✔
1993
            ("[a-]", "--aa", vec![]),
1✔
1994
            ("[a- ]", "--aa", vec![]),
1✔
1995
            ("[ :]", "::", vec![]),
1✔
1996
            ("[ :L:]", "::LL", vec![]),
1✔
1997
            // but not all "whitespace", only Pattern_White_Space:
1998
            ("[\u{A0}]", "\u{A0}\u{A0}", vec![]), // non-breaking space
1✔
1999
            // anchor
2000
            ("[$]", "\u{ffff}\u{ffff}", vec![]),
1✔
2001
            (r"[\$]", "$$", vec![]),
1✔
2002
            ("[{$}]", "$$", vec![]),
1✔
2003
            // set operations
2004
            ("[[a-z]&[b-z]]", "bz", vec![]),
1✔
2005
            ("[[a-z]-[b-z]]", "aa", vec![]),
1✔
2006
            ("[[a-z][b-z]]", "az", vec![]),
1✔
2007
            ("[[a-a][b-z]]", "az", vec![]),
1✔
2008
            ("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
1✔
2009
            ("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
1✔
2010
            ("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
1✔
2011
            ("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
1✔
2012
            ("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
1✔
2013
            // strings
2014
            ("[{this is a minus -}]", "", vec!["thisisaminus-"]),
1✔
2015
            // associativity
2016
            ("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
1✔
2017
            ("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
1✔
2018
            ("[[a-a][b-z] - [a-z][]]", "", vec![]),
1✔
2019
            ("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
1✔
2020
            ("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
1✔
2021
            ("[[a-a][b-z] & []-[a-z]]", "", vec![]),
1✔
2022
            ("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
1✔
2023
            ("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
1✔
2024
            // escape tests
2025
            (r"[\x61-\x63]", "ac", vec![]),
1✔
2026
            (r"[a-\x63]", "ac", vec![]),
1✔
2027
            (r"[\x61-c]", "ac", vec![]),
1✔
2028
            (r"[\u0061-\x63]", "ac", vec![]),
1✔
2029
            (r"[\U00000061-\x63]", "ac", vec![]),
1✔
2030
            (r"[\x{61}-\x63]", "ac", vec![]),
1✔
2031
            (r"[\u{61}-\x63]", "ac", vec![]),
1✔
2032
            (r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
1✔
2033
            (r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
1✔
2034
            (r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
1✔
2035
            // complement tests
2036
            (r"[^]", ALL_CHARS, vec![]),
1✔
2037
            (r"[[^]-[^a-z]]", "az", vec![]),
1✔
2038
            (r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
1✔
2039
            (
1✔
2040
                r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
2041
                ALL_CHARS,
2042
                vec![],
1✔
2043
            ),
2044
            (
1✔
2045
                r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
2046
                "\u{100}\u{10FFFF}",
2047
                vec![],
1✔
2048
            ),
2049
            (r"[^[^a-z]]", "az", vec![]),
1✔
2050
            (r"[^[^\^]]", "^^", vec![]),
1✔
2051
            (r"[{\x{61 0062   063}}]", "", vec!["abc"]),
1✔
2052
            (r"[\x{61 0062   063}]", "ac", vec![]),
1✔
2053
            // binary properties
2054
            (r"[:AHex:]", "09afAF", vec![]),
1✔
2055
            (r"[:AHex=True:]", "09afAF", vec![]),
1✔
2056
            (r"[:AHex=T:]", "09afAF", vec![]),
1✔
2057
            (r"[:AHex=Yes:]", "09afAF", vec![]),
1✔
2058
            (r"[:AHex=Y:]", "09afAF", vec![]),
1✔
2059
            (r"[:^AHex≠True:]", "09afAF", vec![]),
1✔
2060
            (r"[:AHex≠False:]", "09afAF", vec![]),
1✔
2061
            (r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
1✔
2062
            (r"\p{AHex}", "09afAF", vec![]),
1✔
2063
            (r"\p{AHex=True}", "09afAF", vec![]),
1✔
2064
            (r"\p{AHex=T}", "09afAF", vec![]),
1✔
2065
            (r"\p{AHex=Yes}", "09afAF", vec![]),
1✔
2066
            (r"\p{AHex=Y}", "09afAF", vec![]),
1✔
2067
            (r"\P{AHex≠True}", "09afAF", vec![]),
1✔
2068
            (r"\p{AHex≠False}", "09afAF", vec![]),
1✔
2069
            // general category
2070
            (r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
1✔
2071
            (r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
1✔
2072
            // general category groups
2073
            // equivalence between L and the union of all the L* categories
2074
            (
1✔
2075
                r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
2076
                "",
2077
                vec![],
1✔
2078
            ),
2079
            // script
2080
            (r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2081
            (r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2082
            (r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2083
            (r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2084
            // script extensions
2085
            (r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2086
            (r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2087
            (r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2088
            (r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
1✔
2089
            (r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2090
            (r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
1✔
2091
            (r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2092
            // grapheme cluster break
2093
            (
1✔
2094
                r"\p{Grapheme_Cluster_Break=ZWJ}",
2095
                "\u{200D}\u{200D}",
2096
                vec![],
1✔
2097
            ),
2098
            // sentence break
2099
            (
1✔
2100
                r"\p{Sentence_Break=ATerm}",
2101
                "\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
2102
                vec![],
1✔
2103
            ),
2104
            // word break
2105
            (r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
1✔
2106
            // more syntax edge cases from UTS35 directly
2107
            (r"[\^a]", "^^aa", vec![]),
1✔
2108
            (r"[{{}]", "{{", vec![]),
1✔
2109
            (r"[{}}]", "}}", vec![""]),
1✔
2110
            (r"[}]", "}}", vec![]),
1✔
2111
            (r"[{$var}]", "", vec!["$var"]),
1✔
2112
            (r"[{[a-z}]", "", vec!["[a-z"]),
1✔
2113
            (r"[ { [ a - z } ]", "", vec!["[a-z"]),
1✔
2114
            // TODO(#3556): Add more tests (specifically conformance tests if they exist)
2115
        ];
2116
        for (source, single, strings) in cases {
119✔
2117
            let parsed = parse(source);
118✔
2118
            if let Err(err) = parsed {
118✔
2119
                panic!(
×
2120
                    "{source} results in an error: {}",
2121
                    err.fmt_with_source(source)
×
2122
                );
2123
            }
2124
            let (set, consumed) = parsed.unwrap();
118✔
2125
            assert_eq!(consumed, source.len());
118✔
2126
            assert_set_equality(
118✔
2127
                source,
2128
                &set,
2129
                range_iter_from_str(single),
118✔
2130
                strings.into_iter(),
118✔
2131
            );
118✔
2132
        }
119✔
2133
    }
2✔
2134

2135
    #[test]
2136
    fn test_error_messages_with_variables() {
2✔
2137
        let mut map_char_char = VariableMap::default();
1✔
2138
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
2139
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
2140

2141
        let mut map_char_string = VariableMap::default();
1✔
2142
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
2143
        map_char_string
1✔
2144
            .insert_string("var2".to_string(), "abc".to_string())
2✔
2145
            .unwrap();
2146

2147
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
2148
        let mut map_char_set = VariableMap::default();
1✔
2149
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
2150
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
2151

2152
        let cases = [
1✔
2153
            (&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
1✔
2154
            (
1✔
2155
                &map_char_char,
2156
                "[$ a]",
2157
                r"[$ a← error: unexpected character 'a'",
2158
            ),
2159
            (&map_char_char, "$a", r"$a← error: unexpected variable"),
1✔
2160
            (&map_char_char, "$", r"$← error: unexpected end of input"),
1✔
2161
            (
1✔
2162
                &map_char_string,
2163
                "[$var2-$a]",
2164
                r"[$var2-$a← error: unexpected variable",
2165
            ),
2166
            (
1✔
2167
                &map_char_string,
2168
                "[$a-$var2]",
2169
                r"[$a-$var2← error: unexpected variable",
2170
            ),
2171
            (
1✔
2172
                &map_char_set,
2173
                "[$a-$set]",
2174
                r"[$a-$set← error: unexpected variable",
2175
            ),
2176
            (
1✔
2177
                &map_char_set,
2178
                "[$set-$a]",
2179
                r"[$set-$a← error: unexpected variable",
2180
            ),
2181
            (
1✔
2182
                &map_char_set,
2183
                "[$=]",
2184
                "[$=← error: unexpected character '='",
2185
            ),
2186
        ];
2187
        for (variable_map, source, expected_err) in cases {
10✔
2188
            assert_is_error_and_message_eq(source, expected_err, variable_map);
9✔
2189
        }
1✔
2190
    }
2✔
2191

2192
    #[test]
2193
    fn test_error_messages() {
2✔
2194
        let cases = [
1✔
2195
            (r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
1✔
2196
            (r"", r"← error: unexpected end of input"),
1✔
2197
            (r"[{]", r"[{]← error: unexpected end of input"),
1✔
2198
            // we match ECMA-262 strictly, so case matters
2199
            (
1✔
2200
                r"[:general_category:]",
2201
                r"[:general_category← error: unknown property",
2202
            ),
2203
            (r"[:ll=true:]", r"[:ll=true← error: unknown property"),
1✔
2204
            (r"[:=", r"[:=← error: unexpected character '='"),
1✔
2205
            // property names may not be empty
2206
            (r"[::]", r"[::← error: unexpected character ':'"),
1✔
2207
            (r"[:=hello:]", r"[:=← error: unexpected character '='"),
1✔
2208
            // property values may not be empty
2209
            (r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
1✔
2210
            (r"[\xag]", r"[\xag← error: unexpected character 'g'"),
1✔
2211
            (r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
1✔
2212
            // TODO(#3558): Might be better as "[a-\p← error: unexpected character 'p'"?
2213
            (r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
1✔
2214
            (r"[a-&]", r"[a-&← error: unexpected character '&'"),
1✔
2215
            (r"[a&b]", r"[a&← error: unexpected character '&'"),
1✔
2216
            (r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
1✔
2217
            (r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
1✔
2218
            (r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
1✔
2219
            (r"[a-`]", r"[a-`← error: unexpected character '`'"),
1✔
2220
            (r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
1✔
2221
            (r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
1✔
2222
            (r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
1✔
2223
            (
1✔
2224
                r"[\x{dabeef}]",
2225
                r"[\x{dabeef← error: invalid escape sequence",
2226
            ),
2227
            (
1✔
2228
                r"[\x{10ffff0}]",
2229
                r"[\x{10ffff0← error: unexpected character '0'",
2230
            ),
2231
            (
1✔
2232
                r"[\x{11ffff}]",
2233
                r"[\x{11ffff← error: invalid escape sequence",
2234
            ),
2235
            (
1✔
2236
                r"[\x{10ffff 1 10ffff0}]",
2237
                r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
2238
            ),
2239
            // > 1 byte in UTF-8 edge case
2240
            (r"ä", r"ä← error: unexpected character 'ä'"),
1✔
2241
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2242
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2243
            (
1✔
2244
                r"[\xe5-\xe4]",
2245
                r"[\xe5-\xe4← error: unexpected character 'ä'",
2246
            ),
2247
            (r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
1✔
2248
            // whitespace significance
2249
            (r"[ ^]", r"[ ^← error: unexpected character '^'"),
1✔
2250
            (r"[:]", r"[:]← error: unexpected character ']'"),
1✔
2251
            (r"[:L]", r"[:L]← error: unexpected character ']'"),
1✔
2252
            (r"\p {L}", r"\p ← error: unexpected character ' '"),
1✔
2253
            // multi-escapes are not allowed in ranges
2254
            (
1✔
2255
                r"[\x{61 62}-d]",
2256
                r"[\x{61 62}-d← error: unexpected character 'd'",
2257
            ),
2258
            (
1✔
2259
                r"[\x{61 63}-\x{62 64}]",
2260
                r"[\x{61 63}-\← error: unexpected character '\\'",
2261
            ),
2262
            // TODO(#3558): This is a bad error message.
2263
            (r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
1✔
2264
        ];
2265
        let vm = Default::default();
1✔
2266
        for (source, expected_err) in cases {
38✔
2267
            assert_is_error_and_message_eq(source, expected_err, &vm);
37✔
2268
        }
1✔
2269
    }
2✔
2270

2271
    #[test]
2272
    fn test_consumed() {
2✔
2273
        let cases = [
1✔
2274
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
1✔
2275
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
1✔
2276
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
1✔
2277
            (r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
1✔
2278
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
1✔
2279
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
1✔
2280
        ];
2281

2282
        let vm = Default::default();
1✔
2283
        for (expected_consumed, source) in cases {
7✔
2284
            let (_, consumed) = parse(source).unwrap();
6✔
2285
            assert_eq!(expected_consumed, consumed);
6✔
2286
            let (_, consumed) = parse_with_variables(source, &vm).unwrap();
6✔
2287
            assert_eq!(expected_consumed, consumed);
6✔
2288
        }
1✔
2289
    }
2✔
2290
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc