• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 11904027177

19 Nov 2024 12:33AM UTC coverage: 75.477% (+0.3%) from 75.174%
11904027177

push

github

web-flow
Move DateTimePattern into pattern module (#5834)

#1317

Also removes `NeoNeverMarker` and fixes #5689

258 of 319 new or added lines in 6 files covered. (80.88%)

6967 existing lines in 278 files now uncovered.

54522 of 72237 relevant lines covered (75.48%)

655305.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.64
/components/experimental/src/unicodeset_parse/parse.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use alloc::borrow::Cow;
6
use alloc::collections::{BTreeMap, BTreeSet};
7
use alloc::fmt::Display;
8
use alloc::format;
9
use alloc::string::{String, ToString};
10
use alloc::vec::Vec;
11
use core::{iter::Peekable, str::CharIndices};
12

13
use icu_collections::{
14
    codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
15
    codepointinvliststringlist::CodePointInversionListAndStringList,
16
};
17
use icu_properties::script::ScriptWithExtensions;
18
use icu_properties::{
19
    props::{
20
        CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup,
21
        GraphemeClusterBreak, Script, SentenceBreak, WordBreak,
22
    },
23
    CodePointMapData,
24
};
25
use icu_properties::{
26
    props::{PatternWhiteSpace, XidContinue, XidStart},
27
    CodePointSetData,
28
};
29
use icu_properties::{provider::*, PropertyParser};
30
use icu_provider::prelude::*;
31

32
/// The kind of error that occurred.
33
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48✔
34
#[non_exhaustive]
35
pub enum ParseErrorKind {
36
    /// An unexpected character was encountered. This variant implies the other variants
37
    /// (notably `UnknownProperty` and `Unimplemented`) do not apply.
UNCOV
38
    UnexpectedChar(char),
×
39
    /// The property name or value is unknown. For property names, make sure you use the spelling
40
    /// defined in [ECMA-262](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
41
    UnknownProperty,
42
    /// A reference to an unknown variable.
43
    UnknownVariable,
44
    /// A variable of a certain type occurring in an unexpected context.
45
    UnexpectedVariable,
46
    /// The source is an incomplete unicode set.
47
    Eof,
48
    /// Something unexpected went wrong with our code. Please file a bug report on GitHub.
49
    Internal,
50
    /// The provided syntax is not supported by us. Note that unknown properties will return the
51
    /// `UnknownProperty` variant, not this one.
52
    Unimplemented,
53
    /// The provided escape sequence is not a valid Unicode code point or represents too many
54
    /// code points.
55
    InvalidEscape,
56
}
57
use zerovec::VarZeroVec;
58
use ParseErrorKind as PEK;
59

60
impl ParseErrorKind {
61
    fn with_offset(self, offset: usize) -> ParseError {
302✔
62
        ParseError {
302✔
63
            offset: Some(offset),
302✔
64
            kind: self,
65
        }
66
    }
302✔
67
}
68

69
impl From<ParseErrorKind> for ParseError {
70
    fn from(kind: ParseErrorKind) -> Self {
14,326✔
71
        ParseError { offset: None, kind }
14,326✔
72
    }
14,326✔
73
}
74

75
/// The error type returned by the `parse` functions in this crate.
76
///
77
/// See [`ParseError::fmt_with_source`] for pretty-printing and [`ParseErrorKind`] of the
78
/// different types of errors represented by this struct.
UNCOV
79
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
×
80
pub struct ParseError {
81
    // offset is the index to an arbitrary byte in the last character in the source that makes sense
82
    // to display as location for the error, e.g., the unexpected character itself or
83
    // for an unknown property name the last character of the name.
UNCOV
84
    offset: Option<usize>,
×
UNCOV
85
    kind: ParseErrorKind,
×
86
}
87

88
type Result<T, E = ParseError> = core::result::Result<T, E>;
89

90
impl ParseError {
91
    /// Pretty-prints this error and if applicable, shows where the error occurred in the source.
92
    ///
93
    /// Must be called with the same source that was used to parse the set.
94
    ///
95
    /// # Examples
96
    ///
97
    /// ```
98
    /// use icu::experimental::unicodeset_parse::*;
99
    ///
100
    /// let source = "[[abc]-x]";
101
    /// let set = parse(source);
102
    /// assert!(set.is_err());
103
    /// let err = set.unwrap_err();
104
    /// assert_eq!(
105
    ///     err.fmt_with_source(source).to_string(),
106
    ///     "[[abc]-x← error: unexpected character 'x'"
107
    /// );
108
    /// ```
109
    ///
110
    /// ```
111
    /// use icu::experimental::unicodeset_parse::*;
112
    ///
113
    /// let source = r"[\N{LATIN CAPITAL LETTER A}]";
114
    /// let set = parse(source);
115
    /// assert!(set.is_err());
116
    /// let err = set.unwrap_err();
117
    /// assert_eq!(
118
    ///     err.fmt_with_source(source).to_string(),
119
    ///     r"[\N← error: unimplemented"
120
    /// );
121
    /// ```
122
    pub fn fmt_with_source(&self, source: &str) -> impl Display {
48✔
123
        let ParseError { offset, kind } = *self;
48✔
124

125
        if kind == ParseErrorKind::Eof {
48✔
126
            return format!("{source}← error: unexpected end of input");
4✔
127
        }
128
        let mut s = String::new();
44✔
129
        if let Some(offset) = offset {
44✔
130
            if offset < source.len() {
44✔
131
                // offset points to any byte of the last character we want to display.
132
                // in the case of ASCII, this is easy - we just display bytes [..=offset].
133
                // however, if the last character is more than one byte in UTF-8
134
                // we cannot use ..=offset, because that would potentially include only partial
135
                // bytes of last character in our string. hence we must find the start of the
136
                // following character and use that as the (exclusive) end of our string.
137

138
                // offset points into the last character we want to include, hence the start of the
139
                // first character we want to exclude is at least offset + 1.
140
                let mut exclusive_end = offset + 1;
44✔
141
                // TODO: replace this loop with str::ceil_char_boundary once stable
142
                for _ in 0..3 {
46✔
143
                    // is_char_boundary returns true at the latest once exclusive_end == source.len()
144
                    if source.is_char_boundary(exclusive_end) {
46✔
145
                        break;
146
                    }
147
                    exclusive_end += 1;
2✔
148
                }
149

150
                // exclusive_end is at most source.len() due to str::is_char_boundary and at least 0 by type
151
                #[allow(clippy::indexing_slicing)]
152
                s.push_str(&source[..exclusive_end]);
44✔
153
                s.push_str("← ");
44✔
154
            }
155
        }
156
        s.push_str("error: ");
44✔
157
        match kind {
44✔
158
            ParseErrorKind::UnexpectedChar(c) => {
31✔
159
                s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
31✔
160
            }
161
            ParseErrorKind::UnknownProperty => {
162
                s.push_str("unknown property");
4✔
163
            }
164
            ParseErrorKind::UnknownVariable => {
UNCOV
165
                s.push_str("unknown variable");
×
166
            }
167
            ParseErrorKind::UnexpectedVariable => {
168
                s.push_str("unexpected variable");
6✔
169
            }
170
            ParseErrorKind::Eof => {
171
                s.push_str("unexpected end of input");
×
172
            }
173
            ParseErrorKind::Internal => {
UNCOV
174
                s.push_str("internal error");
×
175
            }
176
            ParseErrorKind::Unimplemented => {
177
                s.push_str("unimplemented");
1✔
178
            }
179
            ParseErrorKind::InvalidEscape => {
180
                s.push_str("invalid escape sequence");
2✔
181
            }
182
        }
183

184
        s
44✔
185
    }
48✔
186

187
    /// Returns the [`ParseErrorKind`] of this error.
UNCOV
188
    pub fn kind(&self) -> ParseErrorKind {
×
UNCOV
189
        self.kind
×
190
    }
×
191

192
    /// Returns the offset of this error in the source string, if it was specified.
UNCOV
193
    pub fn offset(&self) -> Option<usize> {
×
UNCOV
194
        self.offset
×
UNCOV
195
    }
×
196

197
    fn or_with_offset(self, offset: usize) -> Self {
4✔
198
        match self.offset {
4✔
UNCOV
199
            Some(_) => self,
×
200
            None => ParseError {
4✔
201
                offset: Some(offset),
4✔
202
                ..self
203
            },
4✔
204
        }
205
    }
4✔
206
}
207

208
/// The value of a variable in a UnicodeSet. Used as value type in [`VariableMap`].
209
#[derive(Debug, Clone)]
46✔
210
#[non_exhaustive]
211
pub enum VariableValue<'a> {
212
    /// A UnicodeSet, represented as a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList).
213
    UnicodeSet(CodePointInversionListAndStringList<'a>),
19✔
214
    // in theory, a one-code-point string is always the same as a char, but we might want to keep
215
    // this variant for efficiency?
216
    /// A single code point.
217
    Char(char),
23✔
218
    /// A string. It is guaranteed that when returned from a VariableMap, this variant contains never exactly one code point.
219
    String(Cow<'a, str>),
4✔
220
}
221

222
/// The map used for parsing UnicodeSets with variable support. See [`parse_with_variables`].
223
#[derive(Debug, Clone, Default)]
730✔
224
pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
365✔
225

226
impl<'a> VariableMap<'a> {
227
    /// Creates a new empty map.
228
    pub fn new() -> Self {
1✔
229
        Self::default()
1✔
230
    }
1✔
231

232
    /// Removes a key from the map, returning the value at the key if the key
233
    /// was previously in the map.
UNCOV
234
    pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
×
UNCOV
235
        self.0.remove(key)
×
UNCOV
236
    }
×
237

238
    /// Get a reference to the value associated with this key, if it exists.
239
    pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
7✔
240
        self.0.get(key)
7✔
241
    }
7✔
242

243
    /// Insert a `VariableValue` into the `VariableMap`.
244
    ///
245
    /// Returns `Err` with the old value, if it exists, and does not update the map.
246
    pub fn insert(&mut self, key: String, value: VariableValue<'a>) -> Result<(), &VariableValue> {
51✔
247
        // borrow-checker shenanigans, otherwise we could use if let
248
        if self.0.contains_key(&key) {
51✔
249
            // we just checked that this key exists
250
            #[allow(clippy::indexing_slicing)]
UNCOV
251
            return Err(&self.0[&key]);
×
252
        }
253

254
        if let VariableValue::String(s) = &value {
51✔
255
            let mut chars = s.chars();
21✔
256
            if let (Some(c), None) = (chars.next(), chars.next()) {
21✔
257
                self.0.insert(key, VariableValue::Char(c));
16✔
258
                return Ok(());
16✔
259
            };
260
        }
261

262
        self.0.insert(key, value);
35✔
263
        Ok(())
35✔
264
    }
51✔
265

266
    /// Insert a `char` into the `VariableMap`.    
267
    ///
268
    /// Returns `Err` with the old value, if it exists, and does not update the map.
269
    pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue> {
12✔
270
        // borrow-checker shenanigans, otherwise we could use if let
271
        if self.0.contains_key(&key) {
12✔
272
            // we just checked that this key exists
273
            #[allow(clippy::indexing_slicing)]
274
            return Err(&self.0[&key]);
1✔
275
        }
276

277
        self.0.insert(key, VariableValue::Char(c));
11✔
278
        Ok(())
11✔
279
    }
12✔
280

281
    /// Insert a `String` of any length into the `VariableMap`.
282
    ///
283
    /// Returns `Err` with the old value, if it exists, and does not update the map.
284
    pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue> {
3✔
285
        // borrow-checker shenanigans, otherwise we could use if let
286
        if self.0.contains_key(&key) {
3✔
287
            // we just checked that this key exists
288
            #[allow(clippy::indexing_slicing)]
UNCOV
289
            return Err(&self.0[&key]);
×
290
        }
291

292
        let mut chars = s.chars();
3✔
293
        let val = match (chars.next(), chars.next()) {
3✔
UNCOV
294
            (Some(c), None) => VariableValue::Char(c),
×
295
            _ => VariableValue::String(Cow::Owned(s)),
3✔
296
        };
297

298
        self.0.insert(key, val);
3✔
299
        Ok(())
3✔
300
    }
3✔
301

302
    /// Insert a `&str` of any length into the `VariableMap`.
303
    ///
304
    /// Returns `Err` with the old value, if it exists, and does not update the map.
UNCOV
305
    pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue> {
×
306
        // borrow-checker shenanigans, otherwise we could use if let
307
        if self.0.contains_key(&key) {
×
308
            // we just checked that this key exists
309
            #[allow(clippy::indexing_slicing)]
310
            return Err(&self.0[&key]);
×
311
        }
312

313
        let mut chars = s.chars();
×
UNCOV
314
        let val = match (chars.next(), chars.next()) {
×
UNCOV
315
            (Some(c), None) => VariableValue::Char(c),
×
316
            _ => VariableValue::String(Cow::Borrowed(s)),
×
317
        };
318

UNCOV
319
        self.0.insert(key, val);
×
UNCOV
320
        Ok(())
×
UNCOV
321
    }
×
322

323
    /// Insert a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList) into the `VariableMap`.
324
    ///
325
    /// Returns `Err` with the old value, if it exists, and does not update the map.
326
    pub fn insert_set(
3✔
327
        &mut self,
328
        key: String,
329
        set: CodePointInversionListAndStringList<'a>,
330
    ) -> Result<(), &VariableValue> {
331
        // borrow-checker shenanigans, otherwise we could use if let
332
        if self.0.contains_key(&key) {
3✔
333
            // we just checked that this key exists
334
            #[allow(clippy::indexing_slicing)]
UNCOV
335
            return Err(&self.0[&key]);
×
336
        }
337
        self.0.insert(key, VariableValue::UnicodeSet(set));
3✔
338
        Ok(())
3✔
339
    }
3✔
340
}
341

342
// this ignores the ambiguity between \-escapes and \p{} perl properties. it assumes it is in a context where \p is just 'p'
343
// returns whether the provided char signifies the start of a literal char (raw or escaped - so \ is a legal char start)
344
// important: assumes c is not pattern_white_space
345
fn legal_char_start(c: char) -> bool {
1,647✔
346
    !(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
1,647✔
347
}
1,647✔
348

349
// same as `legal_char_start` but adapted to the charInString nonterminal. \ is allowed due to escapes.
350
// important: assumes c is not pattern_white_space
351
fn legal_char_in_string_start(c: char) -> bool {
283✔
352
    c != '}'
283✔
353
}
283✔
354

UNCOV
355
#[derive(Debug)]
×
356
enum SingleOrMultiChar {
357
    Single(char),
×
358
    // Multi is a marker that indicates parsing was paused and needs to be resumed using parse_multi_escape* when
359
    // this token is consumed. The contained char is the first char of the multi sequence.
UNCOV
360
    Multi(char),
×
361
}
362

363
// A char or a string. The Vec<char> represents multi-escapes in the 2+ case.
364
// invariant: a String is either zero or 2+ chars long, a one-char-string is equivalent to a single char.
365
// invariant: a char is 1+ chars long
366
#[derive(Debug)]
×
367
enum Literal {
UNCOV
368
    String(String),
×
369
    CharKind(SingleOrMultiChar),
×
370
}
371

372
#[derive(Debug)]
×
373
enum MainToken<'data> {
374
    // to be interpreted as value
UNCOV
375
    Literal(Literal),
×
376
    // inner set
UNCOV
377
    UnicodeSet(CodePointInversionListAndStringList<'data>),
×
378
    // anchor, only at the end of a set ([... $])
379
    DollarSign,
380
    // intersection operator, only inbetween two sets ([[...] & [...]])
381
    Ampersand,
382
    // difference operator, only inbetween two sets ([[...] - [...]])
383
    // or
384
    // range operator, only inbetween two chars ([a-z], [a-{z}])
385
    Minus,
386
    // ] to indicate the end of a set
387
    ClosingBracket,
388
}
389

390
impl<'data> MainToken<'data> {
391
    fn from_variable_value(val: VariableValue<'data>) -> Self {
44✔
392
        match val {
44✔
393
            VariableValue::Char(c) => {
21✔
394
                MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
21✔
395
            }
21✔
396
            VariableValue::String(s) => {
4✔
397
                // we know that the VariableMap only contains non-length-1 Strings.
398
                MainToken::Literal(Literal::String(s.into_owned()))
4✔
399
            }
4✔
400
            VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
19✔
401
        }
402
    }
44✔
403
}
404

UNCOV
405
#[derive(Debug, Clone, Copy)]
×
406
enum Operation {
407
    Union,
408
    Difference,
409
    Intersection,
410
}
411

412
// this builds the set on-the-fly while parsing it
413
struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
414
    single_set: CodePointInversionListBuilder,
415
    string_set: BTreeSet<String>,
416
    iter: &'a mut Peekable<CharIndices<'b>>,
417
    source: &'b str,
418
    inverted: bool,
419
    variable_map: &'a VariableMap<'a>,
420
    xid_start: &'a CodePointInversionList<'a>,
421
    xid_continue: &'a CodePointInversionList<'a>,
422
    pat_ws: &'a CodePointInversionList<'a>,
423
    property_provider: &'a P,
424
}
425

426
impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
427
where
428
    P: ?Sized
429
        + DataProvider<AsciiHexDigitV1Marker>
430
        + DataProvider<AlphabeticV1Marker>
431
        + DataProvider<BidiControlV1Marker>
432
        + DataProvider<BidiMirroredV1Marker>
433
        + DataProvider<CanonicalCombiningClassV1Marker>
434
        + DataProvider<CanonicalCombiningClassNameToValueV2Marker>
435
        + DataProvider<CaseIgnorableV1Marker>
436
        + DataProvider<CasedV1Marker>
437
        + DataProvider<ChangesWhenCasefoldedV1Marker>
438
        + DataProvider<ChangesWhenCasemappedV1Marker>
439
        + DataProvider<ChangesWhenLowercasedV1Marker>
440
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
441
        + DataProvider<ChangesWhenTitlecasedV1Marker>
442
        + DataProvider<ChangesWhenUppercasedV1Marker>
443
        + DataProvider<DashV1Marker>
444
        + DataProvider<DefaultIgnorableCodePointV1Marker>
445
        + DataProvider<DeprecatedV1Marker>
446
        + DataProvider<DiacriticV1Marker>
447
        + DataProvider<EmojiV1Marker>
448
        + DataProvider<EmojiComponentV1Marker>
449
        + DataProvider<EmojiModifierV1Marker>
450
        + DataProvider<EmojiModifierBaseV1Marker>
451
        + DataProvider<EmojiPresentationV1Marker>
452
        + DataProvider<ExtendedPictographicV1Marker>
453
        + DataProvider<ExtenderV1Marker>
454
        + DataProvider<GraphemeBaseV1Marker>
455
        + DataProvider<GraphemeClusterBreakV1Marker>
456
        + DataProvider<GraphemeClusterBreakNameToValueV2Marker>
457
        + DataProvider<GraphemeExtendV1Marker>
458
        + DataProvider<HexDigitV1Marker>
459
        + DataProvider<IdsBinaryOperatorV1Marker>
460
        + DataProvider<IdsTrinaryOperatorV1Marker>
461
        + DataProvider<IdContinueV1Marker>
462
        + DataProvider<IdStartV1Marker>
463
        + DataProvider<IdeographicV1Marker>
464
        + DataProvider<JoinControlV1Marker>
465
        + DataProvider<LogicalOrderExceptionV1Marker>
466
        + DataProvider<LowercaseV1Marker>
467
        + DataProvider<MathV1Marker>
468
        + DataProvider<NoncharacterCodePointV1Marker>
469
        + DataProvider<PatternSyntaxV1Marker>
470
        + DataProvider<PatternWhiteSpaceV1Marker>
471
        + DataProvider<QuotationMarkV1Marker>
472
        + DataProvider<RadicalV1Marker>
473
        + DataProvider<RegionalIndicatorV1Marker>
474
        + DataProvider<SentenceBreakV1Marker>
475
        + DataProvider<SentenceBreakNameToValueV2Marker>
476
        + DataProvider<SentenceTerminalV1Marker>
477
        + DataProvider<SoftDottedV1Marker>
478
        + DataProvider<TerminalPunctuationV1Marker>
479
        + DataProvider<UnifiedIdeographV1Marker>
480
        + DataProvider<UppercaseV1Marker>
481
        + DataProvider<VariationSelectorV1Marker>
482
        + DataProvider<WhiteSpaceV1Marker>
483
        + DataProvider<WordBreakV1Marker>
484
        + DataProvider<WordBreakNameToValueV2Marker>
485
        + DataProvider<XidContinueV1Marker>
486
        + DataProvider<GeneralCategoryMaskNameToValueV2Marker>
487
        + DataProvider<GeneralCategoryV1Marker>
488
        + DataProvider<ScriptNameToValueV2Marker>
489
        + DataProvider<ScriptV1Marker>
490
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
491
        + DataProvider<XidStartV1Marker>,
492
{
493
    fn new_internal(
687✔
494
        iter: &'a mut Peekable<CharIndices<'b>>,
495
        source: &'b str,
496
        variable_map: &'a VariableMap<'a>,
497
        xid_start: &'a CodePointInversionList<'a>,
498
        xid_continue: &'a CodePointInversionList<'a>,
499
        pat_ws: &'a CodePointInversionList<'a>,
500
        provider: &'a P,
501
    ) -> Self {
502
        UnicodeSetBuilder {
687✔
503
            single_set: CodePointInversionListBuilder::new(),
687✔
504
            string_set: Default::default(),
687✔
505
            iter,
506
            source,
507
            inverted: false,
508
            variable_map,
509
            xid_start,
510
            xid_continue,
511
            pat_ws,
512
            property_provider: provider,
UNCOV
513
        }
×
514
    }
687✔
515

516
    // the entry point, parses a full UnicodeSet. ignores remaining input
517
    fn parse_unicode_set(&mut self) -> Result<()> {
688✔
518
        match self.must_peek_char()? {
688✔
519
            '\\' => self.parse_property_perl(),
26✔
520
            '[' => {
521
                self.iter.next();
657✔
522
                if let Some(':') = self.peek_char() {
657✔
523
                    self.parse_property_posix()
85✔
524
                } else {
525
                    self.parse_unicode_set_inner()
572✔
526
                }
527
            }
528
            '$' => {
529
                // must be variable ref to a UnicodeSet
530
                let (offset, v) = self.parse_variable()?;
3✔
531
                match v {
2✔
532
                    Some(VariableValue::UnicodeSet(s)) => {
1✔
533
                        self.single_set.add_set(s.code_points());
1✔
534
                        self.string_set
2✔
535
                            .extend(s.strings().iter().map(ToString::to_string));
1✔
536
                        Ok(())
1✔
537
                    }
1✔
538
                    Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
1✔
UNCOV
539
                    None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
×
540
                }
541
            }
542
            c => self.error_here(PEK::UnexpectedChar(c)),
1✔
543
        }
544
    }
688✔
545

546
    // beginning [ is already consumed
547
    fn parse_unicode_set_inner(&mut self) -> Result<()> {
600✔
548
        // special cases for the first chars after [
549
        if self.must_peek_char()? == '^' {
600✔
550
            self.iter.next();
111✔
551
            self.inverted = true;
111✔
552
        }
553
        // whitespace allowed between ^ and - in `[^ - ....]`
554
        self.skip_whitespace();
600✔
555
        if self.must_peek_char()? == '-' {
600✔
556
            self.iter.next();
7✔
557
            self.single_set.add_char('-');
7✔
558
        }
559

560
        // repeatedly parse the following:
561
        // char
562
        // char-char
563
        // {string}
564
        // unicodeset
565
        // & and - operators, but only between unicodesets
566
        // $variables in place of strings, chars, or unicodesets
567

UNCOV
568
        #[derive(Debug, Clone, Copy)]
×
569
        enum State {
570
            // a state equivalent to the beginning
571
            Begin,
572
            // a state after a char. implies `prev_char` is Some(_), because we need to buffer it
573
            // in case it is part of a range, e.g., a-z
574
            Char,
575
            // in the middle of parsing a range. implies `prev_char` is Some(_), and the next
576
            // element must be a char as well
577
            CharMinus,
578
            // state directly after parsing a recursive unicode set. operators are only allowed
579
            // in this state
580
            AfterUnicodeSet,
581
            // state directly after parsing an operator. forces the next element to be a recursive
582
            // unicode set
583
            AfterOp,
584
            // state after parsing a $ (that was not a variable reference)
585
            // the only valid next option is a closing bracket
586
            AfterDollar,
587
            // state after parsing a - in an otherwise invalid position
588
            // the only valid next option is a closing bracket
589
            AfterMinus,
590
        }
591
        use State::*;
592

593
        const DEFAULT_OP: Operation = Operation::Union;
594

595
        let mut state = Begin;
600✔
596
        let mut prev_char = None;
600✔
597
        let mut operation = Operation::Union;
600✔
598

599
        loop {
2,532✔
600
            self.skip_whitespace();
2,532✔
601

602
            // for error messages
603
            let (immediate_offset, immediate_char) = self.must_peek()?;
2,532✔
604

605
            let (tok_offset, from_var, tok) = self.parse_main_token()?;
2,532✔
606
            // warning: self.iter should not be advanced any more after this point on any path to
607
            // MT::Literal(Literal::CharKind(SingleOrMultiChar::Multi)), because that variant
608
            // expects a certain self.iter state
609

610
            use MainToken as MT;
611
            use SingleOrMultiChar as SMC;
612
            match (state, tok) {
2,518✔
613
                // the end of this unicode set
614
                (
615
                    Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
616
                    MT::ClosingBracket,
617
                ) => {
618
                    if let Some(prev) = prev_char.take() {
534✔
619
                        self.single_set.add_char(prev);
142✔
620
                    }
621
                    if matches!(state, CharMinus) {
534✔
622
                        self.single_set.add_char('-');
2✔
623
                    }
624

625
                    return Ok(());
534✔
626
                }
627
                // special case ends for -
628
                // [[a-z]-]
629
                (AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
2✔
630
                    self.single_set.add_char('-');
1✔
631
                    return Ok(());
1✔
632
                }
633
                (Begin, MT::Minus) => {
9✔
634
                    self.single_set.add_char('-');
9✔
635
                    state = AfterMinus;
9✔
636
                }
637
                // inner unicode set
638
                (Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
209✔
639
                    if let Some(prev) = prev_char.take() {
209✔
640
                        self.single_set.add_char(prev);
3✔
641
                    }
642

643
                    self.process_chars(operation, set.code_points().clone());
209✔
644
                    self.process_strings(
209✔
645
                        operation,
209✔
646
                        set.strings().iter().map(ToString::to_string).collect(),
209✔
647
                    );
648

649
                    operation = DEFAULT_OP;
209✔
650
                    state = AfterUnicodeSet;
209✔
651
                }
209✔
652
                // a literal char (either individually or as the start of a range if char)
653
                (
654
                    Begin | Char | AfterUnicodeSet,
655
                    MT::Literal(Literal::CharKind(SMC::Single(c))),
1,074✔
656
                ) => {
657
                    if let Some(prev) = prev_char.take() {
1,074✔
658
                        self.single_set.add_char(prev);
613✔
659
                    }
660
                    prev_char = Some(c);
1,064✔
661
                    state = Char;
1,064✔
662
                }
1,064✔
663
                // a bunch of literal chars as part of a multi-escape sequence
664
                (
665
                    Begin | Char | AfterUnicodeSet,
666
                    MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
6✔
667
                ) => {
668
                    if let Some(prev) = prev_char.take() {
6✔
UNCOV
669
                        self.single_set.add_char(prev);
×
670
                    }
671
                    self.single_set.add_char(first_c);
4✔
672
                    self.parse_multi_escape_into_set()?;
604✔
673

674
                    // Note we cannot go to the Char state, because a multi-escape sequence of
675
                    // length > 1 cannot initiate a range
676
                    state = Begin;
3✔
677
                }
3✔
678
                // a literal string (length != 1, by CharOrString invariant)
679
                (Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
59✔
680
                    if let Some(prev) = prev_char.take() {
59✔
681
                        self.single_set.add_char(prev);
21✔
682
                    }
683

684
                    self.string_set.insert(s);
59✔
685
                    state = Begin;
59✔
686
                }
59✔
687
                // parse a literal char as the end of a range
688
                (CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
259✔
689
                    let start = prev_char.ok_or(PEK::Internal.with_offset(tok_offset))?;
259✔
690
                    let end = c;
691
                    if start > end {
259✔
692
                        // TODO(#3558): Better error message (e.g., "start greater than end in range")?
693
                        return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
4✔
694
                    }
695

696
                    self.single_set.add_range(start..=end);
255✔
697
                    prev_char = None;
255✔
698
                    state = Begin;
255✔
699
                }
255✔
700
                // start parsing a char range
701
                (Char, MT::Minus) => {
266✔
702
                    state = CharMinus;
266✔
703
                }
704
                // start parsing a unicode set difference
705
                (AfterUnicodeSet, MT::Minus) => {
30✔
706
                    operation = Operation::Difference;
30✔
707
                    state = AfterOp;
30✔
708
                }
709
                // start parsing a unicode set difference
710
                (AfterUnicodeSet, MT::Ampersand) => {
27✔
711
                    operation = Operation::Intersection;
27✔
712
                    state = AfterOp;
27✔
713
                }
714
                (Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
37✔
715
                    if let Some(prev) = prev_char.take() {
28✔
716
                        self.single_set.add_char(prev);
21✔
717
                    }
718
                    self.single_set.add_char('\u{FFFF}');
37✔
719
                    state = AfterDollar;
37✔
720
                }
721
                _ => {
722
                    // TODO(#3558): We have precise knowledge about the following MainToken here,
723
                    //  should we make use of that?
724

725
                    if from_var {
18✔
726
                        // otherwise we get error messages such as
727
                        // [$a-$← error: unexpected character '$'
728
                        // for input [$a-$b], $a = 'a', $b = "string" ;
729
                        return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
5✔
730
                    }
731
                    return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
13✔
732
                }
733
            }
734
        }
2,490✔
735
    }
572✔
736

737
    fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
2,532✔
738
        let (initial_offset, first) = self.must_peek()?;
2,532✔
739
        if first == ']' {
2,532✔
740
            self.iter.next();
536✔
741
            return Ok((initial_offset, false, MainToken::ClosingBracket));
536✔
742
        }
743
        let (_, second) = self.must_peek_double()?;
1,996✔
744
        match (first, second) {
2,330✔
745
            // variable or anchor
746
            ('$', _) => {
747
                let (offset, var_or_anchor) = self.parse_variable()?;
82✔
748
                match var_or_anchor {
81✔
749
                    None => Ok((offset, false, MainToken::DollarSign)),
37✔
750
                    Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
44✔
751
                }
752
            }
753
            // string
754
            ('{', _) => self
71✔
755
                .parse_string()
756
                .map(|(offset, l)| (offset, false, MainToken::Literal(l))),
70✔
757
            // inner set
758
            ('\\', 'p' | 'P') | ('[', _) => {
759
                let mut inner_builder = UnicodeSetBuilder::new_internal(
193✔
760
                    self.iter,
193✔
761
                    self.source,
193✔
762
                    self.variable_map,
193✔
763
                    self.xid_start,
193✔
764
                    self.xid_continue,
193✔
765
                    self.pat_ws,
193✔
766
                    self.property_provider,
193✔
767
                );
768
                inner_builder.parse_unicode_set()?;
2,725✔
769
                let (single, string_set) = inner_builder.finalize();
193✔
770
                // note: offset - 1, because we already consumed full set
771
                let offset = self.must_peek_index()? - 1;
193✔
772
                let mut strings = string_set.into_iter().collect::<Vec<_>>();
192✔
773
                strings.sort();
192✔
774
                let cpilasl = CodePointInversionListAndStringList::try_from(
192✔
775
                    single.build(),
192✔
776
                    VarZeroVec::from(&strings),
192✔
777
                )
192✔
UNCOV
778
                .map_err(|_| PEK::Internal.with_offset(offset))?;
×
779
                Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
192✔
780
            }
193✔
781
            // note: c cannot be a whitespace, because we called skip_whitespace just before
782
            // (in the main parse loop), so it's safe to call this guard function
783
            (c, _) if legal_char_start(c) => self
1,648✔
784
                .parse_char()
785
                .map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
1,304✔
786
            ('-', _) => {
787
                self.iter.next();
306✔
788
                Ok((initial_offset, false, MainToken::Minus))
306✔
789
            }
790
            ('&', _) => {
791
                self.iter.next();
29✔
792
                Ok((initial_offset, false, MainToken::Ampersand))
29✔
793
            }
794
            (c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
1✔
795
        }
796
    }
2,532✔
797

798
    // parses a variable or an anchor. expects '$' as next token.
799
    // if this is a single $ (eg `[... $ ]` or the invalid `$ a`), then this function returns Ok(None),
800
    // otherwise Ok(Some(variable_value)).
801
    fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
85✔
802
        self.consume('$')?;
85✔
803

804
        let mut res = String::new();
85✔
805
        let (mut var_offset, first_c) = self.must_peek()?;
85✔
806

807
        if !self.xid_start.contains(first_c) {
84✔
808
            // -1 because we already consumed the '$'
809
            return Ok((var_offset - 1, None));
37✔
810
        }
811

812
        res.push(first_c);
47✔
813
        self.iter.next();
47✔
814
        // important: if we are parsing a root unicodeset as a variable, we might reach EOF as
815
        // a valid end of the variable name, so we cannot use must_peek here.
816
        while let Some(&(offset, c)) = self.iter.peek() {
238✔
817
            if !self.xid_continue.contains(c) {
236✔
818
                break;
819
            }
820
            // only update the offset if we're adding a new char to our variable
821
            var_offset = offset;
191✔
822
            self.iter.next();
191✔
823
            res.push(c);
191✔
824
        }
825

826
        if let Some(v) = self.variable_map.0.get(&res) {
47✔
827
            return Ok((var_offset, Some(v)));
46✔
828
        }
829

830
        Err(PEK::UnknownVariable.with_offset(var_offset))
1✔
831
    }
85✔
832

833
    // parses and consumes: '{' (s charInString)* s '}'
834
    fn parse_string(&mut self) -> Result<(usize, Literal)> {
71✔
835
        self.consume('{')?;
71✔
836

837
        let mut buffer = String::new();
71✔
838
        let mut last_offset;
839

840
        loop {
841
            self.skip_whitespace();
354✔
842
            last_offset = self.must_peek_index()?;
354✔
843
            match self.must_peek_char()? {
353✔
844
                '}' => {
845
                    self.iter.next();
70✔
846
                    break;
847
                }
848
                // note: c cannot be a whitespace, because we called skip_whitespace just before,
849
                // so it's safe to call this guard function
850
                c if legal_char_in_string_start(c) => {
283✔
851
                    // don't need the offset, because '}' will always be the last char
852
                    let (_, c) = self.parse_char()?;
283✔
853
                    match c {
283✔
854
                        SingleOrMultiChar::Single(c) => buffer.push(c),
282✔
855
                        SingleOrMultiChar::Multi(first) => {
1✔
856
                            buffer.push(first);
1✔
857
                            self.parse_multi_escape_into_string(&mut buffer)?;
72✔
858
                        }
859
                    }
860
                }
UNCOV
861
                c => return self.error_here(PEK::UnexpectedChar(c)),
×
862
            }
863
        }
864

865
        let mut chars = buffer.chars();
70✔
866
        let literal = match (chars.next(), chars.next()) {
70✔
867
            (Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
14✔
868
            _ => Literal::String(buffer),
56✔
869
        };
870
        Ok((last_offset, literal))
70✔
871
    }
71✔
872

873
    // finishes a partial multi escape parse. in case of a parse error, self.single_set
874
    // may be left in an inconsistent state
875
    fn parse_multi_escape_into_set(&mut self) -> Result<()> {
4✔
876
        // note: would be good to somehow merge the two multi_escape methods. splitting up the UnicodeSetBuilder into a more
877
        // conventional parser + lexer combo might allow this.
878
        // issue is that we cannot pass this method an argument that somehow mutates `self` in the current architecture.
879
        // self.lexer.parse_multi_into_charappendable(&mut self.single_set) should work because the lifetimes are separate
880

881
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
882
        // enforced when creating the SingleOrMultiChar::Multi.
883
        let mut first = true;
4✔
884
        loop {
4✔
885
            let skipped = self.skip_whitespace();
10✔
886
            match self.must_peek_char()? {
10✔
887
                '}' => {
888
                    self.iter.next();
3✔
889
                    return Ok(());
3✔
890
                }
891
                initial_c => {
892
                    if skipped == 0 && !first {
7✔
893
                        // bracketed hex code points must be separated by whitespace
894
                        return self.error_here(PEK::UnexpectedChar(initial_c));
1✔
895
                    }
896
                    first = false;
6✔
897

898
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
6✔
899
                    self.single_set.add_char(c);
6✔
900
                }
901
            }
902
        }
903
    }
4✔
904

905
    // finishes a partial multi escape parse. in case of a parse error, the caller must clean up the
906
    // string if necessary.
907
    fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
1✔
908
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
909
        // enforced when creating the SingleOrMultiChar::Multi.
910
        let mut first = true;
1✔
911
        loop {
1✔
912
            let skipped = self.skip_whitespace();
3✔
913
            match self.must_peek_char()? {
3✔
914
                '}' => {
915
                    self.iter.next();
1✔
916
                    return Ok(());
1✔
917
                }
918
                initial_c => {
919
                    if skipped == 0 && !first {
2✔
920
                        // bracketed hex code points must be separated by whitespace
UNCOV
921
                        return self.error_here(PEK::UnexpectedChar(initial_c));
×
922
                    }
923
                    first = false;
2✔
924

925
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
2✔
926
                    s.push(c);
2✔
927
                }
928
            }
929
        }
930
    }
1✔
931

932
    // starts with \ and consumes the whole escape sequence if a single
933
    // char is escaped, otherwise pauses the parse after the first char
934
    fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
133✔
935
        self.consume('\\')?;
133✔
936

937
        let (offset, next_char) = self.must_next()?;
133✔
938

939
        match next_char {
133✔
940
            'u' | 'x' if self.peek_char() == Some('{') => {
60✔
941
                // bracketedHex
942
                self.iter.next();
20✔
943

944
                self.skip_whitespace();
20✔
945
                let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
20✔
946
                let skipped = self.skip_whitespace();
16✔
947

948
                match self.must_peek()? {
16✔
949
                    (offset, '}') => {
7✔
950
                        self.iter.next();
7✔
951
                        Ok((offset, SingleOrMultiChar::Single(first_c)))
7✔
952
                    }
7✔
953
                    // note: enforcing whitespace after the first char here, because the parse_multi_escape functions
954
                    // won't have access to this information anymore
955
                    (offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
9✔
956
                        Ok((offset, SingleOrMultiChar::Multi(first_c)))
7✔
957
                    }
7✔
958
                    (_, c) => self.error_here(PEK::UnexpectedChar(c)),
2✔
959
                }
960
            }
961
            'u' => {
962
                // 'u' hex{4}
963
                self.parse_hex_digits_into_char(4, 4)
24✔
964
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
24✔
965
            }
966
            'x' => {
967
                // 'x' hex{2}
968
                self.parse_hex_digits_into_char(2, 2)
16✔
969
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
15✔
970
            }
971
            'U' => {
972
                // 'U00' ('0' hex{5} | '10' hex{4})
973
                self.consume('0')?;
3✔
974
                self.consume('0')?;
136✔
975
                self.parse_hex_digits_into_char(6, 6)
3✔
976
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
3✔
977
            }
978
            'N' => {
979
                // parse code point with name in {}
980
                // tracking issue: https://github.com/unicode-org/icu4x/issues/1397
981
                Err(PEK::Unimplemented.with_offset(offset))
1✔
982
            }
983
            'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
×
UNCOV
984
            'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
×
UNCOV
985
            't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
×
986
            'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
9✔
UNCOV
987
            'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
×
UNCOV
988
            'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
×
989
            'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
8✔
990
            _ => Ok((offset, SingleOrMultiChar::Single(next_char))),
52✔
991
        }
992
    }
133✔
993

994
    // starts with :, consumes the trailing :]
995
    fn parse_property_posix(&mut self) -> Result<()> {
85✔
996
        self.consume(':')?;
85✔
997
        if self.must_peek_char()? == '^' {
85✔
998
            self.inverted = true;
3✔
999
            self.iter.next();
3✔
1000
        }
1001

1002
        self.parse_property_inner(':')?;
85✔
1003

1004
        self.consume(']')?;
162✔
1005

1006
        Ok(())
77✔
1007
    }
85✔
1008

1009
    // starts with \p{ or \P{, consumes the trailing }
1010
    fn parse_property_perl(&mut self) -> Result<()> {
26✔
1011
        self.consume('\\')?;
26✔
1012
        match self.must_next()? {
26✔
1013
            (_, 'p') => {}
1014
            (_, 'P') => self.inverted = true,
1✔
UNCOV
1015
            (offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
×
1016
        }
1017
        self.consume('{')?;
26✔
1018

1019
        self.parse_property_inner('}')?;
51✔
1020

1021
        Ok(())
22✔
1022
    }
26✔
1023

1024
    fn parse_property_inner(&mut self, end: char) -> Result<()> {
114✔
1025
        // UnicodeSet spec ignores whitespace, '-', and '_',
1026
        // but ECMA-262 requires '_', so we'll allow that.
1027
        // TODO(#3559): support loose matching on property names (e.g., "AS  -_-  CII_Hex_ D-igit")
1028
        // TODO(#3559): support more properties than ECMA-262
1029

1030
        let property_offset;
1031

1032
        let mut key_buffer = String::new();
114✔
1033
        let mut value_buffer = String::new();
114✔
1034

1035
        enum State {
1036
            // initial state, nothing parsed yet
1037
            Begin,
1038
            // non-empty property name
1039
            PropertyName,
1040
            // property name parsed, '=' or '≠' parsed, no value parsed yet
1041
            PropertyValueBegin,
1042
            // non-empty property name, non-empty property value
1043
            PropertyValue,
1044
        }
1045
        use State::*;
1046

1047
        let mut state = Begin;
114✔
1048
        // whether '=' (true) or '≠' (false) was parsed
1049
        let mut equality = true;
114✔
1050

1051
        loop {
114✔
1052
            self.skip_whitespace();
684✔
1053
            match (state, self.must_peek_char()?) {
1,355✔
1054
                // parse the end of the property expression
1055
                (PropertyName | PropertyValue, c) if c == end => {
539✔
1056
                    // byte index of (full) property name/value is one back
1057
                    property_offset = self.must_peek_index()? - 1;
103✔
1058
                    self.iter.next();
103✔
1059
                    break;
1060
                }
1061
                // parse the property name
1062
                // NOTE: this might be too strict, because in the case of e.g. [:value:], we might want to
1063
                // allow [:lower-case-letter:] ([:gc=lower-case-letter:] works)
1064
                (Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
437✔
1065
                    key_buffer.push(c);
400✔
1066
                    self.iter.next();
400✔
1067
                    state = PropertyName;
400✔
1068
                }
400✔
1069
                // parse the name-value separator
1070
                (PropertyName, c @ ('=' | '≠')) => {
33✔
1071
                    equality = c == '=';
33✔
1072
                    self.iter.next();
33✔
1073
                    state = PropertyValueBegin;
31✔
1074
                }
31✔
1075
                // parse the property value
1076
                (PropertyValue | PropertyValueBegin, c) if c != end => {
140✔
1077
                    value_buffer.push(c);
139✔
1078
                    self.iter.next();
139✔
1079
                    state = PropertyValue;
139✔
1080
                }
139✔
1081
                (_, c) => return self.error_here(PEK::UnexpectedChar(c)),
5✔
1082
            }
1083
        }
1084

1085
        if !equality {
103✔
1086
            self.inverted = !self.inverted;
5✔
1087
        }
1088

1089
        let inverted = self
103✔
1090
            .load_property_codepoints(&key_buffer, &value_buffer)
103✔
1091
            // any error that does not already have an offset should use the appropriate property offset
1092
            .map_err(|e| e.or_with_offset(property_offset))?;
8✔
1093
        if inverted {
99✔
1094
            self.inverted = !self.inverted;
3✔
1095
        }
1096

1097
        Ok(())
99✔
1098
    }
110✔
1099

1100
    // returns whether the set needs to be inverted or not
1101
    fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
103✔
1102
        // we support:
1103
        // [:gc = value:]
1104
        // [:sc = value:]
1105
        // [:scx = value:]
1106
        // [:Grapheme_Cluster_Break = value:]
1107
        // [:Sentence_Break = value:]
1108
        // [:Word_Break = value:]
1109
        // [:value:] - looks up value in gc, sc
1110
        // [:prop:] - binary property, returns codepoints that have the property
1111
        // [:prop = truthy/falsy:] - same as above
1112

1113
        let mut inverted = false;
103✔
1114

1115
        // contains a value for the General_Category property that needs to be tried
1116
        let mut try_gc = Err(PEK::UnknownProperty.into());
103✔
1117
        // contains a value for the Script property that needs to be tried
1118
        let mut try_sc = Err(PEK::UnknownProperty.into());
103✔
1119
        // contains a value for the Script_Extensions property that needs to be tried
1120
        let mut try_scx = Err(PEK::UnknownProperty.into());
103✔
1121
        // contains a value for the Grapheme_Cluster_Break property that needs to be tried
1122
        let mut try_gcb = Err(PEK::UnknownProperty.into());
103✔
1123
        // contains a value for the Sentence_Break property that needs to be tried
1124
        let mut try_sb = Err(PEK::UnknownProperty.into());
103✔
1125
        // contains a value for the Word_Break property that needs to be tried
1126
        let mut try_wb = Err(PEK::UnknownProperty.into());
103✔
1127
        // contains a supposed binary property name that needs to be tried
1128
        let mut try_binary = Err(PEK::UnknownProperty.into());
103✔
1129
        // contains a supposed canonical combining class property name that needs to be tried
1130
        let mut try_ccc: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
103✔
1131
        // contains a supposed block property name that needs to be tried
1132
        let mut try_block: Result<&str, ParseError> = Err(PEK::UnknownProperty.into());
103✔
1133

1134
        if !value.is_empty() {
103✔
1135
            // key is gc, sc, scx, grapheme cluster break, sentence break, word break
1136
            // value is a property value
1137
            // OR
1138
            // key is a binary property and value is a truthy/falsy value
1139

1140
            match key.as_bytes() {
47✔
1141
                GeneralCategory::NAME | GeneralCategory::SHORT_NAME => try_gc = Ok(value),
34✔
1142
                GraphemeClusterBreak::NAME | GraphemeClusterBreak::SHORT_NAME => {
20✔
1143
                    try_gcb = Ok(value)
1✔
1144
                }
1145
                Script::NAME | Script::SHORT_NAME => try_sc = Ok(value),
20✔
1146
                SentenceBreak::NAME | SentenceBreak::SHORT_NAME => try_sb = Ok(value),
16✔
1147
                WordBreak::NAME | WordBreak::SHORT_NAME => try_wb = Ok(value),
15✔
1148
                CanonicalCombiningClass::NAME | CanonicalCombiningClass::SHORT_NAME => {
13✔
UNCOV
1149
                    try_ccc = Ok(value)
×
1150
                }
1151
                b"Script_Extensions" | b"scx" => try_scx = Ok(value),
17✔
1152
                b"Block" | b"blk" => try_block = Ok(value),
13✔
1153
                _ => {
1154
                    let normalized_value = value.to_ascii_lowercase();
14✔
1155
                    let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
14✔
1156
                    let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
14✔
1157
                    // value must either match truthy or falsy
1158
                    if truthy == falsy {
14✔
UNCOV
1159
                        return Err(PEK::UnknownProperty.into());
×
1160
                    }
1161
                    // correctness: if we reach this point, only `try_binary` can be Ok, hence
1162
                    // it does not matter that further down we unconditionally return `inverted`,
1163
                    // because only `try_binary` can enter that code path.
1164
                    inverted = falsy;
14✔
1165
                    try_binary = Ok(key);
14✔
1166
                }
14✔
1167
            }
1168
        } else {
1169
            // key is binary property
1170
            // OR a value of gc, sc (only gc or sc are supported as implicit keys by UTS35!)
1171
            try_gc = Ok(key);
73✔
1172
            try_sc = Ok(key);
73✔
1173
            try_binary = Ok(key);
73✔
1174
        }
1175

1176
        try_gc
206✔
1177
            .and_then(|value| self.try_load_general_category_set(value))
77✔
1178
            .or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
71✔
1179
            .or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
36✔
1180
            .or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
51✔
1181
            .or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
8✔
1182
            .or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
7✔
1183
            .or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))
6✔
1184
            .or_else(|_| try_ccc.and_then(|value| self.try_load_ccc_set(value)))
4✔
1185
            .or_else(|_| try_block.and_then(|value| self.try_load_block_set(value)))?;
8✔
1186
        Ok(inverted)
99✔
1187
    }
103✔
1188

1189
    fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
635✔
1190
        if self.inverted {
635✔
1191
            // code point inversion; removes all strings
1192
            #[cfg(feature = "log")]
1193
            if !self.string_set.is_empty() {
111✔
1194
                log::info!(
3✔
1195
                    "Inverting a unicode set with strings. This removes all strings entirely."
1196
                );
1197
            }
1198
            self.string_set.clear();
111✔
1199
            self.single_set.complement();
111✔
1200
        }
1201

1202
        (self.single_set, self.string_set)
635✔
1203
    }
635✔
1204

1205
    // parses either a raw char or an escaped char. all chars are allowed, the caller must make sure to handle
1206
    // cases where some characters are not allowed
1207
    fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
1,595✔
1208
        let (offset, c) = self.must_peek()?;
1,595✔
1209
        match c {
1,595✔
1210
            '\\' => self.parse_escaped_char(),
133✔
1211
            _ => {
1212
                self.iter.next();
1,462✔
1213
                Ok((offset, SingleOrMultiChar::Single(c)))
1,462✔
1214
            }
1215
        }
1216
    }
1,595✔
1217

1218
    // note: could turn this from the current two-pass approach into a one-pass approach
1219
    // by manually parsing the digits instead of using u32::from_str_radix.
1220
    fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
71✔
1221
        let first_offset = self.must_peek_index()?;
71✔
1222
        let end_offset = self.validate_hex_digits(min, max)?;
71✔
1223

1224
        // validate_hex_digits ensures that chars (including the last one) are ascii hex digits,
1225
        // which are all exactly one UTF-8 byte long, so slicing on these offsets always respects char boundaries
1226
        #[allow(clippy::indexing_slicing)]
1227
        let hex_source = &self.source[first_offset..=end_offset];
68✔
1228
        let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
68✔
1229
        char::try_from(num)
68✔
1230
            .map(|c| (end_offset, c))
66✔
1231
            .map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
2✔
1232
    }
71✔
1233

1234
    // validates [0-9a-fA-F]{min,max}, returns the offset of the last digit, consuming everything in the process
1235
    fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
71✔
1236
        let mut last_offset = 0;
71✔
1237
        for count in 0..max {
292✔
1238
            let (offset, c) = self.must_peek()?;
245✔
1239
            if !c.is_ascii_hexdigit() {
245✔
1240
                if count < min {
24✔
1241
                    return Err(PEK::UnexpectedChar(c).with_offset(offset));
3✔
1242
                } else {
1243
                    break;
1244
                }
1245
            }
1246
            self.iter.next();
221✔
1247
            last_offset = offset;
221✔
1248
        }
1249
        Ok(last_offset)
68✔
1250
    }
71✔
1251

1252
    // returns the number of skipped whitespace chars
1253
    fn skip_whitespace(&mut self) -> usize {
4,187✔
1254
        let mut num = 0;
4,187✔
1255
        while let Some(c) = self.peek_char() {
4,333✔
1256
            if !self.pat_ws.contains(c) {
4,331✔
1257
                break;
1258
            }
1259
            self.iter.next();
146✔
1260
            num += 1;
146✔
1261
        }
1262
        num
4,187✔
1263
    }
4,187✔
1264

1265
    fn consume(&mut self, expected: char) -> Result<()> {
509✔
1266
        match self.must_next()? {
509✔
1267
            (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
509✔
1268
            _ => Ok(()),
508✔
1269
        }
1270
    }
509✔
1271

1272
    // use this whenever an empty iterator would imply an Eof error
1273
    fn must_next(&mut self) -> Result<(usize, char)> {
668✔
1274
        self.iter.next().ok_or(PEK::Eof.into())
668✔
1275
    }
668✔
1276

1277
    // use this whenever an empty iterator would imply an Eof error
1278
    fn must_peek(&mut self) -> Result<(usize, char)> {
10,686✔
1279
        self.iter.peek().copied().ok_or(PEK::Eof.into())
10,686✔
1280
    }
10,686✔
1281

1282
    // must_peek, but looks two chars ahead. use sparingly
1283
    fn must_peek_double(&mut self) -> Result<(usize, char)> {
1,996✔
1284
        let mut copy = self.iter.clone();
1,996✔
1285
        copy.next();
1,996✔
1286
        copy.next().ok_or(PEK::Eof.into())
1,996✔
1287
    }
1,996✔
1288

1289
    // see must_peek
1290
    fn must_peek_char(&mut self) -> Result<char> {
2,963✔
1291
        self.must_peek().map(|(_, c)| c)
5,925✔
1292
    }
2,963✔
1293

1294
    // see must_peek
1295
    fn must_peek_index(&mut self) -> Result<usize> {
721✔
1296
        self.must_peek().map(|(idx, _)| idx)
1,440✔
1297
    }
721✔
1298

1299
    fn peek_char(&mut self) -> Option<char> {
5,050✔
1300
        self.iter.peek().map(|&(_, c)| c)
10,098✔
1301
    }
5,050✔
1302

1303
    // TODO: return Result<!> once ! is stable
1304
    #[inline]
1305
    fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
11✔
1306
        match self.iter.peek() {
11✔
UNCOV
1307
            None => Err(kind.into()),
×
1308
            Some(&(offset, _)) => Err(kind.with_offset(offset)),
11✔
1309
        }
1310
    }
11✔
1311

1312
    fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
209✔
1313
        match op {
209✔
1314
            Operation::Union => self.string_set.extend(other_strings),
157✔
1315
            Operation::Difference => {
1316
                self.string_set = self
27✔
1317
                    .string_set
1318
                    .difference(&other_strings)
1319
                    .cloned()
UNCOV
1320
                    .collect()
×
1321
            }
27✔
1322
            Operation::Intersection => {
1323
                self.string_set = self
25✔
1324
                    .string_set
1325
                    .intersection(&other_strings)
1326
                    .cloned()
UNCOV
1327
                    .collect()
×
1328
            }
25✔
1329
        }
1330
    }
209✔
1331

1332
    fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
209✔
1333
        match op {
209✔
1334
            Operation::Union => self.single_set.add_set(&other_chars),
157✔
1335
            Operation::Difference => self.single_set.remove_set(&other_chars),
27✔
1336
            Operation::Intersection => self.single_set.retain_set(&other_chars),
25✔
1337
        }
1338
    }
209✔
1339

1340
    fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
77✔
1341
        // TODO(#3550): This could be cached; does not depend on name.
1342
        let name_map =
1343
            PropertyParser::<GeneralCategoryGroup>::try_new_unstable(self.property_provider)
77✔
UNCOV
1344
                .map_err(|_| PEK::Internal)?;
×
1345
        let gc_value = name_map
77✔
1346
            .as_borrowed()
1347
            .get_loose(name)
1348
            .ok_or(PEK::UnknownProperty)?;
98✔
1349
        // TODO(#3550): This could be cached; does not depend on name.
1350
        let set = CodePointMapData::<GeneralCategory>::try_new_unstable(self.property_provider)
56✔
UNCOV
1351
            .map_err(|_| PEK::Internal)?
×
1352
            .as_borrowed()
1353
            .get_set_for_value_group(gc_value);
56✔
1354
        self.single_set.add_set(&set.to_code_point_inversion_list());
56✔
1355
        Ok(())
56✔
1356
    }
77✔
1357

1358
    fn try_get_script(&self, name: &str) -> Result<Script> {
28✔
1359
        // TODO(#3550): This could be cached; does not depend on name.
1360
        let name_map = PropertyParser::<Script>::try_new_unstable(self.property_provider)
28✔
UNCOV
1361
            .map_err(|_| PEK::Internal)?;
×
1362
        name_map
28✔
1363
            .as_borrowed()
1364
            .get_loose(name)
1365
            .ok_or(PEK::UnknownProperty.into())
28✔
1366
    }
28✔
1367

1368
    fn try_load_script_set(&mut self, name: &str) -> Result<()> {
24✔
1369
        let sc_value = self.try_get_script(name)?;
24✔
1370
        // TODO(#3550): This could be cached; does not depend on name.
1371
        let property_map = CodePointMapData::<Script>::try_new_unstable(self.property_provider)
15✔
UNCOV
1372
            .map_err(|_| PEK::Internal)?;
×
1373
        let set = property_map.as_borrowed().get_set_for_value(sc_value);
15✔
1374
        self.single_set.add_set(&set.to_code_point_inversion_list());
15✔
1375
        Ok(())
15✔
1376
    }
24✔
1377

1378
    fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
4✔
1379
        // TODO(#3550): This could be cached; does not depend on name.
1380
        let scx = ScriptWithExtensions::try_new_unstable(self.property_provider)
4✔
UNCOV
1381
            .map_err(|_| PEK::Internal)?;
×
1382
        let sc_value = self.try_get_script(name)?;
4✔
1383
        let set = scx.as_borrowed().get_script_extensions_set(sc_value);
4✔
1384
        self.single_set.add_set(&set);
4✔
1385
        Ok(())
4✔
1386
    }
4✔
1387

1388
    fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
23✔
1389
        let set =
1390
            CodePointSetData::try_new_for_ecma262_unstable(self.property_provider, name.as_bytes())
46✔
1391
                .ok_or(PEK::UnknownProperty)?
23✔
UNCOV
1392
                .map_err(|_data_error| PEK::Internal)?;
×
1393
        self.single_set.add_set(&set.to_code_point_inversion_list());
21✔
1394
        Ok(())
21✔
1395
    }
23✔
1396

1397
    fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
1✔
1398
        let parser =
1399
            PropertyParser::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1400
                .map_err(|_| PEK::Internal)?;
×
1401
        let gcb_value = parser
1✔
1402
            .as_borrowed()
1403
            .get_loose(name)
UNCOV
1404
            .or_else(|| name.parse().ok().map(GraphemeClusterBreak))
×
1405
            .ok_or(PEK::UnknownProperty)?;
1✔
1406
        // TODO(#3550): This could be cached; does not depend on name.
1407
        let property_map =
1408
            CodePointMapData::<GraphemeClusterBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1409
                .map_err(|_| PEK::Internal)?;
×
1410
        let set = property_map.as_borrowed().get_set_for_value(gcb_value);
1✔
1411
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1412
        Ok(())
1✔
1413
    }
1✔
1414

1415
    fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
1✔
1416
        let parser = PropertyParser::<SentenceBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1417
            .map_err(|_| PEK::Internal)?;
×
1418
        let sb_value = parser
1✔
1419
            .as_borrowed()
1420
            .get_loose(name)
UNCOV
1421
            .or_else(|| name.parse().ok().map(SentenceBreak))
×
1422
            .ok_or(PEK::UnknownProperty)?;
1✔
1423
        // TODO(#3550): This could be cached; does not depend on name.
1424
        let property_map =
1425
            CodePointMapData::<SentenceBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1426
                .map_err(|_| PEK::Internal)?;
×
1427
        let set = property_map.as_borrowed().get_set_for_value(sb_value);
1✔
1428
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1429
        Ok(())
1✔
1430
    }
1✔
1431

1432
    fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
1✔
1433
        let parser = PropertyParser::<WordBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1434
            .map_err(|_| PEK::Internal)?;
×
1435
        let wb_value = parser
1✔
1436
            .as_borrowed()
1437
            .get_loose(name)
UNCOV
1438
            .or_else(|| name.parse().ok().map(WordBreak))
×
1439
            .ok_or(PEK::UnknownProperty)?;
1✔
1440
        // TODO(#3550): This could be cached; does not depend on name.
1441
        let property_map = CodePointMapData::<WordBreak>::try_new_unstable(self.property_provider)
1✔
UNCOV
1442
            .map_err(|_| PEK::Internal)?;
×
1443
        let set = property_map.as_borrowed().get_set_for_value(wb_value);
1✔
1444
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1445
        Ok(())
1✔
1446
    }
1✔
1447

UNCOV
1448
    fn try_load_ccc_set(&mut self, name: &str) -> Result<()> {
×
1449
        let parser =
UNCOV
1450
            PropertyParser::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
×
UNCOV
1451
                .map_err(|_| PEK::Internal)?;
×
UNCOV
1452
        let value = parser
×
1453
            .as_borrowed()
1454
            .get_loose(name)
UNCOV
1455
            .or_else(|| name.parse().ok().map(CanonicalCombiningClass))
×
UNCOV
1456
            .ok_or(PEK::UnknownProperty)?;
×
1457
        // TODO(#3550): This could be cached; does not depend on name.
1458
        let property_map =
UNCOV
1459
            CodePointMapData::<CanonicalCombiningClass>::try_new_unstable(self.property_provider)
×
UNCOV
1460
                .map_err(|_| PEK::Internal)?;
×
UNCOV
1461
        let set = property_map.as_borrowed().get_set_for_value(value);
×
UNCOV
1462
        self.single_set.add_set(&set.to_code_point_inversion_list());
×
UNCOV
1463
        Ok(())
×
UNCOV
1464
    }
×
1465

UNCOV
1466
    fn try_load_block_set(&mut self, name: &str) -> Result<()> {
×
1467
        // TODO: source these from properties
UNCOV
1468
        self.single_set
×
UNCOV
1469
            .add_range(match name.to_ascii_lowercase().as_str() {
×
UNCOV
1470
                "arabic" => '\u{0600}'..'\u{06FF}',
×
UNCOV
1471
                "thaana" => '\u{0780}'..'\u{07BF}',
×
1472
                _ => {
1473
                    #[cfg(feature = "log")]
UNCOV
1474
                    log::warn!("Skipping :block={name}:");
×
UNCOV
1475
                    return Err(PEK::Unimplemented.into());
×
1476
                }
UNCOV
1477
            });
×
UNCOV
1478
        Ok(())
×
UNCOV
1479
    }
×
1480
}
1481

1482
/// Parses a UnicodeSet pattern and returns a UnicodeSet in the form of a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList),
1483
/// as well as the number of bytes consumed from the source string.
1484
///
1485
/// Supports UnicodeSets as described in [UTS #35 - Unicode Sets](https://unicode.org/reports/tr35/#Unicode_Sets).
1486
///
1487
/// The error type of the returned Result can be pretty-printed with [`ParseError::fmt_with_source`].
1488
///
1489
/// # Variables
1490
///
1491
/// If you need support for variables inside UnicodeSets (e.g., `[$start-$end]`), use [`parse_with_variables`].
1492
///
1493
/// # Limitations
1494
///
1495
/// * Currently, we only support the [ECMA-262 properties](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
1496
///   The property names must match the exact spelling listed in ECMA-262. Note that we do support UTS35 syntax for elided `General_Category`
1497
///   and `Script` property names, i.e., `[:Latn:]` and `[:Ll:]` are both valid, with the former implying the `Script` property, and the latter the
1498
///   `General_Category` property.
1499
/// * We do not support `\N{Unicode code point name}` character escaping. Use any other escape method described in UTS35.
1500
///
1501
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
1502
///
1503
/// [📚 Help choosing a constructor](icu_provider::constructors)
1504
///
1505
/// # Examples
1506
///
1507
/// Parse ranges
1508
/// ```
1509
/// use icu::experimental::unicodeset_parse::parse;
1510
///
1511
/// let source = "[a-zA-Z0-9]";
1512
/// let (set, consumed) = parse(source).unwrap();
1513
/// let code_points = set.code_points();
1514
///
1515
/// assert!(code_points.contains_range('a'..='z'));
1516
/// assert!(code_points.contains_range('A'..='Z'));
1517
/// assert!(code_points.contains_range('0'..='9'));
1518
/// assert_eq!(consumed, source.len());
1519
/// ```
1520
///
1521
/// Parse properties, set operations, inner sets
1522
/// ```
1523
/// use icu::experimental::unicodeset_parse::parse;
1524
///
1525
/// let (set, _) =
1526
///     parse("[[:^ll:]-[^][:gc = Lowercase Letter:]&[^[[^]-[a-z]]]]").unwrap();
1527
/// assert!(set.code_points().contains_range('a'..='z'));
1528
/// assert_eq!(('a'..='z').count(), set.size());
1529
/// ```
1530
///
1531
/// Inversions remove strings
1532
/// ```
1533
/// use icu::experimental::unicodeset_parse::parse;
1534
///
1535
/// let (set, _) =
1536
///     parse(r"[[a-z{hello\ world}]&[^a-y{hello\ world}]]").unwrap();
1537
/// assert!(set.contains('z'));
1538
/// assert_eq!(set.size(), 1);
1539
/// assert!(!set.has_strings());
1540
/// ```
1541
///
1542
/// Set operators (including the implicit union) have the same precedence and are left-associative
1543
/// ```
1544
/// use icu::experimental::unicodeset_parse::parse;
1545
///
1546
/// let (set, _) = parse("[[ace][bdf] - [abc][def]]").unwrap();
1547
/// assert!(set.code_points().contains_range('d'..='f'));
1548
/// assert_eq!(set.size(), ('d'..='f').count());
1549
/// ```
1550
///
1551
/// Supports partial parses
1552
/// ```
1553
/// use icu::experimental::unicodeset_parse::parse;
1554
///
1555
/// let (set, consumed) = parse("[a-c][x-z]").unwrap();
1556
/// let code_points = set.code_points();
1557
/// assert!(code_points.contains_range('a'..='c'));
1558
/// assert!(!code_points.contains_range('x'..='z'));
1559
/// assert_eq!(set.size(), ('a'..='c').count());
1560
/// // only the first UnicodeSet is parsed
1561
/// assert_eq!(consumed, "[a-c]".len());
1562
/// ```
1563
#[cfg(feature = "compiled_data")]
1564
pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
134✔
1565
    parse_unstable(source, &icu_properties::provider::Baked)
134✔
1566
}
134✔
1567

1568
/// Parses a UnicodeSet pattern with support for variables enabled.
1569
///
1570
/// See [`parse`] for more information.
1571
///
1572
/// # Examples
1573
///
1574
/// ```
1575
/// use icu::experimental::unicodeset_parse::*;
1576
///
1577
/// let (my_set, _) = parse("[abc]").unwrap();
1578
///
1579
/// let mut variable_map = VariableMap::new();
1580
/// variable_map.insert_char("start".into(), 'a').unwrap();
1581
/// variable_map.insert_char("end".into(), 'z').unwrap();
1582
/// variable_map.insert_string("str".into(), "Hello World".into()).unwrap();
1583
/// variable_map.insert_set("the_set".into(), my_set).unwrap();
1584
///
1585
/// // If a variable already exists, `Err` is returned, and the map is not updated.
1586
/// variable_map.insert_char("end".into(), 'Ω').unwrap_err();
1587
///
1588
/// let source = "[[$start-$end]-$the_set $str]";
1589
/// let (set, consumed) = parse_with_variables(source, &variable_map).unwrap();
1590
/// assert_eq!(consumed, source.len());
1591
/// assert!(set.code_points().contains_range('d'..='z'));
1592
/// assert!(set.contains_str("Hello World"));
1593
/// assert_eq!(set.size(), 1 + ('d'..='z').count());
1594
#[cfg(feature = "compiled_data")]
1595
pub fn parse_with_variables(
74✔
1596
    source: &str,
1597
    variable_map: &VariableMap<'_>,
1598
) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1599
    parse_unstable_with_variables(source, variable_map, &icu_properties::provider::Baked)
74✔
1600
}
74✔
1601

1602
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
1603
pub fn parse_unstable_with_variables<P>(
501✔
1604
    source: &str,
1605
    variable_map: &VariableMap<'_>,
1606
    provider: &P,
1607
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1608
where
1609
    P: ?Sized
1610
        + DataProvider<AsciiHexDigitV1Marker>
1611
        + DataProvider<AlphabeticV1Marker>
1612
        + DataProvider<BidiControlV1Marker>
1613
        + DataProvider<BidiMirroredV1Marker>
1614
        + DataProvider<CanonicalCombiningClassV1Marker>
1615
        + DataProvider<CanonicalCombiningClassNameToValueV2Marker>
1616
        + DataProvider<CaseIgnorableV1Marker>
1617
        + DataProvider<CasedV1Marker>
1618
        + DataProvider<ChangesWhenCasefoldedV1Marker>
1619
        + DataProvider<ChangesWhenCasemappedV1Marker>
1620
        + DataProvider<ChangesWhenLowercasedV1Marker>
1621
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
1622
        + DataProvider<ChangesWhenTitlecasedV1Marker>
1623
        + DataProvider<ChangesWhenUppercasedV1Marker>
1624
        + DataProvider<DashV1Marker>
1625
        + DataProvider<DefaultIgnorableCodePointV1Marker>
1626
        + DataProvider<DeprecatedV1Marker>
1627
        + DataProvider<DiacriticV1Marker>
1628
        + DataProvider<EmojiV1Marker>
1629
        + DataProvider<EmojiComponentV1Marker>
1630
        + DataProvider<EmojiModifierV1Marker>
1631
        + DataProvider<EmojiModifierBaseV1Marker>
1632
        + DataProvider<EmojiPresentationV1Marker>
1633
        + DataProvider<ExtendedPictographicV1Marker>
1634
        + DataProvider<ExtenderV1Marker>
1635
        + DataProvider<GraphemeBaseV1Marker>
1636
        + DataProvider<GraphemeClusterBreakV1Marker>
1637
        + DataProvider<GraphemeClusterBreakNameToValueV2Marker>
1638
        + DataProvider<GraphemeExtendV1Marker>
1639
        + DataProvider<HexDigitV1Marker>
1640
        + DataProvider<IdsBinaryOperatorV1Marker>
1641
        + DataProvider<IdsTrinaryOperatorV1Marker>
1642
        + DataProvider<IdContinueV1Marker>
1643
        + DataProvider<IdStartV1Marker>
1644
        + DataProvider<IdeographicV1Marker>
1645
        + DataProvider<JoinControlV1Marker>
1646
        + DataProvider<LogicalOrderExceptionV1Marker>
1647
        + DataProvider<LowercaseV1Marker>
1648
        + DataProvider<MathV1Marker>
1649
        + DataProvider<NoncharacterCodePointV1Marker>
1650
        + DataProvider<PatternSyntaxV1Marker>
1651
        + DataProvider<PatternWhiteSpaceV1Marker>
1652
        + DataProvider<QuotationMarkV1Marker>
1653
        + DataProvider<RadicalV1Marker>
1654
        + DataProvider<RegionalIndicatorV1Marker>
1655
        + DataProvider<SentenceBreakV1Marker>
1656
        + DataProvider<SentenceBreakNameToValueV2Marker>
1657
        + DataProvider<SentenceTerminalV1Marker>
1658
        + DataProvider<SoftDottedV1Marker>
1659
        + DataProvider<TerminalPunctuationV1Marker>
1660
        + DataProvider<UnifiedIdeographV1Marker>
1661
        + DataProvider<UppercaseV1Marker>
1662
        + DataProvider<VariationSelectorV1Marker>
1663
        + DataProvider<WhiteSpaceV1Marker>
1664
        + DataProvider<WordBreakV1Marker>
1665
        + DataProvider<WordBreakNameToValueV2Marker>
1666
        + DataProvider<XidContinueV1Marker>
1667
        + DataProvider<GeneralCategoryMaskNameToValueV2Marker>
1668
        + DataProvider<GeneralCategoryV1Marker>
1669
        + DataProvider<ScriptNameToValueV2Marker>
1670
        + DataProvider<ScriptV1Marker>
1671
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
1672
        + DataProvider<XidStartV1Marker>,
1673
{
1674
    // TODO(#3550): Add function "parse_overescaped" that uses a custom iterator to de-overescape (i.e., maps \\ to \) on-the-fly?
1675
    // ^ will likely need a different iterator type on UnicodeSetBuilder
1676

1677
    let mut iter = source.char_indices().peekable();
501✔
1678

1679
    let xid_start =
1680
        CodePointSetData::try_new_unstable::<XidStart>(provider).map_err(|_| PEK::Internal)?;
501✔
1681
    let xid_start_list = xid_start.to_code_point_inversion_list();
501✔
1682
    let xid_continue =
1683
        CodePointSetData::try_new_unstable::<XidContinue>(provider).map_err(|_| PEK::Internal)?;
495✔
1684
    let xid_continue_list = xid_continue.to_code_point_inversion_list();
501✔
1685

1686
    let pat_ws = CodePointSetData::try_new_unstable::<PatternWhiteSpace>(provider)
495✔
UNCOV
1687
        .map_err(|_| PEK::Internal)?;
×
1688
    let pat_ws_list = pat_ws.to_code_point_inversion_list();
500✔
1689

1690
    let mut builder = UnicodeSetBuilder::new_internal(
503✔
1691
        &mut iter,
1692
        source,
1693
        variable_map,
1694
        &xid_start_list,
1695
        &xid_continue_list,
1696
        &pat_ws_list,
1697
        provider,
1698
    );
499✔
1699

1700
    builder.parse_unicode_set()?;
1,000✔
1701
    let (single, string_set) = builder.finalize();
442✔
1702
    let built_single = single.build();
446✔
1703

1704
    let mut strings = string_set.into_iter().collect::<Vec<_>>();
446✔
1705
    strings.sort();
446✔
1706
    let zerovec = (&strings).into();
443✔
1707

1708
    let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
442✔
UNCOV
1709
        .map_err(|_| PEK::Internal)?;
×
1710

1711
    let parsed_bytes = match iter.peek().copied() {
442✔
1712
        None => source.len(),
163✔
1713
        Some((offset, _)) => offset,
279✔
1714
    };
1715

1716
    Ok((cpinvlistandstrlist, parsed_bytes))
442✔
1717
}
495✔
1718

1719
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, parse)]
1720
pub fn parse_unstable<P>(
150✔
1721
    source: &str,
1722
    provider: &P,
1723
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1724
where
1725
    P: ?Sized
1726
        + DataProvider<AsciiHexDigitV1Marker>
1727
        + DataProvider<AlphabeticV1Marker>
1728
        + DataProvider<BidiControlV1Marker>
1729
        + DataProvider<BidiMirroredV1Marker>
1730
        + DataProvider<CanonicalCombiningClassV1Marker>
1731
        + DataProvider<CanonicalCombiningClassNameToValueV2Marker>
1732
        + DataProvider<CaseIgnorableV1Marker>
1733
        + DataProvider<CasedV1Marker>
1734
        + DataProvider<ChangesWhenCasefoldedV1Marker>
1735
        + DataProvider<ChangesWhenCasemappedV1Marker>
1736
        + DataProvider<ChangesWhenLowercasedV1Marker>
1737
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
1738
        + DataProvider<ChangesWhenTitlecasedV1Marker>
1739
        + DataProvider<ChangesWhenUppercasedV1Marker>
1740
        + DataProvider<DashV1Marker>
1741
        + DataProvider<DefaultIgnorableCodePointV1Marker>
1742
        + DataProvider<DeprecatedV1Marker>
1743
        + DataProvider<DiacriticV1Marker>
1744
        + DataProvider<EmojiV1Marker>
1745
        + DataProvider<EmojiComponentV1Marker>
1746
        + DataProvider<EmojiModifierV1Marker>
1747
        + DataProvider<EmojiModifierBaseV1Marker>
1748
        + DataProvider<EmojiPresentationV1Marker>
1749
        + DataProvider<ExtendedPictographicV1Marker>
1750
        + DataProvider<ExtenderV1Marker>
1751
        + DataProvider<GraphemeBaseV1Marker>
1752
        + DataProvider<GraphemeClusterBreakV1Marker>
1753
        + DataProvider<GraphemeClusterBreakNameToValueV2Marker>
1754
        + DataProvider<GraphemeExtendV1Marker>
1755
        + DataProvider<HexDigitV1Marker>
1756
        + DataProvider<IdsBinaryOperatorV1Marker>
1757
        + DataProvider<IdsTrinaryOperatorV1Marker>
1758
        + DataProvider<IdContinueV1Marker>
1759
        + DataProvider<IdStartV1Marker>
1760
        + DataProvider<IdeographicV1Marker>
1761
        + DataProvider<JoinControlV1Marker>
1762
        + DataProvider<LogicalOrderExceptionV1Marker>
1763
        + DataProvider<LowercaseV1Marker>
1764
        + DataProvider<MathV1Marker>
1765
        + DataProvider<NoncharacterCodePointV1Marker>
1766
        + DataProvider<PatternSyntaxV1Marker>
1767
        + DataProvider<PatternWhiteSpaceV1Marker>
1768
        + DataProvider<QuotationMarkV1Marker>
1769
        + DataProvider<RadicalV1Marker>
1770
        + DataProvider<RegionalIndicatorV1Marker>
1771
        + DataProvider<SentenceBreakV1Marker>
1772
        + DataProvider<SentenceBreakNameToValueV2Marker>
1773
        + DataProvider<SentenceTerminalV1Marker>
1774
        + DataProvider<SoftDottedV1Marker>
1775
        + DataProvider<TerminalPunctuationV1Marker>
1776
        + DataProvider<UnifiedIdeographV1Marker>
1777
        + DataProvider<UppercaseV1Marker>
1778
        + DataProvider<VariationSelectorV1Marker>
1779
        + DataProvider<WhiteSpaceV1Marker>
1780
        + DataProvider<WordBreakV1Marker>
1781
        + DataProvider<WordBreakNameToValueV2Marker>
1782
        + DataProvider<XidContinueV1Marker>
1783
        + DataProvider<GeneralCategoryMaskNameToValueV2Marker>
1784
        + DataProvider<GeneralCategoryV1Marker>
1785
        + DataProvider<ScriptNameToValueV2Marker>
1786
        + DataProvider<ScriptV1Marker>
1787
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
1788
        + DataProvider<XidStartV1Marker>,
1789
{
1790
    let dummy = Default::default();
150✔
1791
    parse_unstable_with_variables(source, &dummy, provider)
150✔
1792
}
150✔
1793

1794
#[cfg(test)]
1795
mod tests {
1796
    use core::ops::RangeInclusive;
1797
    use std::collections::HashSet;
1798

1799
    use super::*;
1800

1801
    // "aabxzz" => [a..=a, b..=x, z..=z]
1802
    fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
139✔
1803
        debug_assert_eq!(
278✔
1804
            s.chars().count() % 2,
139✔
1805
            0,
1806
            "string \"{}\" does not contain an even number of code points",
1807
            s.escape_debug()
1808
        );
1809
        let mut res = vec![];
139✔
1810
        let mut skip = false;
139✔
1811
        for (a, b) in s.chars().zip(s.chars().skip(1)) {
382✔
1812
            if skip {
243✔
1813
                skip = false;
66✔
1814
                continue;
1815
            }
1816
            let a = a as u32;
177✔
1817
            let b = b as u32;
177✔
1818
            res.push(a..=b);
177✔
1819
            skip = true;
177✔
1820
        }
1821

1822
        res.into_iter()
139✔
1823
    }
139✔
1824

1825
    fn assert_set_equality<'a>(
139✔
1826
        source: &str,
1827
        cpinvlistandstrlist: &CodePointInversionListAndStringList,
1828
        single: impl Iterator<Item = RangeInclusive<u32>>,
1829
        strings: impl Iterator<Item = &'a str>,
1830
    ) {
1831
        let expected_ranges: HashSet<_> = single.collect();
139✔
1832
        let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
139✔
1833
        assert_eq!(
139✔
1834
            actual_ranges,
1835
            expected_ranges,
1836
            "got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
1837
            actual_ranges,
1838
            expected_ranges,
1839
            source.escape_debug()
1840
        );
1841
        let mut expected_size = cpinvlistandstrlist.code_points().size();
139✔
1842
        for s in strings {
160✔
1843
            expected_size += 1;
21✔
UNCOV
1844
            assert!(
×
1845
                cpinvlistandstrlist.contains_str(s),
21✔
1846
                "missing string \"{}\" from parsed set \"{}\"",
UNCOV
1847
                s.escape_debug(),
×
UNCOV
1848
                source.escape_debug()
×
1849
            );
1850
        }
139✔
1851
        let actual_size = cpinvlistandstrlist.size();
139✔
1852
        assert_eq!(
139✔
1853
            actual_size,
1854
            expected_size,
1855
            "got unexpected size {}, expected {} for parsed set \"{}\"",
1856
            actual_size,
1857
            expected_size,
1858
            source.escape_debug()
1859
        );
1860
    }
139✔
1861

1862
    fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
46✔
1863
        let result = parse_with_variables(source, vm);
46✔
1864
        assert!(result.is_err(), "{source} does not cause an error!");
46✔
1865
        let err = result.unwrap_err();
46✔
1866
        assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
92✔
1867
    }
46✔
1868

1869
    #[test]
1870
    fn test_semantics_with_variables() {
2✔
1871
        let mut map_char_char = VariableMap::default();
1✔
1872
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
1873
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
1874

1875
        let mut map_headache = VariableMap::default();
1✔
1876
        map_headache.insert_char("hehe".to_string(), '-').unwrap();
1✔
1877

1878
        let mut map_char_string = VariableMap::default();
1✔
1879
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
1880
        map_char_string
1✔
1881
            .insert_string("var2".to_string(), "abc".to_string())
2✔
1882
            .unwrap();
1883

1884
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
1885
        let mut map_char_set = VariableMap::default();
1✔
1886
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
1887
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
1888

1889
        let cases: Vec<(_, _, _, Vec<&str>)> = vec![
2✔
1890
            // simple
1891
            (&map_char_char, "[$a]", "aa", vec![]),
1✔
1892
            (&map_char_char, "[ $a ]", "aa", vec![]),
1✔
1893
            (&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1894
            (&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1895
            (&map_char_char, "[$a$var2]", "aazz", vec![]),
1✔
1896
            (&map_char_char, "[$a - $var2]", "az", vec![]),
1✔
1897
            (&map_char_char, "[$a-$var2]", "az", vec![]),
1✔
1898
            (&map_headache, "[a $hehe z]", "aazz--", vec![]),
1✔
1899
            (
1✔
1900
                &map_char_char,
1901
                "[[$]var2]",
1902
                "\u{ffff}\u{ffff}vvaarr22",
1903
                vec![],
1✔
1904
            ),
1905
            // variable prefix escaping
1906
            (&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
1✔
1907
            (&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
1✔
1908
            // no variable dereferencing in strings
1909
            (&map_char_char, "[{$a}]", "", vec!["$a"]),
1✔
1910
            // set operations
1911
            (&map_char_set, "[$set & [b-z]]", "bz", vec![]),
1✔
1912
            (&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
1✔
1913
            (&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
1✔
1914
            (&map_char_set, "[$set-$set]", "", vec![]),
1✔
1915
            (&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
1✔
1916
            (&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
1✔
1917
            (&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
1✔
1918
            (&map_char_set, "$set", "az", vec!["Hello, World!"]),
1✔
1919
            // strings
1920
            (&map_char_string, "[$var2]", "", vec!["abc"]),
1✔
1921
        ];
1922
        for (variable_map, source, single, strings) in cases {
22✔
1923
            let parsed = parse_with_variables(source, variable_map);
21✔
1924
            if let Err(err) = parsed {
21✔
UNCOV
1925
                panic!(
×
1926
                    "{source} results in an error: {}",
UNCOV
1927
                    err.fmt_with_source(source)
×
1928
                );
1929
            }
1930
            let (set, consumed) = parsed.unwrap();
21✔
1931
            assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
21✔
1932
            assert_set_equality(
21✔
1933
                source,
1934
                &set,
1935
                range_iter_from_str(single),
21✔
1936
                strings.into_iter(),
21✔
1937
            );
21✔
1938
        }
22✔
1939
    }
2✔
1940

1941
    #[test]
1942
    fn test_semantics() {
2✔
1943
        const ALL_CHARS: &str = "\x00\u{10FFFF}";
1944
        let cases: Vec<(_, _, Vec<&str>)> = vec![
2✔
1945
            // simple
1946
            ("[a]", "aa", vec![]),
1✔
1947
            ("[]", "", vec![]),
1✔
1948
            ("[qax]", "aaqqxx", vec![]),
1✔
1949
            ("[a-z]", "az", vec![]),
1✔
1950
            ("[--]", "--", vec![]),
1✔
1951
            ("[a-b-]", "ab--", vec![]),
1✔
1952
            ("[[a-b]-]", "ab--", vec![]),
1✔
1953
            ("[{ab}-]", "--", vec!["ab"]),
1✔
1954
            ("[-a-b]", "ab--", vec![]),
1✔
1955
            ("[-a]", "--aa", vec![]),
1✔
1956
            // whitespace escaping
1957
            (r"[\n]", "\n\n", vec![]),
1✔
1958
            ("[\\\n]", "\n\n", vec![]),
1✔
1959
            // empty - whitespace is skipped
1960
            ("[\n]", "", vec![]),
1✔
1961
            ("[\u{9}]", "", vec![]),
1✔
1962
            ("[\u{A}]", "", vec![]),
1✔
1963
            ("[\u{B}]", "", vec![]),
1✔
1964
            ("[\u{C}]", "", vec![]),
1✔
1965
            ("[\u{D}]", "", vec![]),
1✔
1966
            ("[\u{20}]", "", vec![]),
1✔
1967
            ("[\u{85}]", "", vec![]),
1✔
1968
            ("[\u{200E}]", "", vec![]),
1✔
1969
            ("[\u{200F}]", "", vec![]),
1✔
1970
            ("[\u{2028}]", "", vec![]),
1✔
1971
            ("[\u{2029}]", "", vec![]),
1✔
1972
            // whitespace significance:
1973
            ("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1974
            ("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1975
            ("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1976
            ("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1977
            ("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1978
            ("[-]", "--", vec![]),
1✔
1979
            ("[  -  ]", "--", vec![]),
1✔
1980
            ("[  - -  ]", "--", vec![]),
1✔
1981
            ("[ a-b -  ]", "ab--", vec![]),
1✔
1982
            ("[ -a]", "--aa", vec![]),
1✔
1983
            ("[a-]", "--aa", vec![]),
1✔
1984
            ("[a- ]", "--aa", vec![]),
1✔
1985
            ("[ :]", "::", vec![]),
1✔
1986
            ("[ :L:]", "::LL", vec![]),
1✔
1987
            // but not all "whitespace", only Pattern_White_Space:
1988
            ("[\u{A0}]", "\u{A0}\u{A0}", vec![]), // non-breaking space
1✔
1989
            // anchor
1990
            ("[$]", "\u{ffff}\u{ffff}", vec![]),
1✔
1991
            (r"[\$]", "$$", vec![]),
1✔
1992
            ("[{$}]", "$$", vec![]),
1✔
1993
            // set operations
1994
            ("[[a-z]&[b-z]]", "bz", vec![]),
1✔
1995
            ("[[a-z]-[b-z]]", "aa", vec![]),
1✔
1996
            ("[[a-z][b-z]]", "az", vec![]),
1✔
1997
            ("[[a-a][b-z]]", "az", vec![]),
1✔
1998
            ("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
1✔
1999
            ("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
1✔
2000
            ("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
1✔
2001
            ("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
1✔
2002
            ("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
1✔
2003
            // strings
2004
            ("[{this is a minus -}]", "", vec!["thisisaminus-"]),
1✔
2005
            // associativity
2006
            ("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
1✔
2007
            ("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
1✔
2008
            ("[[a-a][b-z] - [a-z][]]", "", vec![]),
1✔
2009
            ("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
1✔
2010
            ("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
1✔
2011
            ("[[a-a][b-z] & []-[a-z]]", "", vec![]),
1✔
2012
            ("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
1✔
2013
            ("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
1✔
2014
            // escape tests
2015
            (r"[\x61-\x63]", "ac", vec![]),
1✔
2016
            (r"[a-\x63]", "ac", vec![]),
1✔
2017
            (r"[\x61-c]", "ac", vec![]),
1✔
2018
            (r"[\u0061-\x63]", "ac", vec![]),
1✔
2019
            (r"[\U00000061-\x63]", "ac", vec![]),
1✔
2020
            (r"[\x{61}-\x63]", "ac", vec![]),
1✔
2021
            (r"[\u{61}-\x63]", "ac", vec![]),
1✔
2022
            (r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
1✔
2023
            (r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
1✔
2024
            (r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
1✔
2025
            // complement tests
2026
            (r"[^]", ALL_CHARS, vec![]),
1✔
2027
            (r"[[^]-[^a-z]]", "az", vec![]),
1✔
2028
            (r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
1✔
2029
            (
1✔
2030
                r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
2031
                ALL_CHARS,
2032
                vec![],
1✔
2033
            ),
2034
            (
1✔
2035
                r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
2036
                "\u{100}\u{10FFFF}",
2037
                vec![],
1✔
2038
            ),
2039
            (r"[^[^a-z]]", "az", vec![]),
1✔
2040
            (r"[^[^\^]]", "^^", vec![]),
1✔
2041
            (r"[{\x{61 0062   063}}]", "", vec!["abc"]),
1✔
2042
            (r"[\x{61 0062   063}]", "ac", vec![]),
1✔
2043
            // binary properties
2044
            (r"[:AHex:]", "09afAF", vec![]),
1✔
2045
            (r"[:AHex=True:]", "09afAF", vec![]),
1✔
2046
            (r"[:AHex=T:]", "09afAF", vec![]),
1✔
2047
            (r"[:AHex=Yes:]", "09afAF", vec![]),
1✔
2048
            (r"[:AHex=Y:]", "09afAF", vec![]),
1✔
2049
            (r"[:^AHex≠True:]", "09afAF", vec![]),
1✔
2050
            (r"[:AHex≠False:]", "09afAF", vec![]),
1✔
2051
            (r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
1✔
2052
            (r"\p{AHex}", "09afAF", vec![]),
1✔
2053
            (r"\p{AHex=True}", "09afAF", vec![]),
1✔
2054
            (r"\p{AHex=T}", "09afAF", vec![]),
1✔
2055
            (r"\p{AHex=Yes}", "09afAF", vec![]),
1✔
2056
            (r"\p{AHex=Y}", "09afAF", vec![]),
1✔
2057
            (r"\P{AHex≠True}", "09afAF", vec![]),
1✔
2058
            (r"\p{AHex≠False}", "09afAF", vec![]),
1✔
2059
            // general category
2060
            (r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
1✔
2061
            (r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
1✔
2062
            // general category groups
2063
            // equivalence between L and the union of all the L* categories
2064
            (
1✔
2065
                r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
2066
                "",
2067
                vec![],
1✔
2068
            ),
2069
            // script
2070
            (r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2071
            (r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2072
            (r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2073
            (r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2074
            // script extensions
2075
            (r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2076
            (r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2077
            (r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2078
            (r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
1✔
2079
            (r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2080
            (r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
1✔
2081
            (r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2082
            // grapheme cluster break
2083
            (
1✔
2084
                r"\p{Grapheme_Cluster_Break=ZWJ}",
2085
                "\u{200D}\u{200D}",
2086
                vec![],
1✔
2087
            ),
2088
            // sentence break
2089
            (
1✔
2090
                r"\p{Sentence_Break=ATerm}",
2091
                "\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
2092
                vec![],
1✔
2093
            ),
2094
            // word break
2095
            (r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
1✔
2096
            // more syntax edge cases from UTS35 directly
2097
            (r"[\^a]", "^^aa", vec![]),
1✔
2098
            (r"[{{}]", "{{", vec![]),
1✔
2099
            (r"[{}}]", "}}", vec![""]),
1✔
2100
            (r"[}]", "}}", vec![]),
1✔
2101
            (r"[{$var}]", "", vec!["$var"]),
1✔
2102
            (r"[{[a-z}]", "", vec!["[a-z"]),
1✔
2103
            (r"[ { [ a - z } ]", "", vec!["[a-z"]),
1✔
2104
            // TODO(#3556): Add more tests (specifically conformance tests if they exist)
2105
        ];
2106
        for (source, single, strings) in cases {
119✔
2107
            let parsed = parse(source);
118✔
2108
            if let Err(err) = parsed {
118✔
UNCOV
2109
                panic!(
×
2110
                    "{source} results in an error: {}",
UNCOV
2111
                    err.fmt_with_source(source)
×
2112
                );
2113
            }
2114
            let (set, consumed) = parsed.unwrap();
118✔
2115
            assert_eq!(consumed, source.len());
118✔
2116
            assert_set_equality(
118✔
2117
                source,
2118
                &set,
2119
                range_iter_from_str(single),
118✔
2120
                strings.into_iter(),
118✔
2121
            );
118✔
2122
        }
119✔
2123
    }
2✔
2124

2125
    #[test]
2126
    fn test_error_messages_with_variables() {
2✔
2127
        let mut map_char_char = VariableMap::default();
1✔
2128
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
2129
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
2130

2131
        let mut map_char_string = VariableMap::default();
1✔
2132
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
2133
        map_char_string
1✔
2134
            .insert_string("var2".to_string(), "abc".to_string())
2✔
2135
            .unwrap();
2136

2137
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
2138
        let mut map_char_set = VariableMap::default();
1✔
2139
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
2140
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
2141

2142
        let cases = [
1✔
2143
            (&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
1✔
2144
            (
1✔
2145
                &map_char_char,
2146
                "[$ a]",
2147
                r"[$ a← error: unexpected character 'a'",
2148
            ),
2149
            (&map_char_char, "$a", r"$a← error: unexpected variable"),
1✔
2150
            (&map_char_char, "$", r"$← error: unexpected end of input"),
1✔
2151
            (
1✔
2152
                &map_char_string,
2153
                "[$var2-$a]",
2154
                r"[$var2-$a← error: unexpected variable",
2155
            ),
2156
            (
1✔
2157
                &map_char_string,
2158
                "[$a-$var2]",
2159
                r"[$a-$var2← error: unexpected variable",
2160
            ),
2161
            (
1✔
2162
                &map_char_set,
2163
                "[$a-$set]",
2164
                r"[$a-$set← error: unexpected variable",
2165
            ),
2166
            (
1✔
2167
                &map_char_set,
2168
                "[$set-$a]",
2169
                r"[$set-$a← error: unexpected variable",
2170
            ),
2171
            (
1✔
2172
                &map_char_set,
2173
                "[$=]",
2174
                "[$=← error: unexpected character '='",
2175
            ),
2176
        ];
2177
        for (variable_map, source, expected_err) in cases {
10✔
2178
            assert_is_error_and_message_eq(source, expected_err, variable_map);
9✔
2179
        }
1✔
2180
    }
2✔
2181

2182
    #[test]
2183
    fn test_error_messages() {
2✔
2184
        let cases = [
1✔
2185
            (r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
1✔
2186
            (r"", r"← error: unexpected end of input"),
1✔
2187
            (r"[{]", r"[{]← error: unexpected end of input"),
1✔
2188
            // we match ECMA-262 strictly, so case matters
2189
            (
1✔
2190
                r"[:general_category:]",
2191
                r"[:general_category← error: unknown property",
2192
            ),
2193
            (r"[:ll=true:]", r"[:ll=true← error: unknown property"),
1✔
2194
            (r"[:=", r"[:=← error: unexpected character '='"),
1✔
2195
            // property names may not be empty
2196
            (r"[::]", r"[::← error: unexpected character ':'"),
1✔
2197
            (r"[:=hello:]", r"[:=← error: unexpected character '='"),
1✔
2198
            // property values may not be empty
2199
            (r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
1✔
2200
            (r"[\xag]", r"[\xag← error: unexpected character 'g'"),
1✔
2201
            (r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
1✔
2202
            // TODO(#3558): Might be better as "[a-\p← error: unexpected character 'p'"?
2203
            (r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
1✔
2204
            (r"[a-&]", r"[a-&← error: unexpected character '&'"),
1✔
2205
            (r"[a&b]", r"[a&← error: unexpected character '&'"),
1✔
2206
            (r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
1✔
2207
            (r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
1✔
2208
            (r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
1✔
2209
            (r"[a-`]", r"[a-`← error: unexpected character '`'"),
1✔
2210
            (r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
1✔
2211
            (r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
1✔
2212
            (r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
1✔
2213
            (
1✔
2214
                r"[\x{dabeef}]",
2215
                r"[\x{dabeef← error: invalid escape sequence",
2216
            ),
2217
            (
1✔
2218
                r"[\x{10ffff0}]",
2219
                r"[\x{10ffff0← error: unexpected character '0'",
2220
            ),
2221
            (
1✔
2222
                r"[\x{11ffff}]",
2223
                r"[\x{11ffff← error: invalid escape sequence",
2224
            ),
2225
            (
1✔
2226
                r"[\x{10ffff 1 10ffff0}]",
2227
                r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
2228
            ),
2229
            // > 1 byte in UTF-8 edge case
2230
            (r"ä", r"ä← error: unexpected character 'ä'"),
1✔
2231
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2232
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2233
            (
1✔
2234
                r"[\xe5-\xe4]",
2235
                r"[\xe5-\xe4← error: unexpected character 'ä'",
2236
            ),
2237
            (r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
1✔
2238
            // whitespace significance
2239
            (r"[ ^]", r"[ ^← error: unexpected character '^'"),
1✔
2240
            (r"[:]", r"[:]← error: unexpected character ']'"),
1✔
2241
            (r"[:L]", r"[:L]← error: unexpected character ']'"),
1✔
2242
            (r"\p {L}", r"\p ← error: unexpected character ' '"),
1✔
2243
            // multi-escapes are not allowed in ranges
2244
            (
1✔
2245
                r"[\x{61 62}-d]",
2246
                r"[\x{61 62}-d← error: unexpected character 'd'",
2247
            ),
2248
            (
1✔
2249
                r"[\x{61 63}-\x{62 64}]",
2250
                r"[\x{61 63}-\← error: unexpected character '\\'",
2251
            ),
2252
            // TODO(#3558): This is a bad error message.
2253
            (r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
1✔
2254
        ];
2255
        let vm = Default::default();
1✔
2256
        for (source, expected_err) in cases {
38✔
2257
            assert_is_error_and_message_eq(source, expected_err, &vm);
37✔
2258
        }
1✔
2259
    }
2✔
2260

2261
    #[test]
2262
    fn test_consumed() {
2✔
2263
        let cases = [
1✔
2264
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
1✔
2265
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
1✔
2266
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
1✔
2267
            (r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
1✔
2268
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
1✔
2269
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
1✔
2270
        ];
2271

2272
        let vm = Default::default();
1✔
2273
        for (expected_consumed, source) in cases {
7✔
2274
            let (_, consumed) = parse(source).unwrap();
6✔
2275
            assert_eq!(expected_consumed, consumed);
6✔
2276
            let (_, consumed) = parse_with_variables(source, &vm).unwrap();
6✔
2277
            assert_eq!(expected_consumed, consumed);
6✔
2278
        }
1✔
2279
    }
2✔
2280
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc