• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9357137046

03 Jun 2024 08:51PM UTC coverage: 75.121% (-1.1%) from 76.254%
9357137046

push

github

web-flow
Switch locid Value to use Subtag (#4941)

This is part of #1833 switching Value API to use Subtag.

61 of 71 new or added lines in 11 files covered. (85.92%)

3224 existing lines in 178 files now uncovered.

52958 of 70497 relevant lines covered (75.12%)

572757.08 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.5
/components/experimental/src/unicodeset_parse/parse.rs
1
// This file is part of ICU4X. For terms of use, please see the file
1,266✔
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use alloc::borrow::Cow;
6
use alloc::collections::{BTreeMap, BTreeSet};
7
use alloc::fmt::Display;
8
use alloc::format;
9
use alloc::string::{String, ToString};
10
use alloc::vec::Vec;
11
use core::{iter::Peekable, str::CharIndices};
12

13
use icu_collections::{
14
    codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder},
15
    codepointinvliststringlist::CodePointInversionListAndStringList,
16
};
17
use icu_properties::maps::{
18
    load_grapheme_cluster_break, load_script, load_sentence_break, load_word_break,
19
};
20
use icu_properties::script::load_script_with_extensions_unstable;
21
use icu_properties::sets::{
22
    load_for_ecma262_unstable, load_for_general_category_group, load_pattern_white_space,
23
    load_xid_continue, load_xid_start,
24
};
25
use icu_properties::{provider::*, GeneralCategoryGroup};
26
use icu_properties::{GraphemeClusterBreak, Script, SentenceBreak, WordBreak};
27
use icu_provider::prelude::*;
28

29
/// The kind of error that occurred.
30
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47✔
31
#[non_exhaustive]
32
pub enum ParseErrorKind {
33
    /// An unexpected character was encountered. This variant implies the other variants
34
    /// (notably `UnknownProperty` and `Unimplemented`) do not apply.
35
    UnexpectedChar(char),
×
36
    /// The property name or value is unknown. For property names, make sure you use the spelling
37
    /// defined in [ECMA-262](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
38
    UnknownProperty,
39
    /// A reference to an unknown variable.
40
    UnknownVariable,
41
    /// A variable of a certain type occurring in an unexpected context.
42
    UnexpectedVariable,
43
    /// The source is an incomplete unicode set.
44
    Eof,
45
    /// Something unexpected went wrong with our code. Please file a bug report on GitHub.
46
    Internal,
47
    /// The provided syntax is not supported by us. Note that unknown properties will return the
48
    /// `UnknownProperty` variant, not this one.
49
    Unimplemented,
50
    /// The provided escape sequence is not a valid Unicode code point or represents too many
51
    /// code points.
52
    InvalidEscape,
53
}
54
use zerovec::VarZeroVec;
55
use ParseErrorKind as PEK;
56

57
impl ParseErrorKind {
58
    fn with_offset(self, offset: usize) -> ParseError {
301✔
59
        ParseError {
301✔
60
            offset: Some(offset),
301✔
61
            kind: self,
62
        }
63
    }
301✔
64
}
65

66
impl From<ParseErrorKind> for ParseError {
67
    fn from(kind: ParseErrorKind) -> Self {
13,970✔
68
        ParseError { offset: None, kind }
13,970✔
69
    }
13,970✔
70
}
71

72
/// The error type returned by the `parse` functions in this crate.
73
///
74
/// See [`ParseError::fmt_with_source`] for pretty-printing and [`ParseErrorKind`] of the
75
/// different types of errors represented by this struct.
76
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
×
77
pub struct ParseError {
78
    // offset is the index to an arbitrary byte in the last character in the source that makes sense
79
    // to display as location for the error, e.g., the unexpected character itself or
80
    // for an unknown property name the last character of the name.
81
    offset: Option<usize>,
×
82
    kind: ParseErrorKind,
×
83
}
84

85
type Result<T, E = ParseError> = core::result::Result<T, E>;
86

87
impl ParseError {
88
    /// Pretty-prints this error and if applicable, shows where the error occurred in the source.
89
    ///
90
    /// Must be called with the same source that was used to parse the set.
91
    ///
92
    /// # Examples
93
    ///
94
    /// ```
95
    /// use icu::experimental::unicodeset_parse::*;
96
    ///
97
    /// let source = "[[abc]-x]";
98
    /// let set = parse(source);
99
    /// assert!(set.is_err());
100
    /// let err = set.unwrap_err();
101
    /// assert_eq!(
102
    ///     err.fmt_with_source(source).to_string(),
103
    ///     "[[abc]-x← error: unexpected character 'x'"
104
    /// );
105
    /// ```
106
    ///
107
    /// ```
108
    /// use icu::experimental::unicodeset_parse::*;
109
    ///
110
    /// let source = r"[\N{LATIN CAPITAL LETTER A}]";
111
    /// let set = parse(source);
112
    /// assert!(set.is_err());
113
    /// let err = set.unwrap_err();
114
    /// assert_eq!(
115
    ///     err.fmt_with_source(source).to_string(),
116
    ///     r"[\N← error: unimplemented"
117
    /// );
118
    /// ```
119
    pub fn fmt_with_source(&self, source: &str) -> impl Display {
47✔
120
        let ParseError { offset, kind } = *self;
47✔
121

122
        if kind == ParseErrorKind::Eof {
47✔
123
            return format!("{source}← error: unexpected end of input");
4✔
124
        }
125
        let mut s = String::new();
43✔
126
        if let Some(offset) = offset {
43✔
127
            if offset < source.len() {
43✔
128
                // offset points to any byte of the last character we want to display.
129
                // in the case of ASCII, this is easy - we just display bytes [..=offset].
130
                // however, if the last character is more than one byte in UTF-8
131
                // we cannot use ..=offset, because that would potentially include only partial
132
                // bytes of last character in our string. hence we must find the start of the
133
                // following character and use that as the (exclusive) end of our string.
134

135
                // offset points into the last character we want to include, hence the start of the
136
                // first character we want to exclude is at least offset + 1.
137
                let mut exclusive_end = offset + 1;
43✔
138
                // TODO: replace this loop with str::ceil_char_boundary once stable
139
                for _ in 0..3 {
45✔
140
                    // is_char_boundary returns true at the latest once exclusive_end == source.len()
141
                    if source.is_char_boundary(exclusive_end) {
45✔
142
                        break;
143
                    }
144
                    exclusive_end += 1;
2✔
145
                }
146

147
                // exclusive_end is at most source.len() due to str::is_char_boundary and at least 0 by type
148
                #[allow(clippy::indexing_slicing)]
149
                s.push_str(&source[..exclusive_end]);
43✔
150
                s.push_str("← ");
43✔
151
            }
152
        }
153
        s.push_str("error: ");
43✔
154
        match kind {
43✔
155
            ParseErrorKind::UnexpectedChar(c) => {
30✔
156
                s.push_str(&format!("unexpected character '{}'", c.escape_debug()));
30✔
157
            }
158
            ParseErrorKind::UnknownProperty => {
159
                s.push_str("unknown property");
4✔
160
            }
161
            ParseErrorKind::UnknownVariable => {
162
                s.push_str("unknown variable");
×
163
            }
164
            ParseErrorKind::UnexpectedVariable => {
165
                s.push_str("unexpected variable");
6✔
166
            }
167
            ParseErrorKind::Eof => {
168
                s.push_str("unexpected end of input");
×
169
            }
170
            ParseErrorKind::Internal => {
171
                s.push_str("internal error");
×
172
            }
173
            ParseErrorKind::Unimplemented => {
174
                s.push_str("unimplemented");
1✔
175
            }
176
            ParseErrorKind::InvalidEscape => {
177
                s.push_str("invalid escape sequence");
2✔
178
            }
179
        }
180

181
        s
43✔
182
    }
47✔
183

184
    /// Returns the [`ParseErrorKind`] of this error.
185
    pub fn kind(&self) -> ParseErrorKind {
×
186
        self.kind
×
187
    }
×
188

189
    /// Returns the offset of this error in the source string, if it was specified.
190
    pub fn offset(&self) -> Option<usize> {
×
191
        self.offset
×
192
    }
×
193

194
    fn or_with_offset(self, offset: usize) -> Self {
4✔
195
        match self.offset {
4✔
196
            Some(_) => self,
×
197
            None => ParseError {
4✔
198
                offset: Some(offset),
4✔
199
                ..self
200
            },
4✔
201
        }
202
    }
4✔
203
}
204

205
/// The value of a variable in a UnicodeSet. Used as value type in [`VariableMap`].
206
#[derive(Debug, Clone)]
46✔
207
#[non_exhaustive]
208
pub enum VariableValue<'a> {
209
    /// A UnicodeSet, represented as a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList).
210
    UnicodeSet(CodePointInversionListAndStringList<'a>),
19✔
211
    // in theory, a one-code-point string is always the same as a char, but we might want to keep
212
    // this variant for efficiency?
213
    /// A single code point.
214
    Char(char),
23✔
215
    /// A string. It is guaranteed that when returned from a VariableMap, this variant contains never exactly one code point.
216
    String(Cow<'a, str>),
4✔
217
}
218

219
/// The map used for parsing UnicodeSets with variable support. See [`parse_with_variables`].
220
#[derive(Debug, Clone, Default)]
730✔
221
pub struct VariableMap<'a>(BTreeMap<String, VariableValue<'a>>);
365✔
222

223
impl<'a> VariableMap<'a> {
224
    /// Creates a new empty map.
225
    pub fn new() -> Self {
1✔
226
        Self::default()
1✔
227
    }
1✔
228

229
    /// Removes a key from the map, returning the value at the key if the key
230
    /// was previously in the map.
231
    pub fn remove(&mut self, key: &str) -> Option<VariableValue<'a>> {
×
232
        self.0.remove(key)
×
233
    }
×
234

235
    /// Get a reference to the value associated with this key, if it exists.
236
    pub fn get(&self, key: &str) -> Option<&VariableValue<'a>> {
7✔
237
        self.0.get(key)
7✔
238
    }
7✔
239

240
    /// Insert a `VariableValue` into the `VariableMap`.
241
    ///
242
    /// Returns `Err` with the old value, if it exists, and does not update the map.
243
    pub fn insert(&mut self, key: String, value: VariableValue<'a>) -> Result<(), &VariableValue> {
51✔
244
        // borrow-checker shenanigans, otherwise we could use if let
245
        if self.0.contains_key(&key) {
51✔
246
            // we just checked that this key exists
247
            #[allow(clippy::indexing_slicing)]
248
            return Err(&self.0[&key]);
×
249
        }
250

251
        if let VariableValue::String(s) = &value {
51✔
252
            let mut chars = s.chars();
21✔
253
            if let (Some(c), None) = (chars.next(), chars.next()) {
21✔
254
                self.0.insert(key, VariableValue::Char(c));
16✔
255
                return Ok(());
16✔
256
            };
257
        }
258

259
        self.0.insert(key, value);
35✔
260
        Ok(())
35✔
261
    }
51✔
262

263
    /// Insert a `char` into the `VariableMap`.    
264
    ///
265
    /// Returns `Err` with the old value, if it exists, and does not update the map.
266
    pub fn insert_char(&mut self, key: String, c: char) -> Result<(), &VariableValue> {
12✔
267
        // borrow-checker shenanigans, otherwise we could use if let
268
        if self.0.contains_key(&key) {
12✔
269
            // we just checked that this key exists
270
            #[allow(clippy::indexing_slicing)]
271
            return Err(&self.0[&key]);
1✔
272
        }
273

274
        self.0.insert(key, VariableValue::Char(c));
11✔
275
        Ok(())
11✔
276
    }
12✔
277

278
    /// Insert a `String` of any length into the `VariableMap`.
279
    ///
280
    /// Returns `Err` with the old value, if it exists, and does not update the map.
281
    pub fn insert_string(&mut self, key: String, s: String) -> Result<(), &VariableValue> {
3✔
282
        // borrow-checker shenanigans, otherwise we could use if let
283
        if self.0.contains_key(&key) {
3✔
284
            // we just checked that this key exists
285
            #[allow(clippy::indexing_slicing)]
286
            return Err(&self.0[&key]);
×
287
        }
288

289
        let mut chars = s.chars();
3✔
290
        let val = match (chars.next(), chars.next()) {
3✔
291
            (Some(c), None) => VariableValue::Char(c),
×
292
            _ => VariableValue::String(Cow::Owned(s)),
3✔
293
        };
294

295
        self.0.insert(key, val);
3✔
296
        Ok(())
3✔
297
    }
3✔
298

299
    /// Insert a `&str` of any length into the `VariableMap`.
300
    ///
301
    /// Returns `Err` with the old value, if it exists, and does not update the map.
302
    pub fn insert_str(&mut self, key: String, s: &'a str) -> Result<(), &VariableValue> {
×
303
        // borrow-checker shenanigans, otherwise we could use if let
304
        if self.0.contains_key(&key) {
×
305
            // we just checked that this key exists
306
            #[allow(clippy::indexing_slicing)]
307
            return Err(&self.0[&key]);
×
308
        }
309

310
        let mut chars = s.chars();
×
311
        let val = match (chars.next(), chars.next()) {
×
312
            (Some(c), None) => VariableValue::Char(c),
×
313
            _ => VariableValue::String(Cow::Borrowed(s)),
×
314
        };
315

316
        self.0.insert(key, val);
×
317
        Ok(())
×
318
    }
×
319

320
    /// Insert a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList) into the `VariableMap`.
321
    ///
322
    /// Returns `Err` with the old value, if it exists, and does not update the map.
323
    pub fn insert_set(
3✔
324
        &mut self,
325
        key: String,
326
        set: CodePointInversionListAndStringList<'a>,
327
    ) -> Result<(), &VariableValue> {
328
        // borrow-checker shenanigans, otherwise we could use if let
329
        if self.0.contains_key(&key) {
3✔
330
            // we just checked that this key exists
331
            #[allow(clippy::indexing_slicing)]
332
            return Err(&self.0[&key]);
×
333
        }
334
        self.0.insert(key, VariableValue::UnicodeSet(set));
3✔
335
        Ok(())
3✔
336
    }
3✔
337
}
338

339
// this ignores the ambiguity between \-escapes and \p{} perl properties. it assumes it is in a context where \p is just 'p'
340
// returns whether the provided char signifies the start of a literal char (raw or escaped - so \ is a legal char start)
341
// important: assumes c is not pattern_white_space
342
fn legal_char_start(c: char) -> bool {
1,644✔
343
    !(c == '&' || c == '-' || c == '$' || c == '^' || c == '[' || c == ']' || c == '{')
1,644✔
344
}
1,644✔
345

346
// same as `legal_char_start` but adapted to the charInString nonterminal. \ is allowed due to escapes.
347
// important: assumes c is not pattern_white_space
348
fn legal_char_in_string_start(c: char) -> bool {
277✔
349
    c != '}'
277✔
350
}
277✔
351

352
#[derive(Debug)]
×
353
enum SingleOrMultiChar {
354
    Single(char),
×
355
    // Multi is a marker that indicates parsing was paused and needs to be resumed using parse_multi_escape* when
356
    // this token is consumed. The contained char is the first char of the multi sequence.
357
    Multi(char),
×
358
}
359

360
// A char or a string. The Vec<char> represents multi-escapes in the 2+ case.
361
// invariant: a String is either zero or 2+ chars long, a one-char-string is equivalent to a single char.
362
// invariant: a char is 1+ chars long
363
#[derive(Debug)]
×
364
enum Literal {
365
    String(String),
×
366
    CharKind(SingleOrMultiChar),
×
367
}
368

369
#[derive(Debug)]
×
370
enum MainToken<'data> {
371
    // to be interpreted as value
372
    Literal(Literal),
×
373
    // inner set
374
    UnicodeSet(CodePointInversionListAndStringList<'data>),
×
375
    // anchor, only at the end of a set ([... $])
376
    DollarSign,
377
    // intersection operator, only inbetween two sets ([[...] & [...]])
378
    Ampersand,
379
    // difference operator, only inbetween two sets ([[...] - [...]])
380
    // or
381
    // range operator, only inbetween two chars ([a-z], [a-{z}])
382
    Minus,
383
    // ] to indicate the end of a set
384
    ClosingBracket,
385
}
386

387
impl<'data> MainToken<'data> {
388
    fn from_variable_value(val: VariableValue<'data>) -> Self {
44✔
389
        match val {
44✔
390
            VariableValue::Char(c) => {
21✔
391
                MainToken::Literal(Literal::CharKind(SingleOrMultiChar::Single(c)))
21✔
392
            }
21✔
393
            VariableValue::String(s) => {
4✔
394
                // we know that the VariableMap only contains non-length-1 Strings.
395
                MainToken::Literal(Literal::String(s.into_owned()))
4✔
396
            }
4✔
397
            VariableValue::UnicodeSet(set) => MainToken::UnicodeSet(set),
19✔
398
        }
399
    }
44✔
400
}
401

402
#[derive(Debug, Clone, Copy)]
×
403
enum Operation {
404
    Union,
405
    Difference,
406
    Intersection,
407
}
408

409
// this builds the set on-the-fly while parsing it
410
struct UnicodeSetBuilder<'a, 'b, P: ?Sized> {
411
    single_set: CodePointInversionListBuilder,
412
    string_set: BTreeSet<String>,
413
    iter: &'a mut Peekable<CharIndices<'b>>,
414
    source: &'b str,
415
    inverted: bool,
416
    variable_map: &'a VariableMap<'a>,
417
    xid_start: &'a CodePointInversionList<'a>,
418
    xid_continue: &'a CodePointInversionList<'a>,
419
    pat_ws: &'a CodePointInversionList<'a>,
420
    property_provider: &'a P,
421
}
422

423
impl<'a, 'b, P> UnicodeSetBuilder<'a, 'b, P>
424
where
425
    P: ?Sized
426
        + DataProvider<AsciiHexDigitV1Marker>
427
        + DataProvider<AlphabeticV1Marker>
428
        + DataProvider<BidiControlV1Marker>
429
        + DataProvider<BidiMirroredV1Marker>
430
        + DataProvider<CaseIgnorableV1Marker>
431
        + DataProvider<CasedV1Marker>
432
        + DataProvider<ChangesWhenCasefoldedV1Marker>
433
        + DataProvider<ChangesWhenCasemappedV1Marker>
434
        + DataProvider<ChangesWhenLowercasedV1Marker>
435
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
436
        + DataProvider<ChangesWhenTitlecasedV1Marker>
437
        + DataProvider<ChangesWhenUppercasedV1Marker>
438
        + DataProvider<DashV1Marker>
439
        + DataProvider<DefaultIgnorableCodePointV1Marker>
440
        + DataProvider<DeprecatedV1Marker>
441
        + DataProvider<DiacriticV1Marker>
442
        + DataProvider<EmojiV1Marker>
443
        + DataProvider<EmojiComponentV1Marker>
444
        + DataProvider<EmojiModifierV1Marker>
445
        + DataProvider<EmojiModifierBaseV1Marker>
446
        + DataProvider<EmojiPresentationV1Marker>
447
        + DataProvider<ExtendedPictographicV1Marker>
448
        + DataProvider<ExtenderV1Marker>
449
        + DataProvider<GraphemeBaseV1Marker>
450
        + DataProvider<GraphemeClusterBreakV1Marker>
451
        + DataProvider<GraphemeClusterBreakNameToValueV1Marker>
452
        + DataProvider<GraphemeExtendV1Marker>
453
        + DataProvider<HexDigitV1Marker>
454
        + DataProvider<IdsBinaryOperatorV1Marker>
455
        + DataProvider<IdsTrinaryOperatorV1Marker>
456
        + DataProvider<IdContinueV1Marker>
457
        + DataProvider<IdStartV1Marker>
458
        + DataProvider<IdeographicV1Marker>
459
        + DataProvider<JoinControlV1Marker>
460
        + DataProvider<LogicalOrderExceptionV1Marker>
461
        + DataProvider<LowercaseV1Marker>
462
        + DataProvider<MathV1Marker>
463
        + DataProvider<NoncharacterCodePointV1Marker>
464
        + DataProvider<PatternSyntaxV1Marker>
465
        + DataProvider<PatternWhiteSpaceV1Marker>
466
        + DataProvider<QuotationMarkV1Marker>
467
        + DataProvider<RadicalV1Marker>
468
        + DataProvider<RegionalIndicatorV1Marker>
469
        + DataProvider<SentenceBreakV1Marker>
470
        + DataProvider<SentenceBreakNameToValueV1Marker>
471
        + DataProvider<SentenceTerminalV1Marker>
472
        + DataProvider<SoftDottedV1Marker>
473
        + DataProvider<TerminalPunctuationV1Marker>
474
        + DataProvider<UnifiedIdeographV1Marker>
475
        + DataProvider<UppercaseV1Marker>
476
        + DataProvider<VariationSelectorV1Marker>
477
        + DataProvider<WhiteSpaceV1Marker>
478
        + DataProvider<WordBreakV1Marker>
479
        + DataProvider<WordBreakNameToValueV1Marker>
480
        + DataProvider<XidContinueV1Marker>
481
        + DataProvider<GeneralCategoryMaskNameToValueV1Marker>
482
        + DataProvider<GeneralCategoryV1Marker>
483
        + DataProvider<ScriptNameToValueV1Marker>
484
        + DataProvider<ScriptV1Marker>
485
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
486
        + DataProvider<XidStartV1Marker>,
487
{
488
    fn new_internal(
682✔
489
        iter: &'a mut Peekable<CharIndices<'b>>,
490
        source: &'b str,
491
        variable_map: &'a VariableMap<'a>,
492
        xid_start: &'a CodePointInversionList<'a>,
493
        xid_continue: &'a CodePointInversionList<'a>,
494
        pat_ws: &'a CodePointInversionList<'a>,
495
        provider: &'a P,
496
    ) -> Self {
497
        UnicodeSetBuilder {
682✔
498
            single_set: CodePointInversionListBuilder::new(),
682✔
499
            string_set: Default::default(),
682✔
500
            iter,
501
            source,
502
            inverted: false,
503
            variable_map,
504
            xid_start,
505
            xid_continue,
506
            pat_ws,
507
            property_provider: provider,
508
        }
×
509
    }
682✔
510

511
    // the entry point, parses a full UnicodeSet. ignores remaining input
512
    fn parse_unicode_set(&mut self) -> Result<()> {
682✔
513
        match self.must_peek_char()? {
682✔
514
            '\\' => self.parse_property_perl(),
26✔
515
            '[' => {
516
                self.iter.next();
651✔
517
                if let Some(':') = self.peek_char() {
651✔
518
                    self.parse_property_posix()
83✔
519
                } else {
520
                    self.parse_unicode_set_inner()
568✔
521
                }
522
            }
523
            '$' => {
524
                // must be variable ref to a UnicodeSet
525
                let (offset, v) = self.parse_variable()?;
3✔
526
                match v {
2✔
527
                    Some(VariableValue::UnicodeSet(s)) => {
1✔
528
                        self.single_set.add_set(s.code_points());
1✔
529
                        self.string_set
2✔
530
                            .extend(s.strings().iter().map(ToString::to_string));
1✔
531
                        Ok(())
1✔
532
                    }
1✔
533
                    Some(_) => Err(PEK::UnexpectedVariable.with_offset(offset)),
1✔
534
                    None => Err(PEK::UnexpectedChar('$').with_offset(offset)),
×
535
                }
536
            }
537
            c => self.error_here(PEK::UnexpectedChar(c)),
1✔
538
        }
539
    }
682✔
540

541
    // beginning [ is already consumed
542
    fn parse_unicode_set_inner(&mut self) -> Result<()> {
568✔
543
        // special cases for the first chars after [
544
        if self.must_peek_char()? == '^' {
568✔
545
            self.iter.next();
111✔
546
            self.inverted = true;
111✔
547
        }
548
        // whitespace allowed between ^ and - in `[^ - ....]`
549
        self.skip_whitespace();
568✔
550
        if self.must_peek_char()? == '-' {
568✔
551
            self.iter.next();
7✔
552
            self.single_set.add_char('-');
7✔
553
        }
554

555
        // repeatedly parse the following:
556
        // char
557
        // char-char
558
        // {string}
559
        // unicodeset
560
        // & and - operators, but only between unicodesets
561
        // $variables in place of strings, chars, or unicodesets
562

563
        #[derive(Debug, Clone, Copy)]
×
564
        enum State {
565
            // a state equivalent to the beginning
566
            Begin,
567
            // a state after a char. implies `prev_char` is Some(_), because we need to buffer it
568
            // in case it is part of a range, e.g., a-z
569
            Char,
570
            // in the middle of parsing a range. implies `prev_char` is Some(_), and the next
571
            // element must be a char as well
572
            CharMinus,
573
            // state directly after parsing a recursive unicode set. operators are only allowed
574
            // in this state
575
            AfterUnicodeSet,
576
            // state directly after parsing an operator. forces the next element to be a recursive
577
            // unicode set
578
            AfterOp,
579
            // state after parsing a $ (that was not a variable reference)
580
            // the only valid next option is a closing bracket
581
            AfterDollar,
582
            // state after parsing a - in an otherwise invalid position
583
            // the only valid next option is a closing bracket
584
            AfterMinus,
585
        }
586
        use State::*;
587

588
        const DEFAULT_OP: Operation = Operation::Union;
589

590
        let mut state = Begin;
568✔
591
        let mut prev_char = None;
568✔
592
        let mut operation = Operation::Union;
568✔
593

594
        loop {
2,521✔
595
            self.skip_whitespace();
2,521✔
596

597
            // for error messages
598
            let (immediate_offset, immediate_char) = self.must_peek()?;
2,521✔
599

600
            let (tok_offset, from_var, tok) = self.parse_main_token()?;
2,521✔
601
            // warning: self.iter should not be advanced any more after this point on any path to
602
            // MT::Literal(Literal::CharKind(SingleOrMultiChar::Multi)), because that variant
603
            // expects a certain self.iter state
604

605
            use MainToken as MT;
606
            use SingleOrMultiChar as SMC;
607
            match (state, tok) {
2,507✔
608
                // the end of this unicode set
609
                (
610
                    Begin | Char | CharMinus | AfterUnicodeSet | AfterDollar | AfterMinus,
611
                    MT::ClosingBracket,
612
                ) => {
613
                    if let Some(prev) = prev_char.take() {
531✔
614
                        self.single_set.add_char(prev);
142✔
615
                    }
616
                    if matches!(state, CharMinus) {
531✔
617
                        self.single_set.add_char('-');
2✔
618
                    }
619

620
                    return Ok(());
531✔
621
                }
622
                // special case ends for -
623
                // [[a-z]-]
624
                (AfterOp, MT::ClosingBracket) if matches!(operation, Operation::Difference) => {
2✔
625
                    self.single_set.add_char('-');
1✔
626
                    return Ok(());
1✔
627
                }
628
                (Begin, MT::Minus) => {
9✔
629
                    self.single_set.add_char('-');
9✔
630
                    state = AfterMinus;
9✔
631
                }
632
                // inner unicode set
633
                (Begin | Char | AfterUnicodeSet | AfterOp, MT::UnicodeSet(set)) => {
209✔
634
                    if let Some(prev) = prev_char.take() {
209✔
635
                        self.single_set.add_char(prev);
3✔
636
                    }
637

638
                    self.process_chars(operation, set.code_points().clone());
209✔
639
                    self.process_strings(
209✔
640
                        operation,
209✔
641
                        set.strings().iter().map(ToString::to_string).collect(),
209✔
642
                    );
643

644
                    operation = DEFAULT_OP;
209✔
645
                    state = AfterUnicodeSet;
209✔
646
                }
209✔
647
                // a literal char (either individually or as the start of a range if char)
648
                (
649
                    Begin | Char | AfterUnicodeSet,
650
                    MT::Literal(Literal::CharKind(SMC::Single(c))),
1,062✔
651
                ) => {
652
                    if let Some(prev) = prev_char.take() {
1,062✔
653
                        self.single_set.add_char(prev);
613✔
654
                    }
655
                    prev_char = Some(c);
1,062✔
656
                    state = Char;
1,062✔
657
                }
1,062✔
658
                // a bunch of literal chars as part of a multi-escape sequence
659
                (
660
                    Begin | Char | AfterUnicodeSet,
661
                    MT::Literal(Literal::CharKind(SMC::Multi(first_c))),
4✔
662
                ) => {
663
                    if let Some(prev) = prev_char.take() {
4✔
664
                        self.single_set.add_char(prev);
×
665
                    }
666
                    self.single_set.add_char(first_c);
4✔
667
                    self.parse_multi_escape_into_set()?;
572✔
668

669
                    // Note we cannot go to the Char state, because a multi-escape sequence of
670
                    // length > 1 cannot initiate a range
671
                    state = Begin;
3✔
672
                }
3✔
673
                // a literal string (length != 1, by CharOrString invariant)
674
                (Begin | Char | AfterUnicodeSet, MT::Literal(Literal::String(s))) => {
56✔
675
                    if let Some(prev) = prev_char.take() {
56✔
676
                        self.single_set.add_char(prev);
18✔
677
                    }
678

679
                    self.string_set.insert(s);
56✔
680
                    state = Begin;
56✔
681
                }
56✔
682
                // parse a literal char as the end of a range
683
                (CharMinus, MT::Literal(Literal::CharKind(SMC::Single(c)))) => {
259✔
684
                    let start = prev_char.ok_or(PEK::Internal.with_offset(tok_offset))?;
259✔
685
                    let end = c;
686
                    if start > end {
259✔
687
                        // TODO(#3558): Better error message (e.g., "start greater than end in range")?
688
                        return Err(PEK::UnexpectedChar(end).with_offset(tok_offset));
4✔
689
                    }
690

691
                    self.single_set.add_range(&(start..=end));
255✔
692
                    prev_char = None;
255✔
693
                    state = Begin;
255✔
694
                }
255✔
695
                // start parsing a char range
696
                (Char, MT::Minus) => {
266✔
697
                    state = CharMinus;
266✔
698
                }
699
                // start parsing a unicode set difference
700
                (AfterUnicodeSet, MT::Minus) => {
30✔
701
                    operation = Operation::Difference;
30✔
702
                    state = AfterOp;
30✔
703
                }
704
                // start parsing a unicode set difference
705
                (AfterUnicodeSet, MT::Ampersand) => {
27✔
706
                    operation = Operation::Intersection;
27✔
707
                    state = AfterOp;
27✔
708
                }
709
                (Begin | Char | AfterUnicodeSet, MT::DollarSign) => {
36✔
710
                    if let Some(prev) = prev_char.take() {
36✔
711
                        self.single_set.add_char(prev);
21✔
712
                    }
713
                    self.single_set.add_char('\u{FFFF}');
36✔
714
                    state = AfterDollar;
36✔
715
                }
716
                _ => {
717
                    // TODO(#3558): We have precise knowledge about the following MainToken here,
718
                    //  should we make use of that?
719

720
                    if from_var {
17✔
721
                        // otherwise we get error messages such as
722
                        // [$a-$← error: unexpected character '$'
723
                        // for input [$a-$b], $a = 'a', $b = "string" ;
724
                        return Err(PEK::UnexpectedVariable.with_offset(tok_offset));
5✔
725
                    }
726
                    return Err(PEK::UnexpectedChar(immediate_char).with_offset(immediate_offset));
12✔
727
                }
728
            }
729
        }
2,507✔
730
    }
568✔
731

732
    fn parse_main_token(&mut self) -> Result<(usize, bool, MainToken<'a>)> {
2,521✔
733
        let (initial_offset, first) = self.must_peek()?;
2,521✔
734
        if first == ']' {
2,521✔
735
            self.iter.next();
533✔
736
            return Ok((initial_offset, false, MainToken::ClosingBracket));
533✔
737
        }
738
        let (_, second) = self.must_peek_double()?;
1,988✔
739
        match (first, second) {
2,322✔
740
            // variable or anchor
741
            ('$', _) => {
742
                let (offset, var_or_anchor) = self.parse_variable()?;
81✔
743
                match var_or_anchor {
80✔
744
                    None => Ok((offset, false, MainToken::DollarSign)),
36✔
745
                    Some(v) => Ok((offset, true, MainToken::from_variable_value(v.clone()))),
44✔
746
                }
747
            }
748
            // string
749
            ('{', _) => self
68✔
750
                .parse_string()
751
                .map(|(offset, l)| (offset, false, MainToken::Literal(l))),
67✔
752
            // inner set
753
            ('\\', 'p' | 'P') | ('[', _) => {
754
                let mut inner_builder = UnicodeSetBuilder::new_internal(
193✔
755
                    self.iter,
193✔
756
                    self.source,
193✔
757
                    self.variable_map,
193✔
758
                    self.xid_start,
193✔
759
                    self.xid_continue,
193✔
760
                    self.pat_ws,
193✔
761
                    self.property_provider,
193✔
762
                );
763
                inner_builder.parse_unicode_set()?;
2,714✔
764
                let (single, string_set) = inner_builder.finalize();
193✔
765
                // note: offset - 1, because we already consumed full set
766
                let offset = self.must_peek_index()? - 1;
193✔
767
                let mut strings = string_set.into_iter().collect::<Vec<_>>();
192✔
768
                strings.sort();
192✔
769
                let cpilasl = CodePointInversionListAndStringList::try_from(
192✔
770
                    single.build(),
192✔
771
                    VarZeroVec::from(&strings),
192✔
772
                )
192✔
773
                .map_err(|_| PEK::Internal.with_offset(offset))?;
192✔
774
                Ok((offset, false, MainToken::UnicodeSet(cpilasl)))
192✔
775
            }
193✔
776
            // note: c cannot be a whitespace, because we called skip_whitespace just before
777
            // (in the main parse loop), so it's safe to call this guard function
778
            (c, _) if legal_char_start(c) => self
1,644✔
779
                .parse_char()
780
                .map(|(offset, c)| (offset, false, MainToken::Literal(Literal::CharKind(c)))),
1,300✔
781
            ('-', _) => {
782
                self.iter.next();
306✔
783
                Ok((initial_offset, false, MainToken::Minus))
306✔
784
            }
785
            ('&', _) => {
786
                self.iter.next();
29✔
787
                Ok((initial_offset, false, MainToken::Ampersand))
29✔
788
            }
789
            (c, _) => Err(PEK::UnexpectedChar(c).with_offset(initial_offset)),
1✔
790
        }
791
    }
2,521✔
792

793
    // parses a variable or an anchor. expects '$' as next token.
794
    // if this is a single $ (eg `[... $ ]` or the invalid `$ a`), then this function returns Ok(None),
795
    // otherwise Ok(Some(variable_value)).
796
    fn parse_variable(&mut self) -> Result<(usize, Option<&'a VariableValue<'a>>)> {
84✔
797
        self.consume('$')?;
84✔
798

799
        let mut res = String::new();
84✔
800
        let (mut var_offset, first_c) = self.must_peek()?;
84✔
801

802
        if !self.xid_start.contains(first_c) {
83✔
803
            // -1 because we already consumed the '$'
804
            return Ok((var_offset - 1, None));
36✔
805
        }
806

807
        res.push(first_c);
47✔
808
        self.iter.next();
47✔
809
        // important: if we are parsing a root unicodeset as a variable, we might reach EOF as
810
        // a valid end of the variable name, so we cannot use must_peek here.
811
        while let Some(&(offset, c)) = self.iter.peek() {
238✔
812
            if !self.xid_continue.contains(c) {
236✔
813
                break;
814
            }
815
            // only update the offset if we're adding a new char to our variable
816
            var_offset = offset;
191✔
817
            self.iter.next();
191✔
818
            res.push(c);
191✔
819
        }
820

821
        if let Some(v) = self.variable_map.0.get(&res) {
47✔
822
            return Ok((var_offset, Some(v)));
46✔
823
        }
824

825
        Err(PEK::UnknownVariable.with_offset(var_offset))
1✔
826
    }
84✔
827

828
    // parses and consumes: '{' (s charInString)* s '}'
829
    fn parse_string(&mut self) -> Result<(usize, Literal)> {
68✔
830
        self.consume('{')?;
68✔
831

832
        let mut buffer = String::new();
68✔
833
        let mut last_offset;
834

835
        loop {
836
            self.skip_whitespace();
345✔
837
            last_offset = self.must_peek_index()?;
345✔
838
            match self.must_peek_char()? {
344✔
839
                '}' => {
840
                    self.iter.next();
67✔
841
                    break;
842
                }
843
                // note: c cannot be a whitespace, because we called skip_whitespace just before,
844
                // so it's safe to call this guard function
845
                c if legal_char_in_string_start(c) => {
277✔
846
                    // don't need the offset, because '}' will always be the last char
847
                    let (_, c) = self.parse_char()?;
277✔
848
                    match c {
277✔
849
                        SingleOrMultiChar::Single(c) => buffer.push(c),
276✔
850
                        SingleOrMultiChar::Multi(first) => {
1✔
851
                            buffer.push(first);
1✔
852
                            self.parse_multi_escape_into_string(&mut buffer)?;
69✔
853
                        }
854
                    }
855
                }
856
                c => return self.error_here(PEK::UnexpectedChar(c)),
×
857
            }
858
        }
859

860
        let mut chars = buffer.chars();
67✔
861
        let literal = match (chars.next(), chars.next()) {
67✔
862
            (Some(c), None) => Literal::CharKind(SingleOrMultiChar::Single(c)),
14✔
863
            _ => Literal::String(buffer),
53✔
864
        };
865
        Ok((last_offset, literal))
67✔
866
    }
68✔
867

868
    // finishes a partial multi escape parse. in case of a parse error, self.single_set
869
    // may be left in an inconsistent state
870
    fn parse_multi_escape_into_set(&mut self) -> Result<()> {
4✔
871
        // note: would be good to somehow merge the two multi_escape methods. splitting up the UnicodeSetBuilder into a more
872
        // conventional parser + lexer combo might allow this.
873
        // issue is that we cannot pass this method an argument that somehow mutates `self` in the current architecture.
874
        // self.lexer.parse_multi_into_charappendable(&mut self.single_set) should work because the lifetimes are separate
875

876
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
877
        // enforced when creating the SingleOrMultiChar::Multi.
878
        let mut first = true;
4✔
879
        loop {
4✔
880
            let skipped = self.skip_whitespace();
10✔
881
            match self.must_peek_char()? {
10✔
882
                '}' => {
883
                    self.iter.next();
3✔
884
                    return Ok(());
3✔
885
                }
886
                initial_c => {
887
                    if skipped == 0 && !first {
7✔
888
                        // bracketed hex code points must be separated by whitespace
889
                        return self.error_here(PEK::UnexpectedChar(initial_c));
1✔
890
                    }
891
                    first = false;
6✔
892

893
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
6✔
894
                    self.single_set.add_char(c);
6✔
895
                }
896
            }
897
        }
898
    }
4✔
899

900
    // finishes a partial multi escape parse. in case of a parse error, the caller must clean up the
901
    // string if necessary.
902
    fn parse_multi_escape_into_string(&mut self, s: &mut String) -> Result<()> {
1✔
903
        // whitespace before first char of this loop (ie, second char in this multi_escape) must be
904
        // enforced when creating the SingleOrMultiChar::Multi.
905
        let mut first = true;
1✔
906
        loop {
1✔
907
            let skipped = self.skip_whitespace();
3✔
908
            match self.must_peek_char()? {
3✔
909
                '}' => {
910
                    self.iter.next();
1✔
911
                    return Ok(());
1✔
912
                }
913
                initial_c => {
914
                    if skipped == 0 && !first {
2✔
915
                        // bracketed hex code points must be separated by whitespace
916
                        return self.error_here(PEK::UnexpectedChar(initial_c));
×
917
                    }
918
                    first = false;
2✔
919

920
                    let (_, c) = self.parse_hex_digits_into_char(1, 6)?;
2✔
921
                    s.push(c);
2✔
922
                }
923
            }
924
        }
925
    }
1✔
926

927
    // starts with \ and consumes the whole escape sequence if a single
928
    // char is escaped, otherwise pauses the parse after the first char
929
    fn parse_escaped_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
130✔
930
        self.consume('\\')?;
130✔
931

932
        let (offset, next_char) = self.must_next()?;
130✔
933

934
        match next_char {
130✔
935
            'u' | 'x' if self.peek_char() == Some('{') => {
57✔
936
                // bracketedHex
937
                self.iter.next();
20✔
938

939
                self.skip_whitespace();
20✔
940
                let (_, first_c) = self.parse_hex_digits_into_char(1, 6)?;
20✔
941
                let skipped = self.skip_whitespace();
16✔
942

943
                match self.must_peek()? {
16✔
944
                    (offset, '}') => {
7✔
945
                        self.iter.next();
7✔
946
                        Ok((offset, SingleOrMultiChar::Single(first_c)))
7✔
947
                    }
7✔
948
                    // note: enforcing whitespace after the first char here, because the parse_multi_escape functions
949
                    // won't have access to this information anymore
950
                    (offset, c) if c.is_ascii_hexdigit() && skipped > 0 => {
9✔
951
                        Ok((offset, SingleOrMultiChar::Multi(first_c)))
7✔
952
                    }
7✔
953
                    (_, c) => self.error_here(PEK::UnexpectedChar(c)),
2✔
954
                }
955
            }
956
            'u' => {
957
                // 'u' hex{4}
958
                self.parse_hex_digits_into_char(4, 4)
21✔
959
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
21✔
960
            }
961
            'x' => {
962
                // 'x' hex{2}
963
                self.parse_hex_digits_into_char(2, 2)
16✔
964
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
15✔
965
            }
966
            'U' => {
967
                // 'U00' ('0' hex{5} | '10' hex{4})
968
                self.consume('0')?;
3✔
969
                self.consume('0')?;
133✔
970
                self.parse_hex_digits_into_char(6, 6)
3✔
971
                    .map(|(offset, c)| (offset, SingleOrMultiChar::Single(c)))
3✔
972
            }
973
            'N' => {
974
                // parse code point with name in {}
975
                // tracking issue: https://github.com/unicode-org/icu4x/issues/1397
976
                Err(PEK::Unimplemented.with_offset(offset))
1✔
977
            }
978
            'a' => Ok((offset, SingleOrMultiChar::Single('\u{0007}'))),
×
979
            'b' => Ok((offset, SingleOrMultiChar::Single('\u{0008}'))),
×
980
            't' => Ok((offset, SingleOrMultiChar::Single('\u{0009}'))),
×
981
            'n' => Ok((offset, SingleOrMultiChar::Single('\u{000A}'))),
9✔
982
            'v' => Ok((offset, SingleOrMultiChar::Single('\u{000B}'))),
×
983
            'f' => Ok((offset, SingleOrMultiChar::Single('\u{000C}'))),
×
984
            'r' => Ok((offset, SingleOrMultiChar::Single('\u{000D}'))),
8✔
985
            _ => Ok((offset, SingleOrMultiChar::Single(next_char))),
52✔
986
        }
987
    }
130✔
988

989
    // starts with :, consumes the trailing :]
990
    fn parse_property_posix(&mut self) -> Result<()> {
83✔
991
        self.consume(':')?;
83✔
992
        if self.must_peek_char()? == '^' {
83✔
993
            self.inverted = true;
3✔
994
            self.iter.next();
3✔
995
        }
996

997
        self.parse_property_inner(':')?;
83✔
998

999
        self.consume(']')?;
158✔
1000

1001
        Ok(())
75✔
1002
    }
83✔
1003

1004
    // starts with \p{ or \P{, consumes the trailing }
1005
    fn parse_property_perl(&mut self) -> Result<()> {
26✔
1006
        self.consume('\\')?;
26✔
1007
        match self.must_next()? {
26✔
1008
            (_, 'p') => {}
1009
            (_, 'P') => self.inverted = true,
1✔
1010
            (offset, c) => return Err(PEK::UnexpectedChar(c).with_offset(offset)),
×
1011
        }
1012
        self.consume('{')?;
26✔
1013

1014
        self.parse_property_inner('}')?;
51✔
1015

1016
        Ok(())
22✔
1017
    }
26✔
1018

1019
    fn parse_property_inner(&mut self, end: char) -> Result<()> {
108✔
1020
        // UnicodeSet spec ignores whitespace, '-', and '_',
1021
        // but ECMA-262 requires '_', so we'll allow that.
1022
        // TODO(#3559): support loose matching on property names (e.g., "AS  -_-  CII_Hex_ D-igit")
1023
        // TODO(#3559): support more properties than ECMA-262
1024

1025
        let property_offset;
1026

1027
        let mut key_buffer = String::new();
108✔
1028
        let mut value_buffer = String::new();
108✔
1029

1030
        enum State {
1031
            // initial state, nothing parsed yet
1032
            Begin,
1033
            // non-empty property name
1034
            PropertyName,
1035
            // property name parsed, '=' or '≠' parsed, no value parsed yet
1036
            PropertyValueBegin,
1037
            // non-empty property name, non-empty property value
1038
            PropertyValue,
1039
        }
1040
        use State::*;
1041

1042
        let mut state = Begin;
108✔
1043
        // whether '=' (true) or '≠' (false) was parsed
1044
        let mut equality = true;
108✔
1045

1046
        loop {
108✔
1047
            self.skip_whitespace();
649✔
1048
            match (state, self.must_peek_char()?) {
1,293✔
1049
                // parse the end of the property expression
1050
                (PropertyName | PropertyValue, c) if c == end => {
510✔
1051
                    // byte index of (full) property name/value is one back
1052
                    property_offset = self.must_peek_index()? - 1;
101✔
1053
                    self.iter.next();
101✔
1054
                    break;
1055
                }
1056
                // parse the property name
1057
                // NOTE: this might be too strict, because in the case of e.g. [:value:], we might want to
1058
                // allow [:lower-case-letter:] ([:gc=lower-case-letter:] works)
1059
                (Begin | PropertyName, c) if c.is_ascii_alphanumeric() || c == '_' => {
408✔
1060
                    key_buffer.push(c);
371✔
1061
                    self.iter.next();
371✔
1062
                    state = PropertyName;
371✔
1063
                }
371✔
1064
                // parse the name-value separator
1065
                (PropertyName, c @ ('=' | '≠')) => {
31✔
1066
                    equality = c == '=';
31✔
1067
                    self.iter.next();
31✔
1068
                    state = PropertyValueBegin;
31✔
1069
                }
31✔
1070
                // parse the property value
1071
                (PropertyValue | PropertyValueBegin, c) if c != end => {
140✔
1072
                    value_buffer.push(c);
139✔
1073
                    self.iter.next();
139✔
1074
                    state = PropertyValue;
139✔
1075
                }
139✔
1076
                (_, c) => return self.error_here(PEK::UnexpectedChar(c)),
7✔
1077
            }
1078
        }
1079

1080
        if !equality {
101✔
1081
            self.inverted = !self.inverted;
5✔
1082
        }
1083

1084
        let inverted = self
101✔
1085
            .load_property_codepoints(&key_buffer, &value_buffer)
101✔
1086
            // any error that does not already have an offset should use the appropriate property offset
1087
            .map_err(|e| e.or_with_offset(property_offset))?;
109✔
1088
        if inverted {
97✔
1089
            self.inverted = !self.inverted;
3✔
1090
        }
1091

1092
        Ok(())
97✔
1093
    }
108✔
1094

1095
    // returns whether the set needs to be inverted or not
1096
    fn load_property_codepoints(&mut self, key: &str, value: &str) -> Result<bool> {
101✔
1097
        // we support:
1098
        // [:gc = value:]
1099
        // [:sc = value:]
1100
        // [:scx = value:]
1101
        // [:Grapheme_Cluster_Break = value:]
1102
        // [:Sentence_Break = value:]
1103
        // [:Word_Break = value:]
1104
        // [:value:] - looks up value in gc, sc
1105
        // [:prop:] - binary property, returns codepoints that have the property
1106
        // [:prop = truthy/falsy:] - same as above
1107

1108
        let mut inverted = false;
101✔
1109

1110
        // contains a value for the General_Category property that needs to be tried
1111
        let mut try_gc = Err(PEK::UnknownProperty.into());
101✔
1112
        // contains a value for the Script property that needs to be tried
1113
        let mut try_sc = Err(PEK::UnknownProperty.into());
101✔
1114
        // contains a value for the Script_Extensions property that needs to be tried
1115
        let mut try_scx = Err(PEK::UnknownProperty.into());
101✔
1116
        // contains a value for the Grapheme_Cluster_Break property that needs to be tried
1117
        let mut try_gcb = Err(PEK::UnknownProperty.into());
101✔
1118
        // contains a value for the Sentence_Break property that needs to be tried
1119
        let mut try_sb = Err(PEK::UnknownProperty.into());
101✔
1120
        // contains a value for the Word_Break property that needs to be tried
1121
        let mut try_wb = Err(PEK::UnknownProperty.into());
101✔
1122
        // contains a supposed binary property name that needs to be tried
1123
        let mut try_binary = Err(PEK::UnknownProperty.into());
101✔
1124

1125
        if !value.is_empty() {
101✔
1126
            // key is gc, sc, scx, grapheme cluster break, sentence break, word break
1127
            // value is a property value
1128
            // OR
1129
            // key is a binary property and value is a truthy/falsy value
1130

1131
            match key {
1132
                "General_Category" | "gc" => try_gc = Ok(value),
30✔
1133
                "Grapheme_Cluster_Break" => try_gcb = Ok(value),
26✔
1134
                "Script" | "sc" => try_sc = Ok(value),
25✔
1135
                "Script_Extensions" | "scx" => try_scx = Ok(value),
20✔
1136
                "Sentence_Break" => try_sb = Ok(value),
16✔
1137
                "Word_Break" => try_wb = Ok(value),
15✔
1138
                _ => {
1139
                    let normalized_value = value.to_ascii_lowercase();
14✔
1140
                    let truthy = matches!(normalized_value.as_str(), "true" | "t" | "yes" | "y");
14✔
1141
                    let falsy = matches!(normalized_value.as_str(), "false" | "f" | "no" | "n");
14✔
1142
                    // value must either match truthy or falsy
1143
                    if truthy == falsy {
14✔
1144
                        return Err(PEK::UnknownProperty.into());
×
1145
                    }
1146
                    // correctness: if we reach this point, only `try_binary` can be Ok, hence
1147
                    // it does not matter that further down we unconditionally return `inverted`,
1148
                    // because only `try_binary` can enter that code path.
1149
                    inverted = falsy;
14✔
1150
                    try_binary = Ok(key);
14✔
1151
                }
14✔
1152
            }
1153
        } else {
1154
            // key is binary property
1155
            // OR a value of gc, sc (only gc or sc are supported as implicit keys by UTS35!)
1156
            try_gc = Ok(key);
71✔
1157
            try_sc = Ok(key);
71✔
1158
            try_binary = Ok(key);
71✔
1159
        }
1160

1161
        try_gc
909✔
1162
            .and_then(|value| self.try_load_general_category_set(value))
176✔
1163
            .or_else(|_| try_sc.and_then(|value| self.try_load_script_set(value)))
172✔
1164
            .or_else(|_| try_scx.and_then(|value| self.try_load_script_extensions_set(value)))
137✔
1165
            .or_else(|_| try_binary.and_then(|value| self.try_load_ecma262_binary_set(value)))
152✔
1166
            .or_else(|_| try_gcb.and_then(|value| self.try_load_grapheme_cluster_break_set(value)))
109✔
1167
            .or_else(|_| try_sb.and_then(|value| self.try_load_sentence_break_set(value)))
108✔
1168
            .or_else(|_| try_wb.and_then(|value| self.try_load_word_break_set(value)))?;
107✔
1169
        Ok(inverted)
97✔
1170
    }
101✔
1171

1172
    fn finalize(mut self) -> (CodePointInversionListBuilder, BTreeSet<String>) {
630✔
1173
        if self.inverted {
630✔
1174
            // code point inversion; removes all strings
1175
            #[cfg(feature = "log")]
1176
            if !self.string_set.is_empty() {
111✔
1177
                log::info!(
3✔
1178
                    "Inverting a unicode set with strings. This removes all strings entirely."
1179
                );
1180
            }
1181
            self.string_set.clear();
111✔
1182
            self.single_set.complement();
111✔
1183
        }
1184

1185
        (self.single_set, self.string_set)
630✔
1186
    }
630✔
1187

1188
    // parses either a raw char or an escaped char. all chars are allowed, the caller must make sure to handle
1189
    // cases where some characters are not allowed
1190
    fn parse_char(&mut self) -> Result<(usize, SingleOrMultiChar)> {
1,585✔
1191
        let (offset, c) = self.must_peek()?;
1,585✔
1192
        match c {
1,585✔
1193
            '\\' => self.parse_escaped_char(),
130✔
1194
            _ => {
1195
                self.iter.next();
1,455✔
1196
                Ok((offset, SingleOrMultiChar::Single(c)))
1,455✔
1197
            }
1198
        }
1199
    }
1,585✔
1200

1201
    // note: could turn this from the current two-pass approach into a one-pass approach
1202
    // by manually parsing the digits instead of using u32::from_str_radix.
1203
    fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result<(usize, char)> {
68✔
1204
        let first_offset = self.must_peek_index()?;
68✔
1205
        let end_offset = self.validate_hex_digits(min, max)?;
68✔
1206

1207
        // validate_hex_digits ensures that chars (including the last one) are ascii hex digits,
1208
        // which are all exactly one UTF-8 byte long, so slicing on these offsets always respects char boundaries
1209
        #[allow(clippy::indexing_slicing)]
1210
        let hex_source = &self.source[first_offset..=end_offset];
65✔
1211
        let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?;
65✔
1212
        char::try_from(num)
195✔
1213
            .map(|c| (end_offset, c))
128✔
1214
            .map_err(|_| PEK::InvalidEscape.with_offset(end_offset))
67✔
1215
    }
68✔
1216

1217
    // validates [0-9a-fA-F]{min,max}, returns the offset of the last digit, consuming everything in the process
1218
    fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result<usize> {
68✔
1219
        let mut last_offset = 0;
68✔
1220
        for count in 0..max {
277✔
1221
            let (offset, c) = self.must_peek()?;
233✔
1222
            if !c.is_ascii_hexdigit() {
233✔
1223
                if count < min {
24✔
1224
                    return Err(PEK::UnexpectedChar(c).with_offset(offset));
3✔
1225
                } else {
1226
                    break;
1227
                }
1228
            }
1229
            self.iter.next();
209✔
1230
            last_offset = offset;
209✔
1231
        }
1232
        Ok(last_offset)
65✔
1233
    }
68✔
1234

1235
    // returns the number of skipped whitespace chars
1236
    fn skip_whitespace(&mut self) -> usize {
4,132✔
1237
        let mut num = 0;
4,132✔
1238
        while let Some(c) = self.peek_char() {
4,271✔
1239
            if !self.pat_ws.contains(c) {
4,270✔
1240
                break;
1241
            }
1242
            self.iter.next();
139✔
1243
            num += 1;
139✔
1244
        }
1245
        num
4,132✔
1246
    }
4,132✔
1247

1248
    fn consume(&mut self, expected: char) -> Result<()> {
498✔
1249
        match self.must_next()? {
498✔
1250
            (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)),
498✔
1251
            _ => Ok(()),
497✔
1252
        }
1253
    }
498✔
1254

1255
    // use this whenever an empty iterator would imply an Eof error
1256
    fn must_next(&mut self) -> Result<(usize, char)> {
654✔
1257
        self.iter.next().ok_or(PEK::Eof.into())
654✔
1258
    }
654✔
1259

1260
    // use this whenever an empty iterator would imply an Eof error
1261
    fn must_peek(&mut self) -> Result<(usize, char)> {
10,573✔
1262
        self.iter.peek().copied().ok_or(PEK::Eof.into())
10,573✔
1263
    }
10,573✔
1264

1265
    // must_peek, but looks two chars ahead. use sparingly
1266
    fn must_peek_double(&mut self) -> Result<(usize, char)> {
1,988✔
1267
        let mut copy = self.iter.clone();
1,988✔
1268
        copy.next();
1,988✔
1269
        copy.next().ok_or(PEK::Eof.into())
1,988✔
1270
    }
1,988✔
1271

1272
    // see must_peek
1273
    fn must_peek_char(&mut self) -> Result<char> {
2,907✔
1274
        self.must_peek().map(|(_, c)| c)
5,813✔
1275
    }
2,907✔
1276

1277
    // see must_peek
1278
    fn must_peek_index(&mut self) -> Result<usize> {
707✔
1279
        self.must_peek().map(|(idx, _)| idx)
1,412✔
1280
    }
707✔
1281

1282
    fn peek_char(&mut self) -> Option<char> {
4,979✔
1283
        self.iter.peek().map(|&(_, c)| c)
9,957✔
1284
    }
4,979✔
1285

1286
    // TODO: return Result<!> once ! is stable
1287
    #[inline]
1288
    fn error_here<T>(&mut self, kind: ParseErrorKind) -> Result<T> {
11✔
1289
        match self.iter.peek() {
11✔
1290
            None => Err(kind.into()),
×
1291
            Some(&(offset, _)) => Err(kind.with_offset(offset)),
11✔
1292
        }
1293
    }
11✔
1294

1295
    fn process_strings(&mut self, op: Operation, other_strings: BTreeSet<String>) {
209✔
1296
        match op {
209✔
1297
            Operation::Union => self.string_set.extend(other_strings),
157✔
1298
            Operation::Difference => {
1299
                self.string_set = self
27✔
1300
                    .string_set
1301
                    .difference(&other_strings)
1302
                    .cloned()
UNCOV
1303
                    .collect()
×
1304
            }
27✔
1305
            Operation::Intersection => {
1306
                self.string_set = self
25✔
1307
                    .string_set
1308
                    .intersection(&other_strings)
1309
                    .cloned()
UNCOV
1310
                    .collect()
×
1311
            }
25✔
1312
        }
1313
    }
209✔
1314

1315
    fn process_chars(&mut self, op: Operation, other_chars: CodePointInversionList) {
209✔
1316
        match op {
209✔
1317
            Operation::Union => self.single_set.add_set(&other_chars),
157✔
1318
            Operation::Difference => self.single_set.remove_set(&other_chars),
27✔
1319
            Operation::Intersection => self.single_set.retain_set(&other_chars),
25✔
1320
        }
1321
    }
209✔
1322

1323
    fn try_load_general_category_set(&mut self, name: &str) -> Result<()> {
75✔
1324
        // TODO(#3550): This could be cached; does not depend on name.
1325
        let name_map = GeneralCategoryGroup::get_name_to_enum_mapper(self.property_provider)
75✔
1326
            .map_err(|_| PEK::Internal)?;
×
1327
        let gc_value = name_map
75✔
1328
            .as_borrowed()
1329
            .get_loose(name)
1330
            .ok_or(PEK::UnknownProperty)?;
96✔
1331
        // TODO(#3550): This could be cached; does not depend on name.
1332
        let set = load_for_general_category_group(self.property_provider, gc_value)
54✔
1333
            .map_err(|_| PEK::Internal)?;
×
1334
        self.single_set.add_set(&set.to_code_point_inversion_list());
54✔
1335
        Ok(())
54✔
1336
    }
75✔
1337

1338
    fn try_get_script(&self, name: &str) -> Result<Script> {
28✔
1339
        // TODO(#3550): This could be cached; does not depend on name.
1340
        let name_map =
1341
            Script::get_name_to_enum_mapper(self.property_provider).map_err(|_| PEK::Internal)?;
28✔
1342
        name_map
28✔
1343
            .as_borrowed()
1344
            .get_loose(name)
1345
            .ok_or(PEK::UnknownProperty.into())
28✔
1346
    }
28✔
1347

1348
    fn try_load_script_set(&mut self, name: &str) -> Result<()> {
24✔
1349
        let sc_value = self.try_get_script(name)?;
24✔
1350
        // TODO(#3550): This could be cached; does not depend on name.
1351
        let property_map = load_script(self.property_provider).map_err(|_| PEK::Internal)?;
15✔
1352
        let set = property_map.as_borrowed().get_set_for_value(sc_value);
15✔
1353
        self.single_set.add_set(&set.to_code_point_inversion_list());
15✔
1354
        Ok(())
15✔
1355
    }
24✔
1356

1357
    fn try_load_script_extensions_set(&mut self, name: &str) -> Result<()> {
4✔
1358
        // TODO(#3550): This could be cached; does not depend on name.
1359
        let scx = load_script_with_extensions_unstable(self.property_provider)
4✔
1360
            .map_err(|_| PEK::Internal)?;
×
1361
        let sc_value = self.try_get_script(name)?;
4✔
1362
        let set = scx.as_borrowed().get_script_extensions_set(sc_value);
4✔
1363
        self.single_set.add_set(&set);
4✔
1364
        Ok(())
4✔
1365
    }
4✔
1366

1367
    fn try_load_ecma262_binary_set(&mut self, name: &str) -> Result<()> {
23✔
1368
        let set = load_for_ecma262_unstable(self.property_provider, name)
23✔
1369
            .map_err(|_| PEK::UnknownProperty)?;
4✔
1370
        self.single_set.add_set(&set.to_code_point_inversion_list());
21✔
1371
        Ok(())
21✔
1372
    }
23✔
1373

1374
    fn try_load_grapheme_cluster_break_set(&mut self, name: &str) -> Result<()> {
1✔
1375
        let name_map = GraphemeClusterBreak::get_name_to_enum_mapper(self.property_provider)
1✔
1376
            .map_err(|_| PEK::Internal)?;
×
1377
        let gcb_value = name_map
1✔
1378
            .as_borrowed()
1379
            .get_loose(name)
1380
            .ok_or(PEK::UnknownProperty)?;
1✔
1381
        // TODO(#3550): This could be cached; does not depend on name.
1382
        let property_map =
1383
            load_grapheme_cluster_break(self.property_provider).map_err(|_| PEK::Internal)?;
1✔
1384
        let set = property_map.as_borrowed().get_set_for_value(gcb_value);
1✔
1385
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1386
        Ok(())
1✔
1387
    }
1✔
1388

1389
    fn try_load_sentence_break_set(&mut self, name: &str) -> Result<()> {
1✔
1390
        let name_map = SentenceBreak::get_name_to_enum_mapper(self.property_provider)
1✔
1391
            .map_err(|_| PEK::Internal)?;
×
1392
        let sb_value = name_map
1✔
1393
            .as_borrowed()
1394
            .get_loose(name)
1395
            .ok_or(PEK::UnknownProperty)?;
1✔
1396
        // TODO(#3550): This could be cached; does not depend on name.
1397
        let property_map =
1398
            load_sentence_break(self.property_provider).map_err(|_| PEK::Internal)?;
1✔
1399
        let set = property_map.as_borrowed().get_set_for_value(sb_value);
1✔
1400
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1401
        Ok(())
1✔
1402
    }
1✔
1403

1404
    fn try_load_word_break_set(&mut self, name: &str) -> Result<()> {
1✔
1405
        let name_map = WordBreak::get_name_to_enum_mapper(self.property_provider)
1✔
1406
            .map_err(|_| PEK::Internal)?;
×
1407
        let wb_value = name_map
1✔
1408
            .as_borrowed()
1409
            .get_loose(name)
1410
            .ok_or(PEK::UnknownProperty)?;
1✔
1411
        // TODO(#3550): This could be cached; does not depend on name.
1412
        let property_map = load_word_break(self.property_provider).map_err(|_| PEK::Internal)?;
1✔
1413
        let set = property_map.as_borrowed().get_set_for_value(wb_value);
1✔
1414
        self.single_set.add_set(&set.to_code_point_inversion_list());
1✔
1415
        Ok(())
1✔
1416
    }
1✔
1417
}
1418

1419
/// Parses a UnicodeSet pattern and returns a UnicodeSet in the form of a [`CodePointInversionListAndStringList`](CodePointInversionListAndStringList),
1420
/// as well as the number of bytes consumed from the source string.
1421
///
1422
/// Supports UnicodeSets as described in [UTS #35 - Unicode Sets](https://unicode.org/reports/tr35/#Unicode_Sets).
1423
///
1424
/// The error type of the returned Result can be pretty-printed with [`ParseError::fmt_with_source`].
1425
///
1426
/// # Variables
1427
///
1428
/// If you need support for variables inside UnicodeSets (e.g., `[$start-$end]`), use [`parse_with_variables`].
1429
///
1430
/// # Limitations
1431
///
1432
/// * Currently, we only support the [ECMA-262 properties](https://tc39.es/ecma262/#table-nonbinary-unicode-properties).
1433
/// The property names must match the exact spelling listed in ECMA-262. Note that we do support UTS35 syntax for elided `General_Category`
1434
/// and `Script` property names, i.e., `[:Latn:]` and `[:Ll:]` are both valid, with the former implying the `Script` property, and the latter the
1435
/// `General_Category` property.
1436
/// * We do not support `\N{Unicode code point name}` character escaping. Use any other escape method described in UTS35.
1437
///
1438
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
1439
///
1440
/// [📚 Help choosing a constructor](icu_provider::constructors)
1441
///
1442
/// # Examples
1443
///
1444
/// Parse ranges
1445
/// ```
1446
/// use icu::experimental::unicodeset_parse::parse;
1447
///
1448
/// let source = "[a-zA-Z0-9]";
1449
/// let (set, consumed) = parse(source).unwrap();
1450
/// let code_points = set.code_points();
1451
///
1452
/// assert!(code_points.contains_range(&('a'..='z')));
1453
/// assert!(code_points.contains_range(&('A'..='Z')));
1454
/// assert!(code_points.contains_range(&('0'..='9')));
1455
/// assert_eq!(consumed, source.len());
1456
/// ```
1457
///
1458
/// Parse properties, set operations, inner sets
1459
/// ```
1460
/// use icu::experimental::unicodeset_parse::parse;
1461
///
1462
/// let (set, _) =
1463
///     parse("[[:^ll:]-[^][:gc = Lowercase Letter:]&[^[[^]-[a-z]]]]").unwrap();
1464
/// let elements = 'a'..='z';
1465
/// assert!(set.code_points().contains_range(&elements));
1466
/// assert_eq!(elements.count(), set.size());
1467
/// ```
1468
///
1469
/// Inversions remove strings
1470
/// ```
1471
/// use icu::experimental::unicodeset_parse::parse;
1472
///
1473
/// let (set, _) =
1474
///     parse(r"[[a-z{hello\ world}]&[^a-y{hello\ world}]]").unwrap();
1475
/// assert!(set.contains_char('z'));
1476
/// assert_eq!(set.size(), 1);
1477
/// assert!(!set.has_strings());
1478
/// ```
1479
///
1480
/// Set operators (including the implicit union) have the same precedence and are left-associative
1481
/// ```
1482
/// use icu::experimental::unicodeset_parse::parse;
1483
///
1484
/// let (set, _) = parse("[[ace][bdf] - [abc][def]]").unwrap();
1485
/// let elements = 'd'..='f';
1486
/// assert!(set.code_points().contains_range(&elements));
1487
/// assert_eq!(set.size(), elements.count());
1488
/// ```
1489
///
1490
/// Supports partial parses
1491
/// ```
1492
/// use icu::experimental::unicodeset_parse::parse;
1493
///
1494
/// let (set, consumed) = parse("[a-c][x-z]").unwrap();
1495
/// let code_points = set.code_points();
1496
/// let elements = 'a'..='c';
1497
/// let elements_unparsed = 'x'..='z';
1498
/// assert!(code_points.contains_range(&elements));
1499
/// assert!(!code_points.contains_range(&elements_unparsed));
1500
/// assert_eq!(set.size(), elements.count());
1501
/// // only the first UnicodeSet is parsed
1502
/// assert_eq!(consumed, "[a-c]".len());
1503
/// ```
1504
#[cfg(feature = "compiled_data")]
1505
pub fn parse(source: &str) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
134✔
1506
    parse_unstable(source, &icu_properties::provider::Baked)
134✔
1507
}
134✔
1508

1509
/// Parses a UnicodeSet pattern with support for variables enabled.
1510
///
1511
/// See [`parse`] for more information.
1512
///
1513
/// # Examples
1514
///
1515
/// ```
1516
/// use icu::experimental::unicodeset_parse::*;
1517
///
1518
/// let (my_set, _) = parse("[abc]").unwrap();
1519
///
1520
/// let mut variable_map = VariableMap::new();
1521
/// variable_map.insert_char("start".into(), 'a').unwrap();
1522
/// variable_map.insert_char("end".into(), 'z').unwrap();
1523
/// variable_map.insert_string("str".into(), "Hello World".into()).unwrap();
1524
/// variable_map.insert_set("the_set".into(), my_set).unwrap();
1525
///
1526
/// // If a variable already exists, `Err` is returned, and the map is not updated.
1527
/// variable_map.insert_char("end".into(), 'Ω').unwrap_err();
1528
///
1529
/// let source = "[[$start-$end]-$the_set $str]";
1530
/// let (set, consumed) = parse_with_variables(source, &variable_map).unwrap();
1531
/// assert_eq!(consumed, source.len());
1532
/// assert!(set.code_points().contains_range(&('d'..='z')));
1533
/// assert!(set.contains("Hello World"));
1534
/// assert_eq!(set.size(), 1 + ('d'..='z').count());
1535
#[cfg(feature = "compiled_data")]
1536
pub fn parse_with_variables(
73✔
1537
    source: &str,
1538
    variable_map: &VariableMap<'_>,
1539
) -> Result<(CodePointInversionListAndStringList<'static>, usize)> {
1540
    parse_unstable_with_variables(source, variable_map, &icu_properties::provider::Baked)
73✔
1541
}
73✔
1542

1543
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, parse_with_variables)]
1544
pub fn parse_unstable_with_variables<P>(
491✔
1545
    source: &str,
1546
    variable_map: &VariableMap<'_>,
1547
    provider: &P,
1548
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1549
where
1550
    P: ?Sized
1551
        + DataProvider<AsciiHexDigitV1Marker>
1552
        + DataProvider<AlphabeticV1Marker>
1553
        + DataProvider<BidiControlV1Marker>
1554
        + DataProvider<BidiMirroredV1Marker>
1555
        + DataProvider<CaseIgnorableV1Marker>
1556
        + DataProvider<CasedV1Marker>
1557
        + DataProvider<ChangesWhenCasefoldedV1Marker>
1558
        + DataProvider<ChangesWhenCasemappedV1Marker>
1559
        + DataProvider<ChangesWhenLowercasedV1Marker>
1560
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
1561
        + DataProvider<ChangesWhenTitlecasedV1Marker>
1562
        + DataProvider<ChangesWhenUppercasedV1Marker>
1563
        + DataProvider<DashV1Marker>
1564
        + DataProvider<DefaultIgnorableCodePointV1Marker>
1565
        + DataProvider<DeprecatedV1Marker>
1566
        + DataProvider<DiacriticV1Marker>
1567
        + DataProvider<EmojiV1Marker>
1568
        + DataProvider<EmojiComponentV1Marker>
1569
        + DataProvider<EmojiModifierV1Marker>
1570
        + DataProvider<EmojiModifierBaseV1Marker>
1571
        + DataProvider<EmojiPresentationV1Marker>
1572
        + DataProvider<ExtendedPictographicV1Marker>
1573
        + DataProvider<ExtenderV1Marker>
1574
        + DataProvider<GraphemeBaseV1Marker>
1575
        + DataProvider<GraphemeClusterBreakV1Marker>
1576
        + DataProvider<GraphemeClusterBreakNameToValueV1Marker>
1577
        + DataProvider<GraphemeExtendV1Marker>
1578
        + DataProvider<HexDigitV1Marker>
1579
        + DataProvider<IdsBinaryOperatorV1Marker>
1580
        + DataProvider<IdsTrinaryOperatorV1Marker>
1581
        + DataProvider<IdContinueV1Marker>
1582
        + DataProvider<IdStartV1Marker>
1583
        + DataProvider<IdeographicV1Marker>
1584
        + DataProvider<JoinControlV1Marker>
1585
        + DataProvider<LogicalOrderExceptionV1Marker>
1586
        + DataProvider<LowercaseV1Marker>
1587
        + DataProvider<MathV1Marker>
1588
        + DataProvider<NoncharacterCodePointV1Marker>
1589
        + DataProvider<PatternSyntaxV1Marker>
1590
        + DataProvider<PatternWhiteSpaceV1Marker>
1591
        + DataProvider<QuotationMarkV1Marker>
1592
        + DataProvider<RadicalV1Marker>
1593
        + DataProvider<RegionalIndicatorV1Marker>
1594
        + DataProvider<SentenceBreakV1Marker>
1595
        + DataProvider<SentenceBreakNameToValueV1Marker>
1596
        + DataProvider<SentenceTerminalV1Marker>
1597
        + DataProvider<SoftDottedV1Marker>
1598
        + DataProvider<TerminalPunctuationV1Marker>
1599
        + DataProvider<UnifiedIdeographV1Marker>
1600
        + DataProvider<UppercaseV1Marker>
1601
        + DataProvider<VariationSelectorV1Marker>
1602
        + DataProvider<WhiteSpaceV1Marker>
1603
        + DataProvider<WordBreakV1Marker>
1604
        + DataProvider<WordBreakNameToValueV1Marker>
1605
        + DataProvider<XidContinueV1Marker>
1606
        + DataProvider<GeneralCategoryMaskNameToValueV1Marker>
1607
        + DataProvider<GeneralCategoryV1Marker>
1608
        + DataProvider<ScriptNameToValueV1Marker>
1609
        + DataProvider<ScriptV1Marker>
1610
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
1611
        + DataProvider<XidStartV1Marker>,
1612
{
1613
    // TODO(#3550): Add function "parse_overescaped" that uses a custom iterator to de-overescape (i.e., maps \\ to \) on-the-fly?
1614
    // ^ will likely need a different iterator type on UnicodeSetBuilder
1615

1616
    let mut iter = source.char_indices().peekable();
491✔
1617

1618
    let xid_start = load_xid_start(provider).map_err(|_| PEK::Internal)?;
491✔
1619
    let xid_start_list = xid_start.to_code_point_inversion_list();
491✔
1620
    let xid_continue = load_xid_continue(provider).map_err(|_| PEK::Internal)?;
489✔
1621
    let xid_continue_list = xid_continue.to_code_point_inversion_list();
489✔
1622

1623
    let pat_ws = load_pattern_white_space(provider).map_err(|_| PEK::Internal)?;
489✔
1624
    let pat_ws_list = pat_ws.to_code_point_inversion_list();
489✔
1625

1626
    let mut builder = UnicodeSetBuilder::new_internal(
489✔
1627
        &mut iter,
1628
        source,
1629
        variable_map,
1630
        &xid_start_list,
1631
        &xid_continue_list,
1632
        &pat_ws_list,
1633
        provider,
1634
    );
489✔
1635

1636
    builder.parse_unicode_set()?;
980✔
1637
    let (single, string_set) = builder.finalize();
437✔
1638
    let built_single = single.build();
437✔
1639

1640
    let mut strings = string_set.into_iter().collect::<Vec<_>>();
437✔
1641
    strings.sort();
437✔
1642
    let zerovec = (&strings).into();
437✔
1643

1644
    let cpinvlistandstrlist = CodePointInversionListAndStringList::try_from(built_single, zerovec)
437✔
1645
        .map_err(|_| PEK::Internal)?;
×
1646

1647
    let parsed_bytes = match iter.peek().copied() {
437✔
1648
        None => source.len(),
163✔
1649
        Some((offset, _)) => offset,
274✔
1650
    };
1651

1652
    Ok((cpinvlistandstrlist, parsed_bytes))
437✔
1653
}
489✔
1654

1655
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, parse)]
1656
pub fn parse_unstable<P>(
150✔
1657
    source: &str,
1658
    provider: &P,
1659
) -> Result<(CodePointInversionListAndStringList<'static>, usize)>
1660
where
1661
    P: ?Sized
1662
        + DataProvider<AsciiHexDigitV1Marker>
1663
        + DataProvider<AlphabeticV1Marker>
1664
        + DataProvider<BidiControlV1Marker>
1665
        + DataProvider<BidiMirroredV1Marker>
1666
        + DataProvider<CaseIgnorableV1Marker>
1667
        + DataProvider<CasedV1Marker>
1668
        + DataProvider<ChangesWhenCasefoldedV1Marker>
1669
        + DataProvider<ChangesWhenCasemappedV1Marker>
1670
        + DataProvider<ChangesWhenLowercasedV1Marker>
1671
        + DataProvider<ChangesWhenNfkcCasefoldedV1Marker>
1672
        + DataProvider<ChangesWhenTitlecasedV1Marker>
1673
        + DataProvider<ChangesWhenUppercasedV1Marker>
1674
        + DataProvider<DashV1Marker>
1675
        + DataProvider<DefaultIgnorableCodePointV1Marker>
1676
        + DataProvider<DeprecatedV1Marker>
1677
        + DataProvider<DiacriticV1Marker>
1678
        + DataProvider<EmojiV1Marker>
1679
        + DataProvider<EmojiComponentV1Marker>
1680
        + DataProvider<EmojiModifierV1Marker>
1681
        + DataProvider<EmojiModifierBaseV1Marker>
1682
        + DataProvider<EmojiPresentationV1Marker>
1683
        + DataProvider<ExtendedPictographicV1Marker>
1684
        + DataProvider<ExtenderV1Marker>
1685
        + DataProvider<GraphemeBaseV1Marker>
1686
        + DataProvider<GraphemeClusterBreakV1Marker>
1687
        + DataProvider<GraphemeClusterBreakNameToValueV1Marker>
1688
        + DataProvider<GraphemeExtendV1Marker>
1689
        + DataProvider<HexDigitV1Marker>
1690
        + DataProvider<IdsBinaryOperatorV1Marker>
1691
        + DataProvider<IdsTrinaryOperatorV1Marker>
1692
        + DataProvider<IdContinueV1Marker>
1693
        + DataProvider<IdStartV1Marker>
1694
        + DataProvider<IdeographicV1Marker>
1695
        + DataProvider<JoinControlV1Marker>
1696
        + DataProvider<LogicalOrderExceptionV1Marker>
1697
        + DataProvider<LowercaseV1Marker>
1698
        + DataProvider<MathV1Marker>
1699
        + DataProvider<NoncharacterCodePointV1Marker>
1700
        + DataProvider<PatternSyntaxV1Marker>
1701
        + DataProvider<PatternWhiteSpaceV1Marker>
1702
        + DataProvider<QuotationMarkV1Marker>
1703
        + DataProvider<RadicalV1Marker>
1704
        + DataProvider<RegionalIndicatorV1Marker>
1705
        + DataProvider<SentenceBreakV1Marker>
1706
        + DataProvider<SentenceBreakNameToValueV1Marker>
1707
        + DataProvider<SentenceTerminalV1Marker>
1708
        + DataProvider<SoftDottedV1Marker>
1709
        + DataProvider<TerminalPunctuationV1Marker>
1710
        + DataProvider<UnifiedIdeographV1Marker>
1711
        + DataProvider<UppercaseV1Marker>
1712
        + DataProvider<VariationSelectorV1Marker>
1713
        + DataProvider<WhiteSpaceV1Marker>
1714
        + DataProvider<WordBreakV1Marker>
1715
        + DataProvider<WordBreakNameToValueV1Marker>
1716
        + DataProvider<XidContinueV1Marker>
1717
        + DataProvider<GeneralCategoryMaskNameToValueV1Marker>
1718
        + DataProvider<GeneralCategoryV1Marker>
1719
        + DataProvider<ScriptNameToValueV1Marker>
1720
        + DataProvider<ScriptV1Marker>
1721
        + DataProvider<ScriptWithExtensionsPropertyV1Marker>
1722
        + DataProvider<XidStartV1Marker>,
1723
{
1724
    let dummy = Default::default();
150✔
1725
    parse_unstable_with_variables(source, &dummy, provider)
150✔
1726
}
150✔
1727

1728
#[cfg(test)]
1729
mod tests {
1730
    use core::ops::RangeInclusive;
1731
    use std::collections::HashSet;
1732

1733
    use super::*;
1734

1735
    // "aabxzz" => [a..=a, b..=x, z..=z]
1736
    fn range_iter_from_str(s: &str) -> impl Iterator<Item = RangeInclusive<u32>> {
139✔
1737
        debug_assert_eq!(
139✔
1738
            s.chars().count() % 2,
139✔
1739
            0,
1740
            "string \"{}\" does not contain an even number of code points",
1741
            s.escape_debug()
×
1742
        );
1743
        let mut res = vec![];
139✔
1744
        let mut skip = false;
139✔
1745
        for (a, b) in s.chars().zip(s.chars().skip(1)) {
382✔
1746
            if skip {
243✔
1747
                skip = false;
66✔
1748
                continue;
1749
            }
1750
            let a = a as u32;
177✔
1751
            let b = b as u32;
177✔
1752
            res.push(a..=b);
177✔
1753
            skip = true;
177✔
1754
        }
1755

1756
        res.into_iter()
139✔
1757
    }
139✔
1758

1759
    fn assert_set_equality<'a>(
139✔
1760
        source: &str,
1761
        cpinvlistandstrlist: &CodePointInversionListAndStringList,
1762
        single: impl Iterator<Item = RangeInclusive<u32>>,
1763
        strings: impl Iterator<Item = &'a str>,
1764
    ) {
1765
        let expected_ranges: HashSet<_> = single.collect();
139✔
1766
        let actual_ranges: HashSet<_> = cpinvlistandstrlist.code_points().iter_ranges().collect();
139✔
1767
        assert_eq!(
139✔
1768
            actual_ranges,
1769
            expected_ranges,
1770
            "got unexpected ranges {:?}, expected {:?} for parsed set \"{}\"",
1771
            actual_ranges,
1772
            expected_ranges,
1773
            source.escape_debug()
×
1774
        );
1775
        let mut expected_size = cpinvlistandstrlist.code_points().size();
139✔
1776
        for s in strings {
160✔
1777
            expected_size += 1;
21✔
1778
            assert!(
21✔
1779
                cpinvlistandstrlist.contains(s),
21✔
1780
                "missing string \"{}\" from parsed set \"{}\"",
1781
                s.escape_debug(),
×
1782
                source.escape_debug()
×
1783
            );
1784
        }
1785
        let actual_size = cpinvlistandstrlist.size();
139✔
1786
        assert_eq!(
139✔
1787
            actual_size,
1788
            expected_size,
1789
            "got unexpected size {}, expected {} for parsed set \"{}\"",
1790
            actual_size,
1791
            expected_size,
1792
            source.escape_debug()
×
1793
        );
1794
    }
139✔
1795

1796
    fn assert_is_error_and_message_eq(source: &str, expected_err: &str, vm: &VariableMap<'_>) {
45✔
1797
        let result = parse_with_variables(source, vm);
45✔
1798
        assert!(result.is_err(), "{source} does not cause an error!");
45✔
1799
        let err = result.unwrap_err();
45✔
1800
        assert_eq!(err.fmt_with_source(source).to_string(), expected_err);
45✔
1801
    }
45✔
1802

1803
    #[test]
1804
    fn test_semantics_with_variables() {
2✔
1805
        let mut map_char_char = VariableMap::default();
1✔
1806
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
1807
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
1808

1809
        let mut map_headache = VariableMap::default();
1✔
1810
        map_headache.insert_char("hehe".to_string(), '-').unwrap();
1✔
1811

1812
        let mut map_char_string = VariableMap::default();
1✔
1813
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
1814
        map_char_string
1✔
1815
            .insert_string("var2".to_string(), "abc".to_string())
2✔
1816
            .unwrap();
1817

1818
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
1819
        let mut map_char_set = VariableMap::default();
1✔
1820
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
1821
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
1822

1823
        let cases: Vec<(_, _, _, Vec<&str>)> = vec![
2✔
1824
            // simple
1825
            (&map_char_char, "[$a]", "aa", vec![]),
1✔
1826
            (&map_char_char, "[ $a ]", "aa", vec![]),
1✔
1827
            (&map_char_char, "[$a$]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1828
            (&map_char_char, "[$a$ ]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1829
            (&map_char_char, "[$a$var2]", "aazz", vec![]),
1✔
1830
            (&map_char_char, "[$a - $var2]", "az", vec![]),
1✔
1831
            (&map_char_char, "[$a-$var2]", "az", vec![]),
1✔
1832
            (&map_headache, "[a $hehe z]", "aazz--", vec![]),
1✔
1833
            (
1✔
1834
                &map_char_char,
1835
                "[[$]var2]",
1836
                "\u{ffff}\u{ffff}vvaarr22",
1837
                vec![],
1✔
1838
            ),
1839
            // variable prefix escaping
1840
            (&map_char_char, r"[\$var2]", "$$vvaarr22", vec![]),
1✔
1841
            (&map_char_char, r"[\\$var2]", r"\\zz", vec![]),
1✔
1842
            // no variable dereferencing in strings
1843
            (&map_char_char, "[{$a}]", "", vec!["$a"]),
1✔
1844
            // set operations
1845
            (&map_char_set, "[$set & [b-z]]", "bz", vec![]),
1✔
1846
            (&map_char_set, "[[a-z]-[b-z]]", "aa", vec![]),
1✔
1847
            (&map_char_set, "[$set-[b-z]]", "aa", vec!["Hello, World!"]),
1✔
1848
            (&map_char_set, "[$set-$set]", "", vec![]),
1✔
1849
            (&map_char_set, "[[a-zA]-$set]", "AA", vec![]),
1✔
1850
            (&map_char_set, "[$set[b-z]]", "az", vec!["Hello, World!"]),
1✔
1851
            (&map_char_set, "[[a-a]$set]", "az", vec!["Hello, World!"]),
1✔
1852
            (&map_char_set, "$set", "az", vec!["Hello, World!"]),
1✔
1853
            // strings
1854
            (&map_char_string, "[$var2]", "", vec!["abc"]),
1✔
1855
        ];
1856
        for (variable_map, source, single, strings) in cases {
22✔
1857
            let parsed = parse_with_variables(source, variable_map);
21✔
1858
            if let Err(err) = parsed {
21✔
1859
                panic!(
×
1860
                    "{source} results in an error: {}",
1861
                    err.fmt_with_source(source)
×
1862
                );
1863
            }
1864
            let (set, consumed) = parsed.unwrap();
21✔
1865
            assert_eq!(consumed, source.len(), "{source:?} is not fully consumed");
21✔
1866
            assert_set_equality(
21✔
1867
                source,
21✔
1868
                &set,
1869
                range_iter_from_str(single),
21✔
1870
                strings.into_iter(),
21✔
1871
            );
21✔
1872
        }
21✔
1873
    }
2✔
1874

1875
    #[test]
1876
    fn test_semantics() {
2✔
1877
        const ALL_CHARS: &str = "\x00\u{10FFFF}";
1878
        let cases: Vec<(_, _, Vec<&str>)> = vec![
2✔
1879
            // simple
1880
            ("[a]", "aa", vec![]),
1✔
1881
            ("[]", "", vec![]),
1✔
1882
            ("[qax]", "aaqqxx", vec![]),
1✔
1883
            ("[a-z]", "az", vec![]),
1✔
1884
            ("[--]", "--", vec![]),
1✔
1885
            ("[a-b-]", "ab--", vec![]),
1✔
1886
            ("[[a-b]-]", "ab--", vec![]),
1✔
1887
            ("[{ab}-]", "--", vec!["ab"]),
1✔
1888
            ("[-a-b]", "ab--", vec![]),
1✔
1889
            ("[-a]", "--aa", vec![]),
1✔
1890
            // whitespace escaping
1891
            (r"[\n]", "\n\n", vec![]),
1✔
1892
            ("[\\\n]", "\n\n", vec![]),
1✔
1893
            // empty - whitespace is skipped
1894
            ("[\n]", "", vec![]),
1✔
1895
            ("[\u{9}]", "", vec![]),
1✔
1896
            ("[\u{A}]", "", vec![]),
1✔
1897
            ("[\u{B}]", "", vec![]),
1✔
1898
            ("[\u{C}]", "", vec![]),
1✔
1899
            ("[\u{D}]", "", vec![]),
1✔
1900
            ("[\u{20}]", "", vec![]),
1✔
1901
            ("[\u{85}]", "", vec![]),
1✔
1902
            ("[\u{200E}]", "", vec![]),
1✔
1903
            ("[\u{200F}]", "", vec![]),
1✔
1904
            ("[\u{2028}]", "", vec![]),
1✔
1905
            ("[\u{2029}]", "", vec![]),
1✔
1906
            // whitespace significance:
1907
            ("[^[^$]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1908
            ("[^[^ $]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1909
            ("[^[^ $ ]]", "\u{ffff}\u{ffff}", vec![]),
1✔
1910
            ("[^[^a$]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1911
            ("[^[^a$ ]]", "aa\u{ffff}\u{ffff}", vec![]),
1✔
1912
            ("[-]", "--", vec![]),
1✔
1913
            ("[  -  ]", "--", vec![]),
1✔
1914
            ("[  - -  ]", "--", vec![]),
1✔
1915
            ("[ a-b -  ]", "ab--", vec![]),
1✔
1916
            ("[ -a]", "--aa", vec![]),
1✔
1917
            ("[a-]", "--aa", vec![]),
1✔
1918
            ("[a- ]", "--aa", vec![]),
1✔
1919
            ("[ :]", "::", vec![]),
1✔
1920
            ("[ :L:]", "::LL", vec![]),
1✔
1921
            // but not all "whitespace", only Pattern_White_Space:
1922
            ("[\u{A0}]", "\u{A0}\u{A0}", vec![]), // non-breaking space
1✔
1923
            // anchor
1924
            ("[$]", "\u{ffff}\u{ffff}", vec![]),
1✔
1925
            (r"[\$]", "$$", vec![]),
1✔
1926
            ("[{$}]", "$$", vec![]),
1✔
1927
            // set operations
1928
            ("[[a-z]&[b-z]]", "bz", vec![]),
1✔
1929
            ("[[a-z]-[b-z]]", "aa", vec![]),
1✔
1930
            ("[[a-z][b-z]]", "az", vec![]),
1✔
1931
            ("[[a-a][b-z]]", "az", vec![]),
1✔
1932
            ("[[a-z{abc}]&[b-z{abc}{abx}]]", "bz", vec!["abc"]),
1✔
1933
            ("[[{abx}a-z{abc}]&[b-z{abc}]]", "bz", vec!["abc"]),
1✔
1934
            ("[[a-z{abx}]-[{abx}b-z{abc}]]", "aa", vec![]),
1✔
1935
            ("[[a-z{abx}{abc}]-[{abx}b-z]]", "aa", vec!["abc"]),
1✔
1936
            ("[[a-z{abc}][b-z{abx}]]", "az", vec!["abc", "abx"]),
1✔
1937
            // strings
1938
            ("[{this is a minus -}]", "", vec!["thisisaminus-"]),
1✔
1939
            // associativity
1940
            ("[[a-a][b-z] - [a-d][e-z]]", "ez", vec![]),
1✔
1941
            ("[[a-a][b-z] - [a-d]&[e-z]]", "ez", vec![]),
1✔
1942
            ("[[a-a][b-z] - [a-z][]]", "", vec![]),
1✔
1943
            ("[[a-a][b-z] - [a-z]&[]]", "", vec![]),
1✔
1944
            ("[[a-a][b-z] & [a-z]-[]]", "az", vec![]),
1✔
1945
            ("[[a-a][b-z] & []-[a-z]]", "", vec![]),
1✔
1946
            ("[[a-a][b-z] & [a-b][x-z]]", "abxz", vec![]),
1✔
1947
            ("[[a-z]-[a-b]-[y-z]]", "cx", vec![]),
1✔
1948
            // escape tests
1949
            (r"[\x61-\x63]", "ac", vec![]),
1✔
1950
            (r"[a-\x63]", "ac", vec![]),
1✔
1951
            (r"[\x61-c]", "ac", vec![]),
1✔
1952
            (r"[\u0061-\x63]", "ac", vec![]),
1✔
1953
            (r"[\U00000061-\x63]", "ac", vec![]),
1✔
1954
            (r"[\x{61}-\x63]", "ac", vec![]),
1✔
1955
            (r"[\u{61}-\x63]", "ac", vec![]),
1✔
1956
            (r"[\u{61}{hello\ world}]", "aa", vec!["hello world"]),
1✔
1957
            (r"[{hello\ world}\u{61}]", "aa", vec!["hello world"]),
1✔
1958
            (r"[{h\u{65}llo\ world}]", "", vec!["hello world"]),
1✔
1959
            // complement tests
1960
            (r"[^]", ALL_CHARS, vec![]),
1✔
1961
            (r"[[^]-[^a-z]]", "az", vec![]),
1✔
1962
            (r"[^{h\u{65}llo\ world}]", ALL_CHARS, vec![]),
1✔
1963
            (
1✔
1964
                r"[^[{h\u{65}llo\ world}]-[{hello\ world}]]",
1965
                ALL_CHARS,
1966
                vec![],
1✔
1967
            ),
1968
            (
1✔
1969
                r"[^[\x00-\U0010FFFF]-[\u0100-\U0010FFFF]]",
1970
                "\u{100}\u{10FFFF}",
1971
                vec![],
1✔
1972
            ),
1973
            (r"[^[^a-z]]", "az", vec![]),
1✔
1974
            (r"[^[^\^]]", "^^", vec![]),
1✔
1975
            (r"[{\x{61 0062   063}}]", "", vec!["abc"]),
1✔
1976
            (r"[\x{61 0062   063}]", "ac", vec![]),
1✔
1977
            // binary properties
1978
            (r"[:AHex:]", "09afAF", vec![]),
1✔
1979
            (r"[:AHex=True:]", "09afAF", vec![]),
1✔
1980
            (r"[:AHex=T:]", "09afAF", vec![]),
1✔
1981
            (r"[:AHex=Yes:]", "09afAF", vec![]),
1✔
1982
            (r"[:AHex=Y:]", "09afAF", vec![]),
1✔
1983
            (r"[:^AHex≠True:]", "09afAF", vec![]),
1✔
1984
            (r"[:AHex≠False:]", "09afAF", vec![]),
1✔
1985
            (r"[[:^AHex≠False:]&[\x00-\x10]]", "\0\x10", vec![]),
1✔
1986
            (r"\p{AHex}", "09afAF", vec![]),
1✔
1987
            (r"\p{AHex=True}", "09afAF", vec![]),
1✔
1988
            (r"\p{AHex=T}", "09afAF", vec![]),
1✔
1989
            (r"\p{AHex=Yes}", "09afAF", vec![]),
1✔
1990
            (r"\p{AHex=Y}", "09afAF", vec![]),
1✔
1991
            (r"\P{AHex≠True}", "09afAF", vec![]),
1✔
1992
            (r"\p{AHex≠False}", "09afAF", vec![]),
1✔
1993
            // general category
1994
            (r"[[:gc=lower-case-letter:]&[a-zA-Z]]", "az", vec![]),
1✔
1995
            (r"[[:lower case letter:]&[a-zA-Z]]", "az", vec![]),
1✔
1996
            // general category groups
1997
            // equivalence between L and the union of all the L* categories
1998
            (
1✔
1999
                r"[[[:L:]-[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]][[\p{Ll}\p{Lt}\p{Lu}\p{Lo}\p{Lm}]-[:L:]]]",
2000
                "",
2001
                vec![],
1✔
2002
            ),
2003
            // script
2004
            (r"[[:sc=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2005
            (r"[[:sc=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2006
            (r"[[:Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2007
            (r"[[:latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2008
            // script extensions
2009
            (r"[[:scx=latn:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2010
            (r"[[:scx=Latin:]&[a-zA-Z]]", "azAZ", vec![]),
1✔
2011
            (r"[[:scx=Hira:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2012
            (r"[[:sc=Hira:]&[\u30FC]]", "", vec![]),
1✔
2013
            (r"[[:scx=Kana:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2014
            (r"[[:sc=Kana:]&[\u30FC]]", "", vec![]),
1✔
2015
            (r"[[:sc=Common:]&[\u30FC]]", "\u{30FC}\u{30FC}", vec![]),
1✔
2016
            // grapheme cluster break
2017
            (
1✔
2018
                r"\p{Grapheme_Cluster_Break=ZWJ}",
2019
                "\u{200D}\u{200D}",
2020
                vec![],
1✔
2021
            ),
2022
            // sentence break
2023
            (
1✔
2024
                r"\p{Sentence_Break=ATerm}",
2025
                "\u{002E}\u{002E}\u{2024}\u{2024}\u{FE52}\u{FE52}\u{FF0E}\u{FF0E}",
2026
                vec![],
1✔
2027
            ),
2028
            // word break
2029
            (r"\p{Word_Break=Single_Quote}", "\u{0027}\u{0027}", vec![]),
1✔
2030
            // more syntax edge cases from UTS35 directly
2031
            (r"[\^a]", "^^aa", vec![]),
1✔
2032
            (r"[{{}]", "{{", vec![]),
1✔
2033
            (r"[{}}]", "}}", vec![""]),
1✔
2034
            (r"[}]", "}}", vec![]),
1✔
2035
            (r"[{$var}]", "", vec!["$var"]),
1✔
2036
            (r"[{[a-z}]", "", vec!["[a-z"]),
1✔
2037
            (r"[ { [ a - z } ]", "", vec!["[a-z"]),
1✔
2038
            // TODO(#3556): Add more tests (specifically conformance tests if they exist)
2039
        ];
2040
        for (source, single, strings) in cases {
119✔
2041
            let parsed = parse(source);
118✔
2042
            if let Err(err) = parsed {
118✔
2043
                panic!(
×
2044
                    "{source} results in an error: {}",
2045
                    err.fmt_with_source(source)
×
2046
                );
2047
            }
2048
            let (set, consumed) = parsed.unwrap();
118✔
2049
            assert_eq!(consumed, source.len());
118✔
2050
            assert_set_equality(
118✔
2051
                source,
118✔
2052
                &set,
2053
                range_iter_from_str(single),
118✔
2054
                strings.into_iter(),
118✔
2055
            );
118✔
2056
        }
118✔
2057
    }
2✔
2058

2059
    #[test]
2060
    fn test_error_messages_with_variables() {
2✔
2061
        let mut map_char_char = VariableMap::default();
1✔
2062
        map_char_char.insert_char("a".to_string(), 'a').unwrap();
1✔
2063
        map_char_char.insert_char("var2".to_string(), 'z').unwrap();
1✔
2064

2065
        let mut map_char_string = VariableMap::default();
1✔
2066
        map_char_string.insert_char("a".to_string(), 'a').unwrap();
1✔
2067
        map_char_string
1✔
2068
            .insert_string("var2".to_string(), "abc".to_string())
2✔
2069
            .unwrap();
2070

2071
        let (set, _) = parse(r"[a-z {Hello,\ World!}]").unwrap();
1✔
2072
        let mut map_char_set = VariableMap::default();
1✔
2073
        map_char_set.insert_char("a".to_string(), 'a').unwrap();
1✔
2074
        map_char_set.insert_set("set".to_string(), set).unwrap();
1✔
2075

2076
        let cases = [
1✔
2077
            (&map_char_char, "[$$a]", r"[$$a← error: unexpected variable"),
1✔
2078
            (
1✔
2079
                &map_char_char,
2080
                "[$ a]",
2081
                r"[$ a← error: unexpected character 'a'",
2082
            ),
2083
            (&map_char_char, "$a", r"$a← error: unexpected variable"),
1✔
2084
            (&map_char_char, "$", r"$← error: unexpected end of input"),
1✔
2085
            (
1✔
2086
                &map_char_string,
2087
                "[$var2-$a]",
2088
                r"[$var2-$a← error: unexpected variable",
2089
            ),
2090
            (
1✔
2091
                &map_char_string,
2092
                "[$a-$var2]",
2093
                r"[$a-$var2← error: unexpected variable",
2094
            ),
2095
            (
1✔
2096
                &map_char_set,
2097
                "[$a-$set]",
2098
                r"[$a-$set← error: unexpected variable",
2099
            ),
2100
            (
1✔
2101
                &map_char_set,
2102
                "[$set-$a]",
2103
                r"[$set-$a← error: unexpected variable",
2104
            ),
2105
        ];
2106
        for (variable_map, source, expected_err) in cases {
9✔
2107
            assert_is_error_and_message_eq(source, expected_err, variable_map);
8✔
2108
        }
2109
    }
2✔
2110

2111
    #[test]
2112
    fn test_error_messages() {
2✔
2113
        let cases = [
1✔
2114
            (r"[a-z[\]]", r"[a-z[\]]← error: unexpected end of input"),
1✔
2115
            (r"", r"← error: unexpected end of input"),
1✔
2116
            (r"[{]", r"[{]← error: unexpected end of input"),
1✔
2117
            // we match ECMA-262 strictly, so case matters
2118
            (
1✔
2119
                r"[:general_category:]",
2120
                r"[:general_category← error: unknown property",
2121
            ),
2122
            (r"[:ll=true:]", r"[:ll=true← error: unknown property"),
1✔
2123
            (r"[:=", r"[:=← error: unexpected character '='"),
1✔
2124
            // property names may not be empty
2125
            (r"[::]", r"[::← error: unexpected character ':'"),
1✔
2126
            (r"[:=hello:]", r"[:=← error: unexpected character '='"),
1✔
2127
            // property values may not be empty
2128
            (r"[:gc=:]", r"[:gc=:← error: unexpected character ':'"),
1✔
2129
            (r"[\xag]", r"[\xag← error: unexpected character 'g'"),
1✔
2130
            (r"[a-b-z]", r"[a-b-z← error: unexpected character 'z'"),
1✔
2131
            // TODO(#3558): Might be better as "[a-\p← error: unexpected character 'p'"?
2132
            (r"[a-\p{ll}]", r"[a-\← error: unexpected character '\\'"),
1✔
2133
            (r"[a-&]", r"[a-&← error: unexpected character '&'"),
1✔
2134
            (r"[a&b]", r"[a&← error: unexpected character '&'"),
1✔
2135
            (r"[[set]&b]", r"[[set]&b← error: unexpected character 'b'"),
1✔
2136
            (r"[[set]&]", r"[[set]&]← error: unexpected character ']'"),
1✔
2137
            (r"[a-\x60]", r"[a-\x60← error: unexpected character '`'"),
1✔
2138
            (r"[a-`]", r"[a-`← error: unexpected character '`'"),
1✔
2139
            (r"[\x{6g}]", r"[\x{6g← error: unexpected character 'g'"),
1✔
2140
            (r"[\x{g}]", r"[\x{g← error: unexpected character 'g'"),
1✔
2141
            (r"[\x{}]", r"[\x{}← error: unexpected character '}'"),
1✔
2142
            (
1✔
2143
                r"[\x{dabeef}]",
2144
                r"[\x{dabeef← error: invalid escape sequence",
2145
            ),
2146
            (
1✔
2147
                r"[\x{10ffff0}]",
2148
                r"[\x{10ffff0← error: unexpected character '0'",
2149
            ),
2150
            (
1✔
2151
                r"[\x{11ffff}]",
2152
                r"[\x{11ffff← error: invalid escape sequence",
2153
            ),
2154
            (
1✔
2155
                r"[\x{10ffff 1 10ffff0}]",
2156
                r"[\x{10ffff 1 10ffff0← error: unexpected character '0'",
2157
            ),
2158
            // > 1 byte in UTF-8 edge case
2159
            (r"ä", r"ä← error: unexpected character 'ä'"),
1✔
2160
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2161
            (r"\p{gc=ä}", r"\p{gc=ä← error: unknown property"),
1✔
2162
            (
1✔
2163
                r"[\xe5-\xe4]",
2164
                r"[\xe5-\xe4← error: unexpected character 'ä'",
2165
            ),
2166
            (r"[\xe5-ä]", r"[\xe5-ä← error: unexpected character 'ä'"),
1✔
2167
            // whitespace significance
2168
            (r"[ ^]", r"[ ^← error: unexpected character '^'"),
1✔
2169
            (r"[:]", r"[:]← error: unexpected character ']'"),
1✔
2170
            (r"[:L]", r"[:L]← error: unexpected character ']'"),
1✔
2171
            (r"\p {L}", r"\p ← error: unexpected character ' '"),
1✔
2172
            // multi-escapes are not allowed in ranges
2173
            (
1✔
2174
                r"[\x{61 62}-d]",
2175
                r"[\x{61 62}-d← error: unexpected character 'd'",
2176
            ),
2177
            (
1✔
2178
                r"[\x{61 63}-\x{62 64}]",
2179
                r"[\x{61 63}-\← error: unexpected character '\\'",
2180
            ),
2181
            // TODO(#3558): This is a bad error message.
2182
            (r"[a-\x{62 64}]", r"[a-\← error: unexpected character '\\'"),
1✔
2183
        ];
2184
        let vm = Default::default();
1✔
2185
        for (source, expected_err) in cases {
38✔
2186
            assert_is_error_and_message_eq(source, expected_err, &vm);
37✔
2187
        }
2188
    }
2✔
2189

2190
    #[test]
2191
    fn test_consumed() {
2✔
2192
        let cases = [
1✔
2193
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}][]"),
1✔
2194
            (r"[a-z\]{[}]".len(), r"[a-z\]{[}] []"),
1✔
2195
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}] []"),
1✔
2196
            (r"[a-z\]{{[}]".len(), r"[a-z\]{{]}] []"),
1✔
2197
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]\p{L}"),
1✔
2198
            (r"[a-z\]{[}]".len(), r"[a-z\]{]}]$var"),
1✔
2199
        ];
2200

2201
        let vm = Default::default();
1✔
2202
        for (expected_consumed, source) in cases {
7✔
2203
            let (_, consumed) = parse(source).unwrap();
6✔
2204
            assert_eq!(expected_consumed, consumed);
6✔
2205
            let (_, consumed) = parse_with_variables(source, &vm).unwrap();
6✔
2206
            assert_eq!(expected_consumed, consumed);
6✔
2207
        }
2208
    }
2✔
2209
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc