• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pomsky-lang / pomsky / 12019379442

25 Nov 2024 09:47PM UTC coverage: 82.942% (-0.007%) from 82.949%
12019379442

push

github

Aloso
fix e2e test

4269 of 5147 relevant lines covered (82.94%)

417486.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.1
/pomsky-lib/src/exprs/char_class.rs
1
//! Implements _character classes_. The analogue in the regex world are
2
//! [character classes](https://www.regular-expressions.info/charclass.html),
3
//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
4
//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
5
//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
6
//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
7
//! [dot](https://www.regular-expressions.info/dot.html).
8
//!
9
//! All kinds of character classes mentioned above require `[` square brackets
10
//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
11
//! after the opening bracket. For example, `![.]` compiles to `\n`.
12
//!
13
//! ## Items
14
//!
15
//! A character class can contain multiple _items_, which can be
16
//!
17
//! - A __code point__, e.g. `['a']` or `[U+107]`
18
//!
19
//!   - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
20
//!     Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
21
//!
22
//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
23
//!   point P where `U+10 ≤ P ≤ U+200`
24
//!
25
//! - A __named character class__, which can be one of
26
//!
27
//!   - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
28
//!     Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
29
//!
30
//!   - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
31
//!     Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
32
//!     `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
33
//!     `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
34
//!     `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
35
//!     classes are not Unicode aware!\ _Note_: They're converted to ranges,
36
//!     e.g. `[ascii_alpha]` = `[a-zA-Z]`.
37
//!
38
//!   - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
39
//!     For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
40
//!     treats any uppercase identifier except `R` as Unicode class.
41
//!
42
//! ## Compilation
43
//!
44
//! When a character class contains only a single item (e.g. `[w]`), the
45
//! character class is "flattened":
46
//!
47
//! - `['a']` = `a`
48
//! - `[w]` = `\w`
49
//! - `[Letter]` = `\p{Letter}`
50
//!
51
//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
52
//! character class is created:
53
//!
54
//! - `['a'-'z' '!']` = `[a-z!]`
55
//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
56
//!
57
//! ### Negation
58
//!
59
//! Negation is implemented as follows:
60
//!
61
//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
62
//!   character class, e.g. `[^a-z!\e]`.
63
//!
64
//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
65
//!   class.
66
//!
67
//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
68
//!   (`![w]` = `\W`), except when there is more than one item in the class
69
//!   (`![w '-']` = `[^\w\-]`)
70
//!
71
//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
72
//!   individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
73
//!   `![!Latin 'a']` = `[^\P{Latin}a]`.
74
//!
75
//!   When a negated character class only contains 1 item, which is also
76
//!   negated, the class is   removed and the negations cancel each other out:
77
//!   `![!w]` = `\w`, `![!L]` = `\p{L}`.
78

79
use std::fmt;
80

81
use crate::{
82
    compile::{CompileResult, CompileState},
83
    diagnose::{CompileError, CompileErrorKind, Feature},
84
    exprs::literal,
85
    options::{CompileOptions, RegexFlavor},
86
    regex::{Regex, RegexProperty, RegexShorthand},
87
    unicode_set::UnicodeSet,
88
};
89

90
use pomsky_syntax::{
91
    exprs::{Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script},
92
    Span,
93
};
94

95
use super::Compile;
96

97
impl Compile for CharClass {
98
    fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {
224✔
99
        // when single, a `[!w]` can be turned into `![w]`
224✔
100
        let is_single = self.inner.len() == 1;
224✔
101
        let mut group_negative = false;
224✔
102

224✔
103
        let mut set = UnicodeSet::new();
224✔
104
        for item in &self.inner {
579✔
105
            match *item {
363✔
106
                GroupItem::Char(c) => {
204✔
107
                    if !is_single {
204✔
108
                        validate_char_in_class(c, options.flavor, self.span)?;
140✔
109
                    }
64✔
110
                    set.add_char(c)
204✔
111
                }
112
                GroupItem::Range { first, last } => {
30✔
113
                    validate_char_in_class(first, options.flavor, self.span)?;
30✔
114
                    validate_char_in_class(last, options.flavor, self.span)?;
30✔
115
                    set.add_range(first..=last);
30✔
116
                }
117
                GroupItem::Named { name, negative } => {
129✔
118
                    if self.unicode_aware {
129✔
119
                        named_class_to_regex_unicode(
102✔
120
                            name,
102✔
121
                            negative,
102✔
122
                            &mut group_negative,
102✔
123
                            is_single,
102✔
124
                            options.flavor,
102✔
125
                            self.span,
102✔
126
                            &mut set,
102✔
127
                        )?;
102✔
128
                    } else {
129
                        named_class_to_regex_ascii(
27✔
130
                            name,
27✔
131
                            negative,
27✔
132
                            options.flavor,
27✔
133
                            self.span,
27✔
134
                            &mut set,
27✔
135
                        )?;
27✔
136
                    }
137
                }
138
            }
139
        }
140

141
        // this makes it possible to use code points outside the BMP in .NET,
142
        // as long as there is only one in the character set
143
        if let Some(only_char) = set.try_into_char() {
216✔
144
            return Ok(Regex::Literal(only_char.to_string()));
64✔
145
        }
152✔
146

152✔
147
        Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))
152✔
148
    }
224✔
149
}
150

151
fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {
200✔
152
    if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {
200✔
153
        Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)
×
154
            .at(span))
×
155
    } else {
156
        Ok(())
200✔
157
    }
158
}
200✔
159

160
pub(crate) fn check_char_class_empty(
46✔
161
    char_set: &RegexCharSet,
46✔
162
    span: Span,
46✔
163
) -> Result<(), CompileError> {
46✔
164
    if char_set.negative {
46✔
165
        if let Some((group1, group2)) = char_set.set.full_props() {
45✔
166
            return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));
3✔
167
        }
42✔
168
    }
1✔
169
    Ok(())
43✔
170
}
46✔
171

172
fn named_class_to_regex_ascii(
27✔
173
    group: GroupName,
27✔
174
    negative: bool,
27✔
175
    flavor: RegexFlavor,
27✔
176
    span: Span,
27✔
177
    set: &mut UnicodeSet,
27✔
178
) -> Result<(), CompileError> {
27✔
179
    if negative
27✔
180
        // In JS, \W and \D can be used for negation because they're ascii-only
181
        && (flavor != RegexFlavor::JavaScript
1✔
182
            || (group != GroupName::Digit && group != GroupName::Word))
×
183
    {
184
        return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));
1✔
185
    }
26✔
186

26✔
187
    match group {
26✔
188
        GroupName::Word => {
189
            if flavor == RegexFlavor::JavaScript {
7✔
190
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
1✔
191
                set.add_prop(RegexCharSetItem::Shorthand(s));
1✔
192
            } else {
6✔
193
                // we already checked above if negative
6✔
194
                set.add_range('a'..='z');
6✔
195
                set.add_range('A'..='Z');
6✔
196
                set.add_range('0'..='9');
6✔
197
                set.add_char('_');
6✔
198
            }
6✔
199
        }
200
        GroupName::Digit => {
201
            if flavor == RegexFlavor::JavaScript {
11✔
202
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
1✔
203
                set.add_prop(RegexCharSetItem::Shorthand(s));
1✔
204
            } else {
10✔
205
                // we already checked above if negative
10✔
206
                set.add_range('0'..='9');
10✔
207
            }
10✔
208
        }
209
        GroupName::Space => {
7✔
210
            set.add_char(' ');
7✔
211
            set.add_range('\x09'..='\x0D'); // \t\n\v\f\r
7✔
212
        }
7✔
213
        GroupName::HorizSpace => set.add_char('\t'),
×
214
        GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),
×
215
        _ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),
1✔
216
    }
217
    Ok(())
25✔
218
}
27✔
219

220
fn named_class_to_regex_unicode(
102✔
221
    group: GroupName,
102✔
222
    negative: bool,
102✔
223
    group_negative: &mut bool,
102✔
224
    is_single: bool,
102✔
225
    flavor: RegexFlavor,
102✔
226
    span: Span,
102✔
227
    set: &mut UnicodeSet,
102✔
228
) -> Result<(), CompileError> {
102✔
229
    match group {
5✔
230
        GroupName::Word => {
231
            if flavor == RegexFlavor::JavaScript {
22✔
232
                if negative {
5✔
233
                    if is_single {
2✔
234
                        *group_negative ^= true;
1✔
235
                    } else {
1✔
236
                        return Err(CompileErrorKind::Unsupported(
1✔
237
                            Feature::NegativeShorthandW,
1✔
238
                            flavor,
1✔
239
                        )
1✔
240
                        .at(span));
1✔
241
                    }
242
                }
3✔
243
                set.add_prop(
4✔
244
                    RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),
4✔
245
                );
4✔
246
                set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));
4✔
247
                set.add_prop(
4✔
248
                    RegexProperty::Category(Category::Decimal_Number).negative_item(false),
4✔
249
                );
4✔
250
                set.add_prop(
4✔
251
                    RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),
4✔
252
                );
4✔
253
            } else {
254
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
17✔
255
                set.add_prop(RegexCharSetItem::Shorthand(s));
17✔
256
            }
257
        }
258
        GroupName::Digit => {
259
            if flavor == RegexFlavor::JavaScript {
16✔
260
                set.add_prop(
4✔
261
                    RegexProperty::Category(Category::Decimal_Number).negative_item(negative),
4✔
262
                );
4✔
263
            } else {
4✔
264
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
12✔
265
                set.add_prop(RegexCharSetItem::Shorthand(s));
12✔
266
            }
267
        }
268

269
        GroupName::Space => set.add_prop(RegexCharSetItem::Shorthand(if negative {
12✔
270
            RegexShorthand::NotSpace
3✔
271
        } else {
272
            RegexShorthand::Space
9✔
273
        })),
274

275
        GroupName::HorizSpace | GroupName::VertSpace if negative => {
×
276
            return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));
×
277
        }
278

279
        GroupName::HorizSpace | GroupName::VertSpace
280
            if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
5✔
281
        {
282
            set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {
6✔
283
                RegexShorthand::HorizSpace
3✔
284
            } else {
285
                RegexShorthand::VertSpace
3✔
286
            }));
287
        }
288
        GroupName::HorizSpace => {
289
            set.add_char('\t');
2✔
290
            if flavor == RegexFlavor::Python {
2✔
291
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
×
292
            } else {
2✔
293
                set.add_prop(
2✔
294
                    RegexProperty::Category(Category::Space_Separator).negative_item(false),
2✔
295
                );
2✔
296
            }
2✔
297
        }
298
        GroupName::VertSpace => {
2✔
299
            set.add_range('\x0A'..='\x0D');
2✔
300
            set.add_char('\u{85}');
2✔
301
            set.add_char('\u{2028}');
2✔
302
            set.add_char('\u{2029}');
2✔
303
        }
2✔
304

305
        _ if flavor == RegexFlavor::Python => {
42✔
306
            return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
2✔
307
        }
308
        GroupName::Category(c) => {
5✔
309
            if let (RegexFlavor::Rust, Category::Surrogate)
5✔
310
            | (RegexFlavor::DotNet, Category::Cased_Letter) = (flavor, c)
5✔
311
            {
312
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
×
313
            }
5✔
314
            set.add_prop(RegexProperty::Category(c).negative_item(negative));
5✔
315
        }
316
        GroupName::Script(s) => {
21✔
317
            if flavor == RegexFlavor::DotNet {
21✔
318
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));
1✔
319
            }
20✔
320
            if let (
20✔
321
                RegexFlavor::Pcre | RegexFlavor::Ruby | RegexFlavor::Java,
20✔
322
                Script::Kawi | Script::Nag_Mundari,
20✔
323
            )
20✔
324
            | (RegexFlavor::Rust, Script::Unknown) = (flavor, s)
20✔
325
            {
326
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
×
327
            }
20✔
328
            set.add_prop(RegexProperty::Script(s).negative_item(negative));
20✔
329
        }
330
        GroupName::CodeBlock(b) => match flavor {
7✔
331
            RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
332
                match (flavor, b) {
6✔
333
                    (
334
                        RegexFlavor::Java,
335
                        CodeBlock::Arabic_Extended_C
336
                        | CodeBlock::CJK_Unified_Ideographs_Extension_H
337
                        | CodeBlock::Combining_Diacritical_Marks_For_Symbols
338
                        | CodeBlock::Cyrillic_Extended_D
339
                        | CodeBlock::Cyrillic_Supplement
340
                        | CodeBlock::Devanagari_Extended_A
341
                        | CodeBlock::Greek_And_Coptic
342
                        | CodeBlock::Kaktovik_Numerals
343
                        | CodeBlock::No_Block,
344
                    )
345
                    | (
346
                        RegexFlavor::Ruby,
347
                        CodeBlock::Arabic_Extended_C
348
                        | CodeBlock::CJK_Unified_Ideographs_Extension_H
349
                        | CodeBlock::Cyrillic_Extended_D
350
                        | CodeBlock::Devanagari_Extended_A
351
                        | CodeBlock::Kaktovik_Numerals,
352
                    ) => {
353
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
×
354
                    }
355
                    (RegexFlavor::DotNet, _) => {
356
                        let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");
2✔
357
                        if pomsky_syntax::blocks_supported_in_dotnet()
2✔
358
                            .binary_search(&dotnet_name.as_str())
2✔
359
                            .is_err()
2✔
360
                        {
361
                            return Err(
×
362
                                CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)
×
363
                            );
×
364
                        }
2✔
365
                    }
366
                    _ => {}
4✔
367
                }
368

369
                set.add_prop(RegexProperty::Block(b).negative_item(negative));
6✔
370
            }
371
            _ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
1✔
372
        },
373
        GroupName::OtherProperties(o) => {
7✔
374
            use OtherProperties as OP;
375
            use RegexFlavor as RF;
376

377
            if let RF::JavaScript | RF::Rust | RF::Pcre | RF::Ruby = flavor {
7✔
378
                match (flavor, o) {
7✔
379
                    (RF::JavaScript, _) => {}
4✔
380
                    (_, OP::Changes_When_NFKC_Casefolded)
381
                    | (RF::Pcre, OP::Assigned)
382
                    | (RF::Ruby, OP::Bidi_Mirrored) => {
383
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
1✔
384
                    }
385
                    _ => {}
2✔
386
                }
387
                set.add_prop(RegexProperty::Other(o).negative_item(negative));
6✔
388
            } else {
389
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
×
390
            }
391
        }
392
    }
393
    Ok(())
96✔
394
}
102✔
395

396
#[cfg_attr(feature = "dbg", derive(Debug))]
397
pub(crate) struct RegexCharSet {
398
    negative: bool,
399
    set: UnicodeSet,
400
}
401

402
impl RegexCharSet {
403
    pub(crate) fn new(items: UnicodeSet) -> Self {
157✔
404
        Self { negative: false, set: items }
157✔
405
    }
157✔
406

407
    pub(crate) fn negate(mut self) -> Self {
46✔
408
        self.negative = !self.negative;
46✔
409
        self
46✔
410
    }
46✔
411

412
    pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
304✔
413
        if self.set.len() == 1 {
304✔
414
            if let Some(range) = self.set.ranges().next() {
217✔
415
                let (first, last) = range.as_chars();
157✔
416
                if first == last && !self.negative {
157✔
417
                    return literal::codegen_char_esc(first, buf, flavor);
1✔
418
                }
156✔
419
            } else if let Some(prop) = self.set.props().next() {
60✔
420
                match prop {
60✔
421
                    RegexCharSetItem::Shorthand(s) => {
24✔
422
                        let shorthand = if self.negative { s.negate() } else { Some(s) };
24✔
423
                        if let Some(shorthand) = shorthand {
24✔
424
                            return shorthand.codegen(buf);
22✔
425
                        }
2✔
426
                    }
427
                    RegexCharSetItem::Property { negative, value } => {
36✔
428
                        return value.codegen(buf, negative ^ self.negative, flavor);
36✔
429
                    }
430
                }
431
            }
×
432
        }
87✔
433

434
        if self.negative {
245✔
435
            buf.push_str("[^");
27✔
436
        } else {
218✔
437
            buf.push('[');
218✔
438
        }
218✔
439

440
        let mut is_first = true;
245✔
441
        for prop in self.set.props() {
245✔
442
            match prop {
69✔
443
                RegexCharSetItem::Shorthand(s) => s.codegen(buf),
49✔
444
                RegexCharSetItem::Property { negative, value } => {
20✔
445
                    value.codegen(buf, negative, flavor);
20✔
446
                }
20✔
447
            }
448
            is_first = false;
69✔
449
        }
450
        for range in self.set.ranges() {
329✔
451
            let (first, last) = range.as_chars();
329✔
452
            if first == last {
329✔
453
                literal::compile_char_esc_in_class(first, buf, is_first, flavor);
90✔
454
            } else {
90✔
455
                literal::compile_char_esc_in_class(first, buf, is_first, flavor);
239✔
456
                if range.first + 1 < range.last {
239✔
457
                    buf.push('-');
199✔
458
                }
199✔
459
                literal::compile_char_esc_in_class(last, buf, false, flavor);
239✔
460
            }
461
            is_first = false;
329✔
462
        }
463

464
        buf.push(']');
245✔
465
    }
304✔
466
}
467

468
#[derive(Clone, Copy, PartialEq, Eq)]
469
pub(crate) enum RegexCharSetItem {
470
    Shorthand(RegexShorthand),
471
    Property { negative: bool, value: RegexProperty },
472
}
473

474
impl RegexCharSetItem {
475
    pub(crate) fn negate(self) -> Option<Self> {
43✔
476
        match self {
43✔
477
            RegexCharSetItem::Shorthand(s) => s.negate().map(RegexCharSetItem::Shorthand),
19✔
478
            RegexCharSetItem::Property { negative, value } => {
24✔
479
                Some(RegexCharSetItem::Property { negative: !negative, value })
24✔
480
            }
481
        }
482
    }
43✔
483
}
484

485
impl fmt::Debug for RegexCharSetItem {
486
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
6✔
487
        match self {
6✔
488
            Self::Shorthand(s) => f.write_str(s.as_str()),
4✔
489
            &Self::Property { value, negative } => {
2✔
490
                if negative {
2✔
491
                    f.write_str("!")?;
1✔
492
                }
1✔
493
                f.write_str(value.as_str())
2✔
494
            }
495
        }
496
    }
6✔
497
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc