• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pomsky-lang / pomsky / 12301483439

12 Dec 2024 05:19PM UTC coverage: 80.275% (-0.2%) from 80.471%
12301483439

push

github

Aloso
feat: test command

360 of 593 new or added lines in 11 files covered. (60.71%)

20 existing lines in 7 files now uncovered.

4607 of 5739 relevant lines covered (80.28%)

374427.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.03
/pomsky-lib/src/regex/mod.rs
1
use std::borrow::Borrow;
2

3
use pomsky_syntax::{
4
    exprs::{
5
        BoundaryKind, Category, CodeBlock, LookaroundKind, OtherProperties, RepetitionKind, Script,
6
        ScriptExtension,
7
    },
8
    Span,
9
};
10

11
use crate::{
12
    compile::CompileResult,
13
    diagnose::{CompileErrorKind, Feature, IllegalNegationKind},
14
    exprs::{
15
        alternation::RegexAlternation,
16
        boundary::boundary_kind_codegen,
17
        char_class::{RegexCharSet, RegexCharSetItem, RegexCompoundCharSet},
18
        group::{RegexGroup, RegexGroupKind},
19
        literal,
20
        lookaround::RegexLookaround,
21
        recursion,
22
        reference::RegexReference,
23
        repetition::RegexRepetition,
24
    },
25
    options::RegexFlavor,
26
};
27

28
mod optimize;
29

30
pub(super) use optimize::Count;
31

32
#[cfg_attr(feature = "dbg", derive(Debug))]
33
pub(crate) enum Regex {
34
    /// A literal string
35
    Literal(String),
36
    /// A regex string that is inserted verbatim into the output
37
    Unescaped(String),
38
    /// A character class, delimited with square brackets
39
    CharSet(RegexCharSet),
40
    /// A character class, delimited with square brackets
41
    CompoundCharSet(RegexCompoundCharSet),
42
    /// A Unicode grapheme
43
    Grapheme,
44
    /// The dot, matching anything except `\n`
45
    Dot,
46
    /// A group, i.e. a sequence of rules, possibly wrapped in parentheses.
47
    Group(RegexGroup),
48
    /// An alternation, i.e. a list of alternatives; at least one of them has to
49
    /// match.
50
    Alternation(RegexAlternation),
51
    /// A repetition, i.e. a expression that must be repeated. The number of
52
    /// required repetitions is constrained by a lower and possibly an upper
53
    /// bound.
54
    Repetition(Box<RegexRepetition>),
55
    /// A boundary (start of string, end of string or word boundary).
56
    Boundary(BoundaryKind),
57
    /// A (positive or negative) lookahead or lookbehind.
58
    Lookaround(Box<RegexLookaround>),
59
    /// A backreference or forward reference.
60
    Reference(RegexReference),
61
    /// Recursively matches the entire regex.
62
    Recursion,
63
}
64

65
impl Regex {
66
    pub(super) fn validate_in_lookbehind_py(&self) -> Result<Option<u32>, CompileErrorKind> {
1✔
67
        match self {
1✔
68
            Regex::Literal(str) => Ok(Some(str.chars().count() as u32)),
1✔
69
            Regex::Unescaped(_) => Ok(None),
×
70
            Regex::CharSet(_) | Regex::CompoundCharSet(_) => Ok(Some(1)),
×
71
            Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
×
72
                flavor: RegexFlavor::Python,
×
73
                feature: Feature::Grapheme,
×
74
            }),
×
75
            Regex::Dot => Ok(Some(1)),
×
76
            Regex::Group(g) => g.parts.iter().try_fold(Some(0), |acc, part| {
×
77
                Ok(match (acc, part.validate_in_lookbehind_py()?) {
×
78
                    (Some(a), Some(b)) => Some(a + b),
×
79
                    _ => None,
×
80
                })
81
            }),
×
82
            Regex::Alternation(alt) => {
×
83
                let mut count = None;
×
84
                for part in &alt.parts {
×
85
                    let c = part.validate_in_lookbehind_py()?;
×
86
                    count = match (count, c) {
×
87
                        (Some(a), Some(b)) if a == b => Some(a),
×
88
                        (Some(_), Some(_)) => {
89
                            return Err(CompileErrorKind::LookbehindNotConstantLength {
×
90
                                flavor: RegexFlavor::Python,
×
91
                            })
×
92
                        }
93
                        (Some(a), None) | (None, Some(a)) => Some(a),
×
94
                        _ => None,
×
95
                    };
96
                }
97
                Ok(count)
×
98
            }
99
            Regex::Repetition(r) => {
×
100
                if let RepetitionKind { lower_bound, upper_bound: Some(upper) } = r.kind {
×
101
                    if lower_bound == upper {
×
102
                        return Ok(Some(upper));
×
103
                    }
×
104
                }
×
105
                Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Python })
×
106
            }
107
            Regex::Boundary(_) => Ok(Some(0)),
×
108
            Regex::Lookaround(_) => Ok(Some(0)),
×
109
            Regex::Reference(_) => Ok(None), // TODO: somehow get the length of the referenced group
×
110
            Regex::Recursion => unreachable!("not supported in python"),
×
111
        }
112
    }
1✔
113

114
    pub(super) fn validate_in_lookbehind_pcre(&self) -> Result<(), CompileErrorKind> {
9✔
115
        match self {
9✔
116
            Regex::Literal(_) => Ok(()),
9✔
117
            Regex::Unescaped(_) => Ok(()),
×
118
            Regex::CharSet(_) | Regex::CompoundCharSet(_) => Ok(()),
×
119
            Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind {
×
120
                flavor: RegexFlavor::Pcre,
×
121
                feature: Feature::Grapheme,
×
122
            }),
×
123
            Regex::Dot => Ok(()),
×
124
            Regex::Group(g) => {
×
125
                for part in &g.parts {
×
126
                    part.validate_in_lookbehind_pcre()?;
×
127
                }
128
                Ok(())
×
129
            }
130
            Regex::Alternation(alt) => {
×
131
                for part in &alt.parts {
×
132
                    part.validate_in_lookbehind_pcre()?;
×
133
                }
134
                Ok(())
×
135
            }
136
            Regex::Repetition(r) => match r.kind.upper_bound {
×
137
                Some(_) => Ok(()),
×
138
                _ => {
139
                    Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Pcre })
×
140
                }
141
            },
142
            Regex::Boundary(_) => Ok(()),
×
143
            Regex::Lookaround(_) => Ok(()),
×
144
            Regex::Reference(_) => Ok(()), // TODO: somehow check the referenced group
×
145
            Regex::Recursion => Err(CompileErrorKind::UnsupportedInLookbehind {
×
146
                flavor: RegexFlavor::Pcre,
×
147
                feature: Feature::Recursion,
×
148
            }),
×
149
        }
150
    }
9✔
151

152
    pub(super) fn is_single_char(&self) -> bool {
152✔
153
        if let Regex::Literal(l) = self {
152✔
154
            !l.is_empty() && l.chars().nth(1).is_none()
31✔
155
        } else {
156
            matches!(self, Regex::CharSet(_))
121✔
157
        }
158
    }
152✔
159
}
160

161
impl Default for Regex {
162
    fn default() -> Self {
10✔
163
        Regex::Literal("".into())
10✔
164
    }
10✔
165
}
166

167
#[derive(Clone, Copy, PartialEq, Eq)]
168
#[cfg_attr(feature = "dbg", derive(Debug))]
169
pub(crate) enum RegexShorthand {
170
    Word,
171
    Digit,
172
    Space,
173
    NotWord,
174
    NotDigit,
175
    NotSpace,
176
    VertSpace,
177
    HorizSpace,
178
}
179

180
impl RegexShorthand {
181
    pub(crate) fn negate(&self) -> Option<RegexShorthand> {
29✔
182
        Some(match self {
29✔
183
            RegexShorthand::Word => RegexShorthand::NotWord,
5✔
184
            RegexShorthand::Digit => RegexShorthand::NotDigit,
5✔
185
            RegexShorthand::Space => RegexShorthand::NotSpace,
10✔
186
            RegexShorthand::NotWord => RegexShorthand::Word,
4✔
187
            RegexShorthand::NotDigit => RegexShorthand::Digit,
×
188
            RegexShorthand::NotSpace => RegexShorthand::Space,
1✔
189
            RegexShorthand::VertSpace => return None,
2✔
190
            RegexShorthand::HorizSpace => return None,
2✔
191
        })
192
    }
29✔
193

194
    pub(crate) fn as_str(&self) -> &'static str {
4✔
195
        match self {
4✔
196
            RegexShorthand::Word => "word",
1✔
197
            RegexShorthand::Digit => "digit",
×
198
            RegexShorthand::Space => "space",
1✔
199
            RegexShorthand::NotWord => "!word",
1✔
200
            RegexShorthand::NotDigit => "!digit",
×
201
            RegexShorthand::NotSpace => "!space",
1✔
202
            RegexShorthand::VertSpace => "vert_space",
×
203
            RegexShorthand::HorizSpace => "horiz_space",
×
204
        }
205
    }
4✔
206
}
207

208
#[derive(Clone, Copy, PartialEq, Eq)]
209
#[cfg_attr(feature = "dbg", derive(Debug))]
210
pub(crate) enum RegexProperty {
211
    Category(Category),
212
    Script(Script, ScriptExtension),
213
    Block(CodeBlock),
214
    Other(OtherProperties),
215
}
216

217
impl RegexProperty {
218
    pub fn as_str(&self) -> &'static str {
2✔
219
        match self {
2✔
220
            RegexProperty::Category(c) => c.as_str(),
×
221
            RegexProperty::Script(s, _) => s.as_str(),
×
222
            RegexProperty::Block(b) => b.as_str(),
×
223
            RegexProperty::Other(o) => o.as_str(),
2✔
224
        }
225
    }
2✔
226

227
    pub fn prefix_as_str(&self) -> &'static str {
2✔
228
        match self {
×
229
            RegexProperty::Script(_, ScriptExtension::No) => "sc:",
×
230
            RegexProperty::Script(_, ScriptExtension::Yes) => "scx:",
×
231
            _ => "",
2✔
232
        }
233
    }
2✔
234

235
    pub(crate) fn negative_item(self, negative: bool) -> RegexCharSetItem {
74✔
236
        RegexCharSetItem::Property { negative, value: self }
74✔
237
    }
74✔
238
}
239

240
impl Regex {
241
    pub(crate) fn negate(self, not_span: Span, flavor: RegexFlavor) -> CompileResult {
82✔
242
        match self {
2✔
243
            Regex::Literal(l) => {
14✔
244
                let mut iter = l.chars();
14✔
245
                let Some(c) = iter.next().and_then(|c| iter.next().is_none().then_some(c)) else {
14✔
246
                    return Err(CompileErrorKind::IllegalNegation {
2✔
247
                        kind: IllegalNegationKind::Literal(l.to_string()),
2✔
248
                    }
2✔
249
                    .at(not_span));
2✔
250
                };
251
                if flavor == RegexFlavor::DotNet && c.len_utf16() > 1 {
12✔
252
                    return Err(CompileErrorKind::IllegalNegation {
2✔
253
                        kind: IllegalNegationKind::DotNetChar(c),
2✔
254
                    }
2✔
255
                    .at(not_span));
2✔
256
                }
10✔
257
                Ok(Regex::CharSet(RegexCharSet::new(c.into()).negate()))
10✔
258
            }
259
            Regex::CharSet(s) => Ok(Regex::CharSet(s.negate())),
43✔
260
            Regex::CompoundCharSet(s) => Ok(Regex::CompoundCharSet(s.negate())),
×
261
            Regex::Boundary(b) => match b {
13✔
262
                BoundaryKind::Word => Ok(Regex::Boundary(BoundaryKind::NotWord)),
10✔
263
                BoundaryKind::NotWord => Ok(Regex::Boundary(BoundaryKind::Word)),
1✔
264
                _ => Err(CompileErrorKind::IllegalNegation { kind: IllegalNegationKind::Boundary }
2✔
265
                    .at(not_span)),
2✔
266
            },
267
            Regex::Lookaround(mut l) => {
9✔
268
                l.kind = match l.kind {
9✔
269
                    LookaroundKind::Ahead => LookaroundKind::AheadNegative,
1✔
270
                    LookaroundKind::Behind => LookaroundKind::BehindNegative,
7✔
271
                    LookaroundKind::AheadNegative => LookaroundKind::Ahead,
×
272
                    LookaroundKind::BehindNegative => LookaroundKind::Behind,
1✔
273
                };
274
                Ok(Regex::Lookaround(l))
9✔
275
            }
276
            Regex::Group(mut g)
2✔
277
                if matches!(g.kind, RegexGroupKind::Normal) && g.parts.len() == 1 =>
2✔
278
            {
2✔
279
                g.parts.pop().unwrap().negate(not_span, flavor)
2✔
280
            }
281

282
            Regex::Unescaped(_)
283
            | Regex::Grapheme
284
            | Regex::Dot
285
            | Regex::Group(_)
286
            | Regex::Alternation(_)
287
            | Regex::Repetition(_)
288
            | Regex::Reference(_)
289
            | Regex::Recursion => Err(CompileErrorKind::IllegalNegation {
290
                kind: match self {
1✔
291
                    Regex::Unescaped(_) => IllegalNegationKind::Unescaped,
2,147,483,647✔
292
                    Regex::Grapheme => IllegalNegationKind::Grapheme,
1✔
293
                    Regex::Dot => IllegalNegationKind::Dot,
×
294
                    Regex::Group(_) => IllegalNegationKind::Group,
×
295
                    Regex::Alternation(_) => IllegalNegationKind::Alternation,
1✔
296
                    Regex::Repetition(_) => IllegalNegationKind::Repetition,
×
297
                    Regex::Reference(_) => IllegalNegationKind::Reference,
×
298
                    Regex::Recursion => IllegalNegationKind::Recursion,
×
UNCOV
299
                    _ => unreachable!(),
×
300
                },
301
            }
302
            .at(not_span)),
1✔
303
        }
304
    }
82✔
305

306
    pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
1,358✔
307
        match self {
1,358✔
308
            Regex::Literal(l) => {
250✔
309
                // normalize line breaks: within string literals, \r, \n and \r\n should be
250✔
310
                // converted to \n
250✔
311
                let mut chars = l.chars();
250✔
312
                while let Some(c) = chars.next() {
944✔
313
                    if c == '\r' {
694✔
314
                        literal::codegen_char_esc('\n', buf, flavor);
×
315
                        match chars.next() {
×
316
                            Some('\n') | None => {}
×
317
                            Some(c) => literal::codegen_char_esc(c, buf, flavor),
×
318
                        }
319
                    } else {
694✔
320
                        literal::codegen_char_esc(c, buf, flavor);
694✔
321
                    }
694✔
322
                }
323
            }
324
            Regex::Unescaped(u) => {
9✔
325
                buf.push_str(u);
9✔
326
            }
9✔
327
            Regex::CharSet(c) => c.codegen(buf, flavor, false),
413✔
328
            Regex::CompoundCharSet(c) => c.codegen(buf, flavor),
5✔
329
            Regex::Grapheme => buf.push_str("\\X"),
6✔
330
            Regex::Dot => buf.push('.'),
24✔
331
            Regex::Group(g) => g.codegen(buf, flavor),
300✔
332
            Regex::Alternation(a) => a.codegen(buf, flavor),
62✔
333
            Regex::Repetition(r) => r.codegen(buf, flavor),
172✔
334
            Regex::Boundary(b) => boundary_kind_codegen(*b, buf, flavor),
52✔
335
            Regex::Lookaround(l) => l.codegen(buf, flavor),
30✔
336
            Regex::Reference(r) => r.codegen(buf),
33✔
337
            Regex::Recursion => recursion::codegen(buf, flavor),
2✔
338
        }
339
    }
1,358✔
340

341
    pub(crate) fn needs_parens_in_sequence(&self) -> bool {
732✔
342
        match self {
732✔
343
            Regex::Alternation(_) => true,
26✔
344
            Regex::Literal(_)
345
            | Regex::Unescaped(_)
346
            | Regex::Group(_)
347
            | Regex::CharSet(_)
348
            | Regex::CompoundCharSet(_)
349
            | Regex::Grapheme
350
            | Regex::Repetition(_)
351
            | Regex::Boundary(_)
352
            | Regex::Lookaround(_)
353
            | Regex::Reference(_)
354
            | Regex::Dot
355
            | Regex::Recursion => false,
706✔
356
        }
357
    }
732✔
358

359
    pub(crate) fn needs_parens_before_repetition(&self, flavor: RegexFlavor) -> bool {
172✔
360
        match self {
172✔
361
            Regex::Literal(l) => literal::needs_parens_before_repetition(l.borrow()),
43✔
362
            Regex::Group(g) => g.needs_parens_before_repetition(flavor),
26✔
363
            Regex::Repetition(_)
364
            | Regex::Alternation(_)
365
            | Regex::Boundary(_)
366
            | Regex::Unescaped(_) => true,
13✔
367
            Regex::Lookaround(_) => matches!(flavor, RegexFlavor::JavaScript),
12✔
368
            Regex::CharSet(_)
369
            | Regex::CompoundCharSet(_)
370
            | Regex::Grapheme
371
            | Regex::Reference(_)
372
            | Regex::Dot
373
            | Regex::Recursion => false,
78✔
374
        }
375
    }
172✔
376

377
    pub(crate) fn result_is_empty(&self) -> bool {
13✔
378
        match self {
13✔
379
            Regex::Literal(l) => l.is_empty(),
2✔
380
            Regex::Group(g) => g.parts.iter().all(Regex::result_is_empty),
4✔
381
            Regex::Unescaped(r) => r.is_empty(),
×
382
            Regex::Repetition(r) => r.content.result_is_empty(),
×
383
            _ => false,
7✔
384
        }
385
    }
13✔
386

387
    pub(crate) fn is_assertion(&self) -> bool {
13✔
388
        match self {
7✔
389
            Regex::Lookaround(_) | Regex::Boundary(_) => true,
6✔
390
            Regex::Group(g) if matches!(g.kind, RegexGroupKind::Normal) => {
7✔
391
                let mut iter = g.parts.iter().filter(|part| !part.result_is_empty());
8✔
392
                iter.next().is_some_and(Regex::is_assertion) && iter.next().is_none()
7✔
393
            }
394
            Regex::Alternation(g) => g.parts.iter().any(Regex::is_assertion),
×
395
            _ => false,
×
396
        }
397
    }
13✔
398
}
399

400
impl RegexShorthand {
401
    pub(crate) fn codegen(self, buf: &mut String) {
88✔
402
        match self {
88✔
403
            RegexShorthand::Word => buf.push_str("\\w"),
14✔
404
            RegexShorthand::Digit => buf.push_str("\\d"),
16✔
405
            RegexShorthand::Space => buf.push_str("\\s"),
25✔
406
            RegexShorthand::NotWord => buf.push_str("\\W"),
7✔
407
            RegexShorthand::NotDigit => buf.push_str("\\D"),
2✔
408
            RegexShorthand::NotSpace => buf.push_str("\\S"),
18✔
409
            RegexShorthand::VertSpace => buf.push_str("\\v"),
3✔
410
            RegexShorthand::HorizSpace => buf.push_str("\\h"),
3✔
411
        }
412
    }
88✔
413
}
414

415
impl RegexProperty {
416
    pub(crate) fn codegen(self, buf: &mut String, negative: bool, flavor: RegexFlavor) {
71✔
417
        let is_single = matches!(
71✔
418
            (self, flavor),
71✔
419
            (
420
                RegexProperty::Category(
421
                    Category::Letter
422
                        | Category::Mark
423
                        | Category::Number
424
                        | Category::Punctuation
425
                        | Category::Symbol
426
                        | Category::Separator
427
                        | Category::Other
428
                ),
429
                RegexFlavor::Java | RegexFlavor::Pcre | RegexFlavor::Rust | RegexFlavor::Ruby,
430
            )
431
        );
432
        if negative {
71✔
433
            buf.push_str("\\P");
20✔
434
        } else {
51✔
435
            buf.push_str("\\p");
51✔
436
        }
51✔
437
        if !is_single {
71✔
438
            buf.push('{');
65✔
439
        }
65✔
440

441
        match self {
71✔
442
            RegexProperty::Category(c) => {
29✔
443
                buf.push_str(c.as_str());
29✔
444
            }
29✔
445
            RegexProperty::Script(s, e) => {
26✔
446
                if matches!(flavor, RegexFlavor::JavaScript | RegexFlavor::Java)
26✔
447
                    || e != ScriptExtension::Unspecified
16✔
448
                {
449
                    buf.push_str(if let ScriptExtension::Yes = e { "scx=" } else { "sc=" });
13✔
450
                }
13✔
451
                buf.push_str(s.as_str());
26✔
452
            }
453
            RegexProperty::Block(b) => match flavor {
8✔
454
                RegexFlavor::DotNet => {
3✔
455
                    buf.push_str("Is");
3✔
456
                    buf.push_str(&b.as_str().replace("_And_", "_and_").replace('_', ""));
3✔
457
                }
3✔
458
                RegexFlavor::Java => {
459
                    buf.push_str("In");
2✔
460
                    match b {
2✔
461
                        // Java, for whatever reason, chose to only support the *2nd alias* for these blocks
462
                        // (see PropertyValueAliases.txt)
463
                        CodeBlock::Cyrillic_Supplement => buf.push_str("Cyrillic_Supplementary"),
×
464
                        CodeBlock::Combining_Diacritical_Marks_For_Symbols => {
465
                            buf.push_str("Combining_Marks_For_Symbols")
×
466
                        }
467
                        _ => buf.push_str(&b.as_str().replace('-', "_")),
2✔
468
                    };
469
                }
470
                RegexFlavor::Ruby => {
3✔
471
                    buf.push_str("In");
3✔
472
                    buf.push_str(b.as_str());
3✔
473
                }
3✔
474
                _ => panic!("No other flavors support Unicode blocks"),
×
475
            },
476
            RegexProperty::Other(o) => {
8✔
477
                if flavor == RegexFlavor::Java {
8✔
478
                    buf.push_str("Is");
×
479
                }
8✔
480
                buf.push_str(o.as_str());
8✔
481
            }
482
        }
483

484
        if !is_single {
71✔
485
            buf.push('}');
65✔
486
        }
65✔
487
    }
71✔
488
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc