12019379442

Committed 25 Nov 2024 09:47PM UTC coverage: 82.942% (-0.007%) from 82.949%

Build # 12019379442

Build Type

push

github

Committed by

Aloso

Commit Message

fix e2e test

Run Details

4269 of 5147 relevant lines covered (82.94%)

417486.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.1

/pomsky-lib/src/exprs/char_class.rs

//! Implements _character classes_. The analogue in the regex world are
//! [character classes](https://www.regular-expressions.info/charclass.html),
//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
//! [dot](https://www.regular-expressions.info/dot.html).
//!
//! All kinds of character classes mentioned above require `[` square brackets
//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
//! after the opening bracket. For example, `![.]` compiles to `\n`.
//!
//! ## Items
//!
//! A character class can contain multiple _items_, which can be
//!
//! - A __code point__, e.g. `['a']` or `[U+107]`
//!
//!   - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
//!     Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
//!
//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
//!   point P where `U+10 ≤ P ≤ U+200`
//!
//! - A __named character class__, which can be one of
//!
//!   - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
//!     Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
//!
//!   - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
//!     Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
//!     `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
//!     `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
//!     `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
//!     classes are not Unicode aware!\ _Note_: They're converted to ranges,
//!     e.g. `[ascii_alpha]` = `[a-zA-Z]`.
//!
//!   - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
//!     For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
//!     treats any uppercase identifier except `R` as Unicode class.
//!
//! ## Compilation
//!
//! When a character class contains only a single item (e.g. `[w]`), the
//! character class is "flattened":
//!
//! - `['a']` = `a`
//! - `[w]` = `\w`
//! - `[Letter]` = `\p{Letter}`
//!
//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
//! character class is created:
//!
//! - `['a'-'z' '!']` = `[a-z!]`
//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
//!
//! ### Negation
//!
//! Negation is implemented as follows:
//!
//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
//!   character class, e.g. `[^a-z!\e]`.
//!
//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
//!   class.
//!
//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
//!   (`![w]` = `\W`), except when there is more than one item in the class
//!   (`![w '-']` = `[^\w\-]`)
//!
//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
//!   individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
//!   `![!Latin 'a']` = `[^\P{Latin}a]`.
//!
//!   When a negated character class only contains 1 item, which is also
//!   negated, the class is   removed and the negations cancel each other out:
//!   `![!w]` = `\w`, `![!L]` = `\p{L}`.

use std::fmt;

use crate::{
    compile::{CompileResult, CompileState},
    diagnose::{CompileError, CompileErrorKind, Feature},
    exprs::literal,
    options::{CompileOptions, RegexFlavor},
    regex::{Regex, RegexProperty, RegexShorthand},
    unicode_set::UnicodeSet,
};

use pomsky_syntax::{
    exprs::{Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script},
    Span,
};

use super::Compile;

impl Compile for CharClass {
    fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {
        // when single, a `[!w]` can be turned into `![w]`
        let is_single = self.inner.len() == 1;
        let mut group_negative = false;

        let mut set = UnicodeSet::new();
        for item in &self.inner {
            match *item {
                GroupItem::Char(c) => {
                    if !is_single {
                        validate_char_in_class(c, options.flavor, self.span)?;
                    }
                    set.add_char(c)
                }
                GroupItem::Range { first, last } => {
                    validate_char_in_class(first, options.flavor, self.span)?;
                    validate_char_in_class(last, options.flavor, self.span)?;
                    set.add_range(first..=last);
                }
                GroupItem::Named { name, negative } => {
                    if self.unicode_aware {
                        named_class_to_regex_unicode(
                            name,
                            negative,
                            &mut group_negative,
                            is_single,
                            options.flavor,
                            self.span,
                            &mut set,
                        )?;
                    } else {
                        named_class_to_regex_ascii(
                            name,
                            negative,
                            options.flavor,
                            self.span,
                            &mut set,
                        )?;
                    }
                }
            }
        }

        // this makes it possible to use code points outside the BMP in .NET,
        // as long as there is only one in the character set
        if let Some(only_char) = set.try_into_char() {
            return Ok(Regex::Literal(only_char.to_string()));
        }

        Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))
    }
}

fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {
    if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {
        Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)
            .at(span))
    } else {
        Ok(())
    }
}

pub(crate) fn check_char_class_empty(
    char_set: &RegexCharSet,
    span: Span,
) -> Result<(), CompileError> {
    if char_set.negative {
        if let Some((group1, group2)) = char_set.set.full_props() {
            return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));
        }
    }
    Ok(())
}

fn named_class_to_regex_ascii(
    group: GroupName,
    negative: bool,
    flavor: RegexFlavor,
    span: Span,
    set: &mut UnicodeSet,
) -> Result<(), CompileError> {
    if negative
        // In JS, \W and \D can be used for negation because they're ascii-only
        && (flavor != RegexFlavor::JavaScript
            || (group != GroupName::Digit && group != GroupName::Word))
    {
        return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));
    }

    match group {
        GroupName::Word => {
            if flavor == RegexFlavor::JavaScript {
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            } else {
                // we already checked above if negative
                set.add_range('a'..='z');
                set.add_range('A'..='Z');
                set.add_range('0'..='9');
                set.add_char('_');
            }
        }
        GroupName::Digit => {
            if flavor == RegexFlavor::JavaScript {
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            } else {
                // we already checked above if negative
                set.add_range('0'..='9');
            }
        }
        GroupName::Space => {
            set.add_char(' ');
            set.add_range('\x09'..='\x0D'); // \t\n\v\f\r
        }
        GroupName::HorizSpace => set.add_char('\t'),
        GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),
        _ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),
    }
    Ok(())
}

fn named_class_to_regex_unicode(
    group: GroupName,
    negative: bool,
    group_negative: &mut bool,
    is_single: bool,
    flavor: RegexFlavor,
    span: Span,
    set: &mut UnicodeSet,
) -> Result<(), CompileError> {
    match group {
        GroupName::Word => {
            if flavor == RegexFlavor::JavaScript {
                if negative {
                    if is_single {
                        *group_negative ^= true;
                    } else {
                        return Err(CompileErrorKind::Unsupported(
                            Feature::NegativeShorthandW,
                            flavor,
                        )
                        .at(span));
                    }
                }
                set.add_prop(
                    RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),
                );
                set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));
                set.add_prop(
                    RegexProperty::Category(Category::Decimal_Number).negative_item(false),
                );
                set.add_prop(
                    RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),
                );
            } else {
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            }
        }
        GroupName::Digit => {
            if flavor == RegexFlavor::JavaScript {
                set.add_prop(
                    RegexProperty::Category(Category::Decimal_Number).negative_item(negative),
                );
            } else {
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            }
        }

        GroupName::Space => set.add_prop(RegexCharSetItem::Shorthand(if negative {
            RegexShorthand::NotSpace
        } else {
            RegexShorthand::Space
        })),

        GroupName::HorizSpace | GroupName::VertSpace if negative => {
            return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));
        }

        GroupName::HorizSpace | GroupName::VertSpace
            if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
        {
            set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {
                RegexShorthand::HorizSpace
            } else {
                RegexShorthand::VertSpace
            }));
        }
        GroupName::HorizSpace => {
            set.add_char('\t');
            if flavor == RegexFlavor::Python {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
            } else {
                set.add_prop(
                    RegexProperty::Category(Category::Space_Separator).negative_item(false),
                );
            }
        }
        GroupName::VertSpace => {
            set.add_range('\x0A'..='\x0D');
            set.add_char('\u{85}');
            set.add_char('\u{2028}');
            set.add_char('\u{2029}');
        }

        _ if flavor == RegexFlavor::Python => {
            return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
        }
        GroupName::Category(c) => {
            if let (RegexFlavor::Rust, Category::Surrogate)
            | (RegexFlavor::DotNet, Category::Cased_Letter) = (flavor, c)
            {
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
            }
            set.add_prop(RegexProperty::Category(c).negative_item(negative));
        }
        GroupName::Script(s) => {
            if flavor == RegexFlavor::DotNet {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));
            }
            if let (
                RegexFlavor::Pcre | RegexFlavor::Ruby | RegexFlavor::Java,
                Script::Kawi | Script::Nag_Mundari,
            )
            | (RegexFlavor::Rust, Script::Unknown) = (flavor, s)
            {
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
            }
            set.add_prop(RegexProperty::Script(s).negative_item(negative));
        }
        GroupName::CodeBlock(b) => match flavor {
            RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
                match (flavor, b) {
                    (
                        RegexFlavor::Java,
                        CodeBlock::Arabic_Extended_C
                        | CodeBlock::CJK_Unified_Ideographs_Extension_H
                        | CodeBlock::Combining_Diacritical_Marks_For_Symbols
                        | CodeBlock::Cyrillic_Extended_D
                        | CodeBlock::Cyrillic_Supplement
                        | CodeBlock::Devanagari_Extended_A
                        | CodeBlock::Greek_And_Coptic
                        | CodeBlock::Kaktovik_Numerals
                        | CodeBlock::No_Block,
                    )
                    | (
                        RegexFlavor::Ruby,
                        CodeBlock::Arabic_Extended_C
                        | CodeBlock::CJK_Unified_Ideographs_Extension_H
                        | CodeBlock::Cyrillic_Extended_D
                        | CodeBlock::Devanagari_Extended_A
                        | CodeBlock::Kaktovik_Numerals,
                    ) => {
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
                    }
                    (RegexFlavor::DotNet, _) => {
                        let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");
                        if pomsky_syntax::blocks_supported_in_dotnet()
                            .binary_search(&dotnet_name.as_str())
                            .is_err()
                        {
                            return Err(
                                CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)
                            );
                        }
                    }
                    _ => {}
                }

                set.add_prop(RegexProperty::Block(b).negative_item(negative));
            }
            _ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
        },
        GroupName::OtherProperties(o) => {
            use OtherProperties as OP;
            use RegexFlavor as RF;

            if let RF::JavaScript | RF::Rust | RF::Pcre | RF::Ruby = flavor {
                match (flavor, o) {
                    (RF::JavaScript, _) => {}
                    (_, OP::Changes_When_NFKC_Casefolded)
                    | (RF::Pcre, OP::Assigned)
                    | (RF::Ruby, OP::Bidi_Mirrored) => {
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
                    }
                    _ => {}
                }
                set.add_prop(RegexProperty::Other(o).negative_item(negative));
            } else {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
            }
        }
    }
    Ok(())
}

#[cfg_attr(feature = "dbg", derive(Debug))]
pub(crate) struct RegexCharSet {
    negative: bool,
    set: UnicodeSet,
}

impl RegexCharSet {
    pub(crate) fn new(items: UnicodeSet) -> Self {
        Self { negative: false, set: items }
    }

    pub(crate) fn negate(mut self) -> Self {
        self.negative = !self.negative;
        self
    }

    pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {
        if self.set.len() == 1 {
            if let Some(range) = self.set.ranges().next() {
                let (first, last) = range.as_chars();
                if first == last && !self.negative {
                    return literal::codegen_char_esc(first, buf, flavor);
                }
            } else if let Some(prop) = self.set.props().next() {
                match prop {
                    RegexCharSetItem::Shorthand(s) => {
                        let shorthand = if self.negative { s.negate() } else { Some(s) };
                        if let Some(shorthand) = shorthand {
                            return shorthand.codegen(buf);
                        }
                    }
                    RegexCharSetItem::Property { negative, value } => {
                        return value.codegen(buf, negative ^ self.negative, flavor);
                    }
                }
            }
        }

        if self.negative {
            buf.push_str("[^");
        } else {
            buf.push('[');
        }

        let mut is_first = true;
        for prop in self.set.props() {
            match prop {
                RegexCharSetItem::Shorthand(s) => s.codegen(buf),
                RegexCharSetItem::Property { negative, value } => {
                    value.codegen(buf, negative, flavor);
                }
            }
            is_first = false;
        }
        for range in self.set.ranges() {
            let (first, last) = range.as_chars();
            if first == last {
                literal::compile_char_esc_in_class(first, buf, is_first, flavor);
            } else {
                literal::compile_char_esc_in_class(first, buf, is_first, flavor);
                if range.first + 1 < range.last {
                    buf.push('-');
                }
                literal::compile_char_esc_in_class(last, buf, false, flavor);
            }
            is_first = false;
        }

        buf.push(']');
    }
}

#[derive(Clone, Copy, PartialEq, Eq)]
pub(crate) enum RegexCharSetItem {
    Shorthand(RegexShorthand),
    Property { negative: bool, value: RegexProperty },
}

impl RegexCharSetItem {
    pub(crate) fn negate(self) -> Option<Self> {
        match self {
            RegexCharSetItem::Shorthand(s) => s.negate().map(RegexCharSetItem::Shorthand),
            RegexCharSetItem::Property { negative, value } => {
                Some(RegexCharSetItem::Property { negative: !negative, value })
            }
        }
    }
}

impl fmt::Debug for RegexCharSetItem {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Shorthand(s) => f.write_str(s.as_str()),
            &Self::Property { value, negative } => {
                if negative {
                    f.write_str("!")?;
                }
                f.write_str(value.as_str())
            }
        }
    }
}

1	//! Implements _character classes_. The analogue in the regex world are
2	//! [character classes](https://www.regular-expressions.info/charclass.html),
3	//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
4	//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
5	//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
6	//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
7	//! [dot](https://www.regular-expressions.info/dot.html).
8	//!
9	//! All kinds of character classes mentioned above require `[` square brackets
10	//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
11	//! after the opening bracket. For example, `![.]` compiles to `\n`.
12	//!
13	//! ## Items
14	//!
15	//! A character class can contain multiple _items_, which can be
16	//!
17	//! - A __code point__, e.g. `['a']` or `[U+107]`
18	//!
19	//! - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
20	//! Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
21	//!
22	//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
23	//! point P where `U+10 ≤ P ≤ U+200`
24	//!
25	//! - A __named character class__, which can be one of
26	//!
27	//! - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
28	//! Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
29	//!
30	//! - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
31	//! Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
32	//! `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
33	//! `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
34	//! `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
35	//! classes are not Unicode aware!\ _Note_: They're converted to ranges,
36	//! e.g. `[ascii_alpha]` = `[a-zA-Z]`.
37	//!
38	//! - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
39	//! For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
40	//! treats any uppercase identifier except `R` as Unicode class.
41	//!
42	//! ## Compilation
43	//!
44	//! When a character class contains only a single item (e.g. `[w]`), the
45	//! character class is "flattened":
46	//!
47	//! - `['a']` = `a`
48	//! - `[w]` = `\w`
49	//! - `[Letter]` = `\p{Letter}`
50	//!
51	//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
52	//! character class is created:
53	//!
54	//! - `['a'-'z' '!']` = `[a-z!]`
55	//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
56	//!
57	//! ### Negation
58	//!
59	//! Negation is implemented as follows:
60	//!
61	//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
62	//! character class, e.g. `[^a-z!\e]`.
63	//!
64	//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
65	//! class.
66	//!
67	//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
68	//! (`![w]` = `\W`), except when there is more than one item in the class
69	//! (`![w '-']` = `[^\w\-]`)
70	//!
71	//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
72	//! individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
73	//! `![!Latin 'a']` = `[^\P{Latin}a]`.
74	//!
75	//! When a negated character class only contains 1 item, which is also
76	//! negated, the class is removed and the negations cancel each other out:
77	//! `![!w]` = `\w`, `![!L]` = `\p{L}`.
78
79	use std::fmt;
80
81	use crate::{
82	compile::{CompileResult, CompileState},
83	diagnose::{CompileError, CompileErrorKind, Feature},
84	exprs::literal,
85	options::{CompileOptions, RegexFlavor},
86	regex::{Regex, RegexProperty, RegexShorthand},
87	unicode_set::UnicodeSet,
88	};
89
90	use pomsky_syntax::{
91	exprs::{Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script},
92	Span,
93	};
94
95	use super::Compile;
96
97	impl Compile for CharClass {
98	fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {	224✔
99	// when single, a `[!w]` can be turned into `![w]`	224✔
100	let is_single = self.inner.len() == 1;	224✔
101	let mut group_negative = false;	224✔
102		224✔
103	let mut set = UnicodeSet::new();	224✔
104	for item in &self.inner {	579✔
105	match *item {	363✔
106	GroupItem::Char(c) => {	204✔
107	if !is_single {	204✔
108	validate_char_in_class(c, options.flavor, self.span)?;	140✔
109	}	64✔
110	set.add_char(c)	204✔
111	}
112	GroupItem::Range { first, last } => {	30✔
113	validate_char_in_class(first, options.flavor, self.span)?;	30✔
114	validate_char_in_class(last, options.flavor, self.span)?;	30✔
115	set.add_range(first..=last);	30✔
116	}
117	GroupItem::Named { name, negative } => {	129✔
118	if self.unicode_aware {	129✔
119	named_class_to_regex_unicode(	102✔
120	name,	102✔
121	negative,	102✔
122	&mut group_negative,	102✔
123	is_single,	102✔
124	options.flavor,	102✔
125	self.span,	102✔
126	&mut set,	102✔
127	)?;	102✔
128	} else {
129	named_class_to_regex_ascii(	27✔
130	name,	27✔
131	negative,	27✔
132	options.flavor,	27✔
133	self.span,	27✔
134	&mut set,	27✔
135	)?;	27✔
136	}
137	}
138	}
139	}
140
141	// this makes it possible to use code points outside the BMP in .NET,
142	// as long as there is only one in the character set
143	if let Some(only_char) = set.try_into_char() {	216✔
144	return Ok(Regex::Literal(only_char.to_string()));	64✔
145	}	152✔
146		152✔
147	Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))	152✔
148	}	224✔
149	}
150
151	fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {	200✔
152	if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {	200✔
153	Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)	×
154	.at(span))	×
155	} else {
156	Ok(())	200✔
157	}
158	}	200✔
159
160	pub(crate) fn check_char_class_empty(	46✔
161	char_set: &RegexCharSet,	46✔
162	span: Span,	46✔
163	) -> Result<(), CompileError> {	46✔
164	if char_set.negative {	46✔
165	if let Some((group1, group2)) = char_set.set.full_props() {	45✔
166	return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));	3✔
167	}	42✔
168	}	1✔
169	Ok(())	43✔
170	}	46✔
171
172	fn named_class_to_regex_ascii(	27✔
173	group: GroupName,	27✔
174	negative: bool,	27✔
175	flavor: RegexFlavor,	27✔
176	span: Span,	27✔
177	set: &mut UnicodeSet,	27✔
178	) -> Result<(), CompileError> {	27✔
179	if negative	27✔
180	// In JS, \W and \D can be used for negation because they're ascii-only
181	&& (flavor != RegexFlavor::JavaScript	1✔
182	\|\| (group != GroupName::Digit && group != GroupName::Word))	×
183	{
184	return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));	1✔
185	}	26✔
186		26✔
187	match group {	26✔
188	GroupName::Word => {
189	if flavor == RegexFlavor::JavaScript {	7✔
190	let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };	1✔
191	set.add_prop(RegexCharSetItem::Shorthand(s));	1✔
192	} else {	6✔
193	// we already checked above if negative	6✔
194	set.add_range('a'..='z');	6✔
195	set.add_range('A'..='Z');	6✔
196	set.add_range('0'..='9');	6✔
197	set.add_char('_');	6✔
198	}	6✔
199	}
200	GroupName::Digit => {
201	if flavor == RegexFlavor::JavaScript {	11✔
202	let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };	1✔
203	set.add_prop(RegexCharSetItem::Shorthand(s));	1✔
204	} else {	10✔
205	// we already checked above if negative	10✔
206	set.add_range('0'..='9');	10✔
207	}	10✔
208	}
209	GroupName::Space => {	7✔
210	set.add_char(' ');	7✔
211	set.add_range('\x09'..='\x0D'); // \t\n\v\f\r	7✔
212	}	7✔
213	GroupName::HorizSpace => set.add_char('\t'),	×
214	GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),	×
215	_ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),	1✔
216	}
217	Ok(())	25✔
218	}	27✔
219
220	fn named_class_to_regex_unicode(	102✔
221	group: GroupName,	102✔
222	negative: bool,	102✔
223	group_negative: &mut bool,	102✔
224	is_single: bool,	102✔
225	flavor: RegexFlavor,	102✔
226	span: Span,	102✔
227	set: &mut UnicodeSet,	102✔
228	) -> Result<(), CompileError> {	102✔
229	match group {	5✔
230	GroupName::Word => {
231	if flavor == RegexFlavor::JavaScript {	22✔
232	if negative {	5✔
233	if is_single {	2✔
234	*group_negative ^= true;	1✔
235	} else {	1✔
236	return Err(CompileErrorKind::Unsupported(	1✔
237	Feature::NegativeShorthandW,	1✔
238	flavor,	1✔
239	)	1✔
240	.at(span));	1✔
241	}
242	}	3✔
243	set.add_prop(	4✔
244	RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),	4✔
245	);	4✔
246	set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));	4✔
247	set.add_prop(	4✔
248	RegexProperty::Category(Category::Decimal_Number).negative_item(false),	4✔
249	);	4✔
250	set.add_prop(	4✔
251	RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),	4✔
252	);	4✔
253	} else {
254	let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };	17✔
255	set.add_prop(RegexCharSetItem::Shorthand(s));	17✔
256	}
257	}
258	GroupName::Digit => {
259	if flavor == RegexFlavor::JavaScript {	16✔
260	set.add_prop(	4✔
261	RegexProperty::Category(Category::Decimal_Number).negative_item(negative),	4✔
262	);	4✔
263	} else {	4✔
264	let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };	12✔
265	set.add_prop(RegexCharSetItem::Shorthand(s));	12✔
266	}
267	}
268
269	GroupName::Space => set.add_prop(RegexCharSetItem::Shorthand(if negative {	12✔
270	RegexShorthand::NotSpace	3✔
271	} else {
272	RegexShorthand::Space	9✔
273	})),
274
275	GroupName::HorizSpace \| GroupName::VertSpace if negative => {	×
276	return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));	×
277	}
278
279	GroupName::HorizSpace \| GroupName::VertSpace
280	if matches!(flavor, RegexFlavor::Pcre \| RegexFlavor::Java) =>	5✔
281	{
282	set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {	6✔
283	RegexShorthand::HorizSpace	3✔
284	} else {
285	RegexShorthand::VertSpace	3✔
286	}));
287	}
288	GroupName::HorizSpace => {
289	set.add_char('\t');	2✔
290	if flavor == RegexFlavor::Python {	2✔
291	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	×
292	} else {	2✔
293	set.add_prop(	2✔
294	RegexProperty::Category(Category::Space_Separator).negative_item(false),	2✔
295	);	2✔
296	}	2✔
297	}
298	GroupName::VertSpace => {	2✔
299	set.add_range('\x0A'..='\x0D');	2✔
300	set.add_char('\u{85}');	2✔
301	set.add_char('\u{2028}');	2✔
302	set.add_char('\u{2029}');	2✔
303	}	2✔
304
305	_ if flavor == RegexFlavor::Python => {	42✔
306	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	2✔
307	}
308	GroupName::Category(c) => {	5✔
309	if let (RegexFlavor::Rust, Category::Surrogate)	5✔
310	\| (RegexFlavor::DotNet, Category::Cased_Letter) = (flavor, c)	5✔
311	{
312	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
313	}	5✔
314	set.add_prop(RegexProperty::Category(c).negative_item(negative));	5✔
315	}
316	GroupName::Script(s) => {	21✔
317	if flavor == RegexFlavor::DotNet {	21✔
318	return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));	1✔
319	}	20✔
320	if let (	20✔
321	RegexFlavor::Pcre \| RegexFlavor::Ruby \| RegexFlavor::Java,	20✔
322	Script::Kawi \| Script::Nag_Mundari,	20✔
323	)	20✔
324	\| (RegexFlavor::Rust, Script::Unknown) = (flavor, s)	20✔
325	{
326	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
327	}	20✔
328	set.add_prop(RegexProperty::Script(s).negative_item(negative));	20✔
329	}
330	GroupName::CodeBlock(b) => match flavor {	7✔
331	RegexFlavor::DotNet \| RegexFlavor::Java \| RegexFlavor::Ruby => {
332	match (flavor, b) {	6✔
333	(
334	RegexFlavor::Java,
335	CodeBlock::Arabic_Extended_C
336	\| CodeBlock::CJK_Unified_Ideographs_Extension_H
337	\| CodeBlock::Combining_Diacritical_Marks_For_Symbols
338	\| CodeBlock::Cyrillic_Extended_D
339	\| CodeBlock::Cyrillic_Supplement
340	\| CodeBlock::Devanagari_Extended_A
341	\| CodeBlock::Greek_And_Coptic
342	\| CodeBlock::Kaktovik_Numerals
343	\| CodeBlock::No_Block,
344	)
345	\| (
346	RegexFlavor::Ruby,
347	CodeBlock::Arabic_Extended_C
348	\| CodeBlock::CJK_Unified_Ideographs_Extension_H
349	\| CodeBlock::Cyrillic_Extended_D
350	\| CodeBlock::Devanagari_Extended_A
351	\| CodeBlock::Kaktovik_Numerals,
352	) => {
353	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
354	}
355	(RegexFlavor::DotNet, _) => {
356	let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");	2✔
357	if pomsky_syntax::blocks_supported_in_dotnet()	2✔
358	.binary_search(&dotnet_name.as_str())	2✔
359	.is_err()	2✔
360	{
361	return Err(	×
362	CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)	×
363	);	×
364	}	2✔
365	}
366	_ => {}	4✔
367	}
368
369	set.add_prop(RegexProperty::Block(b).negative_item(negative));	6✔
370	}
371	_ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),	1✔
372	},
373	GroupName::OtherProperties(o) => {	7✔
374	use OtherProperties as OP;
375	use RegexFlavor as RF;
376
377	if let RF::JavaScript \| RF::Rust \| RF::Pcre \| RF::Ruby = flavor {	7✔
378	match (flavor, o) {	7✔
379	(RF::JavaScript, _) => {}	4✔
380	(_, OP::Changes_When_NFKC_Casefolded)
381	\| (RF::Pcre, OP::Assigned)
382	\| (RF::Ruby, OP::Bidi_Mirrored) => {
383	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	1✔
384	}
385	_ => {}	2✔
386	}
387	set.add_prop(RegexProperty::Other(o).negative_item(negative));	6✔
388	} else {
389	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	×
390	}
391	}
392	}
393	Ok(())	96✔
394	}	102✔
395
396	#[cfg_attr(feature = "dbg", derive(Debug))]
397	pub(crate) struct RegexCharSet {
398	negative: bool,
399	set: UnicodeSet,
400	}
401
402	impl RegexCharSet {
403	pub(crate) fn new(items: UnicodeSet) -> Self {	157✔
404	Self { negative: false, set: items }	157✔
405	}	157✔
406
407	pub(crate) fn negate(mut self) -> Self {	46✔
408	self.negative = !self.negative;	46✔
409	self	46✔
410	}	46✔
411
412	pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) {	304✔
413	if self.set.len() == 1 {	304✔
414	if let Some(range) = self.set.ranges().next() {	217✔
415	let (first, last) = range.as_chars();	157✔
416	if first == last && !self.negative {	157✔
417	return literal::codegen_char_esc(first, buf, flavor);	1✔
418	}	156✔
419	} else if let Some(prop) = self.set.props().next() {	60✔
420	match prop {	60✔
421	RegexCharSetItem::Shorthand(s) => {	24✔
422	let shorthand = if self.negative { s.negate() } else { Some(s) };	24✔
423	if let Some(shorthand) = shorthand {	24✔
424	return shorthand.codegen(buf);	22✔
425	}	2✔
426	}
427	RegexCharSetItem::Property { negative, value } => {	36✔
428	return value.codegen(buf, negative ^ self.negative, flavor);	36✔
429	}
430	}
431	}	×
432	}	87✔
433
434	if self.negative {	245✔
435	buf.push_str("[^");	27✔
436	} else {	218✔
437	buf.push('[');	218✔
438	}	218✔
439
440	let mut is_first = true;	245✔
441	for prop in self.set.props() {	245✔
442	match prop {	69✔
443	RegexCharSetItem::Shorthand(s) => s.codegen(buf),	49✔
444	RegexCharSetItem::Property { negative, value } => {	20✔
445	value.codegen(buf, negative, flavor);	20✔
446	}	20✔
447	}
448	is_first = false;	69✔
449	}
450	for range in self.set.ranges() {	329✔
451	let (first, last) = range.as_chars();	329✔
452	if first == last {	329✔
453	literal::compile_char_esc_in_class(first, buf, is_first, flavor);	90✔
454	} else {	90✔
455	literal::compile_char_esc_in_class(first, buf, is_first, flavor);	239✔
456	if range.first + 1 < range.last {	239✔
457	buf.push('-');	199✔
458	}	199✔
459	literal::compile_char_esc_in_class(last, buf, false, flavor);	239✔
460	}
461	is_first = false;	329✔
462	}
463
464	buf.push(']');	245✔
465	}	304✔
466	}
467
468	#[derive(Clone, Copy, PartialEq, Eq)]
469	pub(crate) enum RegexCharSetItem {
470	Shorthand(RegexShorthand),
471	Property { negative: bool, value: RegexProperty },
472	}
473
474	impl RegexCharSetItem {
475	pub(crate) fn negate(self) -> Option<Self> {	43✔
476	match self {	43✔
477	RegexCharSetItem::Shorthand(s) => s.negate().map(RegexCharSetItem::Shorthand),	19✔
478	RegexCharSetItem::Property { negative, value } => {	24✔
479	Some(RegexCharSetItem::Property { negative: !negative, value })	24✔
480	}
481	}
482	}	43✔
483	}
484
485	impl fmt::Debug for RegexCharSetItem {
486	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {	6✔
487	match self {	6✔
488	Self::Shorthand(s) => f.write_str(s.as_str()),	4✔
489	&Self::Property { value, negative } => {	2✔
490	if negative {	2✔
491	f.write_str("!")?;	1✔
492	}	1✔
493	f.write_str(value.as_str())	2✔
494	}
495	}
496	}	6✔
497	}

pomsky-lang / pomsky / 12019379442

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous