12359588212

Committed 16 Dec 2024 07:10PM UTC coverage: 77.979% (-0.004%) from 77.983%

Build # 12359588212

Build Type

push

github

Committed by

Aloso

Commit Message

fix: adjust Unicode support for Ruby, fix typo in DotNetSupportedBlocks

Run Details

1 of 1 new or added line in 1 file covered. (100.0%)

1 existing line in 1 file now uncovered.

4777 of 6126 relevant lines covered (77.98%)

350773.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.5

/pomsky-lib/src/exprs/char_class/mod.rs

//! Implements _character classes_. The analogue in the regex world are
//! [character classes](https://www.regular-expressions.info/charclass.html),
//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
//! [dot](https://www.regular-expressions.info/dot.html).
//!
//! All kinds of character classes mentioned above require `[` square brackets
//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
//! after the opening bracket. For example, `![.]` compiles to `\n`.
//!
//! ## Items
//!
//! A character class can contain multiple _items_, which can be
//!
//! - A __code point__, e.g. `['a']` or `[U+107]`
//!
//!   - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
//!     Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
//!
//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
//!   point P where `U+10 ≤ P ≤ U+200`
//!
//! - A __named character class__, which can be one of
//!
//!   - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
//!     Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
//!
//!   - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
//!     Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
//!     `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
//!     `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
//!     `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
//!     classes are not Unicode aware!\ _Note_: They're converted to ranges,
//!     e.g. `[ascii_alpha]` = `[a-zA-Z]`.
//!
//!   - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
//!     For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
//!     treats any uppercase identifier except `R` as Unicode class.
//!
//! ## Compilation
//!
//! When a character class contains only a single item (e.g. `[w]`), the
//! character class is "flattened":
//!
//! - `['a']` = `a`
//! - `[w]` = `\w`
//! - `[Letter]` = `\p{Letter}`
//!
//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
//! character class is created:
//!
//! - `['a'-'z' '!']` = `[a-z!]`
//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
//!
//! ### Negation
//!
//! Negation is implemented as follows:
//!
//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
//!   character class, e.g. `[^a-z!\e]`.
//!
//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
//!   class.
//!
//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
//!   (`![w]` = `\W`), except when there is more than one item in the class
//!   (`![w '-']` = `[^\w\-]`)
//!
//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
//!   individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
//!   `![!Latin 'a']` = `[^\P{Latin}a]`.
//!
//!   When a negated character class only contains 1 item, which is also
//!   negated, the class is   removed and the negations cancel each other out:
//!   `![!w]` = `\w`, `![!L]` = `\p{L}`.

use pomsky_syntax::exprs::{
    Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script, ScriptExtension,
};
use pomsky_syntax::Span;

use crate::{
    compile::{CompileResult, CompileState},
    diagnose::{CompileError, CompileErrorKind, Feature},
    exprs::literal,
    options::{CompileOptions, RegexFlavor},
    regex::{Regex, RegexProperty, RegexShorthand},
    unicode_set::UnicodeSet,
};
pub(crate) use char_set_item::{RegexCharSet, RegexCharSetItem, RegexCompoundCharSet};

use super::Compile;

mod char_set_item;

impl Compile for CharClass {
    fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {
        // when single, a `[!w]` can be turned into `![w]`
        let is_single = self.inner.len() == 1;
        let mut group_negative = false;

        let mut set = UnicodeSet::new();
        for item in &self.inner {
            match *item {
                GroupItem::Char(c) => {
                    if !is_single {
                        validate_char_in_class(c, options.flavor, self.span)?;
                    }
                    set.add_char(c)
                }
                GroupItem::Range { first, last } => {
                    validate_char_in_class(first, options.flavor, self.span)?;
                    validate_char_in_class(last, options.flavor, self.span)?;
                    set.add_range(first..=last);
                }
                GroupItem::Named { name, negative, span } => {
                    if self.unicode_aware {
                        named_class_to_regex_unicode(
                            name,
                            negative,
                            &mut group_negative,
                            is_single,
                            options.flavor,
                            span,
                            &mut set,
                        )?;
                    } else {
                        named_class_to_regex_ascii(name, negative, options.flavor, span, &mut set)?;
                    }
                }
            }
        }

        // this makes it possible to use code points outside the BMP in .NET,
        // as long as there is only one in the character set
        if let Some(only_char) = set.try_into_char() {
            return Ok(Regex::Literal(only_char.to_string()));
        }

        Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))
    }
}

fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {
    if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {
        Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)
            .at(span))
    } else {
        Ok(())
    }
}

pub(crate) fn check_char_class_empty(
    char_set: &RegexCharSet,
    span: Span,
) -> Result<(), CompileError> {
    if char_set.negative {
        if let Some((group1, group2)) = char_set.set.full_props() {
            return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));
        }
    }
    Ok(())
}

pub fn is_ascii_only_in_flavor(group: GroupName, flavor: RegexFlavor) -> bool {
    match flavor {
        RegexFlavor::JavaScript => matches!(group, GroupName::Word | GroupName::Digit),
        RegexFlavor::RE2 => matches!(group, GroupName::Word | GroupName::Digit | GroupName::Space),
        _ => false,
    }
}

fn named_class_to_regex_ascii(
    group: GroupName,
    negative: bool,
    flavor: RegexFlavor,
    span: Span,
    set: &mut UnicodeSet,
) -> Result<(), CompileError> {
    // In JS, \W and \D can be used for negation because they're ascii-only
    // Same goes for \W, \D and \S in RE2
    if negative && !is_ascii_only_in_flavor(group, flavor) {
        return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));
    }

    match group {
        GroupName::Word => {
            if let RegexFlavor::JavaScript | RegexFlavor::RE2 = flavor {
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            } else {
                // we already checked above if negative
                set.add_range('a'..='z');
                set.add_range('A'..='Z');
                set.add_range('0'..='9');
                set.add_char('_');
            }
        }
        GroupName::Digit => {
            if let RegexFlavor::JavaScript | RegexFlavor::RE2 = flavor {
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            } else {
                // we already checked above if negative
                set.add_range('0'..='9');
            }
        }
        GroupName::Space => {
            if let RegexFlavor::RE2 = flavor {
                let s = if negative { RegexShorthand::NotSpace } else { RegexShorthand::Space };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            } else {
                set.add_char(' ');
                set.add_range('\x09'..='\x0D'); // \t\n\v\f\r
            }
        }
        GroupName::HorizSpace => set.add_char('\t'),
        GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),
        _ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),
    }
    Ok(())
}

fn named_class_to_regex_unicode(
    group: GroupName,
    negative: bool,
    group_negative: &mut bool,
    is_single: bool,
    flavor: RegexFlavor,
    span: Span,
    set: &mut UnicodeSet,
) -> Result<(), CompileError> {
    match group {
        GroupName::Word => {
            if flavor == RegexFlavor::RE2 {
                return Err(CompileErrorKind::Unsupported(Feature::ShorthandW, flavor).at(span));
            } else if flavor == RegexFlavor::JavaScript {
                if negative {
                    if is_single {
                        *group_negative ^= true;
                    } else {
                        return Err(CompileErrorKind::Unsupported(
                            Feature::NegativeShorthandW,
                            flavor,
                        )
                        .at(span));
                    }
                }
                set.add_prop(
                    RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),
                );
                set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));
                set.add_prop(
                    RegexProperty::Category(Category::Decimal_Number).negative_item(false),
                );
                set.add_prop(
                    RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),
                );
            } else {
                let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            }
        }
        GroupName::Digit => {
            if matches!(flavor, RegexFlavor::JavaScript | RegexFlavor::RE2) {
                set.add_prop(
                    RegexProperty::Category(Category::Decimal_Number).negative_item(negative),
                );
            } else {
                let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };
                set.add_prop(RegexCharSetItem::Shorthand(s));
            }
        }

        GroupName::Space => {
            if flavor == RegexFlavor::RE2 {
                if negative {
                    if is_single {
                        *group_negative ^= true;
                    } else {
                        return Err(CompileErrorKind::Unsupported(
                            Feature::NegativeShorthandS,
                            flavor,
                        )
                        .at(span));
                    }
                }

                // [ \f\n\r\t\u000b\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
                set.add_prop(RegexCharSetItem::Shorthand(RegexShorthand::Space));
                set.add_char('\x0b');
                set.add_char('\u{a0}');
                set.add_char('\u{1680}');
                set.add_range('\u{2000}'..='\u{200a}');
                set.add_range('\u{2028}'..='\u{2029}');
                set.add_char('\u{202f}');
                set.add_char('\u{205f}');
                set.add_char('\u{3000}');
                set.add_char('\u{feff}');
            } else {
                set.add_prop(RegexCharSetItem::Shorthand(if negative {
                    RegexShorthand::NotSpace
                } else {
                    RegexShorthand::Space
                }))
            }
        }

        GroupName::HorizSpace | GroupName::VertSpace if negative => {
            return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));
        }

        GroupName::HorizSpace | GroupName::VertSpace
            if matches!(flavor, RegexFlavor::Pcre | RegexFlavor::Java) =>
        {
            set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {
                RegexShorthand::HorizSpace
            } else {
                RegexShorthand::VertSpace
            }));
        }
        GroupName::HorizSpace => {
            set.add_char('\t');
            if flavor == RegexFlavor::Python {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
            } else {
                set.add_prop(
                    RegexProperty::Category(Category::Space_Separator).negative_item(false),
                );
            }
        }
        GroupName::VertSpace => {
            set.add_range('\x0A'..='\x0D');
            set.add_char('\u{85}');
            set.add_char('\u{2028}');
            set.add_char('\u{2029}');
        }

        _ if flavor == RegexFlavor::Python => {
            return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
        }
        GroupName::Category(c) => {
            if let (RegexFlavor::Rust, Category::Surrogate)
            | (RegexFlavor::DotNet | RegexFlavor::RE2, Category::Cased_Letter) = (flavor, c)
            {
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
            }
            set.add_prop(RegexProperty::Category(c).negative_item(negative));
        }
        GroupName::Script(s, e) => {
            if flavor == RegexFlavor::DotNet {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));
            }
            if let (RegexFlavor::Rust, Script::Unknown) = (flavor, s) {
                return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
            }

            let set_extensions = match e {
                ScriptExtension::Yes => match flavor {
                    RegexFlavor::Rust | RegexFlavor::Pcre | RegexFlavor::JavaScript => {
                        ScriptExtension::Yes
                    }
                    RegexFlavor::Java
                    | RegexFlavor::DotNet
                    | RegexFlavor::Ruby
                    | RegexFlavor::Python
                    | RegexFlavor::RE2 => {
                        return Err(CompileErrorKind::Unsupported(
                            Feature::ScriptExtensions,
                            flavor,
                        )
                        .at(span))
                    }
                },
                ScriptExtension::No => match flavor {
                    // PCRE is currently the only flavor when `\p{Greek}` is the same as `\p{scx=Greek}`
                    RegexFlavor::Pcre => ScriptExtension::No,
                    _ => ScriptExtension::Unspecified,
                },
                _ => ScriptExtension::Unspecified,
            };

            set.add_prop(RegexProperty::Script(s, set_extensions).negative_item(negative));
        }
        GroupName::CodeBlock(b) => match flavor {
            RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
                match (flavor, b) {
                    (RegexFlavor::Java, CodeBlock::No_Block)
                    | (
                        // These should work since Oniguruma updated to Unicode 15.1
                        // ... but our C bindings for Oniguruma are unmaintained!
                        RegexFlavor::Ruby,
                        CodeBlock::Arabic_Extended_C
                        | CodeBlock::CJK_Unified_Ideographs_Extension_H
                        | CodeBlock::Cyrillic_Extended_D
                        | CodeBlock::Devanagari_Extended_A
                        | CodeBlock::Kaktovik_Numerals,
                    ) => {
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
                    }
                    (RegexFlavor::DotNet, _) => {
                        let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");
                        if pomsky_syntax::blocks_supported_in_dotnet()
                            .binary_search(&dotnet_name.as_str())
                            .is_err()
                        {
                            return Err(
                                CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)
                            );
                        }
                    }
                    _ => {}
                }

                set.add_prop(RegexProperty::Block(b).negative_item(negative));
            }
            _ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),
        },
        GroupName::OtherProperties(o) => {
            use OtherProperties as OP;
            use RegexFlavor as RF;

            if let RF::JavaScript | RF::Rust | RF::Pcre | RF::Ruby = flavor {
                match (flavor, o) {
                    (RF::JavaScript, _) => {}
                    (_, OP::Changes_When_NFKC_Casefolded)
                    | (RF::Pcre, OP::Assigned)
                    | (RF::Ruby, OP::Bidi_Mirrored) => {
                        return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));
                    }
                    _ => {}
                }
                set.add_prop(RegexProperty::Other(o).negative_item(negative));
            } else if flavor == RF::Java {
                if pomsky_syntax::props_supported_in_java().binary_search(&o.as_str()).is_ok() {
                    set.add_prop(RegexProperty::Other(o).negative_item(negative));
                } else {
                    return Err(CompileErrorKind::Unsupported(
                        Feature::SpecificUnicodeProp,
                        flavor,
                    )
                    .at(span));
                }
            } else {
                return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));
            }
        }
    }
    Ok(())
}

1	//! Implements _character classes_. The analogue in the regex world are
2	//! [character classes](https://www.regular-expressions.info/charclass.html),
3	//! [shorthand character classes](https://www.regular-expressions.info/shorthand.html),
4	//! [non-printable characters](https://www.regular-expressions.info/nonprint.html),
5	//! [Unicode categories/scripts/blocks](https://www.regular-expressions.info/unicode.html#category),
6	//! [POSIX classes](https://www.regular-expressions.info/posixbrackets.html#class) and the
7	//! [dot](https://www.regular-expressions.info/dot.html).
8	//!
9	//! All kinds of character classes mentioned above require `[` square brackets
10	//! `]` in Pomsky. A character class can be negated by putting the keyword `not`
11	//! after the opening bracket. For example, `![.]` compiles to `\n`.
12	//!
13	//! ## Items
14	//!
15	//! A character class can contain multiple _items_, which can be
16	//!
17	//! - A __code point__, e.g. `['a']` or `[U+107]`
18	//!
19	//! - This includes [non-printable characters](https://www.regular-expressions.info/nonprint.html).\
20	//! Supported are `[n]`, `[r]`, `[t]`, `[a]`, `[e]` and `[f]`.
21	//!
22	//! - A __range of code points__. For example, `[U+10 - U+200]` matches any code
23	//! point P where `U+10 ≤ P ≤ U+200`
24	//!
25	//! - A __named character class__, which can be one of
26	//!
27	//! - a [shorthand character class](https://www.regular-expressions.info/shorthand.html).\
28	//! Supported are `[w]`, `[d]`, `[s]`, `[h]`, `[v]` and `[R]`.
29	//!
30	//! - a [POSIX class](https://www.regular-expressions.info/posixbrackets.html#class).\
31	//! Supported are `[ascii_alnum]`, `[ascii_alpha]`, `[ascii]`,
32	//! `[ascii_blank]`, `[ascii_cntrl]`, `[ascii_digit]`, `[ascii_graph]`,
33	//! `[ascii_lower]`, `[ascii_print]`, `[ascii_punct]`, ´ `[ascii_space]`,
34	//! `[ascii_upper]`, `[ascii_word]` and `[ascii_xdigit]`.\ _Note_: POSIX
35	//! classes are not Unicode aware!\ _Note_: They're converted to ranges,
36	//! e.g. `[ascii_alpha]` = `[a-zA-Z]`.
37	//!
38	//! - a [Unicode category, script or block](https://www.regular-expressions.info/unicode.html#category).\
39	//! For example: `[Letter]` compiles to `\p{Letter}`. Pomsky currently
40	//! treats any uppercase identifier except `R` as Unicode class.
41	//!
42	//! ## Compilation
43	//!
44	//! When a character class contains only a single item (e.g. `[w]`), the
45	//! character class is "flattened":
46	//!
47	//! - `['a']` = `a`
48	//! - `[w]` = `\w`
49	//! - `[Letter]` = `\p{Letter}`
50	//!
51	//! When there is more than one item or a range (e.g. `['a'-'z' '!']`), a regex
52	//! character class is created:
53	//!
54	//! - `['a'-'z' '!']` = `[a-z!]`
55	//! - `[w e Punctuation]` = `[\w\e\p{Punctuation}]`
56	//!
57	//! ### Negation
58	//!
59	//! Negation is implemented as follows:
60	//!
61	//! - Ranges and chars such as `!['a'-'z' '!' e]` are wrapped in a negative
62	//! character class, e.g. `[^a-z!\e]`.
63	//!
64	//! - The `h`, `v` and `R` shorthands are also wrapped in a negative character
65	//! class.
66	//!
67	//! - The `w`, `d` and `s` shorthands are negated by making them uppercase
68	//! (`![w]` = `\W`), except when there is more than one item in the class
69	//! (`![w '-']` = `[^\w\-]`)
70	//!
71	//! - `w`, `s`, `d` and Unicode categories/scripts/blocks can be negated
72	//! individually _within a character class_, e.g. `[s !s]` = `[\s\S]`,
73	//! `![!Latin 'a']` = `[^\P{Latin}a]`.
74	//!
75	//! When a negated character class only contains 1 item, which is also
76	//! negated, the class is removed and the negations cancel each other out:
77	//! `![!w]` = `\w`, `![!L]` = `\p{L}`.
78
79	use pomsky_syntax::exprs::{
80	Category, CharClass, CodeBlock, GroupItem, GroupName, OtherProperties, Script, ScriptExtension,
81	};
82	use pomsky_syntax::Span;
83
84	use crate::{
85	compile::{CompileResult, CompileState},
86	diagnose::{CompileError, CompileErrorKind, Feature},
87	exprs::literal,
88	options::{CompileOptions, RegexFlavor},
89	regex::{Regex, RegexProperty, RegexShorthand},
90	unicode_set::UnicodeSet,
91	};
92	pub(crate) use char_set_item::{RegexCharSet, RegexCharSetItem, RegexCompoundCharSet};
93
94	use super::Compile;
95
96	mod char_set_item;
97
98	impl Compile for CharClass {
99	fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult {	260✔
100	// when single, a `[!w]` can be turned into `![w]`	260✔
101	let is_single = self.inner.len() == 1;	260✔
102	let mut group_negative = false;	260✔
103		260✔
104	let mut set = UnicodeSet::new();	260✔
105	for item in &self.inner {	652✔
106	match *item {	401✔
107	GroupItem::Char(c) => {	205✔
108	if !is_single {	205✔
109	validate_char_in_class(c, options.flavor, self.span)?;	141✔
110	}	64✔
111	set.add_char(c)	205✔
112	}
113	GroupItem::Range { first, last } => {	34✔
114	validate_char_in_class(first, options.flavor, self.span)?;	34✔
115	validate_char_in_class(last, options.flavor, self.span)?;	34✔
116	set.add_range(first..=last);	34✔
117	}
118	GroupItem::Named { name, negative, span } => {	162✔
119	if self.unicode_aware {	162✔
120	named_class_to_regex_unicode(	132✔
121	name,	132✔
122	negative,	132✔
123	&mut group_negative,	132✔
124	is_single,	132✔
125	options.flavor,	132✔
126	span,	132✔
127	&mut set,	132✔
128	)?;	132✔
129	} else {
130	named_class_to_regex_ascii(name, negative, options.flavor, span, &mut set)?;	30✔
131	}
132	}
133	}
134	}
135
136	// this makes it possible to use code points outside the BMP in .NET,
137	// as long as there is only one in the character set
138	if let Some(only_char) = set.try_into_char() {	251✔
139	return Ok(Regex::Literal(only_char.to_string()));	63✔
140	}	188✔
141		188✔
142	Ok(Regex::CharSet(RegexCharSet { negative: group_negative, set }))	188✔
143	}	260✔
144	}
145
146	fn validate_char_in_class(char: char, flavor: RegexFlavor, span: Span) -> Result<(), CompileError> {	209✔
147	if flavor == RegexFlavor::DotNet && char > '\u{FFFF}' {	209✔
148	Err(CompileErrorKind::Unsupported(Feature::LargeCodePointInCharClass(char), flavor)	×
149	.at(span))	×
150	} else {
151	Ok(())	209✔
152	}
153	}	209✔
154
155	pub(crate) fn check_char_class_empty(	51✔
156	char_set: &RegexCharSet,	51✔
157	span: Span,	51✔
158	) -> Result<(), CompileError> {	51✔
159	if char_set.negative {	51✔
160	if let Some((group1, group2)) = char_set.set.full_props() {	50✔
161	return Err(CompileErrorKind::EmptyClassNegated { group1, group2 }.at(span));	3✔
162	}	47✔
163	}	1✔
164	Ok(())	48✔
165	}	51✔
166
167	pub fn is_ascii_only_in_flavor(group: GroupName, flavor: RegexFlavor) -> bool {	1✔
168	match flavor {	1✔
169	RegexFlavor::JavaScript => matches!(group, GroupName::Word \| GroupName::Digit),	×
170	RegexFlavor::RE2 => matches!(group, GroupName::Word \| GroupName::Digit \| GroupName::Space),	×
171	_ => false,	1✔
172	}
173	}	1✔
174
175	fn named_class_to_regex_ascii(	30✔
176	group: GroupName,	30✔
177	negative: bool,	30✔
178	flavor: RegexFlavor,	30✔
179	span: Span,	30✔
180	set: &mut UnicodeSet,	30✔
181	) -> Result<(), CompileError> {	30✔
182	// In JS, \W and \D can be used for negation because they're ascii-only	30✔
183	// Same goes for \W, \D and \S in RE2	30✔
184	if negative && !is_ascii_only_in_flavor(group, flavor) {	30✔
185	return Err(CompileErrorKind::NegativeShorthandInAsciiMode.at(span));	1✔
186	}	29✔
187		29✔
188	match group {	29✔
189	GroupName::Word => {
190	if let RegexFlavor::JavaScript \| RegexFlavor::RE2 = flavor {	8✔
191	let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };	2✔
192	set.add_prop(RegexCharSetItem::Shorthand(s));	2✔
193	} else {	6✔
194	// we already checked above if negative	6✔
195	set.add_range('a'..='z');	6✔
196	set.add_range('A'..='Z');	6✔
197	set.add_range('0'..='9');	6✔
198	set.add_char('_');	6✔
199	}	6✔
200	}
201	GroupName::Digit => {
202	if let RegexFlavor::JavaScript \| RegexFlavor::RE2 = flavor {	12✔
203	let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };	2✔
204	set.add_prop(RegexCharSetItem::Shorthand(s));	2✔
205	} else {	10✔
206	// we already checked above if negative	10✔
207	set.add_range('0'..='9');	10✔
208	}	10✔
209	}
210	GroupName::Space => {
211	if let RegexFlavor::RE2 = flavor {	8✔
212	let s = if negative { RegexShorthand::NotSpace } else { RegexShorthand::Space };	1✔
213	set.add_prop(RegexCharSetItem::Shorthand(s));	1✔
214	} else {	7✔
215	set.add_char(' ');	7✔
216	set.add_range('\x09'..='\x0D'); // \t\n\v\f\r	7✔
217	}	7✔
218	}
219	GroupName::HorizSpace => set.add_char('\t'),	×
220	GroupName::VertSpace => set.add_range('\x0A'..='\x0D'),	×
221	_ => return Err(CompileErrorKind::UnicodeInAsciiMode.at(span)),	1✔
222	}
223	Ok(())	28✔
224	}	30✔
225
226	fn named_class_to_regex_unicode(	132✔
227	group: GroupName,	132✔
228	negative: bool,	132✔
229	group_negative: &mut bool,	132✔
230	is_single: bool,	132✔
231	flavor: RegexFlavor,	132✔
232	span: Span,	132✔
233	set: &mut UnicodeSet,	132✔
234	) -> Result<(), CompileError> {	132✔
235	match group {	5✔
236	GroupName::Word => {
237	if flavor == RegexFlavor::RE2 {	27✔
238	return Err(CompileErrorKind::Unsupported(Feature::ShorthandW, flavor).at(span));	×
239	} else if flavor == RegexFlavor::JavaScript {	27✔
240	if negative {	5✔
241	if is_single {	2✔
242	*group_negative ^= true;	1✔
243	} else {	1✔
244	return Err(CompileErrorKind::Unsupported(	1✔
245	Feature::NegativeShorthandW,	1✔
246	flavor,	1✔
247	)	1✔
248	.at(span));	1✔
249	}
250	}	3✔
251	set.add_prop(	4✔
252	RegexProperty::Other(OtherProperties::Alphabetic).negative_item(false),	4✔
253	);	4✔
254	set.add_prop(RegexProperty::Category(Category::Mark).negative_item(false));	4✔
255	set.add_prop(	4✔
256	RegexProperty::Category(Category::Decimal_Number).negative_item(false),	4✔
257	);	4✔
258	set.add_prop(	4✔
259	RegexProperty::Category(Category::Connector_Punctuation).negative_item(false),	4✔
260	);	4✔
261	} else {
262	let s = if negative { RegexShorthand::NotWord } else { RegexShorthand::Word };	22✔
263	set.add_prop(RegexCharSetItem::Shorthand(s));	22✔
264	}
265	}
266	GroupName::Digit => {
267	if matches!(flavor, RegexFlavor::JavaScript \| RegexFlavor::RE2) {	25✔
268	set.add_prop(	8✔
269	RegexProperty::Category(Category::Decimal_Number).negative_item(negative),	8✔
270	);	8✔
271	} else {	8✔
272	let s = if negative { RegexShorthand::NotDigit } else { RegexShorthand::Digit };	17✔
273	set.add_prop(RegexCharSetItem::Shorthand(s));	17✔
274	}
275	}
276
277	GroupName::Space => {
278	if flavor == RegexFlavor::RE2 {	16✔
279	if negative {	4✔
280	if is_single {	1✔
281	*group_negative ^= true;	1✔
282	} else {	1✔
283	return Err(CompileErrorKind::Unsupported(	×
284	Feature::NegativeShorthandS,	×
285	flavor,	×
286	)	×
287	.at(span));	×
288	}
289	}	3✔
290
291	// [ \f\n\r\t\u000b\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
292	set.add_prop(RegexCharSetItem::Shorthand(RegexShorthand::Space));	4✔
293	set.add_char('\x0b');	4✔
294	set.add_char('\u{a0}');	4✔
295	set.add_char('\u{1680}');	4✔
296	set.add_range('\u{2000}'..='\u{200a}');	4✔
297	set.add_range('\u{2028}'..='\u{2029}');	4✔
298	set.add_char('\u{202f}');	4✔
299	set.add_char('\u{205f}');	4✔
300	set.add_char('\u{3000}');	4✔
301	set.add_char('\u{feff}');	4✔
302	} else {
303	set.add_prop(RegexCharSetItem::Shorthand(if negative {	12✔
304	RegexShorthand::NotSpace	3✔
305	} else {
306	RegexShorthand::Space	9✔
307	}))
308	}
309	}
310
311	GroupName::HorizSpace \| GroupName::VertSpace if negative => {	×
312	return Err(CompileErrorKind::NegatedHorizVertSpace.at(span));	×
313	}
314
315	GroupName::HorizSpace \| GroupName::VertSpace
316	if matches!(flavor, RegexFlavor::Pcre \| RegexFlavor::Java) =>	5✔
317	{
318	set.add_prop(RegexCharSetItem::Shorthand(if group == GroupName::HorizSpace {	6✔
319	RegexShorthand::HorizSpace	3✔
320	} else {
321	RegexShorthand::VertSpace	3✔
322	}));
323	}
324	GroupName::HorizSpace => {
325	set.add_char('\t');	2✔
326	if flavor == RegexFlavor::Python {	2✔
327	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	×
328	} else {	2✔
329	set.add_prop(	2✔
330	RegexProperty::Category(Category::Space_Separator).negative_item(false),	2✔
331	);	2✔
332	}	2✔
333	}
334	GroupName::VertSpace => {	2✔
335	set.add_range('\x0A'..='\x0D');	2✔
336	set.add_char('\u{85}');	2✔
337	set.add_char('\u{2028}');	2✔
338	set.add_char('\u{2029}');	2✔
339	}	2✔
340
341	_ if flavor == RegexFlavor::Python => {	54✔
342	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	2✔
343	}
344	GroupName::Category(c) => {	8✔
345	if let (RegexFlavor::Rust, Category::Surrogate)	8✔
346	\| (RegexFlavor::DotNet \| RegexFlavor::RE2, Category::Cased_Letter) = (flavor, c)	8✔
347	{
348	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
349	}	8✔
350	set.add_prop(RegexProperty::Category(c).negative_item(negative));	8✔
351	}
352	GroupName::Script(s, e) => {	28✔
353	if flavor == RegexFlavor::DotNet {	28✔
354	return Err(CompileErrorKind::Unsupported(Feature::UnicodeScript, flavor).at(span));	1✔
355	}	27✔
356	if let (RegexFlavor::Rust, Script::Unknown) = (flavor, s) {	27✔
UNCOV 357	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
358	}	27✔
359
360	let set_extensions = match e {	27✔
361	ScriptExtension::Yes => match flavor {	3✔
362	RegexFlavor::Rust \| RegexFlavor::Pcre \| RegexFlavor::JavaScript => {
363	ScriptExtension::Yes	2✔
364	}
365	RegexFlavor::Java
366	\| RegexFlavor::DotNet
367	\| RegexFlavor::Ruby
368	\| RegexFlavor::Python
369	\| RegexFlavor::RE2 => {
370	return Err(CompileErrorKind::Unsupported(	1✔
371	Feature::ScriptExtensions,	1✔
372	flavor,	1✔
373	)	1✔
374	.at(span))	1✔
375	}
376	},
377	ScriptExtension::No => match flavor {	4✔
378	// PCRE is currently the only flavor when `\p{Greek}` is the same as `\p{scx=Greek}`
379	RegexFlavor::Pcre => ScriptExtension::No,	1✔
380	_ => ScriptExtension::Unspecified,	3✔
381	},
382	_ => ScriptExtension::Unspecified,	20✔
383	};
384
385	set.add_prop(RegexProperty::Script(s, set_extensions).negative_item(negative));	26✔
386	}
387	GroupName::CodeBlock(b) => match flavor {	9✔
388	RegexFlavor::DotNet \| RegexFlavor::Java \| RegexFlavor::Ruby => {
389	match (flavor, b) {	8✔
390	(RegexFlavor::Java, CodeBlock::No_Block)
391	\| (
392	// These should work since Oniguruma updated to Unicode 15.1
393	// ... but our C bindings for Oniguruma are unmaintained!
394	RegexFlavor::Ruby,
395	CodeBlock::Arabic_Extended_C
396	\| CodeBlock::CJK_Unified_Ideographs_Extension_H
397	\| CodeBlock::Cyrillic_Extended_D
398	\| CodeBlock::Devanagari_Extended_A
399	\| CodeBlock::Kaktovik_Numerals,
400	) => {
401	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	×
402	}
403	(RegexFlavor::DotNet, _) => {
404	let dotnet_name = b.as_str().replace("_And_", "_and_").replace('_', "");	3✔
405	if pomsky_syntax::blocks_supported_in_dotnet()	3✔
406	.binary_search(&dotnet_name.as_str())	3✔
407	.is_err()	3✔
408	{
409	return Err(	×
410	CompileErrorKind::unsupported_specific_prop_in(flavor).at(span)	×
411	);	×
412	}	3✔
413	}
414	_ => {}	5✔
415	}
416
417	set.add_prop(RegexProperty::Block(b).negative_item(negative));	8✔
418	}
419	_ => return Err(CompileErrorKind::Unsupported(Feature::UnicodeBlock, flavor).at(span)),	1✔
420	},
421	GroupName::OtherProperties(o) => {	7✔
422	use OtherProperties as OP;
423	use RegexFlavor as RF;
424
425	if let RF::JavaScript \| RF::Rust \| RF::Pcre \| RF::Ruby = flavor {	7✔
426	match (flavor, o) {	7✔
427	(RF::JavaScript, _) => {}	4✔
428	(_, OP::Changes_When_NFKC_Casefolded)
429	\| (RF::Pcre, OP::Assigned)
430	\| (RF::Ruby, OP::Bidi_Mirrored) => {
431	return Err(CompileErrorKind::unsupported_specific_prop_in(flavor).at(span));	1✔
432	}
433	_ => {}	2✔
434	}
435	set.add_prop(RegexProperty::Other(o).negative_item(negative));	6✔
436	} else if flavor == RF::Java {	×
437	if pomsky_syntax::props_supported_in_java().binary_search(&o.as_str()).is_ok() {	×
438	set.add_prop(RegexProperty::Other(o).negative_item(negative));	×
439	} else {	×
440	return Err(CompileErrorKind::Unsupported(	×
441	Feature::SpecificUnicodeProp,	×
442	flavor,	×
443	)	×
444	.at(span));	×
445	}
446	} else {
447	return Err(CompileErrorKind::Unsupported(Feature::UnicodeProp, flavor).at(span));	×
448	}
449	}
450	}
451	Ok(())	125✔
452	}	132✔

pomsky-lang / pomsky / 12359588212

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous