16083134029

Committed 05 Jul 2025 01:22AM UTC coverage: 94.358% (-0.2%) from 94.535%

Build # 16083134029

Build Type

Pull #27

github

Committed by

kaidokert

Commit Message

Review comments

Pull Request Pull Request #27: Panic free for SliceParser

Run Details

198 of 211 new or added lines in 11 files covered. (93.84%)

1 existing line in 1 file now uncovered.

2191 of 2322 relevant lines covered (94.36%)

132.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.84

/picojson/src/escape_processor.rs

// SPDX-License-Identifier: Apache-2.0

use crate::{shared::ParserErrorHandler, ParseError};

/// Shared utilities for processing JSON escape sequences.
/// This module contains pure functions for escape processing that can be used
/// by both CopyOnEscape and StreamingBuffer components.
pub struct EscapeProcessor;
use crate::ujson;
use ujson::EventToken;

impl EscapeProcessor {
    /// Convert an escape token from the tokenizer to the corresponding escape character.
    /// This extracts the character that follows the backslash in the escape sequence.
    ///
    /// # Arguments
    /// * `escape_token` - The escape token from the tokenizer
    ///
    /// # Returns
    /// The character that follows the backslash, or None if the token is not a simple escape.
    ///
    /// # Examples
    /// ```ignore
    /// // Internal API - see unit tests for usage examples
    /// assert_eq!(EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), b'n');
    /// ```
    pub fn token_to_escape_char(escape_token: &ujson::EventToken) -> Option<u8> {
        match escape_token {
            EventToken::EscapeQuote => Some(b'"'),
            EventToken::EscapeBackslash => Some(b'\\'),
            EventToken::EscapeSlash => Some(b'/'),
            EventToken::EscapeBackspace => Some(b'b'),
            EventToken::EscapeFormFeed => Some(b'f'),
            EventToken::EscapeNewline => Some(b'n'),
            EventToken::EscapeCarriageReturn => Some(b'r'),
            EventToken::EscapeTab => Some(b't'),
            _ => None,
        }
    }

    /// Process an escape token directly to the unescaped byte value.
    /// This is a convenience method that combines token_to_escape_char and process_simple_escape.
    ///
    /// # Arguments
    /// * `escape_token` - The escape token from the tokenizer
    ///
    /// # Returns
    /// The unescaped byte value, or an error if the token is invalid or not a simple escape.
    ///
    /// # Examples
    /// ```ignore
    /// // Internal API - see unit tests for usage examples
    /// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n');
    /// ```
    pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result<u8, ParseError> {
        let escape_char = Self::token_to_escape_char(escape_token)
            .ok_or(ParserErrorHandler::unexpected_state("Invalid escape token"))?;
        Self::process_simple_escape(escape_char)
    }

    /// Process a simple escape sequence character and return the unescaped byte.
    ///
    /// # Arguments
    /// * `escape_char` - The character following the backslash in an escape sequence
    ///
    /// # Returns
    /// The unescaped byte value, or an error if the escape sequence is invalid.
    ///
    /// # Examples
    /// ```ignore
    /// // Internal API - see unit tests for usage examples
    /// assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
    /// ```
    pub fn process_simple_escape(escape_char: u8) -> Result<u8, ParseError> {
        match escape_char {
            b'n' => Ok(b'\n'),
            b't' => Ok(b'\t'),
            b'r' => Ok(b'\r'),
            b'\\' => Ok(b'\\'),
            b'"' => Ok(b'"'),
            b'/' => Ok(b'/'),
            b'b' => Ok(0x08), // Backspace
            b'f' => Ok(0x0C), // Form feed
            _ => Err(ParseError::InvalidEscapeSequence),
        }
    }

    /// Validate that a byte represents a valid hexadecimal digit.
    ///
    /// # Arguments
    /// * `byte` - The byte to validate
    ///
    /// # Returns
    /// The numeric value (0-15) of the hex digit, or an error if invalid.
    pub fn validate_hex_digit(byte: u8) -> Result<u32, ParseError> {
        match byte {
            b'0'..=b'9' => Ok((byte - b'0') as u32),
            b'a'..=b'f' => Ok(byte.wrapping_sub(b'a').wrapping_add(10) as u32),
            b'A'..=b'F' => Ok(byte.wrapping_sub(b'A').wrapping_add(10) as u32),
            _ => Err(ParseError::InvalidUnicodeHex),
        }
    }

    /// Process a Unicode escape sequence (\uXXXX) and return the UTF-8 encoded bytes.
    ///
    /// # Arguments
    /// * `hex_slice` - A 4-byte slice containing the hexadecimal digits
    /// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes)
    ///
    /// # Returns
    /// A slice containing the UTF-8 encoded bytes, or an error if the escape is invalid.
    ///
    /// # Examples
    /// ```ignore
    /// // Internal API - see unit tests for usage examples
    /// let mut buffer = [0u8; 4];
    /// let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();
    /// assert_eq!(result, b"A");
    /// ```
    pub fn process_unicode_escape<'a>(
        hex_slice: &[u8],
        utf8_buffer: &'a mut [u8],
    ) -> Result<&'a [u8], ParseError> {
        if hex_slice.len() != 4 {
            return Err(ParseError::InvalidUnicodeHex);
        }

        // Convert hex bytes to Unicode codepoint
        let mut codepoint = 0u32;
        for &byte in hex_slice {
            let digit = Self::validate_hex_digit(byte)?;
            codepoint = (codepoint << 4) | digit;
        }

        // Convert codepoint to character and encode as UTF-8
        let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?;
        let utf8_str = ch.encode_utf8(utf8_buffer);
        Ok(utf8_str.as_bytes())
    }
}

/// Shared Unicode escape hex digit collector for both parsers.
/// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences.
#[derive(Debug)]
pub struct UnicodeEscapeCollector {
    /// Buffer to collect the 4 hex digits
    hex_buffer: [u8; 4],
    /// Current position in the hex buffer (0-4)
    hex_pos: usize,
}

impl UnicodeEscapeCollector {
    /// Create a new Unicode escape collector
    pub fn new() -> Self {
        Self {
            hex_buffer: [0u8; 4],
            hex_pos: 0,
        }
    }

    /// Reset the collector for a new Unicode escape sequence
    pub fn reset(&mut self) {
        self.hex_pos = 0;
    }

    /// Add a hex digit to the collector
    /// Returns true if this completes the 4-digit sequence
    pub fn add_hex_digit(&mut self, digit: u8) -> Result<bool, ParseError> {
        // Validate the hex digit first
        EscapeProcessor::validate_hex_digit(digit)?;

        if self.hex_pos >= 4 {
            return Err(ParserErrorHandler::unexpected_state(
                "Too many hex digits in Unicode escape",
            ));
        }

        if let Some(slot) = self.hex_buffer.get_mut(self.hex_pos) {
            *slot = digit;
        } else {
            return Err(ParseError::InvalidUnicodeHex);
        }

        self.hex_pos = self.hex_pos.saturating_add(1);

        Ok(self.hex_pos == 4)
    }

    /// Process the collected hex digits and return UTF-8 bytes
    /// Should only be called when is_complete() returns true
    pub fn process_to_utf8<'a>(&self, utf8_buffer: &'a mut [u8]) -> Result<&'a [u8], ParseError> {
        if self.hex_pos != 4 {
            return Err(ParserErrorHandler::incomplete_unicode_escape());
        }

        EscapeProcessor::process_unicode_escape(&self.hex_buffer, utf8_buffer)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_simple_escapes() {
        assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
        assert_eq!(EscapeProcessor::process_simple_escape(b't').unwrap(), b'\t');
        assert_eq!(EscapeProcessor::process_simple_escape(b'r').unwrap(), b'\r');
        assert_eq!(
            EscapeProcessor::process_simple_escape(b'\\').unwrap(),
            b'\\'
        );
        assert_eq!(EscapeProcessor::process_simple_escape(b'"').unwrap(), b'"');
        assert_eq!(EscapeProcessor::process_simple_escape(b'/').unwrap(), b'/');
        assert_eq!(EscapeProcessor::process_simple_escape(b'b').unwrap(), 0x08);
        assert_eq!(EscapeProcessor::process_simple_escape(b'f').unwrap(), 0x0C);
    }

    #[test]
    fn test_invalid_simple_escape() {
        assert!(EscapeProcessor::process_simple_escape(b'x').is_err());
        assert!(EscapeProcessor::process_simple_escape(b'z').is_err());
        assert!(EscapeProcessor::process_simple_escape(b'1').is_err());
    }

    #[test]
    fn test_hex_digit_validation() {
        // Valid digits
        assert_eq!(EscapeProcessor::validate_hex_digit(b'0').unwrap(), 0);
        assert_eq!(EscapeProcessor::validate_hex_digit(b'9').unwrap(), 9);
        assert_eq!(EscapeProcessor::validate_hex_digit(b'a').unwrap(), 10);
        assert_eq!(EscapeProcessor::validate_hex_digit(b'f').unwrap(), 15);
        assert_eq!(EscapeProcessor::validate_hex_digit(b'A').unwrap(), 10);
        assert_eq!(EscapeProcessor::validate_hex_digit(b'F').unwrap(), 15);

        // Invalid digits
        assert!(EscapeProcessor::validate_hex_digit(b'g').is_err());
        assert!(EscapeProcessor::validate_hex_digit(b'G').is_err());
        assert!(EscapeProcessor::validate_hex_digit(b'z').is_err());
        assert!(EscapeProcessor::validate_hex_digit(b' ').is_err());
    }

    #[test]
    fn test_unicode_escape_basic() {
        let mut buffer = [0u8; 4];

        // Test basic ASCII character \u0041 -> 'A'
        let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();
        assert_eq!(result, b"A");

        // Test another ASCII character \u0048 -> 'H'
        let result = EscapeProcessor::process_unicode_escape(b"0048", &mut buffer).unwrap();
        assert_eq!(result, b"H");
    }

    #[test]
    fn test_unicode_escape_multibyte() {
        let mut buffer = [0u8; 4];

        // Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1)
        let result = EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer).unwrap();
        assert_eq!(result, "α".as_bytes());

        // Test emoji \u1F60A -> '😊' (4 bytes in UTF-8)
        let _result = EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer).unwrap();
        // Note: This is actually incomplete - \u1F60A requires surrogate pairs
        // But for basic testing this verifies the hex parsing works
    }

    #[test]
    fn test_unicode_escape_invalid_hex() {
        let mut buffer = [0u8; 4];

        // Invalid hex characters
        assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer).is_err());
        assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer).is_err());

        // Wrong length
        assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer).is_err());
        assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer).is_err());
    }

    #[test]
    fn test_unicode_escape_invalid_codepoint() {
        let mut buffer = [0u8; 4];

        // Note: Most values in the BMP are valid Unicode codepoints
        // Invalid surrogate codepoints would be D800-DFFF but they're complex to test
        // For now, test basic valid cases to ensure the function works
        let result = EscapeProcessor::process_unicode_escape(b"0000", &mut buffer).unwrap();
        assert_eq!(result, "\0".as_bytes());
    }

    #[test]
    fn test_token_to_escape_char() {
        use crate::ujson::EventToken;

        // Test all valid escape tokens
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(),
            b'"'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackslash).unwrap(),
            b'\\'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeSlash).unwrap(),
            b'/'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackspace).unwrap(),
            b'b'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeFormFeed).unwrap(),
            b'f'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(),
            b'n'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeCarriageReturn).unwrap(),
            b'r'
        );
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeTab).unwrap(),
            b't'
        );

        // Test invalid token
        assert_eq!(
            EscapeProcessor::token_to_escape_char(&EventToken::String),
            None
        );
    }

    #[test]
    fn test_process_escape_token() {
        use crate::ujson::EventToken;

        // Test valid escape tokens that produce correct unescaped bytes
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(),
            b'"'
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackslash).unwrap(),
            b'\\'
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeSlash).unwrap(),
            b'/'
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackspace).unwrap(),
            0x08
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeFormFeed).unwrap(),
            0x0C
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(),
            b'\n'
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeCarriageReturn).unwrap(),
            b'\r'
        );
        assert_eq!(
            EscapeProcessor::process_escape_token(&EventToken::EscapeTab).unwrap(),
            b'\t'
        );

        // Test invalid token
        assert!(EscapeProcessor::process_escape_token(&EventToken::String).is_err());
    }

    #[test]
    fn test_unicode_escape_collector_basic() {
        let mut collector = UnicodeEscapeCollector::new();
        let mut utf8_buffer = [0u8; 4];

        // Add hex digits for \u0041 -> 'A'
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
        assert!(!collector.add_hex_digit(b'4').unwrap()); // Not complete yet
        assert!(collector.add_hex_digit(b'1').unwrap()); // Complete!

        // Process to UTF-8
        let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();
        assert_eq!(result, b"A");
    }

    #[test]
    fn test_unicode_escape_collector_invalid_hex() {
        let mut collector = UnicodeEscapeCollector::new();

        // Valid digits first
        assert!(!collector.add_hex_digit(b'0').unwrap());
        assert!(!collector.add_hex_digit(b'0').unwrap());

        // Invalid hex digit should fail
        assert!(collector.add_hex_digit(b'G').is_err());
    }

    #[test]
    fn test_unicode_escape_collector_reset() {
        let mut collector = UnicodeEscapeCollector::new();

        // Add some digits
        assert!(!collector.add_hex_digit(b'0').unwrap());
        assert!(!collector.add_hex_digit(b'1').unwrap());

        // Reset should clear state
        collector.reset();

        // Should be able to start fresh
        assert!(!collector.add_hex_digit(b'A').unwrap());
    }

    #[test]
    fn test_unicode_escape_collector_multibyte() {
        let mut collector = UnicodeEscapeCollector::new();
        let mut utf8_buffer = [0u8; 4];

        // Add hex digits for \u03B1 -> 'α' (Greek alpha)
        assert!(!collector.add_hex_digit(b'0').unwrap());
        assert!(!collector.add_hex_digit(b'3').unwrap());
        assert!(!collector.add_hex_digit(b'B').unwrap());
        assert!(collector.add_hex_digit(b'1').unwrap());

        let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();
        assert_eq!(result, "α".as_bytes());
    }

    #[test]
    fn test_unicode_escape_collector_incomplete_processing() {
        let mut collector = UnicodeEscapeCollector::new();
        let mut utf8_buffer = [0u8; 4];

        // Add only 2 digits
        assert!(!collector.add_hex_digit(b'0').unwrap());
        assert!(!collector.add_hex_digit(b'0').unwrap());

        // Should fail to process incomplete sequence
        assert!(collector.process_to_utf8(&mut utf8_buffer).is_err());
    }
}

/// Shared implementation for processing a Unicode escape sequence.
///
/// This function centralizes the logic for handling `\uXXXX` escapes, which is
/// common to both the pull-based and stream-based parsers. It uses a generic
/// `hex_slice_provider` to remain independent of the underlying buffer implementation
/// (`SliceInputBuffer` vs. `DirectBuffer`).
///
/// # Arguments
/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
///
/// # Returns
/// A tuple containing the resulting UTF-8 byte slice and the start position of the escape sequence (`\uXXXX`).
pub(crate) fn process_unicode_escape_sequence<'a, F>(
    current_pos: usize,
    unicode_escape_collector: &mut UnicodeEscapeCollector,
    mut hex_slice_provider: F,
    utf8_buf: &'a mut [u8; 4],
) -> Result<(&'a [u8], usize), ParseError>
where
    F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
{
    let (hex_start, hex_end, escape_start_pos) =
        crate::shared::ContentRange::unicode_escape_bounds(current_pos);

    // Extract the 4 hex digits from the buffer using the provider
    let hex_slice = hex_slice_provider(hex_start, hex_end)?;

    if hex_slice.len() != 4 {
        return Err(ParserErrorHandler::invalid_unicode_length());
    }

    // Feed hex digits to the shared collector
    for &hex_digit in hex_slice {
        unicode_escape_collector.add_hex_digit(hex_digit)?;
    }

    // Process the complete sequence to UTF-8
    let utf8_bytes = unicode_escape_collector.process_to_utf8(utf8_buf)?;

    Ok((utf8_bytes, escape_start_pos))
}

1	// SPDX-License-Identifier: Apache-2.0
2
3	use crate::{shared::ParserErrorHandler, ParseError};
4
5	/// Shared utilities for processing JSON escape sequences.
6	/// This module contains pure functions for escape processing that can be used
7	/// by both CopyOnEscape and StreamingBuffer components.
8	pub struct EscapeProcessor;
9	use crate::ujson;
10	use ujson::EventToken;
11
12	impl EscapeProcessor {
13	/// Convert an escape token from the tokenizer to the corresponding escape character.
14	/// This extracts the character that follows the backslash in the escape sequence.
15	///
16	/// # Arguments
17	/// * `escape_token` - The escape token from the tokenizer
18	///
19	/// # Returns
20	/// The character that follows the backslash, or None if the token is not a simple escape.
21	///
22	/// # Examples
23	/// ```ignore
24	/// // Internal API - see unit tests for usage examples
25	/// assert_eq!(EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), b'n');
26	/// ```
27	pub fn token_to_escape_char(escape_token: &ujson::EventToken) -> Option<u8> {	42✔
28	match escape_token {	42✔
29	EventToken::EscapeQuote => Some(b'"'),	3✔
30	EventToken::EscapeBackslash => Some(b'\\'),	2✔
31	EventToken::EscapeSlash => Some(b'/'),	2✔
32	EventToken::EscapeBackspace => Some(b'b'),	2✔
33	EventToken::EscapeFormFeed => Some(b'f'),	2✔
34	EventToken::EscapeNewline => Some(b'n'),	17✔
35	EventToken::EscapeCarriageReturn => Some(b'r'),	4✔
36	EventToken::EscapeTab => Some(b't'),	8✔
37	_ => None,	2✔
38	}
39	}	42✔
40
41	/// Process an escape token directly to the unescaped byte value.
42	/// This is a convenience method that combines token_to_escape_char and process_simple_escape.
43	///
44	/// # Arguments
45	/// * `escape_token` - The escape token from the tokenizer
46	///
47	/// # Returns
48	/// The unescaped byte value, or an error if the token is invalid or not a simple escape.
49	///
50	/// # Examples
51	/// ```ignore
52	/// // Internal API - see unit tests for usage examples
53	/// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n');
54	/// ```
55	pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result<u8, ParseError> {	33✔
56	let escape_char = Self::token_to_escape_char(escape_token)	33✔
57	.ok_or(ParserErrorHandler::unexpected_state("Invalid escape token"))?;	33✔
58	Self::process_simple_escape(escape_char)	32✔
59	}	33✔
60
61	/// Process a simple escape sequence character and return the unescaped byte.
62	///
63	/// # Arguments
64	/// * `escape_char` - The character following the backslash in an escape sequence
65	///
66	/// # Returns
67	/// The unescaped byte value, or an error if the escape sequence is invalid.
68	///
69	/// # Examples
70	/// ```ignore
71	/// // Internal API - see unit tests for usage examples
72	/// assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
73	/// ```
74	pub fn process_simple_escape(escape_char: u8) -> Result<u8, ParseError> {	43✔
75	match escape_char {	43✔
76	b'n' => Ok(b'\n'),	17✔
77	b't' => Ok(b'\t'),	8✔
78	b'r' => Ok(b'\r'),	4✔
79	b'\\' => Ok(b'\\'),	2✔
80	b'"' => Ok(b'"'),	3✔
81	b'/' => Ok(b'/'),	2✔
82	b'b' => Ok(0x08), // Backspace	2✔
83	b'f' => Ok(0x0C), // Form feed	2✔
84	_ => Err(ParseError::InvalidEscapeSequence),	3✔
85	}
86	}	43✔
87
88	/// Validate that a byte represents a valid hexadecimal digit.
89	///
90	/// # Arguments
91	/// * `byte` - The byte to validate
92	///
93	/// # Returns
94	/// The numeric value (0-15) of the hex digit, or an error if invalid.
95	pub fn validate_hex_digit(byte: u8) -> Result<u32, ParseError> {	82✔
96	match byte {	82✔
97	b'0'..=b'9' => Ok((byte - b'0') as u32),	81✔
98	b'a'..=b'f' => Ok(byte.wrapping_sub(b'a').wrapping_add(10) as u32),	4✔
99	b'A'..=b'F' => Ok(byte.wrapping_sub(b'A').wrapping_add(10) as u32),	15✔
100	_ => Err(ParseError::InvalidUnicodeHex),	7✔
101	}
102	}	82✔
103
104	/// Process a Unicode escape sequence (\uXXXX) and return the UTF-8 encoded bytes.
105	///
106	/// # Arguments
107	/// * `hex_slice` - A 4-byte slice containing the hexadecimal digits
108	/// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes)
109	///
110	/// # Returns
111	/// A slice containing the UTF-8 encoded bytes, or an error if the escape is invalid.
112	///
113	/// # Examples
114	/// ```ignore
115	/// // Internal API - see unit tests for usage examples
116	/// let mut buffer = [0u8; 4];
117	/// let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();
118	/// assert_eq!(result, b"A");
119	/// ```
120	pub fn process_unicode_escape<'a>(	14✔
121	hex_slice: &[u8],	14✔
122	utf8_buffer: &'a mut [u8],	14✔
123	) -> Result<&'a [u8], ParseError> {	14✔
124	if hex_slice.len() != 4 {	14✔
125	return Err(ParseError::InvalidUnicodeHex);	2✔
126	}	12✔
127
128	// Convert hex bytes to Unicode codepoint
129	let mut codepoint = 0u32;	12✔
130	for &byte in hex_slice {	54✔
131	let digit = Self::validate_hex_digit(byte)?;	44✔
132	codepoint = (codepoint << 4) \| digit;	42✔
133	}
134
135	// Convert codepoint to character and encode as UTF-8
136	let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?;	10✔
137	let utf8_str = ch.encode_utf8(utf8_buffer);	10✔
138	Ok(utf8_str.as_bytes())	10✔
139	}	14✔
140	}
141
142	/// Shared Unicode escape hex digit collector for both parsers.
143	/// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences.
144	#[derive(Debug)]
145	pub struct UnicodeEscapeCollector {
146	/// Buffer to collect the 4 hex digits
147	hex_buffer: [u8; 4],
148	/// Current position in the hex buffer (0-4)
149	hex_pos: usize,
150	}
151
152	impl UnicodeEscapeCollector {
153	/// Create a new Unicode escape collector
154	pub fn new() -> Self {	91✔
155	Self {	91✔
156	hex_buffer: [0u8; 4],	91✔
157	hex_pos: 0,	91✔
158	}	91✔
159	}	91✔
160
161	/// Reset the collector for a new Unicode escape sequence
162	pub fn reset(&mut self) {	4✔
163	self.hex_pos = 0;	4✔
164	}	4✔
165
166	/// Add a hex digit to the collector
167	/// Returns true if this completes the 4-digit sequence
168	pub fn add_hex_digit(&mut self, digit: u8) -> Result<bool, ParseError> {	28✔
169	// Validate the hex digit first
170	EscapeProcessor::validate_hex_digit(digit)?;	28✔
171
172	if self.hex_pos >= 4 {	27✔
173	return Err(ParserErrorHandler::unexpected_state(	×
174	"Too many hex digits in Unicode escape",	×
175	));	×
176	}	27✔
177
178	if let Some(slot) = self.hex_buffer.get_mut(self.hex_pos) {	27✔
179	*slot = digit;	27✔
180	} else {	27✔
NEW 181	return Err(ParseError::InvalidUnicodeHex);	×
182	}
183
184	self.hex_pos = self.hex_pos.saturating_add(1);	27✔
185
186	Ok(self.hex_pos == 4)	27✔
187	}	28✔
188
189	/// Process the collected hex digits and return UTF-8 bytes
190	/// Should only be called when is_complete() returns true
191	pub fn process_to_utf8<'a>(&self, utf8_buffer: &'a mut [u8]) -> Result<&'a [u8], ParseError> {	6✔
192	if self.hex_pos != 4 {	6✔
193	return Err(ParserErrorHandler::incomplete_unicode_escape());	1✔
194	}	5✔
195
196	EscapeProcessor::process_unicode_escape(&self.hex_buffer, utf8_buffer)	5✔
197	}	6✔
198	}
199
200	#[cfg(test)]
201	mod tests {
202	use super::*;
203
204	#[test]
205	fn test_simple_escapes() {	1✔
206	assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');	1✔
207	assert_eq!(EscapeProcessor::process_simple_escape(b't').unwrap(), b'\t');	1✔
208	assert_eq!(EscapeProcessor::process_simple_escape(b'r').unwrap(), b'\r');	1✔
209	assert_eq!(	1✔
210	EscapeProcessor::process_simple_escape(b'\\').unwrap(),	1✔
211	b'\\'
212	);
213	assert_eq!(EscapeProcessor::process_simple_escape(b'"').unwrap(), b'"');	1✔
214	assert_eq!(EscapeProcessor::process_simple_escape(b'/').unwrap(), b'/');	1✔
215	assert_eq!(EscapeProcessor::process_simple_escape(b'b').unwrap(), 0x08);	1✔
216	assert_eq!(EscapeProcessor::process_simple_escape(b'f').unwrap(), 0x0C);	1✔
217	}	1✔
218
219	#[test]
220	fn test_invalid_simple_escape() {	1✔
221	assert!(EscapeProcessor::process_simple_escape(b'x').is_err());	1✔
222	assert!(EscapeProcessor::process_simple_escape(b'z').is_err());	1✔
223	assert!(EscapeProcessor::process_simple_escape(b'1').is_err());	1✔
224	}	1✔
225
226	#[test]
227	fn test_hex_digit_validation() {	1✔
228	// Valid digits
229	assert_eq!(EscapeProcessor::validate_hex_digit(b'0').unwrap(), 0);	1✔
230	assert_eq!(EscapeProcessor::validate_hex_digit(b'9').unwrap(), 9);	1✔
231	assert_eq!(EscapeProcessor::validate_hex_digit(b'a').unwrap(), 10);	1✔
232	assert_eq!(EscapeProcessor::validate_hex_digit(b'f').unwrap(), 15);	1✔
233	assert_eq!(EscapeProcessor::validate_hex_digit(b'A').unwrap(), 10);	1✔
234	assert_eq!(EscapeProcessor::validate_hex_digit(b'F').unwrap(), 15);	1✔
235
236	// Invalid digits
237	assert!(EscapeProcessor::validate_hex_digit(b'g').is_err());	1✔
238	assert!(EscapeProcessor::validate_hex_digit(b'G').is_err());	1✔
239	assert!(EscapeProcessor::validate_hex_digit(b'z').is_err());	1✔
240	assert!(EscapeProcessor::validate_hex_digit(b' ').is_err());	1✔
241	}	1✔
242
243	#[test]
244	fn test_unicode_escape_basic() {	1✔
245	let mut buffer = [0u8; 4];	1✔
246
247	// Test basic ASCII character \u0041 -> 'A'
248	let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();	1✔
249	assert_eq!(result, b"A");	1✔
250
251	// Test another ASCII character \u0048 -> 'H'
252	let result = EscapeProcessor::process_unicode_escape(b"0048", &mut buffer).unwrap();	1✔
253	assert_eq!(result, b"H");	1✔
254	}	1✔
255
256	#[test]
257	fn test_unicode_escape_multibyte() {	1✔
258	let mut buffer = [0u8; 4];	1✔
259
260	// Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1)
261	let result = EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer).unwrap();	1✔
262	assert_eq!(result, "α".as_bytes());	1✔
263
264	// Test emoji \u1F60A -> '😊' (4 bytes in UTF-8)
265	let _result = EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer).unwrap();	1✔
266	// Note: This is actually incomplete - \u1F60A requires surrogate pairs
267	// But for basic testing this verifies the hex parsing works
268	}	1✔
269
270	#[test]
271	fn test_unicode_escape_invalid_hex() {	1✔
272	let mut buffer = [0u8; 4];	1✔
273
274	// Invalid hex characters
275	assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer).is_err());	1✔
276	assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer).is_err());	1✔
277
278	// Wrong length
279	assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer).is_err());	1✔
280	assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer).is_err());	1✔
281	}	1✔
282
283	#[test]
284	fn test_unicode_escape_invalid_codepoint() {	1✔
285	let mut buffer = [0u8; 4];	1✔
286
287	// Note: Most values in the BMP are valid Unicode codepoints
288	// Invalid surrogate codepoints would be D800-DFFF but they're complex to test
289	// For now, test basic valid cases to ensure the function works
290	let result = EscapeProcessor::process_unicode_escape(b"0000", &mut buffer).unwrap();	1✔
291	assert_eq!(result, "\0".as_bytes());	1✔
292	}	1✔
293
294	#[test]
295	fn test_token_to_escape_char() {	1✔
296	use crate::ujson::EventToken;
297
298	// Test all valid escape tokens
299	assert_eq!(	1✔
300	EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(),	1✔
301	b'"'
302	);
303	assert_eq!(	1✔
304	EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackslash).unwrap(),	1✔
305	b'\\'
306	);
307	assert_eq!(	1✔
308	EscapeProcessor::token_to_escape_char(&EventToken::EscapeSlash).unwrap(),	1✔
309	b'/'
310	);
311	assert_eq!(	1✔
312	EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackspace).unwrap(),	1✔
313	b'b'
314	);
315	assert_eq!(	1✔
316	EscapeProcessor::token_to_escape_char(&EventToken::EscapeFormFeed).unwrap(),	1✔
317	b'f'
318	);
319	assert_eq!(	1✔
320	EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(),	1✔
321	b'n'
322	);
323	assert_eq!(	1✔
324	EscapeProcessor::token_to_escape_char(&EventToken::EscapeCarriageReturn).unwrap(),	1✔
325	b'r'
326	);
327	assert_eq!(	1✔
328	EscapeProcessor::token_to_escape_char(&EventToken::EscapeTab).unwrap(),	1✔
329	b't'
330	);
331
332	// Test invalid token
333	assert_eq!(	1✔
334	EscapeProcessor::token_to_escape_char(&EventToken::String),	1✔
335	None
336	);
337	}	1✔
338
339	#[test]
340	fn test_process_escape_token() {	1✔
341	use crate::ujson::EventToken;
342
343	// Test valid escape tokens that produce correct unescaped bytes
344	assert_eq!(	1✔
345	EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(),	1✔
346	b'"'
347	);
348	assert_eq!(	1✔
349	EscapeProcessor::process_escape_token(&EventToken::EscapeBackslash).unwrap(),	1✔
350	b'\\'
351	);
352	assert_eq!(	1✔
353	EscapeProcessor::process_escape_token(&EventToken::EscapeSlash).unwrap(),	1✔
354	b'/'
355	);
356	assert_eq!(	1✔
357	EscapeProcessor::process_escape_token(&EventToken::EscapeBackspace).unwrap(),	1✔
358	0x08
359	);
360	assert_eq!(	1✔
361	EscapeProcessor::process_escape_token(&EventToken::EscapeFormFeed).unwrap(),	1✔
362	0x0C
363	);
364	assert_eq!(	1✔
365	EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(),	1✔
366	b'\n'
367	);
368	assert_eq!(	1✔
369	EscapeProcessor::process_escape_token(&EventToken::EscapeCarriageReturn).unwrap(),	1✔
370	b'\r'
371	);
372	assert_eq!(	1✔
373	EscapeProcessor::process_escape_token(&EventToken::EscapeTab).unwrap(),	1✔
374	b'\t'
375	);
376
377	// Test invalid token
378	assert!(EscapeProcessor::process_escape_token(&EventToken::String).is_err());	1✔
379	}	1✔
380
381	#[test]
382	fn test_unicode_escape_collector_basic() {	1✔
383	let mut collector = UnicodeEscapeCollector::new();	1✔
384	let mut utf8_buffer = [0u8; 4];	1✔
385
386	// Add hex digits for \u0041 -> 'A'
387	assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet	1✔
388	assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet	1✔
389	assert!(!collector.add_hex_digit(b'4').unwrap()); // Not complete yet	1✔
390	assert!(collector.add_hex_digit(b'1').unwrap()); // Complete!	1✔
391
392	// Process to UTF-8
393	let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();	1✔
394	assert_eq!(result, b"A");	1✔
395	}	1✔
396
397	#[test]
398	fn test_unicode_escape_collector_invalid_hex() {	1✔
399	let mut collector = UnicodeEscapeCollector::new();	1✔
400
401	// Valid digits first
402	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
403	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
404
405	// Invalid hex digit should fail
406	assert!(collector.add_hex_digit(b'G').is_err());	1✔
407	}	1✔
408
409	#[test]
410	fn test_unicode_escape_collector_reset() {	1✔
411	let mut collector = UnicodeEscapeCollector::new();	1✔
412
413	// Add some digits
414	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
415	assert!(!collector.add_hex_digit(b'1').unwrap());	1✔
416
417	// Reset should clear state
418	collector.reset();	1✔
419
420	// Should be able to start fresh
421	assert!(!collector.add_hex_digit(b'A').unwrap());	1✔
422	}	1✔
423
424	#[test]
425	fn test_unicode_escape_collector_multibyte() {	1✔
426	let mut collector = UnicodeEscapeCollector::new();	1✔
427	let mut utf8_buffer = [0u8; 4];	1✔
428
429	// Add hex digits for \u03B1 -> 'α' (Greek alpha)
430	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
431	assert!(!collector.add_hex_digit(b'3').unwrap());	1✔
432	assert!(!collector.add_hex_digit(b'B').unwrap());	1✔
433	assert!(collector.add_hex_digit(b'1').unwrap());	1✔
434
435	let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();	1✔
436	assert_eq!(result, "α".as_bytes());	1✔
437	}	1✔
438
439	#[test]
440	fn test_unicode_escape_collector_incomplete_processing() {	1✔
441	let mut collector = UnicodeEscapeCollector::new();	1✔
442	let mut utf8_buffer = [0u8; 4];	1✔
443
444	// Add only 2 digits
445	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
446	assert!(!collector.add_hex_digit(b'0').unwrap());	1✔
447
448	// Should fail to process incomplete sequence
449	assert!(collector.process_to_utf8(&mut utf8_buffer).is_err());	1✔
450	}	1✔
451	}
452
453	/// Shared implementation for processing a Unicode escape sequence.
454	///
455	/// This function centralizes the logic for handling `\uXXXX` escapes, which is
456	/// common to both the pull-based and stream-based parsers. It uses a generic
457	/// `hex_slice_provider` to remain independent of the underlying buffer implementation
458	/// (`SliceInputBuffer` vs. `DirectBuffer`).
459	///
460	/// # Arguments
461	/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
462	/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
463	/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
464	/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
465	///
466	/// # Returns
467	/// A tuple containing the resulting UTF-8 byte slice and the start position of the escape sequence (`\uXXXX`).
468	pub(crate) fn process_unicode_escape_sequence<'a, F>(	3✔
469	current_pos: usize,	3✔
470	unicode_escape_collector: &mut UnicodeEscapeCollector,	3✔
471	mut hex_slice_provider: F,	3✔
472	utf8_buf: &'a mut [u8; 4],	3✔
473	) -> Result<(&'a [u8], usize), ParseError>	3✔
474	where	3✔
475	F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,	3✔
476	{
477	let (hex_start, hex_end, escape_start_pos) =	3✔
478	crate::shared::ContentRange::unicode_escape_bounds(current_pos);	3✔
479
480	// Extract the 4 hex digits from the buffer using the provider
481	let hex_slice = hex_slice_provider(hex_start, hex_end)?;	3✔
482
483	if hex_slice.len() != 4 {	3✔
484	return Err(ParserErrorHandler::invalid_unicode_length());	×
485	}	3✔
486
487	// Feed hex digits to the shared collector
488	for &hex_digit in hex_slice {	15✔
489	unicode_escape_collector.add_hex_digit(hex_digit)?;	12✔
490	}
491
492	// Process the complete sequence to UTF-8
493	let utf8_bytes = unicode_escape_collector.process_to_utf8(utf8_buf)?;	3✔
494
495	Ok((utf8_bytes, escape_start_pos))	3✔
496	}	3✔

kaidokert / picojson-rs / 16083134029

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous