• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kaidokert / picojson-rs / 16313593500

16 Jul 2025 07:47AM UTC coverage: 94.318% (+0.5%) from 93.864%
16313593500

Pull #62

github

web-flow
Merge 7ef4af978 into d7962e604
Pull Request #62: Big old refactor

513 of 558 new or added lines in 9 files covered. (91.94%)

3 existing lines in 2 files now uncovered.

4598 of 4875 relevant lines covered (94.32%)

648.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.35
/picojson/src/escape_processor.rs
1
// SPDX-License-Identifier: Apache-2.0
2

3
use crate::parse_error::ParseError;
4
use crate::shared::UnexpectedState;
5

6
/// Shared utilities for processing JSON escape sequences.
7
/// This module contains pure functions for escape processing that can be used
8
/// by both CopyOnEscape and StreamingBuffer components.
9
pub struct EscapeProcessor;
10
use crate::ujson;
11
use ujson::EventToken;
12

13
impl EscapeProcessor {
14
    /// Convert an escape token from the tokenizer to the corresponding escape character.
15
    /// This extracts the character that follows the backslash in the escape sequence.
16
    ///
17
    /// # Arguments
18
    /// * `escape_token` - The escape token from the tokenizer
19
    ///
20
    /// # Returns
21
    /// The character that follows the backslash, or None if the token is not a simple escape.
22
    ///
23
    /// # Examples
24
    /// ```ignore
25
    /// // Internal API - see unit tests for usage examples
26
    /// assert_eq!(EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), b'n');
27
    /// ```
28
    pub fn token_to_escape_char(escape_token: &ujson::EventToken) -> Option<u8> {
706✔
29
        match escape_token {
706✔
30
            EventToken::EscapeQuote => Some(b'"'),
139✔
31
            EventToken::EscapeBackslash => Some(b'\\'),
212✔
32
            EventToken::EscapeSlash => Some(b'/'),
2✔
33
            EventToken::EscapeBackspace => Some(b'b'),
2✔
34
            EventToken::EscapeFormFeed => Some(b'f'),
2✔
35
            EventToken::EscapeNewline => Some(b'n'),
184✔
36
            EventToken::EscapeCarriageReturn => Some(b'r'),
7✔
37
            EventToken::EscapeTab => Some(b't'),
156✔
38
            _ => None,
2✔
39
        }
40
    }
706✔
41

42
    /// Process an escape token directly to the unescaped byte value.
43
    /// This is a convenience method that combines token_to_escape_char and process_simple_escape.
44
    ///
45
    /// # Arguments
46
    /// * `escape_token` - The escape token from the tokenizer
47
    ///
48
    /// # Returns
49
    /// The unescaped byte value, or an error if the token is invalid or not a simple escape.
50
    ///
51
    /// # Examples
52
    /// ```ignore
53
    /// // Internal API - see unit tests for usage examples
54
    /// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n');
55
    /// ```
56
    pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result<u8, ParseError> {
697✔
57
        let escape_char =
696✔
58
            Self::token_to_escape_char(escape_token).ok_or(UnexpectedState::InvalidEscapeToken)?;
697✔
59
        Self::process_simple_escape(escape_char)
696✔
60
    }
697✔
61

62
    /// Process a simple escape sequence character and return the unescaped byte.
63
    ///
64
    /// # Arguments
65
    /// * `escape_char` - The character following the backslash in an escape sequence
66
    ///
67
    /// # Returns
68
    /// The unescaped byte value, or an error if the escape sequence is invalid.
69
    ///
70
    /// # Examples
71
    /// ```ignore
72
    /// // Internal API - see unit tests for usage examples
73
    /// assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
74
    /// ```
75
    pub fn process_simple_escape(escape_char: u8) -> Result<u8, ParseError> {
707✔
76
        match escape_char {
707✔
77
            b'n' => Ok(b'\n'),
184✔
78
            b't' => Ok(b'\t'),
156✔
79
            b'r' => Ok(b'\r'),
7✔
80
            b'\\' => Ok(b'\\'),
212✔
81
            b'"' => Ok(b'"'),
139✔
82
            b'/' => Ok(b'/'),
2✔
83
            b'b' => Ok(0x08), // Backspace
2✔
84
            b'f' => Ok(0x0C), // Form feed
2✔
85
            _ => Err(ParseError::InvalidEscapeSequence),
3✔
86
        }
87
    }
707✔
88

89
    /// Validate that a byte represents a valid hexadecimal digit.
90
    ///
91
    /// # Arguments
92
    /// * `byte` - The byte to validate
93
    ///
94
    /// # Returns
95
    /// The numeric value (0-15) of the hex digit, or an error if invalid.
96
    pub fn validate_hex_digit(byte: u8) -> Result<u32, ParseError> {
1,702✔
97
        match byte {
1,702✔
98
            b'0'..=b'9' => Ok((byte - b'0') as u32),
1,701✔
99
            b'a'..=b'f' => Ok(byte.wrapping_sub(b'a').wrapping_add(10) as u32),
4✔
100
            b'A'..=b'F' => Ok(byte.wrapping_sub(b'A').wrapping_add(10) as u32),
386✔
101
            _ => Err(ParseError::InvalidUnicodeHex),
7✔
102
        }
103
    }
1,702✔
104

105
    /// Check if a Unicode codepoint is a high surrogate (0xD800-0xDBFF)
106
    pub fn is_high_surrogate(codepoint: u32) -> bool {
217✔
107
        (0xD800..=0xDBFF).contains(&codepoint)
217✔
108
    }
217✔
109

110
    /// Check if a Unicode codepoint is a low surrogate (0xDC00-0xDFFF)
111
    pub fn is_low_surrogate(codepoint: u32) -> bool {
199✔
112
        (0xDC00..=0xDFFF).contains(&codepoint)
199✔
113
    }
199✔
114

115
    /// Combine a high and low surrogate pair into a single Unicode codepoint
116
    pub fn combine_surrogate_pair(high: u32, low: u32) -> Result<u32, ParseError> {
38✔
117
        if !Self::is_high_surrogate(high) || !Self::is_low_surrogate(low) {
38✔
118
            return Err(ParseError::InvalidUnicodeCodepoint);
2✔
119
        }
36✔
120

121
        // Combine surrogates according to UTF-16 specification
122
        let codepoint = 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF);
36✔
123
        Ok(codepoint)
36✔
124
    }
38✔
125

126
    /// Process a Unicode escape sequence with surrogate pair support.
127
    /// This function handles both individual Unicode escapes and surrogate pairs.
128
    ///
129
    /// # Arguments
130
    /// * `hex_slice` - A 4-byte slice containing the hexadecimal digits
131
    /// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes)
132
    /// * `pending_high_surrogate` - Optional high surrogate from previous escape
133
    ///
134
    /// # Returns
135
    /// A tuple containing:
136
    /// - Optional UTF-8 encoded bytes (None if this is a high surrogate waiting for low)
137
    /// - Optional high surrogate to save for next escape (Some if this is a high surrogate)
138
    pub fn process_unicode_escape<'a>(
219✔
139
        hex_slice: &[u8],
219✔
140
        utf8_buffer: &'a mut [u8],
219✔
141
        pending_high_surrogate: Option<u32>,
219✔
142
    ) -> Result<(Option<&'a [u8]>, Option<u32>), ParseError> {
219✔
143
        if hex_slice.len() != 4 {
219✔
144
            return Err(ParseError::InvalidUnicodeHex);
2✔
145
        }
217✔
146

147
        // Convert hex bytes to Unicode codepoint
148
        let mut codepoint = 0u32;
217✔
149
        for &byte in hex_slice {
1,079✔
150
            let digit = Self::validate_hex_digit(byte)?;
864✔
151
            codepoint = (codepoint << 4) | digit;
862✔
152
        }
153

154
        // Check if we have a pending high surrogate
155
        if let Some(high) = pending_high_surrogate {
215✔
156
            // We should have a low surrogate now
157
            if Self::is_low_surrogate(codepoint) {
41✔
158
                // Combine the surrogate pair
159
                let combined = Self::combine_surrogate_pair(high, codepoint)?;
34✔
160
                let ch = char::from_u32(combined).ok_or(ParseError::InvalidUnicodeCodepoint)?;
34✔
161
                let utf8_str = ch.encode_utf8(utf8_buffer);
34✔
162
                Ok((Some(utf8_str.as_bytes()), None))
34✔
163
            } else {
164
                // Error: high surrogate not followed by low surrogate
165
                Err(ParseError::InvalidUnicodeCodepoint)
7✔
166
            }
167
        } else {
168
            // No pending high surrogate
169
            if Self::is_high_surrogate(codepoint) {
174✔
170
                // Save this high surrogate for the next escape
171
                Ok((None, Some(codepoint)))
58✔
172
            } else if Self::is_low_surrogate(codepoint) {
116✔
173
                // Error: low surrogate without preceding high surrogate
174
                Err(ParseError::InvalidUnicodeCodepoint)
19✔
175
            } else {
176
                // Regular Unicode character
177
                let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?;
97✔
178
                let utf8_str = ch.encode_utf8(utf8_buffer);
97✔
179
                Ok((Some(utf8_str.as_bytes()), None))
97✔
180
            }
181
        }
182
    }
219✔
183
}
184

185
/// Shared Unicode escape hex digit collector for both parsers.
186
/// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences.
187
/// Supports surrogate pairs by tracking pending high surrogates.
188
#[derive(Debug)]
189
pub struct UnicodeEscapeCollector {
190
    /// Buffer to collect the 4 hex digits
191
    hex_buffer: [u8; 4],
192
    /// Current position in the hex buffer (0-4)
193
    hex_pos: usize,
194
    /// Pending high surrogate waiting for low surrogate
195
    pending_high_surrogate: Option<u32>,
196
}
197

198
impl UnicodeEscapeCollector {
199
    /// Create a new Unicode escape collector
200
    pub fn new() -> Self {
1,137✔
201
        Self {
1,137✔
202
            hex_buffer: [0u8; 4],
1,137✔
203
            hex_pos: 0,
1,137✔
204
            pending_high_surrogate: None,
1,137✔
205
        }
1,137✔
206
    }
1,137✔
207

208
    /// Reset the collector for a new Unicode escape sequence
209
    pub fn reset(&mut self) {
221✔
210
        self.hex_pos = 0;
221✔
211
        // Note: We don't reset pending_high_surrogate here since it needs to persist
212
        // across Unicode escape sequences to properly handle surrogate pairs
213
    }
221✔
214

215
    /// Reset the collector completely, including any pending surrogate state
216
    pub fn reset_all(&mut self) {
688✔
217
        self.hex_pos = 0;
688✔
218
        self.pending_high_surrogate = None;
688✔
219
    }
688✔
220

221
    /// Add a hex digit to the collector
222
    /// Returns true if this completes the 4-digit sequence
223
    pub fn add_hex_digit(&mut self, digit: u8) -> Result<bool, ParseError> {
828✔
224
        // Validate the hex digit first
225
        EscapeProcessor::validate_hex_digit(digit)?;
828✔
226

227
        if self.hex_pos >= 4 {
827✔
228
            return Err(UnexpectedState::InvalidUnicodeEscape.into());
×
229
        }
827✔
230

231
        if let Some(slot) = self.hex_buffer.get_mut(self.hex_pos) {
827✔
232
            *slot = digit;
827✔
233
        } else {
827✔
234
            return Err(ParseError::InvalidUnicodeHex);
×
235
        }
236

237
        self.hex_pos = self.hex_pos.saturating_add(1);
827✔
238

239
        Ok(self.hex_pos == 4)
827✔
240
    }
828✔
241

242
    /// Process the collected hex digits with surrogate pair support
243
    /// Should only be called when is_complete() returns true
244
    /// Returns (optional UTF-8 bytes, whether surrogate state changed)
245
    pub fn process_to_utf8<'a>(
206✔
246
        &mut self,
206✔
247
        utf8_buffer: &'a mut [u8],
206✔
248
    ) -> Result<(Option<&'a [u8]>, bool), ParseError> {
206✔
249
        if self.hex_pos != 4 {
206✔
250
            return Err(UnexpectedState::InvalidUnicodeEscape.into());
1✔
251
        }
205✔
252

253
        let (result, new_pending) = EscapeProcessor::process_unicode_escape(
205✔
254
            &self.hex_buffer,
205✔
255
            utf8_buffer,
205✔
256
            self.pending_high_surrogate,
205✔
257
        )?;
24✔
258

259
        let surrogate_state_changed = self.pending_high_surrogate != new_pending;
181✔
260
        self.pending_high_surrogate = new_pending;
181✔
261

262
        Ok((result, surrogate_state_changed))
181✔
263
    }
206✔
264

265
    /// Check if there's a pending high surrogate waiting for a low surrogate
266
    pub fn has_pending_high_surrogate(&self) -> bool {
1,338✔
267
        self.pending_high_surrogate.is_some()
1,338✔
268
    }
1,338✔
269
}
270

271
impl Default for UnicodeEscapeCollector {
NEW
272
    fn default() -> Self {
×
NEW
273
        Self::new()
×
NEW
274
    }
×
275
}
276

277
#[cfg(test)]
278
mod tests {
279
    use super::*;
280
    use crate::ujson::EventToken;
281

282
    #[test]
283
    fn test_simple_escapes() {
1✔
284
        assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
1✔
285
        assert_eq!(EscapeProcessor::process_simple_escape(b't').unwrap(), b'\t');
1✔
286
        assert_eq!(EscapeProcessor::process_simple_escape(b'r').unwrap(), b'\r');
1✔
287
        assert_eq!(
1✔
288
            EscapeProcessor::process_simple_escape(b'\\').unwrap(),
1✔
289
            b'\\'
290
        );
291
        assert_eq!(EscapeProcessor::process_simple_escape(b'"').unwrap(), b'"');
1✔
292
        assert_eq!(EscapeProcessor::process_simple_escape(b'/').unwrap(), b'/');
1✔
293
        assert_eq!(EscapeProcessor::process_simple_escape(b'b').unwrap(), 0x08);
1✔
294
        assert_eq!(EscapeProcessor::process_simple_escape(b'f').unwrap(), 0x0C);
1✔
295
    }
1✔
296

297
    #[test]
298
    fn test_invalid_simple_escape() {
1✔
299
        assert!(EscapeProcessor::process_simple_escape(b'x').is_err());
1✔
300
        assert!(EscapeProcessor::process_simple_escape(b'z').is_err());
1✔
301
        assert!(EscapeProcessor::process_simple_escape(b'1').is_err());
1✔
302
    }
1✔
303

304
    #[test]
305
    fn test_hex_digit_validation() {
1✔
306
        // Valid digits
307
        assert_eq!(EscapeProcessor::validate_hex_digit(b'0').unwrap(), 0);
1✔
308
        assert_eq!(EscapeProcessor::validate_hex_digit(b'9').unwrap(), 9);
1✔
309
        assert_eq!(EscapeProcessor::validate_hex_digit(b'a').unwrap(), 10);
1✔
310
        assert_eq!(EscapeProcessor::validate_hex_digit(b'f').unwrap(), 15);
1✔
311
        assert_eq!(EscapeProcessor::validate_hex_digit(b'A').unwrap(), 10);
1✔
312
        assert_eq!(EscapeProcessor::validate_hex_digit(b'F').unwrap(), 15);
1✔
313

314
        // Invalid digits
315
        assert!(EscapeProcessor::validate_hex_digit(b'g').is_err());
1✔
316
        assert!(EscapeProcessor::validate_hex_digit(b'G').is_err());
1✔
317
        assert!(EscapeProcessor::validate_hex_digit(b'z').is_err());
1✔
318
        assert!(EscapeProcessor::validate_hex_digit(b' ').is_err());
1✔
319
    }
1✔
320

321
    #[test]
322
    fn test_unicode_escape_basic() {
1✔
323
        let mut buffer = [0u8; 4];
1✔
324

325
        // Test basic ASCII character \u0041 -> 'A'
326
        let (result, pending) =
1✔
327
            EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, None).unwrap();
1✔
328
        assert_eq!(result.unwrap(), b"A");
1✔
329
        assert_eq!(pending, None);
1✔
330

331
        // Test another ASCII character \u0048 -> 'H'
332
        let (result, pending) =
1✔
333
            EscapeProcessor::process_unicode_escape(b"0048", &mut buffer, None).unwrap();
1✔
334
        assert_eq!(result.unwrap(), b"H");
1✔
335
        assert_eq!(pending, None);
1✔
336
    }
1✔
337

338
    #[test]
339
    fn test_unicode_escape_multibyte() {
1✔
340
        let mut buffer = [0u8; 4];
1✔
341

342
        // Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1)
343
        let (result, pending) =
1✔
344
            EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer, None).unwrap();
1✔
345
        assert_eq!(result.unwrap(), "α".as_bytes());
1✔
346
        assert_eq!(pending, None);
1✔
347

348
        // Test emoji \u1F60A -> '😊' (4 bytes in UTF-8)
349
        let (result, pending) =
1✔
350
            EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer, None).unwrap();
1✔
351
        // Note: This is actually incomplete - \u1F60A requires surrogate pairs
352
        // But for basic testing this verifies the hex parsing works
353
        assert!(result.is_some());
1✔
354
        assert_eq!(pending, None);
1✔
355
    }
1✔
356

357
    #[test]
358
    fn test_unicode_escape_invalid_hex() {
1✔
359
        let mut buffer = [0u8; 4];
1✔
360

361
        // Invalid hex characters
362
        assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer, None).is_err());
1✔
363
        assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer, None).is_err());
1✔
364

365
        // Wrong length
366
        assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer, None).is_err());
1✔
367
        assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer, None).is_err());
1✔
368
    }
1✔
369

370
    #[test]
371
    fn test_unicode_escape_invalid_codepoint() {
1✔
372
        let mut buffer = [0u8; 4];
1✔
373

374
        // Note: Most values in the BMP are valid Unicode codepoints
375
        // Invalid surrogate codepoints would be D800-DFFF but they're complex to test
376
        // For now, test basic valid cases to ensure the function works
377
        let (result, pending) =
1✔
378
            EscapeProcessor::process_unicode_escape(b"0000", &mut buffer, None).unwrap();
1✔
379
        assert_eq!(result.unwrap(), "\0".as_bytes());
1✔
380
        assert_eq!(pending, None);
1✔
381
    }
1✔
382

383
    #[test]
384
    fn test_token_to_escape_char() {
1✔
385
        // Test all valid escape tokens
386
        assert_eq!(
1✔
387
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(),
1✔
388
            b'"'
389
        );
390
        assert_eq!(
1✔
391
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackslash).unwrap(),
1✔
392
            b'\\'
393
        );
394
        assert_eq!(
1✔
395
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeSlash).unwrap(),
1✔
396
            b'/'
397
        );
398
        assert_eq!(
1✔
399
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackspace).unwrap(),
1✔
400
            b'b'
401
        );
402
        assert_eq!(
1✔
403
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeFormFeed).unwrap(),
1✔
404
            b'f'
405
        );
406
        assert_eq!(
1✔
407
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(),
1✔
408
            b'n'
409
        );
410
        assert_eq!(
1✔
411
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeCarriageReturn).unwrap(),
1✔
412
            b'r'
413
        );
414
        assert_eq!(
1✔
415
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeTab).unwrap(),
1✔
416
            b't'
417
        );
418

419
        // Test invalid token
420
        assert_eq!(
1✔
421
            EscapeProcessor::token_to_escape_char(&EventToken::String),
1✔
422
            None
423
        );
424
    }
1✔
425

426
    #[test]
427
    fn test_process_escape_token() {
1✔
428
        // Test valid escape tokens that produce correct unescaped bytes
429
        assert_eq!(
1✔
430
            EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(),
1✔
431
            b'"'
432
        );
433
        assert_eq!(
1✔
434
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackslash).unwrap(),
1✔
435
            b'\\'
436
        );
437
        assert_eq!(
1✔
438
            EscapeProcessor::process_escape_token(&EventToken::EscapeSlash).unwrap(),
1✔
439
            b'/'
440
        );
441
        assert_eq!(
1✔
442
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackspace).unwrap(),
1✔
443
            0x08
444
        );
445
        assert_eq!(
1✔
446
            EscapeProcessor::process_escape_token(&EventToken::EscapeFormFeed).unwrap(),
1✔
447
            0x0C
448
        );
449
        assert_eq!(
1✔
450
            EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(),
1✔
451
            b'\n'
452
        );
453
        assert_eq!(
1✔
454
            EscapeProcessor::process_escape_token(&EventToken::EscapeCarriageReturn).unwrap(),
1✔
455
            b'\r'
456
        );
457
        assert_eq!(
1✔
458
            EscapeProcessor::process_escape_token(&EventToken::EscapeTab).unwrap(),
1✔
459
            b'\t'
460
        );
461

462
        // Test invalid token
463
        assert!(EscapeProcessor::process_escape_token(&EventToken::String).is_err());
1✔
464
    }
1✔
465

466
    #[test]
467
    fn test_unicode_escape_collector_basic() {
1✔
468
        let mut collector = UnicodeEscapeCollector::new();
1✔
469
        let mut utf8_buffer = [0u8; 4];
1✔
470

471
        // Add hex digits for \u0041 -> 'A'
472
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
1✔
473
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
1✔
474
        assert!(!collector.add_hex_digit(b'4').unwrap()); // Not complete yet
1✔
475
        assert!(collector.add_hex_digit(b'1').unwrap()); // Complete!
1✔
476

477
        // Process to UTF-8
478
        let (result, _surrogate_state_changed) =
1✔
479
            collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
480
        assert_eq!(result.unwrap(), b"A");
1✔
481
    }
1✔
482

483
    #[test]
484
    fn test_unicode_escape_collector_invalid_hex() {
1✔
485
        let mut collector = UnicodeEscapeCollector::new();
1✔
486

487
        // Valid digits first
488
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
489
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
490

491
        // Invalid hex digit should fail
492
        assert!(collector.add_hex_digit(b'G').is_err());
1✔
493
    }
1✔
494

495
    #[test]
496
    fn test_unicode_escape_collector_reset() {
1✔
497
        let mut collector = UnicodeEscapeCollector::new();
1✔
498

499
        // Add some digits
500
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
501
        assert!(!collector.add_hex_digit(b'1').unwrap());
1✔
502

503
        // Reset should clear hex position but not surrogate state
504
        collector.reset();
1✔
505

506
        // Should be able to start fresh
507
        assert!(!collector.add_hex_digit(b'A').unwrap());
1✔
508
    }
1✔
509

510
    #[test]
511
    fn test_unicode_escape_collector_surrogate_support() {
1✔
512
        let mut collector = UnicodeEscapeCollector::new();
1✔
513
        let mut utf8_buffer = [0u8; 4];
1✔
514

515
        // Process high surrogate \uD801
516
        assert!(!collector.add_hex_digit(b'D').unwrap());
1✔
517
        assert!(!collector.add_hex_digit(b'8').unwrap());
1✔
518
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
519
        assert!(collector.add_hex_digit(b'1').unwrap());
1✔
520

521
        let (result, state_changed) = collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
522
        assert_eq!(result, None); // No UTF-8 output yet
1✔
523
        assert!(state_changed); // Surrogate state changed
1✔
524
        assert!(collector.has_pending_high_surrogate());
1✔
525

526
        // Reset for next escape sequence
527
        collector.reset();
1✔
528

529
        // Process low surrogate \uDC37
530
        assert!(!collector.add_hex_digit(b'D').unwrap());
1✔
531
        assert!(!collector.add_hex_digit(b'C').unwrap());
1✔
532
        assert!(!collector.add_hex_digit(b'3').unwrap());
1✔
533
        assert!(collector.add_hex_digit(b'7').unwrap());
1✔
534

535
        let (result, state_changed) = collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
536
        assert!(result.is_some()); // Should have UTF-8 output
1✔
537
        assert!(state_changed); // Surrogate state changed (cleared)
1✔
538
        assert!(!collector.has_pending_high_surrogate());
1✔
539

540
        // Verify it's the correct UTF-8 encoding for U+10437
541
        assert_eq!(result.unwrap(), [0xF0, 0x90, 0x90, 0xB7]);
1✔
542
    }
1✔
543

544
    #[test]
545
    fn test_unicode_escape_collector_multibyte() {
1✔
546
        let mut collector = UnicodeEscapeCollector::new();
1✔
547
        let mut utf8_buffer = [0u8; 4];
1✔
548

549
        // Add hex digits for \u03B1 -> 'α' (Greek alpha)
550
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
551
        assert!(!collector.add_hex_digit(b'3').unwrap());
1✔
552
        assert!(!collector.add_hex_digit(b'B').unwrap());
1✔
553
        assert!(collector.add_hex_digit(b'1').unwrap());
1✔
554

555
        let (result, _surrogate_state_changed) =
1✔
556
            collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
557
        assert_eq!(result.unwrap(), "α".as_bytes());
1✔
558
    }
1✔
559

560
    #[test]
561
    fn test_unicode_escape_collector_incomplete_processing() {
1✔
562
        let mut collector = UnicodeEscapeCollector::new();
1✔
563
        let mut utf8_buffer = [0u8; 4];
1✔
564

565
        // Add only 2 digits
566
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
567
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
568

569
        // Should fail to process incomplete sequence
570
        assert!(collector.process_to_utf8(&mut utf8_buffer).is_err());
1✔
571
    }
1✔
572

573
    #[test]
574
    fn test_surrogate_pair_detection() {
1✔
575
        // Test high surrogate detection
576
        assert!(EscapeProcessor::is_high_surrogate(0xD800));
1✔
577
        assert!(EscapeProcessor::is_high_surrogate(0xD801));
1✔
578
        assert!(EscapeProcessor::is_high_surrogate(0xDBFF));
1✔
579
        assert!(!EscapeProcessor::is_high_surrogate(0xD7FF));
1✔
580
        assert!(!EscapeProcessor::is_high_surrogate(0xDC00));
1✔
581

582
        // Test low surrogate detection
583
        assert!(EscapeProcessor::is_low_surrogate(0xDC00));
1✔
584
        assert!(EscapeProcessor::is_low_surrogate(0xDC37));
1✔
585
        assert!(EscapeProcessor::is_low_surrogate(0xDFFF));
1✔
586
        assert!(!EscapeProcessor::is_low_surrogate(0xDBFF));
1✔
587
        assert!(!EscapeProcessor::is_low_surrogate(0xE000));
1✔
588
    }
1✔
589

590
    #[test]
591
    fn test_surrogate_pair_combination() {
1✔
592
        // Test valid surrogate pair: \uD801\uDC37 -> U+10437
593
        let combined = EscapeProcessor::combine_surrogate_pair(0xD801, 0xDC37).unwrap();
1✔
594
        assert_eq!(combined, 0x10437);
1✔
595

596
        // Test another valid pair: \uD834\uDD1E -> U+1D11E (musical symbol)
597
        let combined = EscapeProcessor::combine_surrogate_pair(0xD834, 0xDD1E).unwrap();
1✔
598
        assert_eq!(combined, 0x1D11E);
1✔
599

600
        // Test invalid combinations
601
        assert!(EscapeProcessor::combine_surrogate_pair(0x0041, 0xDC37).is_err()); // Not high surrogate
1✔
602
        assert!(EscapeProcessor::combine_surrogate_pair(0xD801, 0x0041).is_err());
1✔
603
        // Not low surrogate
604
    }
1✔
605

606
    #[test]
607
    fn test_unicode_escape_with_surrogate_support() {
1✔
608
        let mut buffer = [0u8; 4];
1✔
609

610
        // Test regular Unicode character (not surrogate)
611
        let (result, pending) =
1✔
612
            EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, None).unwrap();
1✔
613
        assert_eq!(result, Some(b"A".as_slice()));
1✔
614
        assert_eq!(pending, None);
1✔
615

616
        // Test high surrogate - should return None and save the high surrogate
617
        let (result, pending) =
1✔
618
            EscapeProcessor::process_unicode_escape(b"D801", &mut buffer, None).unwrap();
1✔
619
        assert_eq!(result, None);
1✔
620
        assert_eq!(pending, Some(0xD801));
1✔
621

622
        // Test low surrogate following high surrogate - should combine
623
        let (result, pending) =
1✔
624
            EscapeProcessor::process_unicode_escape(b"DC37", &mut buffer, Some(0xD801)).unwrap();
1✔
625
        assert!(result.is_some());
1✔
626
        assert_eq!(pending, None);
1✔
627
        // The result should be the UTF-8 encoding of U+10437
628
        assert_eq!(result.unwrap(), [0xF0, 0x90, 0x90, 0xB7]);
1✔
629
    }
1✔
630

631
    #[test]
632
    fn test_unicode_escape_surrogate_error_cases() {
1✔
633
        let mut buffer = [0u8; 4];
1✔
634

635
        // Test low surrogate without preceding high surrogate - should error
636
        let result = EscapeProcessor::process_unicode_escape(b"DC37", &mut buffer, None);
1✔
637
        assert!(result.is_err());
1✔
638

639
        // Test high surrogate followed by non-low-surrogate - should error
640
        let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer, Some(0xD801));
1✔
641
        assert!(result.is_err());
1✔
642
    }
1✔
643
}
644

645
/// Shared implementation for processing a Unicode escape sequence WITH surrogate pair support.
646
///
647
/// This function centralizes the logic for handling `\uXXXX` escapes, which is
648
/// common to both the pull-based and stream-based parsers. It uses a generic
649
/// `hex_slice_provider` to remain independent of the underlying buffer implementation
650
/// (`SliceInputBuffer` vs. `StreamBuffer`).
651
///
652
/// # Arguments
653
/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
654
/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
655
/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
656
/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
657
///
658
/// # Returns
659
/// A tuple containing:
660
/// - Optional UTF-8 byte slice (None if this is a high surrogate waiting for low surrogate)
661
/// - The start position of the escape sequence (`\uXXXX`)
662
pub(crate) fn process_unicode_escape_sequence<'a, F>(
201✔
663
    current_pos: usize,
201✔
664
    unicode_escape_collector: &mut UnicodeEscapeCollector,
201✔
665
    mut hex_slice_provider: F,
201✔
666
) -> Result<(Option<([u8; 4], usize)>, usize), ParseError>
201✔
667
where
201✔
668
    F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
201✔
669
{
670
    let (hex_start, hex_end, escape_start_pos) =
201✔
671
        crate::shared::ContentRange::unicode_escape_bounds(current_pos);
201✔
672

673
    // Extract the 4 hex digits from the buffer using the provider
674
    let hex_slice = hex_slice_provider(hex_start, hex_end)?;
201✔
675

676
    if hex_slice.len() != 4 {
201✔
677
        return Err(UnexpectedState::InvalidUnicodeEscape.into());
×
678
    }
201✔
679

680
    // Feed hex digits to the shared collector
681
    for &hex_digit in hex_slice {
1,005✔
682
        unicode_escape_collector.add_hex_digit(hex_digit)?;
804✔
683
    }
684

685
    // Check if we had a pending high surrogate before processing
686
    let had_pending_high_surrogate = unicode_escape_collector.has_pending_high_surrogate();
201✔
687

688
    // Create a local buffer for the UTF-8 result
689
    let mut utf8_buf = [0u8; 4];
201✔
690

691
    // Process the complete sequence to UTF-8 with surrogate support
692
    let (utf8_bytes_opt, _surrogate_state_changed) =
177✔
693
        unicode_escape_collector.process_to_utf8(&mut utf8_buf)?;
201✔
694

695
    // If we have a result, copy it to a new array to return by value
696
    let result_by_value = utf8_bytes_opt.map(|bytes| {
177✔
697
        let mut value_buf = [0u8; 4];
121✔
698
        let len = bytes.len();
121✔
699
        value_buf[..len].copy_from_slice(bytes);
121✔
700
        (value_buf, len)
121✔
701
    });
121✔
702

703
    // If we're completing a surrogate pair (had pending high surrogate and now have UTF-8 bytes),
704
    // return the position of the high surrogate start instead of the low surrogate start
705
    let final_escape_start_pos = if had_pending_high_surrogate && result_by_value.is_some() {
177✔
706
        // High surrogate started 6 bytes before the current low surrogate
707
        escape_start_pos.saturating_sub(6)
32✔
708
    } else {
709
        escape_start_pos
145✔
710
    };
711

712
    Ok((result_by_value, final_escape_start_pos))
177✔
713
}
201✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc