• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kaidokert / picojson-rs / 16083134029

05 Jul 2025 01:22AM UTC coverage: 94.358% (-0.2%) from 94.535%
16083134029

Pull #27

github

kaidokert
Review comments
Pull Request #27: Panic free for SliceParser

198 of 211 new or added lines in 11 files covered. (93.84%)

1 existing line in 1 file now uncovered.

2191 of 2322 relevant lines covered (94.36%)

132.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.84
/picojson/src/escape_processor.rs
1
// SPDX-License-Identifier: Apache-2.0
2

3
use crate::{shared::ParserErrorHandler, ParseError};
4

5
/// Shared utilities for processing JSON escape sequences.
6
/// This module contains pure functions for escape processing that can be used
7
/// by both CopyOnEscape and StreamingBuffer components.
8
pub struct EscapeProcessor;
9
use crate::ujson;
10
use ujson::EventToken;
11

12
impl EscapeProcessor {
13
    /// Convert an escape token from the tokenizer to the corresponding escape character.
14
    /// This extracts the character that follows the backslash in the escape sequence.
15
    ///
16
    /// # Arguments
17
    /// * `escape_token` - The escape token from the tokenizer
18
    ///
19
    /// # Returns
20
    /// The character that follows the backslash, or None if the token is not a simple escape.
21
    ///
22
    /// # Examples
23
    /// ```ignore
24
    /// // Internal API - see unit tests for usage examples
25
    /// assert_eq!(EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(), b'n');
26
    /// ```
27
    pub fn token_to_escape_char(escape_token: &ujson::EventToken) -> Option<u8> {
42✔
28
        match escape_token {
42✔
29
            EventToken::EscapeQuote => Some(b'"'),
3✔
30
            EventToken::EscapeBackslash => Some(b'\\'),
2✔
31
            EventToken::EscapeSlash => Some(b'/'),
2✔
32
            EventToken::EscapeBackspace => Some(b'b'),
2✔
33
            EventToken::EscapeFormFeed => Some(b'f'),
2✔
34
            EventToken::EscapeNewline => Some(b'n'),
17✔
35
            EventToken::EscapeCarriageReturn => Some(b'r'),
4✔
36
            EventToken::EscapeTab => Some(b't'),
8✔
37
            _ => None,
2✔
38
        }
39
    }
42✔
40

41
    /// Process an escape token directly to the unescaped byte value.
42
    /// This is a convenience method that combines token_to_escape_char and process_simple_escape.
43
    ///
44
    /// # Arguments
45
    /// * `escape_token` - The escape token from the tokenizer
46
    ///
47
    /// # Returns
48
    /// The unescaped byte value, or an error if the token is invalid or not a simple escape.
49
    ///
50
    /// # Examples
51
    /// ```ignore
52
    /// // Internal API - see unit tests for usage examples
53
    /// assert_eq!(EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(), b'\n');
54
    /// ```
55
    pub fn process_escape_token(escape_token: &ujson::EventToken) -> Result<u8, ParseError> {
33✔
56
        let escape_char = Self::token_to_escape_char(escape_token)
33✔
57
            .ok_or(ParserErrorHandler::unexpected_state("Invalid escape token"))?;
33✔
58
        Self::process_simple_escape(escape_char)
32✔
59
    }
33✔
60

61
    /// Process a simple escape sequence character and return the unescaped byte.
62
    ///
63
    /// # Arguments
64
    /// * `escape_char` - The character following the backslash in an escape sequence
65
    ///
66
    /// # Returns
67
    /// The unescaped byte value, or an error if the escape sequence is invalid.
68
    ///
69
    /// # Examples
70
    /// ```ignore
71
    /// // Internal API - see unit tests for usage examples
72
    /// assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
73
    /// ```
74
    pub fn process_simple_escape(escape_char: u8) -> Result<u8, ParseError> {
43✔
75
        match escape_char {
43✔
76
            b'n' => Ok(b'\n'),
17✔
77
            b't' => Ok(b'\t'),
8✔
78
            b'r' => Ok(b'\r'),
4✔
79
            b'\\' => Ok(b'\\'),
2✔
80
            b'"' => Ok(b'"'),
3✔
81
            b'/' => Ok(b'/'),
2✔
82
            b'b' => Ok(0x08), // Backspace
2✔
83
            b'f' => Ok(0x0C), // Form feed
2✔
84
            _ => Err(ParseError::InvalidEscapeSequence),
3✔
85
        }
86
    }
43✔
87

88
    /// Validate that a byte represents a valid hexadecimal digit.
89
    ///
90
    /// # Arguments
91
    /// * `byte` - The byte to validate
92
    ///
93
    /// # Returns
94
    /// The numeric value (0-15) of the hex digit, or an error if invalid.
95
    pub fn validate_hex_digit(byte: u8) -> Result<u32, ParseError> {
82✔
96
        match byte {
82✔
97
            b'0'..=b'9' => Ok((byte - b'0') as u32),
81✔
98
            b'a'..=b'f' => Ok(byte.wrapping_sub(b'a').wrapping_add(10) as u32),
4✔
99
            b'A'..=b'F' => Ok(byte.wrapping_sub(b'A').wrapping_add(10) as u32),
15✔
100
            _ => Err(ParseError::InvalidUnicodeHex),
7✔
101
        }
102
    }
82✔
103

104
    /// Process a Unicode escape sequence (\uXXXX) and return the UTF-8 encoded bytes.
105
    ///
106
    /// # Arguments
107
    /// * `hex_slice` - A 4-byte slice containing the hexadecimal digits
108
    /// * `utf8_buffer` - A buffer to write the UTF-8 encoded result (must be at least 4 bytes)
109
    ///
110
    /// # Returns
111
    /// A slice containing the UTF-8 encoded bytes, or an error if the escape is invalid.
112
    ///
113
    /// # Examples
114
    /// ```ignore
115
    /// // Internal API - see unit tests for usage examples
116
    /// let mut buffer = [0u8; 4];
117
    /// let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();
118
    /// assert_eq!(result, b"A");
119
    /// ```
120
    pub fn process_unicode_escape<'a>(
14✔
121
        hex_slice: &[u8],
14✔
122
        utf8_buffer: &'a mut [u8],
14✔
123
    ) -> Result<&'a [u8], ParseError> {
14✔
124
        if hex_slice.len() != 4 {
14✔
125
            return Err(ParseError::InvalidUnicodeHex);
2✔
126
        }
12✔
127

128
        // Convert hex bytes to Unicode codepoint
129
        let mut codepoint = 0u32;
12✔
130
        for &byte in hex_slice {
54✔
131
            let digit = Self::validate_hex_digit(byte)?;
44✔
132
            codepoint = (codepoint << 4) | digit;
42✔
133
        }
134

135
        // Convert codepoint to character and encode as UTF-8
136
        let ch = char::from_u32(codepoint).ok_or(ParseError::InvalidUnicodeCodepoint)?;
10✔
137
        let utf8_str = ch.encode_utf8(utf8_buffer);
10✔
138
        Ok(utf8_str.as_bytes())
10✔
139
    }
14✔
140
}
141

142
/// Shared Unicode escape hex digit collector for both parsers.
143
/// Provides a common interface for collecting the 4 hex digits in \uXXXX sequences.
144
#[derive(Debug)]
145
pub struct UnicodeEscapeCollector {
146
    /// Buffer to collect the 4 hex digits
147
    hex_buffer: [u8; 4],
148
    /// Current position in the hex buffer (0-4)
149
    hex_pos: usize,
150
}
151

152
impl UnicodeEscapeCollector {
153
    /// Create a new Unicode escape collector
154
    pub fn new() -> Self {
91✔
155
        Self {
91✔
156
            hex_buffer: [0u8; 4],
91✔
157
            hex_pos: 0,
91✔
158
        }
91✔
159
    }
91✔
160

161
    /// Reset the collector for a new Unicode escape sequence
162
    pub fn reset(&mut self) {
4✔
163
        self.hex_pos = 0;
4✔
164
    }
4✔
165

166
    /// Add a hex digit to the collector
167
    /// Returns true if this completes the 4-digit sequence
168
    pub fn add_hex_digit(&mut self, digit: u8) -> Result<bool, ParseError> {
28✔
169
        // Validate the hex digit first
170
        EscapeProcessor::validate_hex_digit(digit)?;
28✔
171

172
        if self.hex_pos >= 4 {
27✔
173
            return Err(ParserErrorHandler::unexpected_state(
×
174
                "Too many hex digits in Unicode escape",
×
175
            ));
×
176
        }
27✔
177

178
        if let Some(slot) = self.hex_buffer.get_mut(self.hex_pos) {
27✔
179
            *slot = digit;
27✔
180
        } else {
27✔
NEW
181
            return Err(ParseError::InvalidUnicodeHex);
×
182
        }
183

184
        self.hex_pos = self.hex_pos.saturating_add(1);
27✔
185

186
        Ok(self.hex_pos == 4)
27✔
187
    }
28✔
188

189
    /// Process the collected hex digits and return UTF-8 bytes
190
    /// Should only be called when is_complete() returns true
191
    pub fn process_to_utf8<'a>(&self, utf8_buffer: &'a mut [u8]) -> Result<&'a [u8], ParseError> {
6✔
192
        if self.hex_pos != 4 {
6✔
193
            return Err(ParserErrorHandler::incomplete_unicode_escape());
1✔
194
        }
5✔
195

196
        EscapeProcessor::process_unicode_escape(&self.hex_buffer, utf8_buffer)
5✔
197
    }
6✔
198
}
199

200
#[cfg(test)]
201
mod tests {
202
    use super::*;
203

204
    #[test]
205
    fn test_simple_escapes() {
1✔
206
        assert_eq!(EscapeProcessor::process_simple_escape(b'n').unwrap(), b'\n');
1✔
207
        assert_eq!(EscapeProcessor::process_simple_escape(b't').unwrap(), b'\t');
1✔
208
        assert_eq!(EscapeProcessor::process_simple_escape(b'r').unwrap(), b'\r');
1✔
209
        assert_eq!(
1✔
210
            EscapeProcessor::process_simple_escape(b'\\').unwrap(),
1✔
211
            b'\\'
212
        );
213
        assert_eq!(EscapeProcessor::process_simple_escape(b'"').unwrap(), b'"');
1✔
214
        assert_eq!(EscapeProcessor::process_simple_escape(b'/').unwrap(), b'/');
1✔
215
        assert_eq!(EscapeProcessor::process_simple_escape(b'b').unwrap(), 0x08);
1✔
216
        assert_eq!(EscapeProcessor::process_simple_escape(b'f').unwrap(), 0x0C);
1✔
217
    }
1✔
218

219
    #[test]
220
    fn test_invalid_simple_escape() {
1✔
221
        assert!(EscapeProcessor::process_simple_escape(b'x').is_err());
1✔
222
        assert!(EscapeProcessor::process_simple_escape(b'z').is_err());
1✔
223
        assert!(EscapeProcessor::process_simple_escape(b'1').is_err());
1✔
224
    }
1✔
225

226
    #[test]
227
    fn test_hex_digit_validation() {
1✔
228
        // Valid digits
229
        assert_eq!(EscapeProcessor::validate_hex_digit(b'0').unwrap(), 0);
1✔
230
        assert_eq!(EscapeProcessor::validate_hex_digit(b'9').unwrap(), 9);
1✔
231
        assert_eq!(EscapeProcessor::validate_hex_digit(b'a').unwrap(), 10);
1✔
232
        assert_eq!(EscapeProcessor::validate_hex_digit(b'f').unwrap(), 15);
1✔
233
        assert_eq!(EscapeProcessor::validate_hex_digit(b'A').unwrap(), 10);
1✔
234
        assert_eq!(EscapeProcessor::validate_hex_digit(b'F').unwrap(), 15);
1✔
235

236
        // Invalid digits
237
        assert!(EscapeProcessor::validate_hex_digit(b'g').is_err());
1✔
238
        assert!(EscapeProcessor::validate_hex_digit(b'G').is_err());
1✔
239
        assert!(EscapeProcessor::validate_hex_digit(b'z').is_err());
1✔
240
        assert!(EscapeProcessor::validate_hex_digit(b' ').is_err());
1✔
241
    }
1✔
242

243
    #[test]
244
    fn test_unicode_escape_basic() {
1✔
245
        let mut buffer = [0u8; 4];
1✔
246

247
        // Test basic ASCII character \u0041 -> 'A'
248
        let result = EscapeProcessor::process_unicode_escape(b"0041", &mut buffer).unwrap();
1✔
249
        assert_eq!(result, b"A");
1✔
250

251
        // Test another ASCII character \u0048 -> 'H'
252
        let result = EscapeProcessor::process_unicode_escape(b"0048", &mut buffer).unwrap();
1✔
253
        assert_eq!(result, b"H");
1✔
254
    }
1✔
255

256
    #[test]
257
    fn test_unicode_escape_multibyte() {
1✔
258
        let mut buffer = [0u8; 4];
1✔
259

260
        // Test Greek alpha \u03B1 -> 'α' (2 bytes in UTF-8: 0xCE, 0xB1)
261
        let result = EscapeProcessor::process_unicode_escape(b"03B1", &mut buffer).unwrap();
1✔
262
        assert_eq!(result, "α".as_bytes());
1✔
263

264
        // Test emoji \u1F60A -> '😊' (4 bytes in UTF-8)
265
        let _result = EscapeProcessor::process_unicode_escape(b"1F60", &mut buffer).unwrap();
1✔
266
        // Note: This is actually incomplete - \u1F60A requires surrogate pairs
267
        // But for basic testing this verifies the hex parsing works
268
    }
1✔
269

270
    #[test]
271
    fn test_unicode_escape_invalid_hex() {
1✔
272
        let mut buffer = [0u8; 4];
1✔
273

274
        // Invalid hex characters
275
        assert!(EscapeProcessor::process_unicode_escape(b"00GG", &mut buffer).is_err());
1✔
276
        assert!(EscapeProcessor::process_unicode_escape(b"ZZZZ", &mut buffer).is_err());
1✔
277

278
        // Wrong length
279
        assert!(EscapeProcessor::process_unicode_escape(b"123", &mut buffer).is_err());
1✔
280
        assert!(EscapeProcessor::process_unicode_escape(b"12345", &mut buffer).is_err());
1✔
281
    }
1✔
282

283
    #[test]
284
    fn test_unicode_escape_invalid_codepoint() {
1✔
285
        let mut buffer = [0u8; 4];
1✔
286

287
        // Note: Most values in the BMP are valid Unicode codepoints
288
        // Invalid surrogate codepoints would be D800-DFFF but they're complex to test
289
        // For now, test basic valid cases to ensure the function works
290
        let result = EscapeProcessor::process_unicode_escape(b"0000", &mut buffer).unwrap();
1✔
291
        assert_eq!(result, "\0".as_bytes());
1✔
292
    }
1✔
293

294
    #[test]
295
    fn test_token_to_escape_char() {
1✔
296
        use crate::ujson::EventToken;
297

298
        // Test all valid escape tokens
299
        assert_eq!(
1✔
300
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeQuote).unwrap(),
1✔
301
            b'"'
302
        );
303
        assert_eq!(
1✔
304
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackslash).unwrap(),
1✔
305
            b'\\'
306
        );
307
        assert_eq!(
1✔
308
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeSlash).unwrap(),
1✔
309
            b'/'
310
        );
311
        assert_eq!(
1✔
312
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeBackspace).unwrap(),
1✔
313
            b'b'
314
        );
315
        assert_eq!(
1✔
316
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeFormFeed).unwrap(),
1✔
317
            b'f'
318
        );
319
        assert_eq!(
1✔
320
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeNewline).unwrap(),
1✔
321
            b'n'
322
        );
323
        assert_eq!(
1✔
324
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeCarriageReturn).unwrap(),
1✔
325
            b'r'
326
        );
327
        assert_eq!(
1✔
328
            EscapeProcessor::token_to_escape_char(&EventToken::EscapeTab).unwrap(),
1✔
329
            b't'
330
        );
331

332
        // Test invalid token
333
        assert_eq!(
1✔
334
            EscapeProcessor::token_to_escape_char(&EventToken::String),
1✔
335
            None
336
        );
337
    }
1✔
338

339
    #[test]
340
    fn test_process_escape_token() {
1✔
341
        use crate::ujson::EventToken;
342

343
        // Test valid escape tokens that produce correct unescaped bytes
344
        assert_eq!(
1✔
345
            EscapeProcessor::process_escape_token(&EventToken::EscapeQuote).unwrap(),
1✔
346
            b'"'
347
        );
348
        assert_eq!(
1✔
349
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackslash).unwrap(),
1✔
350
            b'\\'
351
        );
352
        assert_eq!(
1✔
353
            EscapeProcessor::process_escape_token(&EventToken::EscapeSlash).unwrap(),
1✔
354
            b'/'
355
        );
356
        assert_eq!(
1✔
357
            EscapeProcessor::process_escape_token(&EventToken::EscapeBackspace).unwrap(),
1✔
358
            0x08
359
        );
360
        assert_eq!(
1✔
361
            EscapeProcessor::process_escape_token(&EventToken::EscapeFormFeed).unwrap(),
1✔
362
            0x0C
363
        );
364
        assert_eq!(
1✔
365
            EscapeProcessor::process_escape_token(&EventToken::EscapeNewline).unwrap(),
1✔
366
            b'\n'
367
        );
368
        assert_eq!(
1✔
369
            EscapeProcessor::process_escape_token(&EventToken::EscapeCarriageReturn).unwrap(),
1✔
370
            b'\r'
371
        );
372
        assert_eq!(
1✔
373
            EscapeProcessor::process_escape_token(&EventToken::EscapeTab).unwrap(),
1✔
374
            b'\t'
375
        );
376

377
        // Test invalid token
378
        assert!(EscapeProcessor::process_escape_token(&EventToken::String).is_err());
1✔
379
    }
1✔
380

381
    #[test]
382
    fn test_unicode_escape_collector_basic() {
1✔
383
        let mut collector = UnicodeEscapeCollector::new();
1✔
384
        let mut utf8_buffer = [0u8; 4];
1✔
385

386
        // Add hex digits for \u0041 -> 'A'
387
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
1✔
388
        assert!(!collector.add_hex_digit(b'0').unwrap()); // Not complete yet
1✔
389
        assert!(!collector.add_hex_digit(b'4').unwrap()); // Not complete yet
1✔
390
        assert!(collector.add_hex_digit(b'1').unwrap()); // Complete!
1✔
391

392
        // Process to UTF-8
393
        let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
394
        assert_eq!(result, b"A");
1✔
395
    }
1✔
396

397
    #[test]
398
    fn test_unicode_escape_collector_invalid_hex() {
1✔
399
        let mut collector = UnicodeEscapeCollector::new();
1✔
400

401
        // Valid digits first
402
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
403
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
404

405
        // Invalid hex digit should fail
406
        assert!(collector.add_hex_digit(b'G').is_err());
1✔
407
    }
1✔
408

409
    #[test]
410
    fn test_unicode_escape_collector_reset() {
1✔
411
        let mut collector = UnicodeEscapeCollector::new();
1✔
412

413
        // Add some digits
414
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
415
        assert!(!collector.add_hex_digit(b'1').unwrap());
1✔
416

417
        // Reset should clear state
418
        collector.reset();
1✔
419

420
        // Should be able to start fresh
421
        assert!(!collector.add_hex_digit(b'A').unwrap());
1✔
422
    }
1✔
423

424
    #[test]
425
    fn test_unicode_escape_collector_multibyte() {
1✔
426
        let mut collector = UnicodeEscapeCollector::new();
1✔
427
        let mut utf8_buffer = [0u8; 4];
1✔
428

429
        // Add hex digits for \u03B1 -> 'α' (Greek alpha)
430
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
431
        assert!(!collector.add_hex_digit(b'3').unwrap());
1✔
432
        assert!(!collector.add_hex_digit(b'B').unwrap());
1✔
433
        assert!(collector.add_hex_digit(b'1').unwrap());
1✔
434

435
        let result = collector.process_to_utf8(&mut utf8_buffer).unwrap();
1✔
436
        assert_eq!(result, "α".as_bytes());
1✔
437
    }
1✔
438

439
    #[test]
440
    fn test_unicode_escape_collector_incomplete_processing() {
1✔
441
        let mut collector = UnicodeEscapeCollector::new();
1✔
442
        let mut utf8_buffer = [0u8; 4];
1✔
443

444
        // Add only 2 digits
445
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
446
        assert!(!collector.add_hex_digit(b'0').unwrap());
1✔
447

448
        // Should fail to process incomplete sequence
449
        assert!(collector.process_to_utf8(&mut utf8_buffer).is_err());
1✔
450
    }
1✔
451
}
452

453
/// Shared implementation for processing a Unicode escape sequence.
454
///
455
/// This function centralizes the logic for handling `\uXXXX` escapes, which is
456
/// common to both the pull-based and stream-based parsers. It uses a generic
457
/// `hex_slice_provider` to remain independent of the underlying buffer implementation
458
/// (`SliceInputBuffer` vs. `DirectBuffer`).
459
///
460
/// # Arguments
461
/// * `current_pos` - The parser's current position in the input buffer, right after the 4 hex digits.
462
/// * `unicode_escape_collector` - A mutable reference to the shared `UnicodeEscapeCollector`.
463
/// * `hex_slice_provider` - A closure that takes a start and end position and returns the hex digit slice.
464
/// * `utf8_buf` - A buffer to write the UTF-8 encoded result into.
465
///
466
/// # Returns
467
/// A tuple containing the resulting UTF-8 byte slice and the start position of the escape sequence (`\uXXXX`).
468
pub(crate) fn process_unicode_escape_sequence<'a, F>(
3✔
469
    current_pos: usize,
3✔
470
    unicode_escape_collector: &mut UnicodeEscapeCollector,
3✔
471
    mut hex_slice_provider: F,
3✔
472
    utf8_buf: &'a mut [u8; 4],
3✔
473
) -> Result<(&'a [u8], usize), ParseError>
3✔
474
where
3✔
475
    F: FnMut(usize, usize) -> Result<&'a [u8], ParseError>,
3✔
476
{
477
    let (hex_start, hex_end, escape_start_pos) =
3✔
478
        crate::shared::ContentRange::unicode_escape_bounds(current_pos);
3✔
479

480
    // Extract the 4 hex digits from the buffer using the provider
481
    let hex_slice = hex_slice_provider(hex_start, hex_end)?;
3✔
482

483
    if hex_slice.len() != 4 {
3✔
484
        return Err(ParserErrorHandler::invalid_unicode_length());
×
485
    }
3✔
486

487
    // Feed hex digits to the shared collector
488
    for &hex_digit in hex_slice {
15✔
489
        unicode_escape_collector.add_hex_digit(hex_digit)?;
12✔
490
    }
491

492
    // Process the complete sequence to UTF-8
493
    let utf8_bytes = unicode_escape_collector.process_to_utf8(utf8_buf)?;
3✔
494

495
    Ok((utf8_bytes, escape_start_pos))
3✔
496
}
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc