• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 398

20 Mar 2025 04:51AM UTC coverage: 88.379% (-0.1%) from 88.491%
398

Pull #117

github

web-flow
Merge 3bb12d10d into 990216913
Pull Request #117: feat(encoding): add {encode, decode}_to functions

9 of 12 new or added lines in 2 files covered. (75.0%)

22 existing lines in 1 file now uncovered.

1270 of 1437 relevant lines covered (88.38%)

417.57 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

77.14
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
16
/// https://unicode.org/charts/nameslist/n_FFF0.html
17
pub const U_REP = '\u{FFFD}'
18

19
///|
20
let utf_8_len = [
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
29
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
30
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
]
32

33
///|
34
/// Create and return a `Decoder` for the specified character encoding.
35
///
36
/// The `Decoder` consumes byte sequences and decodes them into the original string format.
37
///
38
/// # Parameters
39
///
40
/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
41
///
42
/// # Returns
43
///
44
/// A `Decoder` instance that can be used to decode byte sequences into strings.
45
///
46
/// # Examples
47
///
48
/// ```moonbit
49
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
50
/// let decoder = decoder(UTF8)
51
/// inspect!(decoder.consume!(inputs[0]), content="abc")
52
/// inspect!(decoder.consume!(inputs[1]), content="")
53
/// inspect!(decoder.consume!(inputs[2]), content="🐰")
54
/// assert_true!(decoder.finish!().is_empty())
55
pub fn decoder(encoding : Encoding) -> Decoder {
56
  let i = FixedArray::default()
42✔
57
  let i_pos = 0
58
  let t = FixedArray::make(4, Byte::default())
59
  let t_len = 0
60
  let t_need = 0
61
  let k = match encoding {
62
    UTF8 => decode_utf_8
13✔
63
    UTF16 => decode_utf_16le
3✔
64
    UTF16LE => decode_utf_16le
14✔
65
    UTF16BE => decode_utf_16be
12✔
66
  }
67
  { i, i_pos, t, t_len, t_need, k }
68
}
69

70
///|
71
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
72
///
73
/// This function can work in streaming mode where bytes are consumed incrementally.
74
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
75
///
76
/// # Parameters
77
///
78
/// - `self`: The `Decoder` instance used to decode the byte sequence.
79
/// - `input`: The byte sequence to be decoded.
80
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
81
///
82
/// # Returns
83
///
84
/// A `String` representing the decoded content from the input byte sequence.
85
///
86
/// # Errors
87
///
88
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
89
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
90
///
91
/// # Examples
92
///
93
/// ```moonbit
94
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
95
/// let decoder = @encoding.decoder(UTF8)
96
/// inspect!(decoder.decode!(inputs[0], stream=true), content="abc")
97
/// inspect!(decoder.decode!(inputs[1], stream=true), content="")
98
/// inspect!(decoder.decode!(inputs[2], stream=false), content="🐰")
99
/// ```
100
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
101
  if input.length() > 0 {
76✔
102
    self.i_cont(input)
70✔
103
  }
104
  if self.i_rem() == 0 {
105
    return String::default()
6✔
106
  }
107

108
  // TODO: Estimate size_hint based on input and encoding more accurately
109
  let builder = StringBuilder::new(size_hint=input.length())
110

111
  // drive decoder to decode
112
  loop self.decode_() {
113
    Uchar(u) => {
278✔
114
      builder.write_char(u)
115
      continue self.decode_()
116
    }
117
    Malformed(bs) =>
118
      if stream && self.t_need > 0 {
4✔
119
        builder.to_string()
×
120
      } else {
121
        raise MalformedError(bs)
4✔
122
      }
123
    End => builder.to_string()
42✔
124
    Refill(t) =>
125
      if stream {
24✔
126
        builder.to_string()
24✔
127
      } else {
128
        raise TruncatedError(t)
×
129
      }
130
  }
131
}
132

133
///|
134
/// Decodes the given byte sequence using the specified decoder and writes the
135
/// result directly to a StringBuilder.
136
/// Similar to `decode!`, but writes the result to an existing StringBuilder
137
/// instead of creating a new String.
138
///
139
/// Parameters:
140
///
141
/// * `decoder` : The decoder instance used to decode the byte sequence.
142
/// * `input` : The byte sequence to be decoded.
143
/// * `output` : The StringBuilder where the decoded content will be written to.
144
///
145
/// Throws a `MalformedError` when the byte sequence is not properly formatted
146
/// according to the specified encoding.
147
///
148
/// Example:
149
///
150
/// ```moonbit
151
/// test "decode_to" {
152
///   let decoder = decoder(UTF8)
153
///   let buf = StringBuilder::new()
154
///   decoder.decode_to!(b"Hello", buf)
155
///   inspect!(buf.to_string(), content="Hello")
156
/// }
157
/// ```
158
pub fn Decoder::decode_to(
159
  self : Decoder,
160
  input : Bytes,
161
  output : StringBuilder
162
) -> Unit! {
163
  if input.length() > 0 {
10✔
164
    self.i_cont(input)
10✔
165
  }
166
  if self.i_rem() == 0 {
NEW
167
    return
×
168
  }
169
  // drive decoder to decode
170
  loop self.decode_() {
171
    Uchar(u) => {
57✔
172
      output.write_char(u)
173
      continue self.decode_()
174
    }
175
    Malformed(bs) => if self.t_need <= 0 { raise MalformedError(bs) }
2✔
176
    End => return
8✔
NEW
177
    Refill(_) => return
×
178
  }
179
}
180

181
///|
182
pub fn decode_to(
183
  input : Bytes,
184
  output : StringBuilder,
185
  encoding~ : Encoding
186
) -> Unit! {
NEW
187
  decoder(encoding).decode_to!(input, output)
×
188
}
189

190
///|
191
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
192
///
193
/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
194
///
195
/// # Parameters
196
///
197
/// - `self`: The `Decoder` instance used to consume the byte sequence.
198
/// - `input`: The byte sequence to be consumed and decoded incrementally.
199
///
200
/// # Returns
201
///
202
/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
203
///
204
/// # Errors
205
///
206
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
207
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
208
pub fn consume!(self : Decoder, input : Bytes) -> String {
209
  self.decode!(input, stream=true)
50✔
210
}
211

212
///|
213
/// Finalize the decoding process and return the remaining decoded string.
214
///
215
/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
216
/// and triggering the final decoding step to produce the remaining output.
217
///
218
/// # Parameters
219
///
220
/// - `self`: The `Decoder` instance used to finalize the decoding process.
221
///
222
/// # Returns
223
///
224
/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
225
///
226
/// # Errors
227
///
228
/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
229
/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
230
pub fn finish!(self : Decoder) -> String {
231
  self.decode!(b"", stream=false)
6✔
232
}
233

234
///|
235
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
236
///
237
/// This function can work in streaming mode where bytes are consumed incrementally.
238
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
239
///
240
/// # Parameters
241
///
242
/// - `self`: The `Decoder` instance used to decode the byte sequence.
243
/// - `input`: The byte sequence to be decoded.
244
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
245
///
246
/// # Returns
247
///
248
/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
249
pub fn decode_lossy(
250
  self : Decoder,
251
  input : Bytes,
252
  stream~ : Bool = false
253
) -> String {
254
  if input.length() > 0 {
6✔
255
    self.i_cont(input)
6✔
256
  }
257
  if self.i_rem() == 0 {
UNCOV
258
    return String::default()
×
259
  }
260

261
  // drive decoder to decode
262
  let chars = []
263
  loop self.decode_() {
264
    Uchar(u) => {
6✔
265
      chars.push(u)
266
      continue self.decode_()
267
    }
268
    Malformed(_) =>
269
      if stream && self.t_need > 0 {
9✔
UNCOV
270
        String::from_array(chars)
×
271
      } else {
272
        chars.push(U_REP)
9✔
273
        continue self.decode_()
274
      }
275
    End => String::from_array(chars)
6✔
276
    Refill(_) =>
277
      if stream {
3✔
UNCOV
278
        String::from_array(chars)
×
279
      } else {
280
        continue self.decode_()
3✔
281
      }
282
  }
283
}
284

285
///|
286
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
287
///
288
/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
289
///
290
/// # Parameters
291
///
292
/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
293
/// - `input`: The byte sequence to be consumed and decoded incrementally.
294
///
295
/// # Returns
296
///
297
/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
298
pub fn lossy_consume(self : Decoder, input : Bytes) -> String {
299
  self.decode_lossy(input, stream=true)
×
300
}
301

302
///|
303
/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
304
///
305
/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
306
/// and triggering the final decoding step to produce the remaining output.
307
///
308
/// # Parameters
309
///
310
/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
311
///
312
/// # Returns
313
///
314
/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
315
pub fn lossy_finish(self : Decoder) -> String {
316
  self.decode_lossy(b"", stream=false)
×
317
}
318

319
///|
320
fn i_cont(self : Decoder, input : Bytes) -> Unit {
321
  // concat `input` to `i`, drop decoded `i`
322
  let i_rem = @math.maximum(self.i_rem(), 0)
86✔
323
  let new_len = i_rem + input.length()
324
  // init a new `i`
325
  let new_i = FixedArray::make(new_len, Byte::default())
326
  if i_rem > 0 {
327
    // copy the remainder of the old `i` into the new `i`
UNCOV
328
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
×
329
  }
330
  // copy all `input` into new `i`, starting at the remainder of the old `i`
331
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
332
  self.i = new_i
333
  // reset position to starting position
334
  self.i_pos = 0
335
}
336

337
// Implementations
338

339
///|
340
fn decode_(self : Decoder) -> Decode {
341
  (self.k)(self)
439✔
342
}
343

344
///|
345
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
346
  self.k = k
383✔
347
  v
348
}
349

350
///|
351
fn i_rem(self : Decoder) -> Int {
352
  self.i.length() - self.i_pos
665✔
353
}
354

355
///|
356
fn t_need(self : Decoder, need : Int) -> Unit {
357
  self.t_len = 0
27✔
358
  self.t_need = need
359
}
360

361
///|
362
fn eoi(self : Decoder) -> Unit {
363
  self.i = FixedArray::default()
27✔
364
}
365

366
///|
367
fn refill(self : Decoder, k : Cont) -> Decode {
368
  self.eoi()
27✔
369
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
370
}
371

372
///|
373
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
374
  fn blit(decoder : Decoder, l : Int) -> Unit {
54✔
375
    decoder.i.blit_to(
51✔
376
      decoder.t,
377
      len=l,
378
      dst_offset=decoder.t_len,
379
      src_offset=decoder.i_pos,
380
    )
381
    decoder.i_pos += l
382
    decoder.t_len += l
383
  }
384

385
  let rem = decoder.i_rem()
386
  if rem < 0 { // eoi
387
    k(decoder)
3✔
388
  } else {
389
    let need = decoder.t_need - decoder.t_len
51✔
390
    if rem < need {
391
      blit(decoder, rem)
27✔
392
      decoder.refill(@tuple.curry(t_fill)(k))
393
    } else {
394
      blit(decoder, need)
24✔
395
      k(decoder)
396
    }
397
  }
398
}
399

400
// UTF8
401

402
///|
403
fn decode_utf_8(self : Decoder) -> Decode {
404
  let rem = self.i_rem()
105✔
405
  if rem <= 0 {
406
    Decode::End
15✔
407
  } else {
408
    let idx = self.i[self.i_pos].to_int()
90✔
409
    let need = utf_8_len[idx]
410
    if rem < need {
411
      self.t_need(need)
6✔
412
      t_fill(t_decode_utf_8, self)
413
    } else {
414
      let j = self.i_pos
84✔
415
      if need == 0 {
416
        self.i_pos += 1
2✔
417
        self.ret(decode_utf_8, malformed(self.i, j, 1))
418
      } else {
419
        self.i_pos += need
82✔
420
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
421
      }
422
    }
423
  }
424
}
425

426
///|
427
fn t_decode_utf_8(self : Decoder) -> Decode {
428
  if self.t_len < self.t_need {
6✔
UNCOV
429
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
×
430
  } else {
431
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
6✔
432
  }
433
}
434

435
///|
436
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
437
  fn uchar(c : Int) {
88✔
438
    Uchar(Char::from_int(c))
81✔
439
  }
440

441
  match length {
442
    1 => uchar(bytes[offset].to_int())
47✔
443
    2 => {
8✔
444
      let b0 = bytes[offset].to_int()
445
      let b1 = bytes[offset + 1].to_int()
446
      if (b1 >> 6) != 0b10 {
447
        malformed(bytes, offset, length)
5✔
448
      } else {
449
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
3✔
450
      }
451
    }
452
    3 => {
15✔
453
      let b0 = bytes[offset].to_int()
454
      let b1 = bytes[offset + 1].to_int()
455
      let b2 = bytes[offset + 2].to_int()
456
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
457
      if (b2 >> 6) != 0b10 {
UNCOV
458
        malformed(bytes, offset, length)
×
459
      } else {
460
        match b0 {
15✔
461
          0xE0 =>
462
            if b1 < 0xA0 || 0xBF < b1 {
1✔
463
              malformed(bytes, offset, length)
1✔
464
            } else {
UNCOV
465
              uchar(c)
×
466
            }
467
          0xED =>
UNCOV
468
            if b1 < 0x80 || 0x9F < b1 {
×
UNCOV
469
              malformed(bytes, offset, length)
×
470
            } else {
UNCOV
471
              uchar(c)
×
472
            }
473
          _ =>
474
            if (b1 >> 6) != 0b10 {
14✔
UNCOV
475
              malformed(bytes, offset, length)
×
476
            } else {
477
              uchar(c)
14✔
478
            }
479
        }
480
      }
481
    }
482
    4 => {
18✔
483
      let b0 = bytes[offset].to_int()
484
      let b1 = bytes[offset + 1].to_int()
485
      let b2 = bytes[offset + 2].to_int()
486
      let b3 = bytes[offset + 3].to_int()
487
      let c = ((b0 & 0x07) << 18) |
488
        ((b1 & 0x3F) << 12) |
489
        ((b2 & 0x3F) << 6) |
490
        (b3 & 0x3F)
491
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
492
        malformed(bytes, offset, length)
1✔
493
      } else {
494
        match b0 {
17✔
495
          0xF0 =>
496
            if b1 < 0x90 || 0xBF < b1 {
17✔
UNCOV
497
              malformed(bytes, offset, length)
×
498
            } else {
499
              uchar(c)
17✔
500
            }
501
          0xF4 =>
UNCOV
502
            if b1 < 0x80 || 0x8F < b1 {
×
503
              malformed(bytes, offset, length)
×
504
            } else {
UNCOV
505
              uchar(c)
×
506
            }
507
          _ =>
UNCOV
508
            if (b1 >> 6) != 0b10 {
×
509
              malformed(bytes, offset, length)
×
510
            } else {
UNCOV
511
              uchar(c)
×
512
            }
513
        }
514
      }
515
    }
UNCOV
516
    _ => panic()
×
517
  }
518
}
519

520
// UTF16LE
521

522
///|
523
priv enum UTF16Decode {
524
  Hi(Int)
525
  UTF16Malformed(Bytes)
526
  UTF16Uchar(Char)
527
}
528

529
///|
530
fn decode_utf_16le(self : Decoder) -> Decode {
531
  let rem = self.i_rem()
173✔
532
  if rem <= 0 {
533
    Decode::End
25✔
534
  } else if rem < 2 {
148✔
535
    self.t_need(2)
9✔
536
    t_fill(t_decode_utf_16le, self)
537
  } else {
538
    let j = self.i_pos
139✔
539
    self.i_pos += 2
540
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
541
  }
542
}
543

544
///|
545
fn t_decode_utf_16le(self : Decoder) -> Decode {
546
  if self.t_len < self.t_need {
9✔
547
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
×
548
  } else {
549
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
9✔
550
  }
551
}
552

553
///|
554
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
555
  match v {
148✔
556
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
138✔
UNCOV
557
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
×
558
    Hi(hi) => {
10✔
559
      let rem = self.i_rem()
560
      if rem < 2 {
561
        self.t_need(2)
4✔
562
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
563
      } else {
564
        let j = self.i_pos
6✔
565
        let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
566
        match dcd {
567
          Uchar(_) => self.i_pos += 2
5✔
568
          _ => ()
1✔
569
        }
570
        self.ret(decode_utf_16le, dcd)
571
      }
572
    }
573
  }
574
}
575

576
///|
577
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
578
  if decoder.t_len < decoder.t_need {
4✔
579
    decoder.ret(
2✔
580
      decode_utf_16le,
581
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
582
    )
583
  } else {
584
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
2✔
585
  }
586
}
587

588
///|
589
fn r_utf_16_lo(
590
  hi : Int,
591
  bytes : FixedArray[Byte],
592
  offset0 : Int,
593
  offset1 : Int
594
) -> Decode {
595
  let b0 = bytes[offset0].to_int()
18✔
596
  let b1 = bytes[offset1].to_int()
597
  let lo = (b0 << 8) | b1
598
  if lo < 0xDC00 || lo > 0xDFFF {
599
    // NOTE(jinser): only hi malformed, skip lo if lo is illegal
600
    //
601
    // For example, b"\xD8\x00\x00\x48" (BE)
602
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
603
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
604
    //
605
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
606
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
607
    //
608
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
609
    // �H
610
    Malformed([bytes[offset0], bytes[offset1]])
3✔
611
  } else {
612
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
15✔
613
  }
614
}
615

616
///|
617
fn r_utf_16(
618
  bytes : FixedArray[Byte],
619
  offset0 : Int,
620
  offset1 : Int
621
) -> UTF16Decode {
622
  let b0 = bytes[offset0].to_int()
266✔
623
  let b1 = bytes[offset1].to_int()
624
  let u = (b0 << 8) | b1
625
  if u < 0xD800 || u > 0xDFFF {
626
    UTF16Uchar(Char::from_int(u))
245✔
627
  } else if u > 0xDBFF {
21✔
UNCOV
628
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
629
  } else {
630
    Hi(u)
21✔
631
  }
632
}
633

634
// UTF16BE
635

636
///|
637
fn decode_utf_16be(self : Decoder) -> Decode {
638
  let rem = self.i_rem()
134✔
639
  if rem <= 0 {
640
    Decode::End
16✔
641
  } else if rem < 2 {
118✔
642
    self.t_need(2)
7✔
643
    t_fill(t_decode_utf_16be, self)
644
  } else {
645
    let j = self.i_pos
111✔
646
    self.i_pos += 2
647
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
648
  }
649
}
650

651
///|
652
fn t_decode_utf_16be(self : Decoder) -> Decode {
653
  if self.t_len < self.t_need {
7✔
UNCOV
654
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
×
655
  } else {
656
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
7✔
657
  }
658
}
659

660
///|
661
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
662
  match decode {
118✔
663
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
107✔
UNCOV
664
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
×
665
    Hi(hi) => {
11✔
666
      let rem = self.i_rem()
667
      if rem < 2 {
668
        self.t_need(2)
1✔
669
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
670
      } else {
671
        let j = self.i_pos
10✔
672
        let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
673
        match dcd {
674
          Uchar(_) => self.i_pos += 2
8✔
675
          _ => ()
2✔
676
        }
677
        self.ret(decode_utf_16be, dcd)
678
      }
679
    }
680
  }
681
}
682

683
///|
684
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
685
  if self.t_len < self.t_need {
1✔
686
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
1✔
687
  } else {
UNCOV
688
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
689
  }
690
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc