• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 516

18 Jun 2025 08:32AM UTC coverage: 92.153% (-0.1%) from 92.25%
516

Pull #156

github

web-flow
Merge 46c19f46c into e90d6e8c7
Pull Request #156: minor: remove test {} in doc test

9 of 10 new or added lines in 3 files covered. (90.0%)

2 existing lines in 1 file now uncovered.

1832 of 1988 relevant lines covered (92.15%)

408.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.02
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
16
/// https://unicode.org/charts/nameslist/n_FFF0.html
17
pub const U_REP = '\u{FFFD}'
18

19
///|
20
let utf_8_len = [
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
29
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
30
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
]
32

33
///|
34
/// Create and return a `Decoder` for the specified character encoding.
35
///
36
/// The `Decoder` consumes byte sequences and decodes them into the original string format.
37
///
38
/// # Parameters
39
///
40
/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
41
///
42
/// # Returns
43
///
44
/// A `Decoder` instance that can be used to decode byte sequences into strings.
45
///
46
/// # Examples
47
///
48
/// ```moonbit
49
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
50
/// let decoder = decoder(UTF8)
51
/// inspect(decoder.consume(inputs[0]), content="abc")
52
/// inspect(decoder.consume(inputs[1]), content="")
53
/// inspect(decoder.consume(inputs[2]), content="🐰")
54
/// assert_true(decoder.finish().is_empty())
55
pub fn decoder(encoding : Encoding) -> Decoder {
56
  let i = FixedArray::default()
57✔
57
  let i_pos = 0
58
  let t = FixedArray::make(4, Byte::default())
57✔
59
  let t_len = 0
60
  let t_need = 0
61
  let k = match encoding {
57✔
62
    UTF8 => Decoder::decode_utf_8
21✔
63
    UTF16 => Decoder::decode_utf_16le
4✔
64
    UTF16LE => Decoder::decode_utf_16le
18✔
65
    UTF16BE => Decoder::decode_utf_16be
14✔
66
  }
67
  { i, i_pos, t, t_len, t_need, k }
68
}
69

70
///|
71
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
72
///
73
/// This function can work in streaming mode where bytes are consumed incrementally.
74
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
75
///
76
/// # Parameters
77
///
78
/// - `self`: The `Decoder` instance used to decode the byte sequence.
79
/// - `input`: The byte sequence to be decoded.
80
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
81
///
82
/// # Returns
83
///
84
/// A `String` representing the decoded content from the input byte sequence.
85
///
86
/// # Errors
87
///
88
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
89
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
90
///
91
/// # Examples
92
///
93
/// ```moonbit
94
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
95
/// let decoder = @encoding.decoder(UTF8)
96
/// inspect(decoder.decode(inputs[0], stream=true), content="abc")
97
/// inspect(decoder.decode(inputs[1], stream=true), content="")
98
/// inspect(decoder.decode(inputs[2], stream=false), content="🐰")
99
/// ```
100
pub fn decode(
101
  self : Decoder,
102
  input : @bytes.View,
103
  stream~ : Bool = false
104
) -> String raise Error {
105
  if input.length() > 0 {
79✔
106
    self.i_cont(input)
73✔
107
  }
108
  if self.i_rem() == 0 {
109
    return String::default()
6✔
110
  }
111

112
  // TODO: Estimate size_hint based on input and encoding more accurately
113
  let builder = StringBuilder::new(size_hint=input.length())
73✔
114

115
  // drive decoder to decode
116
  loop self.decode_() {
73✔
117
    Uchar(u) => {
282✔
118
      builder.write_char(u)
282✔
119
      continue self.decode_()
282✔
120
    }
121
    Malformed(bs) =>
122
      if stream && self.t_need > 0 {
4✔
123
        builder.to_string()
×
124
      } else {
125
        raise MalformedError(bs)
4✔
126
      }
127
    End => builder.to_string()
44✔
128
    Refill(t) =>
129
      if stream {
25✔
130
        builder.to_string()
25✔
131
      } else {
132
        raise TruncatedError(t)
×
133
      }
134
  }
135
}
136

137
///|
138
/// Decodes the given byte sequence using the specified decoder and writes the
139
/// result directly to a StringBuilder.
140
/// Similar to `decode!`, but writes the result to an existing StringBuilder
141
/// instead of creating a new String.
142
///
143
/// Parameters:
144
///
145
/// * `decoder` : The decoder instance used to decode the byte sequence.
146
/// * `input` : The byte sequence to be decoded.
147
/// * `output` : The StringBuilder where the decoded content will be written to.
148
///
149
/// Throws a `MalformedError` when the byte sequence is not properly formatted
150
/// according to the specified encoding.
151
///
152
/// Example:
153
///
154
/// ```moonbit
155
///   let decoder = decoder(UTF8)
156
///   let buf = StringBuilder::new()
157
///   decoder.decode_to(b"Hello", buf)
158
///   inspect(buf.to_string(), content="Hello")
159
/// ```
160
pub fn Decoder::decode_to(
161
  self : Decoder,
162
  input : @bytes.View,
163
  output : StringBuilder,
164
  stream~ : Bool = false
165
) -> Unit raise {
166
  if input.length() > 0 {
25✔
167
    self.i_cont(input)
23✔
168
  }
169
  if self.i_rem() == 0 {
170
    return
2✔
171
  }
172
  // drive decoder to decode
173
  loop self.decode_() {
23✔
174
    Uchar(u) => {
80✔
175
      output.write_char(u)
80✔
176
      continue self.decode_()
80✔
177
    }
178
    Malformed(bs) =>
179
      if stream && self.t_need > 0 {
2✔
180
        return
×
181
      } else {
182
        raise MalformedError(bs)
2✔
183
      }
184
    End => return
15✔
185
    Refill(t) => if stream { return } else { raise TruncatedError(t) }
1✔
186
  }
187
}
188

189
///|
190
pub fn decode_to(
191
  input : @bytes.View,
192
  output : StringBuilder,
193
  encoding~ : Encoding
194
) -> Unit raise {
195
  decoder(encoding).decode_to(input, output)
1✔
196
}
197

198
///|
199
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
200
///
201
/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
202
///
203
/// # Parameters
204
///
205
/// - `self`: The `Decoder` instance used to consume the byte sequence.
206
/// - `input`: The byte sequence to be consumed and decoded incrementally.
207
///
208
/// # Returns
209
///
210
/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
211
///
212
/// # Errors
213
///
214
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
215
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
216
pub fn consume(self : Decoder, input : @bytes.View) -> String raise Error {
217
  self.decode(input, stream=true)
50✔
218
}
219

220
///|
221
/// Finalize the decoding process and return the remaining decoded string.
222
///
223
/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
224
/// and triggering the final decoding step to produce the remaining output.
225
///
226
/// # Parameters
227
///
228
/// - `self`: The `Decoder` instance used to finalize the decoding process.
229
///
230
/// # Returns
231
///
232
/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
233
///
234
/// # Errors
235
///
236
/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
237
/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
238
pub fn finish(self : Decoder) -> String raise Error {
239
  self.decode(b"", stream=false)
6✔
240
}
241

242
///|
243
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
244
///
245
/// This function can work in streaming mode where bytes are consumed incrementally.
246
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
247
///
248
/// # Parameters
249
///
250
/// - `self`: The `Decoder` instance used to decode the byte sequence.
251
/// - `input`: The byte sequence to be decoded.
252
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
253
///
254
/// # Returns
255
///
256
/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
257
pub fn decode_lossy(
258
  self : Decoder,
259
  input : @bytes.View,
260
  stream~ : Bool = false
261
) -> String {
262
  if input.length() > 0 {
6✔
263
    self.i_cont(input)
6✔
264
  }
265
  if self.i_rem() == 0 {
266
    return String::default()
×
267
  }
268

269
  // drive decoder to decode
270
  let chars = []
271
  loop self.decode_() {
6✔
272
    Uchar(u) => {
6✔
273
      chars.push(u)
6✔
274
      continue self.decode_()
6✔
275
    }
276
    Malformed(_) =>
277
      if stream && self.t_need > 0 {
9✔
278
        String::from_array(chars)
×
279
      } else {
280
        chars.push(U_REP)
9✔
281
        continue self.decode_()
9✔
282
      }
283
    End => String::from_array(chars)
6✔
284
    Refill(_) =>
285
      if stream {
3✔
286
        String::from_array(chars)
×
287
      } else {
288
        continue self.decode_()
3✔
289
      }
290
  }
291
}
292

293
///|
294
pub fn Decoder::decode_lossy_to(
295
  self : Decoder,
296
  input : @bytes.View,
297
  output : StringBuilder,
298
  stream~ : Bool = false
299
) -> Unit {
300
  if input.length() > 0 {
7✔
301
    self.i_cont(input)
6✔
302
  }
303
  if self.i_rem() == 0 {
304
    return
1✔
305
  }
306

307
  // drive decoder to decode
308
  loop self.decode_() {
6✔
309
    Uchar(u) => {
6✔
310
      output.write_char(u)
6✔
311
      continue self.decode_()
6✔
312
    }
313
    Malformed(_) =>
314
      if stream && self.t_need > 0 {
9✔
315
        return
×
316
      } else {
317
        output.write_char(U_REP)
9✔
318
        continue self.decode_()
9✔
319
      }
320
    End => return
6✔
321
    Refill(_) => if stream { return } else { continue self.decode_() }
3✔
322
  }
323
}
324

325
///|
326
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
327
///
328
/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
329
///
330
/// # Parameters
331
///
332
/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
333
/// - `input`: The byte sequence to be consumed and decoded incrementally.
334
///
335
/// # Returns
336
///
337
/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
338
pub fn lossy_consume(self : Decoder, input : @bytes.View) -> String {
339
  self.decode_lossy(input, stream=true)
×
340
}
341

342
///|
343
/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
344
///
345
/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
346
/// and triggering the final decoding step to produce the remaining output.
347
///
348
/// # Parameters
349
///
350
/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
351
///
352
/// # Returns
353
///
354
/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
355
pub fn lossy_finish(self : Decoder) -> String {
356
  self.decode_lossy(b"", stream=false)
×
357
}
358

359
///|
360
fn i_cont(self : Decoder, input : @bytes.View) -> Unit {
361
  // concat `input` to `i`, drop decoded `i`
362
  let i_rem = @math.maximum(self.i_rem(), 0)
108✔
363
  let new_len = i_rem + input.length()
108✔
364
  // init a new `i`
365
  let new_i = FixedArray::make(new_len, Byte::default())
108✔
366
  if i_rem > 0 {
367
    // copy the remainder of the old `i` into the new `i`
368
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
×
369
  }
370
  // copy all `input` into new `i`, starting at the remainder of the old `i`
371
  new_i.blit_from_bytesview(i_rem, input)
108✔
372
  self.i = new_i
373
  // reset position to starting position
374
  self.i_pos = 0
375
}
376

377
// Implementations
378

379
///|
380
fn decode_(self : Decoder) -> Decode {
381
  (self.k)(self)
506✔
382
}
383

384
///|
385
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
386
  self.k = k
435✔
387
  v
388
}
389

390
///|
391
fn i_rem(self : Decoder) -> Int {
392
  self.i.length() - self.i_pos
800✔
393
}
394

395
///|
396
fn t_need(self : Decoder, need : Int) -> Unit {
397
  self.t_len = 0
37✔
398
  self.t_need = need
399
}
400

401
///|
402
fn eoi(self : Decoder) -> Unit {
403
  self.i = FixedArray::default()
37✔
404
}
405

406
///|
407
fn refill(self : Decoder, k : Cont) -> Decode {
408
  self.eoi()
37✔
409
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
37✔
410
}
411

412
///|
413
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
414
  fn blit(decoder : Decoder, l : Int) -> Unit {
73✔
415
    decoder.i.blit_to(
67✔
416
      decoder.t,
417
      len=l,
418
      dst_offset=decoder.t_len,
419
      src_offset=decoder.i_pos,
420
    )
421
    decoder.i_pos += l
422
    decoder.t_len += l
423
  }
424

425
  let rem = decoder.i_rem()
73✔
426
  if rem < 0 { // eoi
427
    k(decoder)
6✔
428
  } else {
429
    let need = decoder.t_need - decoder.t_len
67✔
430
    if rem < need {
431
      blit(decoder, rem)
37✔
432
      decoder.refill(curry(t_fill)(k))
37✔
433
    } else {
434
      blit(decoder, need)
30✔
435
      k(decoder)
30✔
436
    }
437
  }
438
}
439

440
// UTF8
441

442
///|
443
fn decode_utf_8(self : Decoder) -> Decode {
444
  let rem = self.i_rem()
135✔
445
  if rem <= 0 {
446
    Decode::End
22✔
447
  } else {
448
    let idx = self.i[self.i_pos].to_int()
113✔
449
    let need = utf_8_len[idx]
450
    if rem < need {
451
      self.t_need(need)
8✔
452
      t_fill(Decoder::t_decode_utf_8, self)
8✔
453
    } else {
454
      let j = self.i_pos
105✔
455
      if need == 0 {
456
        self.i_pos += 1
2✔
457
        self.ret(Decoder::decode_utf_8, malformed(self.i, j, 1))
2✔
458
      } else {
459
        self.i_pos += need
103✔
460
        self.ret(Decoder::decode_utf_8, r_utf_8(self.i, j, need))
103✔
461
      }
462
    }
463
  }
464
}
465

466
///|
467
fn t_decode_utf_8(self : Decoder) -> Decode {
468
  if self.t_len < self.t_need {
8✔
469
    self.ret(Decoder::decode_utf_8, malformed(self.t, 0, self.t_len))
×
470
  } else {
471
    self.ret(Decoder::decode_utf_8, r_utf_8(self.t, 0, self.t_len))
8✔
472
  }
473
}
474

475
///|
476
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
477
  fn uchar(c : Int) {
111✔
478
    Uchar(Int::unsafe_to_char(c))
99✔
479
  }
480

481
  match length {
111✔
482
    1 => uchar(bytes[offset].to_int())
62✔
483
    2 => {
12✔
484
      let b0 = bytes[offset].to_int()
12✔
485
      let b1 = bytes[offset + 1].to_int()
12✔
486
      if b1 >> 6 != 0b10 {
487
        malformed(bytes, offset, length)
8✔
488
      } else {
489
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
4✔
490
      }
491
    }
492
    3 => {
16✔
493
      let b0 = bytes[offset].to_int()
16✔
494
      let b1 = bytes[offset + 1].to_int()
16✔
495
      let b2 = bytes[offset + 2].to_int()
16✔
496
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
497
      if b2 >> 6 != 0b10 {
498
        malformed(bytes, offset, length)
×
499
      } else {
500
        match b0 {
16✔
501
          0xE0 =>
502
            if b1 < 0xA0 || 0xBF < b1 {
2✔
503
              malformed(bytes, offset, length)
2✔
504
            } else {
505
              uchar(c)
×
506
            }
507
          0xED =>
508
            if b1 < 0x80 || 0x9F < b1 {
×
509
              malformed(bytes, offset, length)
×
510
            } else {
511
              uchar(c)
×
512
            }
513
          _ =>
514
            if b1 >> 6 != 0b10 {
14✔
515
              malformed(bytes, offset, length)
×
516
            } else {
517
              uchar(c)
14✔
518
            }
519
        }
520
      }
521
    }
522
    4 => {
21✔
523
      let b0 = bytes[offset].to_int()
21✔
524
      let b1 = bytes[offset + 1].to_int()
21✔
525
      let b2 = bytes[offset + 2].to_int()
21✔
526
      let b3 = bytes[offset + 3].to_int()
21✔
527
      let c = ((b0 & 0x07) << 18) |
528
        ((b1 & 0x3F) << 12) |
529
        ((b2 & 0x3F) << 6) |
530
        (b3 & 0x3F)
531
      if b3 >> 6 != 0b10 || b2 >> 6 != 0b10 {
532
        malformed(bytes, offset, length)
2✔
533
      } else {
534
        match b0 {
19✔
535
          0xF0 =>
536
            if b1 < 0x90 || 0xBF < b1 {
19✔
537
              malformed(bytes, offset, length)
×
538
            } else {
539
              uchar(c)
19✔
540
            }
541
          0xF4 =>
542
            if b1 < 0x80 || 0x8F < b1 {
×
543
              malformed(bytes, offset, length)
×
544
            } else {
545
              uchar(c)
×
546
            }
547
          _ =>
NEW
548
            if b1 >> 6 != 0b10 {
×
549
              malformed(bytes, offset, length)
×
550
            } else {
551
              uchar(c)
×
552
            }
553
        }
554
      }
555
    }
556
    _ => panic()
×
557
  }
558
}
559

560
// UTF16LE
561

562
///|
563
priv enum UTF16Decode {
564
  Hi(Int)
565
  UTF16Malformed(Bytes)
566
  UTF16Uchar(Char)
567
}
568

569
///|
570
fn decode_utf_16le(self : Decoder) -> Decode {
571
  let rem = self.i_rem()
196✔
572
  if rem <= 0 {
573
    Decode::End
31✔
574
  } else if rem < 2 {
165✔
575
    self.t_need(2)
10✔
576
    t_fill(Decoder::t_decode_utf_16le, self)
10✔
577
  } else {
578
    let j = self.i_pos
155✔
579
    self.i_pos += 2
580
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
155✔
581
  }
582
}
583

584
///|
585
fn t_decode_utf_16le(self : Decoder) -> Decode {
586
  if self.t_len < self.t_need {
10✔
587
    self.ret(Decoder::decode_utf_16le, malformed(self.t, 0, self.t_len))
×
588
  } else {
589
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
10✔
590
  }
591
}
592

593
///|
594
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
595
  match v {
165✔
596
    UTF16Uchar(u) => self.ret(Decoder::decode_utf_16le, Uchar(u))
146✔
597
    UTF16Malformed(s) => self.ret(Decoder::decode_utf_16le, Malformed(s))
×
598
    Hi(hi) => {
19✔
599
      let rem = self.i_rem()
19✔
600
      if rem < 2 {
601
        self.t_need(2)
10✔
602
        t_fill(curry(t_decode_utf_16le_lo)(hi), self)
10✔
603
      } else {
604
        let j = self.i_pos
9✔
605
        let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
9✔
606
        match dcd {
9✔
607
          Uchar(_) => self.i_pos += 2
8✔
608
          _ => ()
1✔
609
        }
610
        self.ret(Decoder::decode_utf_16le, dcd)
9✔
611
      }
612
    }
613
  }
614
}
615

616
///|
617
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
618
  if decoder.t_len < decoder.t_need {
9✔
619
    decoder.ret(
4✔
620
      Decoder::decode_utf_16le,
621
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
4✔
622
    )
623
  } else {
624
    decoder.ret(Decoder::decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
5✔
625
  }
626
}
627

628
///|
629
fn r_utf_16_lo(
630
  hi : Int,
631
  bytes : FixedArray[Byte],
632
  offset0 : Int,
633
  offset1 : Int
634
) -> Decode {
635
  let b0 = bytes[offset0].to_int()
25✔
636
  let b1 = bytes[offset1].to_int()
25✔
637
  let lo = (b0 << 8) | b1
638
  if lo < 0xDC00 || lo > 0xDFFF {
639
    // NOTE(jinser): only hi malformed, skip lo if lo is illegal
640
    //
641
    // For example, b"\xD8\x00\x00\x48" (BE)
642
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
643
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
644
    //
645
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
646
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
647
    //
648
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
649
    // �H
650
    Malformed([bytes[offset0], bytes[offset1]])
4✔
651
  } else {
652
    Uchar(Int::unsafe_to_char(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
21✔
653
  }
654
}
655

656
///|
657
fn r_utf_16(
658
  bytes : FixedArray[Byte],
659
  offset0 : Int,
660
  offset1 : Int
661
) -> UTF16Decode {
662
  let b0 = bytes[offset0].to_int()
286✔
663
  let b1 = bytes[offset1].to_int()
286✔
664
  let u = (b0 << 8) | b1
665
  if u < 0xD800 || u > 0xDFFF {
666
    UTF16Uchar(Int::unsafe_to_char(u))
254✔
667
  } else if u > 0xDBFF {
32✔
668
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
669
  } else {
670
    Hi(u)
32✔
671
  }
672
}
673

674
// UTF16BE
675

676
///|
677
fn decode_utf_16be(self : Decoder) -> Decode {
678
  let rem = self.i_rem()
139✔
679
  if rem <= 0 {
680
    Decode::End
18✔
681
  } else if rem < 2 {
121✔
682
    self.t_need(2)
7✔
683
    t_fill(Decoder::t_decode_utf_16be, self)
7✔
684
  } else {
685
    let j = self.i_pos
114✔
686
    self.i_pos += 2
687
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
114✔
688
  }
689
}
690

691
///|
692
fn t_decode_utf_16be(self : Decoder) -> Decode {
693
  if self.t_len < self.t_need {
7✔
694
    self.ret(Decoder::decode_utf_16be, malformed(self.t, 0, self.t_len))
×
695
  } else {
696
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
7✔
697
  }
698
}
699

700
///|
701
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
702
  match decode {
121✔
703
    UTF16Uchar(x) => self.ret(Decoder::decode_utf_16be, Uchar(x))
108✔
704
    UTF16Malformed(x) => self.ret(Decoder::decode_utf_16be, Malformed(x))
×
705
    Hi(hi) => {
13✔
706
      let rem = self.i_rem()
13✔
707
      if rem < 2 {
708
        self.t_need(2)
2✔
709
        t_fill(curry(t_decode_utf_16be_lo)(hi), self)
2✔
710
      } else {
711
        let j = self.i_pos
11✔
712
        let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
11✔
713
        match dcd {
11✔
714
          Uchar(_) => self.i_pos += 2
8✔
715
          _ => ()
3✔
716
        }
717
        self.ret(Decoder::decode_utf_16be, dcd)
11✔
718
      }
719
    }
720
  }
721
}
722

723
///|
724
fn[T, U, V] curry(f : (T, U) -> V) -> (T) -> (U) -> V {
725
  fn(x : T) { fn(y : U) -> V { f(x, y) } }
47✔
726
}
727

728
///|
729
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
730
  if self.t_len < self.t_need {
2✔
731
    self.ret(
2✔
732
      Decoder::decode_utf_16be,
733
      malformed_pair(true, hi, self.t, 0, self.t_len),
2✔
734
    )
735
  } else {
736
    self.ret(Decoder::decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
737
  }
738
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc