• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 372

25 Feb 2025 06:09AM UTC coverage: 89.592% (+1.6%) from 88.028%
372

Pull #107

github

web-flow
Merge f2397b1e0 into 71e08287c
Pull Request #107: feat: streaming input decoding

53 of 66 new or added lines in 2 files covered. (80.3%)

16 existing lines in 1 file now uncovered.

1274 of 1422 relevant lines covered (89.59%)

434.77 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.03
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///|
16
pub const U_REP = '\u{FFFD}'
17

18
///|
19
let utf_8_len = [
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
25
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
29
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
]
31

32
///|
33
pub fn decoder(encoding : Encoding) -> Decoder {
34
  let i = FixedArray::default()
32✔
35
  let i_pos = 0
36
  let t = FixedArray::make(4, Byte::default())
37
  let t_len = 0
38
  let t_need = 0
39
  let k = match encoding {
40
    UTF8 => decode_utf_8
10✔
41
    UTF16 => decode_utf_16le
2✔
42
    UTF16LE => decode_utf_16le
11✔
43
    UTF16BE => decode_utf_16be
9✔
44
  }
45
  { i, i_pos, t, t_len, t_need, k }
46
}
47

48
///|
49
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
50
  if input.length() > 0 {
73✔
51
    self.i_cont(input)
70✔
52
  }
53
  if self.i_rem() == 0 {
54
    return String::default()
3✔
55
  }
56

57
  // drive decoder to decode
58
  let chars = []
59
  loop self.decode_() {
60
    Uchar(u) => {
278✔
61
      chars.push(u)
62
      continue self.decode_()
63
    }
64
    Malformed(bs) =>
65
      if stream && self.t_need > 0 {
4✔
NEW
66
        String::from_array(chars)
×
67
      } else {
68
        raise MalformedError(bs)
4✔
69
      }
70
    End => String::from_array(chars)
42✔
71
    Refill(t) =>
72
      if stream {
24✔
73
        String::from_array(chars)
24✔
74
      } else {
NEW
UNCOV
75
        raise TruncatedError(t)
×
76
      }
77
  }
78
}
79

80
///|
81
pub fn decode_continue!(self : Decoder, input : Bytes) -> String {
82
  self.decode!(input, stream=true)
47✔
83
}
84

85
///|
86
pub fn decode_finish!(self : Decoder, input~ : Bytes = b"") -> String {
87
  self.decode!(input, stream=false)
6✔
88
}
89

90
///|
91
pub fn decode_lossy(
92
  self : Decoder,
93
  input : Bytes,
94
  stream~ : Bool = false
95
) -> String {
96
  if input.length() > 0 {
6✔
97
    self.i_cont(input)
6✔
98
  }
99
  if self.i_rem() == 0 {
NEW
100
    return String::default()
×
101
  }
102

103
  // drive decoder to decode
104
  let chars = []
105
  loop self.decode_() {
106
    Uchar(u) => {
6✔
107
      chars.push(u)
108
      continue self.decode_()
109
    }
110
    Malformed(_) =>
111
      if stream && self.t_need > 0 {
9✔
NEW
112
        String::from_array(chars)
×
113
      } else {
114
        chars.push(U_REP)
9✔
115
        continue self.decode_()
116
      }
117
    End => String::from_array(chars)
6✔
118
    Refill(_) =>
119
      if stream {
3✔
NEW
120
        String::from_array(chars)
×
121
      } else {
122
        continue self.decode_()
3✔
123
      }
124
  }
125
}
126

127
///|
128
pub fn decode_lossy_continue(self : Decoder, input : Bytes) -> String {
NEW
UNCOV
129
  self.decode_lossy(input, stream=true)
×
130
}
131

132
///|
133
pub fn decode_lossy_finish(self : Decoder, input~ : Bytes = b"") -> String {
NEW
134
  self.decode_lossy(input, stream=false)
×
135
}
136

137
///|
138
fn i_cont(self : Decoder, input : Bytes) -> Unit {
139
  // concat `input` to `i`, drop decoded `i`
140
  let i_rem = @math.maximum(self.i_rem(), 0)
76✔
141
  let new_len = i_rem + input.length()
142
  // init a new `i`
143
  let new_i = FixedArray::make(new_len, Byte::default())
144
  if i_rem > 0 {
145
    // copy the remainder of the old `i` into the new `i`
NEW
146
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
×
147
  }
148
  // copy all `input` into new `i`, starting at the remainder of the old `i`
149
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
150
  self.i = new_i
151
  // reset position to starting position
152
  self.i_pos = 0
153
}
154

155
// Implementations
156

157
///|
158
fn decode_(self : Decoder) -> Decode {
159
  (self.k)(self)
372✔
160
}
161

162
///|
163
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
164
  self.k = k
324✔
165
  v
166
}
167

168
///|
169
fn i_rem(self : Decoder) -> Int {
170
  self.i.length() - self.i_pos
570✔
171
}
172

173
///|
174
fn t_need(self : Decoder, need : Int) -> Unit {
175
  self.t_len = 0
27✔
176
  self.t_need = need
177
}
178

179
///|
180
fn eoi(self : Decoder) -> Unit {
181
  self.i = FixedArray::default()
27✔
182
}
183

184
///|
185
fn refill(self : Decoder, k : Cont) -> Decode {
186
  self.eoi()
27✔
187
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
188
}
189

190
///|
191
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
192
  fn blit(decoder : Decoder, l : Int) -> Unit {
54✔
193
    decoder.i.blit_to(
51✔
194
      decoder.t,
195
      len=l,
196
      dst_offset=decoder.t_len,
197
      src_offset=decoder.i_pos,
198
    )
199
    decoder.i_pos += l
200
    decoder.t_len += l
201
  }
202

203
  let rem = decoder.i_rem()
204
  if rem < 0 { // eoi
205
    k(decoder)
3✔
206
  } else {
207
    let need = decoder.t_need - decoder.t_len
51✔
208
    if rem < need {
209
      blit(decoder, rem)
27✔
210
      decoder.refill(@tuple.curry(t_fill)(k))
211
    } else {
212
      blit(decoder, need)
24✔
213
      k(decoder)
214
    }
215
  }
216
}
217

218
// UTF8
219

220
///|
221
fn decode_utf_8(self : Decoder) -> Decode {
222
  let rem = self.i_rem()
95✔
223
  if rem <= 0 {
224
    Decode::End
14✔
225
  } else {
226
    let idx = self.i[self.i_pos].to_int()
81✔
227
    let need = utf_8_len[idx]
228
    if rem < need {
229
      self.t_need(need)
6✔
230
      t_fill(t_decode_utf_8, self)
231
    } else {
232
      let j = self.i_pos
75✔
233
      if need == 0 {
234
        self.i_pos += 1
1✔
235
        self.ret(decode_utf_8, malformed(self.i, j, 1))
236
      } else {
237
        self.i_pos += need
74✔
238
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
239
      }
240
    }
241
  }
242
}
243

244
///|
245
fn t_decode_utf_8(self : Decoder) -> Decode {
246
  if self.t_len < self.t_need {
6✔
NEW
247
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
×
248
  } else {
249
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
6✔
250
  }
251
}
252

253
///|
254
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
255
  fn uchar(c : Int) {
80✔
256
    Uchar(Char::from_int(c))
74✔
257
  }
258

259
  match length {
260
    1 => uchar(bytes[offset].to_int())
44✔
261
    2 => {
6✔
262
      let b0 = bytes[offset].to_int()
263
      let b1 = bytes[offset + 1].to_int()
264
      if (b1 >> 6) != 0b10 {
265
        malformed(bytes, offset, length)
4✔
266
      } else {
267
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
2✔
268
      }
269
    }
270
    3 => {
13✔
271
      let b0 = bytes[offset].to_int()
272
      let b1 = bytes[offset + 1].to_int()
273
      let b2 = bytes[offset + 2].to_int()
274
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
275
      if (b2 >> 6) != 0b10 {
276
        malformed(bytes, offset, length)
×
277
      } else {
278
        match b0 {
13✔
279
          0xE0 =>
280
            if b1 < 0xA0 || 0xBF < b1 {
1✔
281
              malformed(bytes, offset, length)
1✔
282
            } else {
UNCOV
283
              uchar(c)
×
284
            }
285
          0xED =>
UNCOV
286
            if b1 < 0x80 || 0x9F < b1 {
×
UNCOV
287
              malformed(bytes, offset, length)
×
288
            } else {
UNCOV
289
              uchar(c)
×
290
            }
291
          _ =>
292
            if (b1 >> 6) != 0b10 {
12✔
293
              malformed(bytes, offset, length)
×
294
            } else {
295
              uchar(c)
12✔
296
            }
297
        }
298
      }
299
    }
300
    4 => {
17✔
301
      let b0 = bytes[offset].to_int()
302
      let b1 = bytes[offset + 1].to_int()
303
      let b2 = bytes[offset + 2].to_int()
304
      let b3 = bytes[offset + 3].to_int()
305
      let c = ((b0 & 0x07) << 18) |
306
        ((b1 & 0x3F) << 12) |
307
        ((b2 & 0x3F) << 6) |
308
        (b3 & 0x3F)
309
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
310
        malformed(bytes, offset, length)
1✔
311
      } else {
312
        match b0 {
16✔
313
          0xF0 =>
314
            if b1 < 0x90 || 0xBF < b1 {
16✔
UNCOV
315
              malformed(bytes, offset, length)
×
316
            } else {
317
              uchar(c)
16✔
318
            }
319
          0xF4 =>
320
            if b1 < 0x80 || 0x8F < b1 {
×
UNCOV
321
              malformed(bytes, offset, length)
×
322
            } else {
323
              uchar(c)
×
324
            }
325
          _ =>
UNCOV
326
            if (b1 >> 6) != 0b10 {
×
UNCOV
327
              malformed(bytes, offset, length)
×
328
            } else {
UNCOV
329
              uchar(c)
×
330
            }
331
        }
332
      }
333
    }
UNCOV
334
    _ => panic()
×
335
  }
336
}
337

338
// UTF16LE
339

340
///|
341
priv enum UTF16Decode {
342
  Hi(Int)
343
  UTF16Malformed(Bytes)
344
  UTF16Uchar(Char)
345
}
346

347
///|
348
fn decode_utf_16le(self : Decoder) -> Decode {
349
  let rem = self.i_rem()
141✔
350
  if rem <= 0 {
351
    Decode::End
21✔
352
  } else if rem < 2 {
120✔
353
    self.t_need(2)
9✔
354
    t_fill(t_decode_utf_16le, self)
355
  } else {
356
    let j = self.i_pos
111✔
357
    self.i_pos += 2
358
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
359
  }
360
}
361

362
///|
363
fn t_decode_utf_16le(self : Decoder) -> Decode {
364
  if self.t_len < self.t_need {
9✔
NEW
UNCOV
365
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
×
366
  } else {
367
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
9✔
368
  }
369
}
370

371
///|
372
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
373
  match v {
120✔
374
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
111✔
NEW
UNCOV
375
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
×
376
    Hi(hi) => {
9✔
377
      let rem = self.i_rem()
378
      if rem < 2 {
379
        self.t_need(2)
4✔
380
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
381
      } else {
382
        let j = self.i_pos
5✔
383
        let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
384
        match dcd {
385
          Uchar(_) => self.i_pos += 2
4✔
386
          _ => ()
1✔
387
        }
388
        self.ret(decode_utf_16le, dcd)
389
      }
390
    }
391
  }
392
}
393

394
///|
395
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
396
  if decoder.t_len < decoder.t_need {
4✔
397
    decoder.ret(
2✔
398
      decode_utf_16le,
399
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
400
    )
401
  } else {
402
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
2✔
403
  }
404
}
405

406
///|
407
fn r_utf_16_lo(
408
  hi : Int,
409
  bytes : FixedArray[Byte],
410
  offset0 : Int,
411
  offset1 : Int
412
) -> Decode {
413
  let b0 = bytes[offset0].to_int()
13✔
414
  let b1 = bytes[offset1].to_int()
415
  let lo = (b0 << 8) | b1
416
  if lo < 0xDC00 || lo > 0xDFFF {
417
    // NOTE(jinser): only hi malformed, skip lo if lo is illegal
418
    //
419
    // For example, b"\xD8\x00\x00\x48" (BE)
420
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
421
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
422
    //
423
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
424
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
425
    //
426
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
427
    // �H
428
    Malformed([bytes[offset0], bytes[offset1]])
3✔
429
  } else {
430
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
10✔
431
  }
432
}
433

434
///|
435
fn r_utf_16(
436
  bytes : FixedArray[Byte],
437
  offset0 : Int,
438
  offset1 : Int
439
) -> UTF16Decode {
440
  let b0 = bytes[offset0].to_int()
216✔
441
  let b1 = bytes[offset1].to_int()
442
  let u = (b0 << 8) | b1
443
  if u < 0xD800 || u > 0xDFFF {
444
    UTF16Uchar(Char::from_int(u))
200✔
445
  } else if u > 0xDBFF {
16✔
446
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
447
  } else {
448
    Hi(u)
16✔
449
  }
450
}
451

452
// UTF16BE
453

454
///|
455
fn decode_utf_16be(self : Decoder) -> Decode {
456
  let rem = self.i_rem()
109✔
457
  if rem <= 0 {
458
    Decode::End
13✔
459
  } else if rem < 2 {
96✔
460
    self.t_need(2)
7✔
461
    t_fill(t_decode_utf_16be, self)
462
  } else {
463
    let j = self.i_pos
89✔
464
    self.i_pos += 2
465
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
466
  }
467
}
468

469
///|
470
fn t_decode_utf_16be(self : Decoder) -> Decode {
471
  if self.t_len < self.t_need {
7✔
NEW
UNCOV
472
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
×
473
  } else {
474
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
7✔
475
  }
476
}
477

478
///|
479
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
480
  match decode {
96✔
481
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
89✔
UNCOV
482
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
×
483
    Hi(hi) => {
7✔
484
      let rem = self.i_rem()
485
      if rem < 2 {
486
        self.t_need(2)
1✔
487
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
488
      } else {
489
        let j = self.i_pos
6✔
490
        let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
491
        match dcd {
492
          Uchar(_) => self.i_pos += 2
4✔
493
          _ => ()
2✔
494
        }
495
        self.ret(decode_utf_16be, dcd)
496
      }
497
    }
498
  }
499
}
500

501
///|
502
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
503
  if self.t_len < self.t_need {
1✔
504
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
1✔
505
  } else {
NEW
506
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
507
  }
508
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc