• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 364

20 Feb 2025 06:26AM UTC coverage: 89.563% (+1.5%) from 88.028%
364

Pull #107

github

web-flow
Merge c0ac962e5 into 31f412bf8
Pull Request #107: feat: streaming input decoding

48 of 61 new or added lines in 2 files covered. (78.69%)

2 existing lines in 1 file now uncovered.

1270 of 1418 relevant lines covered (89.56%)

436.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

77.34
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///|
16
pub const U_REP = '\u{FFFD}'
17

18
///|
19
let utf_8_len = [
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
25
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
29
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
]
31

32
///|
33
pub fn decoder(encoding : Encoding) -> Decoder {
34
  let i = FixedArray::default()
32✔
35
  let i_pos = 0
36
  let t = FixedArray::make(4, Byte::default())
37
  let t_len = 0
38
  let t_need = 0
39
  let k = match encoding {
40
    UTF8 => decode_utf_8
10✔
41
    UTF16 => decode_utf_16le
2✔
42
    UTF16LE => decode_utf_16le
11✔
43
    UTF16BE => decode_utf_16be
9✔
44
  }
45
  { i, i_pos, t, t_len, t_need, k }
46
}
47

48
///|
49
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
50
  if input.length() > 0 {
73✔
51
    self.i_cont(input)
70✔
52
  }
53
  if self.i_rem() == 0 {
54
    return String::default()
3✔
55
  }
56

57
  // drive decoder to decode
58
  let chars = []
59
  loop self.decode_() {
60
    Uchar(u) => {
278✔
61
      chars.push(u)
62
      continue self.decode_()
63
    }
64
    Malformed(bs) =>
65
      if stream && self.t_need > 0 {
4✔
NEW
66
        String::from_array(chars)
×
67
      } else {
68
        raise MalformedError(bs)
4✔
69
      }
70
    End => String::from_array(chars)
42✔
71
    Refill(t) =>
72
      if stream {
24✔
73
        String::from_array(chars)
24✔
74
      } else {
NEW
75
        raise TruncatedError(t)
×
76
      }
77
  }
78
}
79

80
///|
81
pub fn decode_continue!(self : Decoder, input : Bytes) -> String {
82
  self.decode!(input, stream=true)
47✔
83
}
84

85
///|
86
pub fn decode_finish!(self : Decoder, input~ : Bytes = b"") -> String {
87
  self.decode!(input, stream=false)
6✔
88
}
89

90
///|
91
pub fn decode_lossy(
92
  self : Decoder,
93
  input : Bytes,
94
  stream~ : Bool = false
95
) -> String {
96
  if input.length() > 0 {
6✔
97
    self.i_cont(input)
6✔
98
  }
99
  if self.i_rem() == 0 {
NEW
100
    return String::default()
×
101
  }
102

103
  // drive decoder to decode
104
  let chars = []
105
  loop self.decode_() {
106
    Uchar(u) => {
5✔
107
      chars.push(u)
108
      continue self.decode_()
109
    }
110
    Malformed(_) =>
111
      if stream && self.t_need > 0 {
9✔
NEW
112
        String::from_array(chars)
×
113
      } else {
114
        chars.push(U_REP)
9✔
115
        continue self.decode_()
116
      }
117
    End => String::from_array(chars)
6✔
118
    Refill(_) =>
119
      if stream {
3✔
NEW
120
        String::from_array(chars)
×
121
      } else {
122
        continue self.decode_()
3✔
123
      }
124
  }
125
}
126

127
///|
128
pub fn decode_lossy_continue(self : Decoder, input : Bytes) -> String {
NEW
129
  self.decode_lossy(input, stream=true)
×
130
}
131

132
///|
133
pub fn decode_lossy_finish(self : Decoder, input~ : Bytes = b"") -> String {
NEW
134
  self.decode_lossy(input, stream=false)
×
135
}
136

137
///|
138
fn i_cont(self : Decoder, input : Bytes) -> Unit {
139
  // concat `input` to `i`, drop decoded `i`
140
  let i_rem = @math.maximum(self.i_rem(), 0)
76✔
141
  let new_len = i_rem + input.length()
142
  // init a new `i`
143
  let new_i = FixedArray::make(new_len, Byte::default())
144
  if i_rem > 0 {
145
    // copy the remainder of the old `i` into the new `i`
NEW
146
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
×
147
  }
148
  // copy all `input` into new `i`, starting at the remainder of the old `i`
149
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
150
  self.i = new_i
151
  // reset position to starting position
152
  self.i_pos = 0
153
}
154

155
// Implementations
156

157
///|
158
fn decode_(self : Decoder) -> Decode {
159
  (self.k)(self)
371✔
160
}
161

162
///|
163
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
164
  self.k = k
323✔
165
  v
166
}
167

168
///|
169
fn i_rem(self : Decoder) -> Int {
170
  self.i.length() - self.i_pos
569✔
171
}
172

173
///|
174
fn t_need(self : Decoder, need : Int) -> Unit {
175
  self.t_len = 0
27✔
176
  self.t_need = need
177
}
178

179
///|
180
fn eoi(self : Decoder) -> Unit {
181
  self.i = FixedArray::default()
27✔
182
}
183

184
///|
185
fn refill(self : Decoder, k : Cont) -> Decode {
186
  self.eoi()
27✔
187
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
188
}
189

190
///|
191
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
192
  fn blit(decoder : Decoder, l : Int) -> Unit {
54✔
193
    decoder.i.blit_to(
51✔
194
      decoder.t,
195
      len=l,
196
      dst_offset=decoder.t_len,
197
      src_offset=decoder.i_pos,
198
    )
199
    decoder.i_pos += l
200
    decoder.t_len += l
201
  }
202

203
  let rem = decoder.i_rem()
204
  if rem < 0 { // eoi
205
    k(decoder)
3✔
206
  } else {
207
    let need = decoder.t_need - decoder.t_len
51✔
208
    if rem < need {
209
      blit(decoder, rem)
27✔
210
      decoder.refill(@tuple.curry(t_fill)(k))
211
    } else {
212
      blit(decoder, need)
24✔
213
      k(decoder)
214
    }
215
  }
216
}
217

218
// UTF8
219

220
///|
221
fn decode_utf_8(self : Decoder) -> Decode {
222
  let rem = self.i_rem()
95✔
223
  if rem <= 0 {
224
    Decode::End
14✔
225
  } else {
226
    let idx = self.i[self.i_pos].to_int()
81✔
227
    let need = utf_8_len[idx]
228
    if rem < need {
229
      self.t_need(need)
6✔
230
      t_fill(t_decode_utf_8, self)
231
    } else {
232
      let j = self.i_pos
75✔
233
      if need == 0 {
234
        self.i_pos += 1
1✔
235
        self.ret(decode_utf_8, malformed(self.i, j, 1))
236
      } else {
237
        self.i_pos += need
74✔
238
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
239
      }
240
    }
241
  }
242
}
243

244
///|
245
fn t_decode_utf_8(self : Decoder) -> Decode {
246
  if self.t_len < self.t_need {
6✔
NEW
247
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
×
248
  } else {
249
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
6✔
250
  }
251
}
252

253
///|
254
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
255
  fn uchar(c : Int) {
80✔
256
    Uchar(Char::from_int(c))
74✔
257
  }
258

259
  match length {
260
    1 => uchar(bytes[offset].to_int())
44✔
261
    2 => {
6✔
262
      let b0 = bytes[offset].to_int()
263
      let b1 = bytes[offset + 1].to_int()
264
      if (b1 >> 6) != 0b10 {
265
        malformed(bytes, offset, length)
4✔
266
      } else {
267
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
2✔
268
      }
269
    }
270
    3 => {
13✔
271
      let b0 = bytes[offset].to_int()
272
      let b1 = bytes[offset + 1].to_int()
273
      let b2 = bytes[offset + 2].to_int()
274
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
275
      if (b2 >> 6) != 0b10 {
276
        malformed(bytes, offset, length)
×
277
      } else {
278
        match b0 {
13✔
279
          0xE0 =>
280
            if b1 < 0xA0 || 0xBF < b1 {
1✔
281
              malformed(bytes, offset, length)
1✔
282
            } else {
283
              uchar(c)
×
284
            }
285
          0xED =>
286
            if b1 < 0x80 || 0x9F < b1 {
×
287
              malformed(bytes, offset, length)
×
288
            } else {
289
              uchar(c)
×
290
            }
291
          _ =>
292
            if (b1 >> 6) != 0b10 {
12✔
293
              malformed(bytes, offset, length)
×
294
            } else {
295
              uchar(c)
12✔
296
            }
297
        }
298
      }
299
    }
300
    4 => {
17✔
301
      let b0 = bytes[offset].to_int()
302
      let b1 = bytes[offset + 1].to_int()
303
      let b2 = bytes[offset + 2].to_int()
304
      let b3 = bytes[offset + 3].to_int()
305
      let c = ((b0 & 0x07) << 18) |
306
        ((b1 & 0x3F) << 12) |
307
        ((b2 & 0x3F) << 6) |
308
        (b3 & 0x3F)
309
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
310
        malformed(bytes, offset, length)
1✔
311
      } else {
312
        match b0 {
16✔
313
          0xF0 =>
314
            if b1 < 0x90 || 0xBF < b1 {
16✔
315
              malformed(bytes, offset, length)
×
316
            } else {
317
              uchar(c)
16✔
318
            }
319
          0xF4 =>
320
            if b1 < 0x80 || 0x8F < b1 {
×
321
              malformed(bytes, offset, length)
×
322
            } else {
323
              uchar(c)
×
324
            }
325
          _ =>
326
            if (b1 >> 6) != 0b10 {
×
327
              malformed(bytes, offset, length)
×
328
            } else {
329
              uchar(c)
×
330
            }
331
        }
332
      }
333
    }
334
    _ => panic()
×
335
  }
336
}
337

338
// UTF16LE
339

340
///|
341
priv enum UTF16Decode {
342
  Hi(Int)
343
  UTF16Malformed(Bytes)
344
  UTF16Uchar(Char)
345
}
346

347
///|
348
fn decode_utf_16le(self : Decoder) -> Decode {
349
  let rem = self.i_rem()
141✔
350
  if rem <= 0 {
351
    Decode::End
21✔
352
  } else if rem < 2 {
120✔
353
    self.t_need(2)
9✔
354
    t_fill(t_decode_utf_16le, self)
355
  } else {
356
    let j = self.i_pos
111✔
357
    self.i_pos += 2
358
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
359
  }
360
}
361

362
///|
363
fn t_decode_utf_16le(self : Decoder) -> Decode {
364
  if self.t_len < self.t_need {
9✔
NEW
365
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
×
366
  } else {
367
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
9✔
368
  }
369
}
370

371
///|
372
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
373
  match v {
120✔
374
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
111✔
NEW
375
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
×
376
    Hi(hi) => {
9✔
377
      let rem = self.i_rem()
378
      if rem < 2 {
379
        self.t_need(2)
4✔
380
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
381
      } else {
382
        let j = self.i_pos
5✔
383
        self.i_pos += 2
384
        self.ret(decode_utf_16le, r_utf_16_lo(hi, self.i, j + 1, j))
385
      }
386
    }
387
  }
388
}
389

390
///|
391
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
392
  if decoder.t_len < decoder.t_need {
4✔
393
    decoder.ret(
2✔
394
      decode_utf_16le,
395
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
396
    )
397
  } else {
398
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
2✔
399
  }
400
}
401

402
///|
403
fn r_utf_16_lo(
404
  hi : Int,
405
  bytes : FixedArray[Byte],
406
  offset0 : Int,
407
  offset1 : Int
408
) -> Decode {
409
  let b0 = bytes[offset0].to_int()
13✔
410
  let b1 = bytes[offset1].to_int()
411
  let lo = (b0 << 8) | b1
412
  if lo < 0xDC00 || lo > 0xDFFF {
413
    // TODO(jinser): try to skip lo and then parse the following
414
    //
415
    // For example, b"\xD8\x00\x00\x48" (BE)
416
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
417
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
418
    //
419
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
420
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
421
    //
422
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
423
    // �H
424
    malformed_pair(
3✔
425
      offset0 < offset1,
426
      hi,
427
      bytes,
428
      @math.minimum(offset0, offset1),
429
      2,
430
    )
431
  } else {
432
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
10✔
433
  }
434
}
435

436
///|
437
fn r_utf_16(
438
  bytes : FixedArray[Byte],
439
  offset0 : Int,
440
  offset1 : Int
441
) -> UTF16Decode {
442
  let b0 = bytes[offset0].to_int()
215✔
443
  let b1 = bytes[offset1].to_int()
444
  let u = (b0 << 8) | b1
445
  if u < 0xD800 || u > 0xDFFF {
446
    UTF16Uchar(Char::from_int(u))
199✔
447
  } else if u > 0xDBFF {
16✔
UNCOV
448
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
449
  } else {
450
    Hi(u)
16✔
451
  }
452
}
453

454
// UTF16BE
455

456
///|
457
fn decode_utf_16be(self : Decoder) -> Decode {
458
  let rem = self.i_rem()
108✔
459
  if rem <= 0 {
460
    Decode::End
13✔
461
  } else if rem < 2 {
95✔
462
    self.t_need(2)
7✔
463
    t_fill(t_decode_utf_16be, self)
464
  } else {
465
    let j = self.i_pos
88✔
466
    self.i_pos += 2
467
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
468
  }
469
}
470

471
///|
472
fn t_decode_utf_16be(self : Decoder) -> Decode {
473
  if self.t_len < self.t_need {
7✔
NEW
474
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
×
475
  } else {
476
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
7✔
477
  }
478
}
479

480
///|
481
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
482
  match decode {
95✔
483
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
88✔
UNCOV
484
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
×
485
    Hi(hi) => {
7✔
486
      let rem = self.i_rem()
487
      if rem < 2 {
488
        self.t_need(2)
1✔
489
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
490
      } else {
491
        let j = self.i_pos
6✔
492
        self.i_pos += 2
493
        self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1))
494
      }
495
    }
496
  }
497
}
498

499
///|
500
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
501
  if self.t_len < self.t_need {
1✔
502
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
1✔
503
  } else {
NEW
504
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
505
  }
506
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc