• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 361

15 Feb 2025 10:25PM UTC coverage: 89.716% (+1.7%) from 88.028%
361

Pull #107

github

web-flow
Merge 6149b6661 into f97e25539
Pull Request #107: feat: streaming input decoding

43 of 53 new or added lines in 2 files covered. (81.13%)

18 existing lines in 1 file now uncovered.

1265 of 1410 relevant lines covered (89.72%)

438.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.33
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///|
16
pub const U_REP = '\u{FFFD}'
17

18
///|
19
let utf_8_len = [
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
25
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
29
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
]
31

32
///|
33
pub fn decoder(encoding : Encoding) -> Decoder {
34
  let i = FixedArray::default()
32✔
35
  let i_pos = 0
36
  let t = FixedArray::make(4, Byte::default())
37
  let t_len = 0
38
  let t_need = 0
39
  let k = match encoding {
40
    UTF8 => decode_utf_8
10✔
41
    UTF16 => decode_utf_16le
2✔
42
    UTF16LE => decode_utf_16le
11✔
43
    UTF16BE => decode_utf_16be
9✔
44
  }
45
  { i, i_pos, t, t_len, t_need, k }
46
}
47

48
///|
49
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = true) -> String {
50
  self.i_cont(input)
70✔
51

52
  // drive decoder to decode
53
  let chars = []
54
  loop self.decode_() {
55
    Uchar(u) => {
278✔
56
      chars.push(u)
57
      continue self.decode_()
58
    }
59
    Malformed(bs) =>
60
      if stream && self.t_need > 0 {
4✔
NEW
61
        String::from_array(chars)
×
62
      } else {
63
        raise MalformedError(bs)
4✔
64
      }
65
    End => String::from_array(chars)
42✔
66
    Refill(t) =>
67
      if stream {
24✔
68
        String::from_array(chars)
24✔
69
      } else {
NEW
70
        raise TruncatedError(t)
×
71
      }
72
  }
73
}
74

75
///|
76
pub fn decode_lossy(
77
  self : Decoder,
78
  input : Bytes,
79
  stream~ : Bool = true
80
) -> String {
81
  self.i_cont(input)
6✔
82

83
  // drive decoder to decode
84
  let chars = []
85
  loop self.decode_() {
86
    Uchar(u) => {
5✔
87
      chars.push(u)
88
      continue self.decode_()
89
    }
90
    Malformed(_) =>
91
      if stream && self.t_need > 0 {
9✔
NEW
92
        String::from_array(chars)
×
93
      } else {
94
        chars.push(U_REP)
9✔
95
        continue self.decode_()
96
      }
97
    End => String::from_array(chars)
6✔
98
    Refill(_) =>
99
      if stream {
3✔
NEW
100
        String::from_array(chars)
×
101
      } else {
102
        continue self.decode_()
3✔
103
      }
104
  }
105
}
106

107
///|
108
fn i_cont(self : Decoder, input : Bytes) -> Unit {
109
  // concat `input` to `i`, drop decoded `i`
110
  let i_rem = @math.maximum(self.i_rem(), 0)
76✔
111
  let new_len = i_rem + input.length()
112
  // init a new `i`
113
  let new_i = FixedArray::make(new_len, Byte::default())
114
  if i_rem > 0 {
115
    // copy the remainder of the old `i` into the new `i`
NEW
116
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
×
117
  }
118
  // copy all `input` into new `i`, starting at the remainder of the old `i`
119
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
120
  self.i = new_i
121
  // reset position to starting position
122
  self.i_pos = 0
123
}
124

125
// Implementations
126

127
///|
128
fn decode_(self : Decoder) -> Decode {
129
  (self.k)(self)
371✔
130
}
131

132
///|
133
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
134
  self.k = k
323✔
135
  v
136
}
137

138
///|
139
fn i_rem(self : Decoder) -> Int {
140
  self.i.length() - self.i_pos
490✔
141
}
142

143
///|
144
fn t_need(self : Decoder, need : Int) -> Unit {
145
  self.t_len = 0
27✔
146
  self.t_need = need
147
}
148

149
///|
150
fn eoi(self : Decoder) -> Unit {
151
  self.i = FixedArray::default()
27✔
152
}
153

154
///|
155
fn refill(self : Decoder, k : Cont) -> Decode {
156
  self.eoi()
27✔
157
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
158
}
159

160
///|
161
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
162
  fn blit(decoder : Decoder, l : Int) -> Unit {
54✔
163
    decoder.i.blit_to(
51✔
164
      decoder.t,
165
      len=l,
166
      dst_offset=decoder.t_len,
167
      src_offset=decoder.i_pos,
168
    )
169
    decoder.i_pos += l
170
    decoder.t_len += l
171
  }
172

173
  let rem = decoder.i_rem()
174
  if rem < 0 { // eoi
175
    k(decoder)
3✔
176
  } else {
177
    let need = decoder.t_need - decoder.t_len
51✔
178
    if rem < need {
179
      blit(decoder, rem)
27✔
180
      decoder.refill(@tuple.curry(t_fill)(k))
181
    } else {
182
      blit(decoder, need)
24✔
183
      k(decoder)
184
    }
185
  }
186
}
187

188
// UTF8
189

190
///|
191
fn decode_utf_8(self : Decoder) -> Decode {
192
  let rem = self.i_rem()
95✔
193
  if rem <= 0 {
194
    Decode::End
14✔
195
  } else {
196
    let idx = self.i[self.i_pos].to_int()
81✔
197
    let need = utf_8_len[idx]
198
    if rem < need {
199
      self.t_need(need)
6✔
200
      t_fill(t_decode_utf_8, self)
201
    } else {
202
      let j = self.i_pos
75✔
203
      if need == 0 {
204
        self.i_pos += 1
1✔
205
        self.ret(decode_utf_8, malformed(self.i, j, 1))
206
      } else {
207
        self.i_pos += need
74✔
208
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
209
      }
210
    }
211
  }
212
}
213

214
///|
215
fn t_decode_utf_8(self : Decoder) -> Decode {
216
  if self.t_len < self.t_need {
6✔
NEW
UNCOV
217
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
×
218
  } else {
219
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
6✔
220
  }
221
}
222

223
///|
224
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
225
  fn uchar(c : Int) {
80✔
226
    Uchar(Char::from_int(c))
74✔
227
  }
228

229
  match length {
230
    1 => uchar(bytes[offset].to_int())
44✔
231
    2 => {
6✔
232
      let b0 = bytes[offset].to_int()
233
      let b1 = bytes[offset + 1].to_int()
234
      if (b1 >> 6) != 0b10 {
235
        malformed(bytes, offset, length)
4✔
236
      } else {
237
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
2✔
238
      }
239
    }
240
    3 => {
13✔
241
      let b0 = bytes[offset].to_int()
242
      let b1 = bytes[offset + 1].to_int()
243
      let b2 = bytes[offset + 2].to_int()
244
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
245
      if (b2 >> 6) != 0b10 {
UNCOV
246
        malformed(bytes, offset, length)
×
247
      } else {
248
        match b0 {
13✔
249
          0xE0 =>
250
            if b1 < 0xA0 || 0xBF < b1 {
1✔
251
              malformed(bytes, offset, length)
1✔
252
            } else {
UNCOV
253
              uchar(c)
×
254
            }
255
          0xED =>
UNCOV
256
            if b1 < 0x80 || 0x9F < b1 {
×
UNCOV
257
              malformed(bytes, offset, length)
×
258
            } else {
UNCOV
259
              uchar(c)
×
260
            }
261
          _ =>
262
            if (b1 >> 6) != 0b10 {
12✔
UNCOV
263
              malformed(bytes, offset, length)
×
264
            } else {
265
              uchar(c)
12✔
266
            }
267
        }
268
      }
269
    }
270
    4 => {
17✔
271
      let b0 = bytes[offset].to_int()
272
      let b1 = bytes[offset + 1].to_int()
273
      let b2 = bytes[offset + 2].to_int()
274
      let b3 = bytes[offset + 3].to_int()
275
      let c = ((b0 & 0x07) << 18) |
276
        ((b1 & 0x3F) << 12) |
277
        ((b2 & 0x3F) << 6) |
278
        (b3 & 0x3F)
279
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
280
        malformed(bytes, offset, length)
1✔
281
      } else {
282
        match b0 {
16✔
283
          0xF0 =>
284
            if b1 < 0x90 || 0xBF < b1 {
16✔
UNCOV
285
              malformed(bytes, offset, length)
×
286
            } else {
287
              uchar(c)
16✔
288
            }
289
          0xF4 =>
UNCOV
290
            if b1 < 0x80 || 0x8F < b1 {
×
UNCOV
291
              malformed(bytes, offset, length)
×
292
            } else {
293
              uchar(c)
×
294
            }
295
          _ =>
UNCOV
296
            if (b1 >> 6) != 0b10 {
×
UNCOV
297
              malformed(bytes, offset, length)
×
298
            } else {
UNCOV
299
              uchar(c)
×
300
            }
301
        }
302
      }
303
    }
UNCOV
304
    _ => panic()
×
305
  }
306
}
307

308
// UTF16LE
309

310
///|
311
priv enum UTF16Decode {
312
  Hi(Int)
313
  UTF16Malformed(Bytes)
314
  UTF16Uchar(Char)
315
}
316

317
///|
318
fn decode_utf_16le(self : Decoder) -> Decode {
319
  let rem = self.i_rem()
141✔
320
  if rem <= 0 {
321
    Decode::End
21✔
322
  } else if rem < 2 {
120✔
323
    self.t_need(2)
9✔
324
    t_fill(t_decode_utf_16le, self)
325
  } else {
326
    let j = self.i_pos
111✔
327
    self.i_pos += 2
328
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
329
  }
330
}
331

332
///|
333
fn t_decode_utf_16le(self : Decoder) -> Decode {
334
  if self.t_len < self.t_need {
9✔
NEW
UNCOV
335
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
×
336
  } else {
337
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
9✔
338
  }
339
}
340

341
///|
342
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
343
  match v {
120✔
344
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
111✔
NEW
UNCOV
345
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
×
346
    Hi(hi) => {
9✔
347
      let rem = self.i_rem()
348
      if rem < 2 {
349
        self.t_need(2)
4✔
350
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
351
      } else {
352
        let j = self.i_pos
5✔
353
        self.i_pos += 2
354
        self.ret(decode_utf_16le, r_utf_16_lo(hi, self.i, j + 1, j))
355
      }
356
    }
357
  }
358
}
359

360
///|
361
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
362
  if decoder.t_len < decoder.t_need {
4✔
363
    decoder.ret(
2✔
364
      decode_utf_16le,
365
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
366
    )
367
  } else {
368
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
2✔
369
  }
370
}
371

372
///|
373
fn r_utf_16_lo(
374
  hi : Int,
375
  bytes : FixedArray[Byte],
376
  offset0 : Int,
377
  offset1 : Int
378
) -> Decode {
379
  let b0 = bytes[offset0].to_int()
13✔
380
  let b1 = bytes[offset1].to_int()
381
  let lo = (b0 << 8) | b1
382
  if lo < 0xDC00 || lo > 0xDFFF {
383
    malformed_pair(
3✔
384
      offset0 < offset1,
385
      hi,
386
      bytes,
387
      @math.minimum(offset0, offset1),
388
      2,
389
    )
390
  } else {
391
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
10✔
392
  }
393
}
394

395
///|
396
fn r_utf_16(
397
  bytes : FixedArray[Byte],
398
  offset0 : Int,
399
  offset1 : Int
400
) -> UTF16Decode {
401
  let b0 = bytes[offset0].to_int()
215✔
402
  let b1 = bytes[offset1].to_int()
403
  let u = (b0 << 8) | b1
404
  if u < 0xD800 || u > 0xDFFF {
405
    UTF16Uchar(Char::from_int(u))
199✔
406
  } else if u > 0xDBFF {
16✔
407
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
408
  } else {
409
    Hi(u)
16✔
410
  }
411
}
412

413
// UTF16BE
414

415
///|
416
fn decode_utf_16be(self : Decoder) -> Decode {
417
  let rem = self.i_rem()
108✔
418
  if rem <= 0 {
419
    Decode::End
13✔
420
  } else if rem < 2 {
95✔
421
    self.t_need(2)
7✔
422
    t_fill(t_decode_utf_16be, self)
423
  } else {
424
    let j = self.i_pos
88✔
425
    self.i_pos += 2
426
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
427
  }
428
}
429

430
///|
431
fn t_decode_utf_16be(self : Decoder) -> Decode {
432
  if self.t_len < self.t_need {
7✔
NEW
UNCOV
433
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
×
434
  } else {
435
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
7✔
436
  }
437
}
438

439
///|
440
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
441
  match decode {
95✔
442
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
88✔
UNCOV
443
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
×
444
    Hi(hi) => {
7✔
445
      let rem = self.i_rem()
446
      if rem < 2 {
447
        self.t_need(2)
1✔
448
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
449
      } else {
450
        let j = self.i_pos
6✔
451
        self.i_pos += 2
452
        self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1))
453
      }
454
    }
455
  }
456
}
457

458
///|
459
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
460
  if self.t_len < self.t_need {
1✔
461
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
1✔
462
  } else {
NEW
463
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
464
  }
465
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc