• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

moonbitlang / x / 301

10 Dec 2024 06:19AM UTC coverage: 85.204% (-2.6%) from 87.841%
301

Pull #78

github

web-flow
Merge b830031f4 into 91f0fdf48
Pull Request #78: feat: new package encoding

105 of 161 new or added lines in 3 files covered. (65.22%)

124 existing lines in 29 files now uncovered.

1169 of 1372 relevant lines covered (85.2%)

434.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

61.11
/encoding/decoding.mbt
1
// Copyright 2024 International Digital Economy Academy
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14

15
///|
16
const U_REP = '\u{FFFD}'
17

18
///|
19
let utf_8_len = [
20
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
25
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
29
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
]
31

32
///|
33
/// Decodes bytes from a specified encoding into lossily decoded characters.
34
///
35
/// # Parameters
36
/// - `encoding`: The character encoding of the input `bytes`.
37
/// - `src`: A `bytes` representing the encoded string in the specified format.
38
///
39
/// # Returns
40
///
41
/// A `LossyChars` iterator representing the decoded characters, with invalid byte sequences replaced by a replacement character.
42
///
43
/// # Behavior
44
///
45
/// - Any invalid sequences in the `bytes` are replaced with a replacement character (`\u{FFFD}`), preventing decoding errors.
46
///
47
/// # Examples
48
///
49
/// ```moonbit
50
/// let buf = @buffer.T::new(size_hint=10)
51
/// buf.write_bytes(b"\xe4\xbd\xa0") // "ä½ " in UTF8
52
/// buf.write_bytes(b"\xe5\xa5\xbd") // "好" in UTF8
53
/// buf.write_bytes(b"\xf0\x9f\x91\x80") // "👀" in UTF8
54
/// let chars = @encoding.decode_lossy(UTF8, buf.to_bytes())
55
/// let arr = chars.iter().collect() // Array of unicode point code: `['你', '好', '👀']`
56
/// let str = String::from_array(arr) // MoonBit String, representing as UTF16LE: `"你好👀"`
57
/// // or
58
/// let str = chars.to_string()
59
/// ```
60
pub fn decode_lossy(encoding : Encoding, src : Bytes) -> LossyChars {
61
  let decoder = decoder(encoding, src)
14✔
62
  decoder
63
}
64

65
///|
66
/// Decodes bytes from a specified encoding into strictly decoded characters.
67
///
68
/// # Parameters
69
///
70
/// - `encoding`: The character encoding of the input `bytes`.
71
/// - `src`: A `bytes` representing the encoded string in the specified format.
72
///
73
/// # Returns
74
///
75
/// A `StrictChars` iterator representing the decoded characters.
76
///
77
/// # Behavior
78
///
79
/// - Assumes all sequences in the `bytes` are valid and will raise errors if invalid sequences are encountered.
80
///
81
/// # Examples
82
///
83
/// ```moonbit
84
/// let buf = @buffer.T::new(size_hint=10)
85
/// buf.write_bytes(b"\xe4\xbd\xa0") // "ä½ " in UTF8
86
/// buf.write_bytes(b"\xe5\xa5\xbd") // "好" in UTF8
87
/// buf.write_bytes(b"\xf0\x9f\x91\x80") // "👀" in UTF8
88
/// let chars = @encoding.decode_strict(UTF8, buf.to_bytes())
89
/// let arr = chars.iter().collect() // Array of unicode point code: `[Ok('你'), Ok('好'), Ok('👀')]`
90
/// let str = chars.to_string() // MoonBit String, representing as UTF16LE: `"你好👀"`
91
/// ```
92
pub fn decode_strict(encoding : Encoding, src : Bytes) -> StrictChars {
93
  let decoder = decoder(encoding, src)
4✔
94
  decoder
95
}
96

97
// Implementations
98

99
///|
100
fn decoder(encoding : Encoding, src : Bytes) -> Decoder {
101
  let i = src
18✔
102
  let i_pos = 0
103
  let i_max = src.length() - 1
104
  let t = b"\x00\x00\x00\x00"
105
  let t_len = 0
106
  let t_need = 0
107
  let k = match encoding {
108
    UTF8 => decode_utf_8
7✔
109
    UTF16 => decode_utf_16le
1✔
110
    UTF16LE => decode_utf_16le
6✔
111
    UTF16BE => decode_utf_16be
4✔
112
  }
113
  { i, i_pos, i_max, t, t_len, t_need, k }
114
}
115

116
///|
117
fn decode(self : Decoder) -> Decode {
118
  (self.k)(self)
141✔
119
}
120

121
///|
122
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
123
  self.k = k
77✔
124
  v
125
}
126

127
///|
128
fn i_rem(self : Decoder) -> Int {
129
  self.i_max - self.i_pos + 1
168✔
130
}
131

132
///|
133
fn eoi(self : Decoder) -> Unit {
134
  self.i = @bytes.default()
18✔
135
  self.i_pos = 0
136
  self.i_max = @int.min_value
137
}
138

139
///|
140
fn refill(self : Decoder, k : Cont) -> Decode {
141
  // only Bytes
142
  self.eoi()
18✔
143
  k(self)
144
}
145

146
///|
147
fn t_need(self : Decoder, need : Int) -> Unit {
148
  self.t_len = 0
2✔
149
  self.t_need = need
150
}
151

152
///|
153
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
154
  fn blit(decoder : Decoder, l : Int) -> Unit {
4✔
155
    decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l)
2✔
156
    decoder.i_pos = decoder.i_pos + 1
157
    decoder.t_len = decoder.t_len + 1
158
  }
159

160
  let rem = decoder.i_rem()
161
  if rem < 0 { // eoi
162
    k(decoder)
2✔
163
  } else {
164
    let need = decoder.t_need - decoder.t_len
2✔
165
    if rem < need {
166
      blit(decoder, rem)
2✔
167
      decoder.refill(@tuple.curry(t_fill)(k))
168
    } else {
NEW
169
      blit(decoder, need)
×
170
      k(decoder)
171
    }
172
  }
173
}
174

175
// UTF8
176

177
///|
178
fn decode_utf_8(self : Decoder) -> Decode {
179
  let rem = self.i_rem()
59✔
180
  match rem.compare(0) {
181
    // rem < 0
182
    -1 => Decode::End
7✔
183
    // rem = 0
184
    0 => self.refill(decode_utf_8)
5✔
185
    // rem > 0
186
    1 => {
47✔
187
      let idx = self.i[self.i_pos].to_int()
188
      let need = utf_8_len[idx]
189
      if rem < need {
190
        self.t_need(need)
2✔
191
        t_fill(t_decode_utf_8, self)
192
      } else {
193
        let j = self.i_pos
45✔
194
        if need == 0 {
NEW
195
          self.i_pos = self.i_pos + 1
×
196
          self.ret(decode_utf_8, malformed(self.i, j, 1))
197
        } else {
198
          self.i_pos = self.i_pos + need
45✔
199
          self.ret(decode_utf_8, r_utf_8(self.i, j, need))
200
        }
201
      }
202
    }
NEW
203
    _ => abort("unreachable")
×
204
  }
205
}
206

207
///|
208
fn t_decode_utf_8(self : Decoder) -> Decode {
209
  if self.t_len < self.t_need {
2✔
210
    malformed(self.t, 0, self.t_len)
2✔
211
  } else {
NEW
212
    r_utf_8(self.t, 0, self.t_len)
×
213
  }
214
}
215

216
///|
217
fn r_utf_8(bytes : Bytes, offset : Int, length : Int) -> Decode {
218
  fn uchar(c : Int) {
45✔
219
    Uchar(Char::from_int(c))
29✔
220
  }
221

222
  match length {
223
    1 => uchar(bytes[offset].to_int())
18✔
224
    2 => {
14✔
225
      let b0 = bytes[offset].to_int()
226
      let b1 = bytes[offset + 1].to_int()
227
      if (b1 >> 6) != 0b10 {
228
        malformed(bytes, offset, length)
12✔
229
      } else {
230
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
2✔
231
      }
232
    }
233
    3 => {
6✔
234
      let b0 = bytes[offset].to_int()
235
      let b1 = bytes[offset + 1].to_int()
236
      let b2 = bytes[offset + 2].to_int()
237
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
238
      if (b2 >> 6) != 0b10 {
NEW
239
        malformed(bytes, offset, length)
×
240
      } else {
241
        match b0 {
6✔
242
          0xE0 =>
NEW
243
            if b1 < 0xA0 || 0xBF < b1 {
×
NEW
244
              malformed(bytes, offset, length)
×
245
            } else {
NEW
246
              uchar(c)
×
247
            }
248
          0xED =>
NEW
249
            if b1 < 0x80 || 0x9F < b1 {
×
NEW
250
              malformed(bytes, offset, length)
×
251
            } else {
NEW
252
              uchar(c)
×
253
            }
254
          _ =>
255
            if (b1 >> 6) != 0b10 {
6✔
NEW
256
              malformed(bytes, offset, length)
×
257
            } else {
258
              uchar(c)
6✔
259
            }
260
        }
261
      }
262
    }
263
    4 => {
7✔
264
      let b0 = bytes[offset].to_int()
265
      let b1 = bytes[offset + 1].to_int()
266
      let b2 = bytes[offset + 2].to_int()
267
      let b3 = bytes[offset + 3].to_int()
268
      let c = ((b0 & 0x07) << 18) |
269
        ((b1 & 0x3F) << 12) |
270
        ((b2 & 0x3F) << 6) |
271
        (b3 & 0x3F)
272
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
273
        malformed(bytes, offset, length)
4✔
274
      } else {
275
        match b0 {
3✔
276
          0xF0 =>
277
            if b1 < 0x90 || 0xBF < b1 {
3✔
NEW
278
              malformed(bytes, offset, length)
×
279
            } else {
280
              uchar(c)
3✔
281
            }
282
          0xF4 =>
NEW
283
            if b1 < 0x80 || 0x8F < b1 {
×
NEW
284
              malformed(bytes, offset, length)
×
285
            } else {
NEW
286
              uchar(c)
×
287
            }
288
          _ =>
NEW
289
            if (b1 >> 6) != 0b10 {
×
NEW
290
              malformed(bytes, offset, length)
×
291
            } else {
NEW
292
              uchar(c)
×
293
            }
294
        }
295
      }
296
    }
NEW
297
    _ => panic()
×
298
  }
299
}
300

301
// UTF16LE
302

303
///|
304
priv enum UTF16Decode {
305
  Hi(Int)
306
  UTF16Malformed(Bytes)
307
  UTF16Uchar(Char)
308
}
309

310
///|
311
fn decode_utf_16le(self : Decoder) -> Decode {
312
  let rem = self.i_rem()
58✔
313
  match rem.compare(0) {
314
    // rem < 0
315
    -1 => Decode::End
7✔
316
    // rem = 0
317
    0 => self.refill(decode_utf_16le)
7✔
318
    // rem > 0
319
    1 =>
320
      if rem < 2 {
44✔
NEW
321
        self.t_need(2)
×
322
        t_fill(t_decode_utf_16le, self)
323
      } else {
324
        let j = self.i_pos
44✔
325
        self.i_pos = self.i_pos + 2
326
        // mark
327
        self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
328
      }
NEW
329
    _ => abort("unreachable")
×
330
  }
331
}
332

333
///|
334
fn t_decode_utf_16le(self : Decoder) -> Decode {
NEW
335
  if self.t_len < self.t_need {
×
NEW
336
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
×
337
  } else {
NEW
338
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
×
339
  }
340
}
341

342
///|
343
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
344
  match v {
44✔
345
    UTF16Uchar(u) => Uchar(u)
41✔
NEW
346
    UTF16Malformed(s) => Malformed(s)
×
347
    Hi(hi) => {
3✔
348
      let rem = self.i_rem()
349
      if rem < 2 {
NEW
350
        self.t_need(2)
×
351
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
352
      } else {
353
        let j = self.i_pos
3✔
354
        self.i_pos = self.i_pos + 2
355
        r_utf_16_lo(hi, self.i, j + 1, j)
356
      }
357
    }
358
  }
359
}
360

361
///|
362
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
NEW
363
  if decoder.t_len < decoder.t_need {
×
NEW
364
    decoder.ret(
×
365
      decode_utf_16le,
366
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
367
    )
368
  } else {
NEW
369
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
×
370
  }
371
}
372

373
///|
374
fn r_utf_16_lo(hi : Int, bytes : Bytes, offset0 : Int, offset1 : Int) -> Decode {
375
  let b0 = bytes[offset0].to_int()
7✔
376
  let b1 = bytes[offset1].to_int()
377
  let lo = (b0 << 8) | b1
378
  if lo < 0xDC00 || lo > 0xDFFF {
NEW
379
    malformed_pair(
×
380
      offset0 < offset1,
381
      hi,
382
      bytes,
383
      @math.minimum(offset0, offset1),
384
      2,
385
    )
386
  } else {
387
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
7✔
388
  }
389
}
390

391
///|
392
fn r_utf_16(bytes : Bytes, offset0 : Int, offset1 : Int) -> UTF16Decode {
393
  let b0 = bytes[offset0].to_int()
76✔
394
  let b1 = bytes[offset1].to_int()
395
  let u = (b0 << 8) | b1
396
  if u < 0xD800 || u > 0xDFFF {
397
    UTF16Uchar(Char::from_int(u))
69✔
398
  } else if u > 0xDBFF {
7✔
NEW
399
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
×
400
  } else {
401
    Hi(u)
7✔
402
  }
403
}
404

405
// UTF16BE
406

407
///|
408
fn decode_utf_16be(self : Decoder) -> Decode {
409
  let rem = self.i_rem()
40✔
410
  match rem.compare(0) {
411
    // rem < 0
412
    -1 => Decode::End
4✔
413
    // rem = 0
414
    0 => self.refill(decode_utf_16be)
4✔
415
    // rem > 0
416
    1 =>
417
      if rem < 2 {
32✔
NEW
418
        self.t_need(2)
×
419
        t_fill(t_decode_utf_16be, self)
420
      } else {
421
        let j = self.i_pos
32✔
422
        self.i_pos = self.i_pos + 2
423
        self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
424
      }
NEW
425
    _ => abort("unreachable")
×
426
  }
427
}
428

429
///|
430
fn t_decode_utf_16be(self : Decoder) -> Decode {
NEW
431
  if self.t_len < self.t_need {
×
NEW
432
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
×
433
  } else {
NEW
434
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
×
435
  }
436
}
437

438
///|
439
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
440
  match decode {
32✔
441
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
28✔
NEW
442
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
×
443
    Hi(hi) => {
4✔
444
      let rem = self.i_rem()
445
      if rem < 2 {
NEW
446
        self.t_need(2)
×
447
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
448
      } else {
449
        let j = self.i_pos
4✔
450
        self.i_pos = self.i_pos + 2
451
        self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1))
452
      }
453
    }
454
  }
455
}
456

457
///|
458
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
NEW
459
  if self.t_len < self.t_need {
×
NEW
460
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
×
461
  } else {
NEW
462
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
×
463
  }
464
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc