moonbitlang / x / 383

Committed 08 Mar 2025 12:31AM UTC coverage: 88.643% (-0.9%) from 89.592%

Build # 383

Build Type

Pull #112

github

Committed by

web-flow

Commit Message

Merge 1bab852be into 4fce8878f

Pull Request Pull Request #112: perf(encoding): optimize char accumulation

Run Details

10 of 29 new or added lines in 2 files covered. (34.48%)

28 existing lines in 1 file now uncovered.

1280 of 1444 relevant lines covered (88.64%)

428.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.99

/encoding/decoding.mbt

// Copyright 2024 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

///| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
/// https://unicode.org/charts/nameslist/n_FFF0.html
pub const U_REP = '\u{FFFD}'

///|
let utf_8_len = [
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]

///|
fn estimate_utf_16_len(utf_8_bs : FixedArray[Byte]) -> Int {
  let mut ascii_count = 0
  let utf_8_bs_len = utf_8_bs.length()
  for b in utf_8_bs {
    // ASCII
    if b <= 0x7F {
      ascii_count += 1
    }
  }
  utf_8_bs_len + ascii_count
}

///|
/// Create and return a `Decoder` for the specified character encoding.
///
/// The `Decoder` consumes byte sequences and decodes them into the original string format.
///
/// # Parameters
///
/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
///
/// # Returns
///
/// A `Decoder` instance that can be used to decode byte sequences into strings.
///
/// # Examples
///
/// ```moonbit
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
/// let decoder = decoder(UTF8)
/// inspect!(decoder.consume!(inputs[0]), content="abc")
/// inspect!(decoder.consume!(inputs[1]), content="")
/// inspect!(decoder.consume!(inputs[2]), content="🐰")
/// assert_true!(decoder.finish!().is_empty())
pub fn decoder(encoding : Encoding) -> Decoder {
  let i = FixedArray::default()
  let i_pos = 0
  let t = FixedArray::make(4, Byte::default())
  let t_len = 0
  let t_need = 0
  let k = match encoding {
    UTF8 => decode_utf_8
    UTF16 => decode_utf_16le
    UTF16LE => decode_utf_16le
    UTF16BE => decode_utf_16be
  }
  { i, i_pos, t, t_len, t_need, k, encoding }
}

///|
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
///
/// This function can work in streaming mode where bytes are consumed incrementally.
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to decode the byte sequence.
/// - `input`: The byte sequence to be decoded.
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
///
/// # Returns
///
/// A `String` representing the decoded content from the input byte sequence.
///
/// # Errors
///
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
///
/// # Examples
///
/// ```moonbit
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
/// let decoder = @encoding.decoder(UTF8)
/// inspect!(decoder.decode!(inputs[0], stream=true), content="abc")
/// inspect!(decoder.decode!(inputs[1], stream=true), content="")
/// inspect!(decoder.decode!(inputs[2], stream=false), content="🐰")
/// ```
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
  if input.length() > 0 {
    self.i_cont(input)
  }
  if self.i_rem() == 0 {
    return String::default()
  }
  let size_hint = match self.encoding {
    UTF16 | UTF16LE | UTF16BE => self.i_rem()
    UTF8 => estimate_utf_16_len(self.i)
  }
  let builder = StringBuilder::new(size_hint~)

  // drive decoder to decode
  loop self.decode_() {
    Uchar(u) => {
      builder.write_char(u)
      continue self.decode_()
    }
    Malformed(bs) =>
      if stream && self.t_need > 0 {
        builder.to_string()
      } else {
        raise MalformedError(bs)
      }
    End => builder.to_string()
    Refill(t) =>
      if stream {
        builder.to_string()
      } else {
        raise TruncatedError(t)
      }
  }
}

///|
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
///
/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to consume the byte sequence.
/// - `input`: The byte sequence to be consumed and decoded incrementally.
///
/// # Returns
///
/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
///
/// # Errors
///
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
pub fn consume!(self : Decoder, input : Bytes) -> String {
  self.decode!(input, stream=true)
}

///|
/// Finalize the decoding process and return the remaining decoded string.
///
/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
/// and triggering the final decoding step to produce the remaining output.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to finalize the decoding process.
///
/// # Returns
///
/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
///
/// # Errors
///
/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
pub fn finish!(self : Decoder) -> String {
  self.decode!(b"", stream=false)
}

///|
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function can work in streaming mode where bytes are consumed incrementally.
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to decode the byte sequence.
/// - `input`: The byte sequence to be decoded.
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
///
/// # Returns
///
/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
pub fn decode_lossy(
  self : Decoder,
  input : Bytes,
  stream~ : Bool = false
) -> String {
  if input.length() > 0 {
    self.i_cont(input)
  }
  if self.i_rem() == 0 {
    return String::default()
  }
  let size_hint = match self.encoding {
    UTF16 | UTF16LE | UTF16BE => self.i_rem()
    UTF8 => estimate_utf_16_len(self.i)
  }
  let builder = StringBuilder::new(size_hint~)

  // drive decoder to decode
  loop self.decode_() {
    Uchar(u) => {
      builder.write_char(u)
      continue self.decode_()
    }
    Malformed(_) =>
      if stream && self.t_need > 0 {
        builder.to_string()
      } else {
        builder.write_char(U_REP)
        continue self.decode_()
      }
    End => builder.to_string()
    Refill(_) =>
      if stream {
        builder.to_string()
      } else {
        continue self.decode_()
      }
  }
}

///|
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
/// - `input`: The byte sequence to be consumed and decoded incrementally.
///
/// # Returns
///
/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
pub fn lossy_consume(self : Decoder, input : Bytes) -> String {
  self.decode_lossy(input, stream=true)
}

///|
/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
/// and triggering the final decoding step to produce the remaining output.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
///
/// # Returns
///
/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
pub fn lossy_finish(self : Decoder) -> String {
  self.decode_lossy(b"", stream=false)
}

///|
fn i_cont(self : Decoder, input : Bytes) -> Unit {
  // concat `input` to `i`, drop decoded `i`
  let i_rem = @math.maximum(self.i_rem(), 0)
  let new_len = i_rem + input.length()
  // init a new `i`
  let new_i = FixedArray::make(new_len, Byte::default())
  if i_rem > 0 {
    // copy the remainder of the old `i` into the new `i`
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
  }
  // copy all `input` into new `i`, starting at the remainder of the old `i`
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
  self.i = new_i
  // reset position to starting position
  self.i_pos = 0
}

// Implementations

///|
fn decode_(self : Decoder) -> Decode {
  (self.k)(self)
}

///|
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
  self.k = k
  v
}

///|
fn i_rem(self : Decoder) -> Int {
  self.i.length() - self.i_pos
}

///|
fn t_need(self : Decoder, need : Int) -> Unit {
  self.t_len = 0
  self.t_need = need
}

///|
fn eoi(self : Decoder) -> Unit {
  self.i = FixedArray::default()
}

///|
fn refill(self : Decoder, k : Cont) -> Decode {
  self.eoi()
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
}

///|
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
  fn blit(decoder : Decoder, l : Int) -> Unit {
    decoder.i.blit_to(
      decoder.t,
      len=l,
      dst_offset=decoder.t_len,
      src_offset=decoder.i_pos,
    )
    decoder.i_pos += l
    decoder.t_len += l
  }

  let rem = decoder.i_rem()
  if rem < 0 { // eoi
    k(decoder)
  } else {
    let need = decoder.t_need - decoder.t_len
    if rem < need {
      blit(decoder, rem)
      decoder.refill(@tuple.curry(t_fill)(k))
    } else {
      blit(decoder, need)
      k(decoder)
    }
  }
}

// UTF8

///|
fn decode_utf_8(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else {
    let idx = self.i[self.i_pos].to_int()
    let need = utf_8_len[idx]
    if rem < need {
      self.t_need(need)
      t_fill(t_decode_utf_8, self)
    } else {
      let j = self.i_pos
      if need == 0 {
        self.i_pos += 1
        self.ret(decode_utf_8, malformed(self.i, j, 1))
      } else {
        self.i_pos += need
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
      }
    }
  }
}

///|
fn t_decode_utf_8(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
  } else {
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
  }
}

///|
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
  fn uchar(c : Int) {
    Uchar(Char::from_int(c))
  }

  match length {
    1 => uchar(bytes[offset].to_int())
    2 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      if (b1 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
      }
    }
    3 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      let b2 = bytes[offset + 2].to_int()
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
      if (b2 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        match b0 {
          0xE0 =>
            if b1 < 0xA0 || 0xBF < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          0xED =>
            if b1 < 0x80 || 0x9F < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          _ =>
            if (b1 >> 6) != 0b10 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
        }
      }
    }
    4 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      let b2 = bytes[offset + 2].to_int()
      let b3 = bytes[offset + 3].to_int()
      let c = ((b0 & 0x07) << 18) |
        ((b1 & 0x3F) << 12) |
        ((b2 & 0x3F) << 6) |
        (b3 & 0x3F)
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        match b0 {
          0xF0 =>
            if b1 < 0x90 || 0xBF < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          0xF4 =>
            if b1 < 0x80 || 0x8F < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          _ =>
            if (b1 >> 6) != 0b10 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
        }
      }
    }
    _ => panic()
  }
}

// UTF16LE

///|
priv enum UTF16Decode {
  Hi(Int)
  UTF16Malformed(Bytes)
  UTF16Uchar(Char)
}

///|
fn decode_utf_16le(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else if rem < 2 {
    self.t_need(2)
    t_fill(t_decode_utf_16le, self)
  } else {
    let j = self.i_pos
    self.i_pos += 2
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
  }
}

///|
fn t_decode_utf_16le(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
  } else {
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
  }
}

///|
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
  match v {
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
    Hi(hi) => {
      let rem = self.i_rem()
      if rem < 2 {
        self.t_need(2)
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
      } else {
        let j = self.i_pos
        let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
        match dcd {
          Uchar(_) => self.i_pos += 2
          _ => ()
        }
        self.ret(decode_utf_16le, dcd)
      }
    }
  }
}

///|
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
  if decoder.t_len < decoder.t_need {
    decoder.ret(
      decode_utf_16le,
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
    )
  } else {
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
  }
}

///|
fn r_utf_16_lo(
  hi : Int,
  bytes : FixedArray[Byte],
  offset0 : Int,
  offset1 : Int
) -> Decode {
  let b0 = bytes[offset0].to_int()
  let b1 = bytes[offset1].to_int()
  let lo = (b0 << 8) | b1
  if lo < 0xDC00 || lo > 0xDFFF {
    // NOTE(jinser): only hi malformed, skip lo if lo is illegal
    //
    // For example, b"\xD8\x00\x00\x48" (BE)
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
    //
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
    //
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
    // �H
    Malformed([bytes[offset0], bytes[offset1]])
  } else {
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
  }
}

///|
fn r_utf_16(
  bytes : FixedArray[Byte],
  offset0 : Int,
  offset1 : Int
) -> UTF16Decode {
  let b0 = bytes[offset0].to_int()
  let b1 = bytes[offset1].to_int()
  let u = (b0 << 8) | b1
  if u < 0xD800 || u > 0xDFFF {
    UTF16Uchar(Char::from_int(u))
  } else if u > 0xDBFF {
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
  } else {
    Hi(u)
  }
}

// UTF16BE

///|
fn decode_utf_16be(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else if rem < 2 {
    self.t_need(2)
    t_fill(t_decode_utf_16be, self)
  } else {
    let j = self.i_pos
    self.i_pos += 2
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
  }
}

///|
fn t_decode_utf_16be(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
  } else {
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
  }
}

///|
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
  match decode {
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
    Hi(hi) => {
      let rem = self.i_rem()
      if rem < 2 {
        self.t_need(2)
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
      } else {
        let j = self.i_pos
        let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
        match dcd {
          Uchar(_) => self.i_pos += 2
          _ => ()
        }
        self.ret(decode_utf_16be, dcd)
      }
    }
  }
}

///|
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
  } else {
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
  }
}

1	// Copyright 2024 International Digital Economy Academy
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	///\| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
16	/// https://unicode.org/charts/nameslist/n_FFF0.html
17	pub const U_REP = '\u{FFFD}'
18
19	///\|
20	let utf_8_len = [
21	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
26	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28	0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
29	2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
30	4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31	]
32
33	///\|
34	fn estimate_utf_16_len(utf_8_bs : FixedArray[Byte]) -> Int {
35	let mut ascii_count = 0	22✔
36	let utf_8_bs_len = utf_8_bs.length()
37	for b in utf_8_bs {
38	// ASCII
39	if b <= 0x7F {
40	ascii_count += 1	58✔
41	}
42	}
43	utf_8_bs_len + ascii_count
44	}
45
46	///\|
47	/// Create and return a `Decoder` for the specified character encoding.
48	///
49	/// The `Decoder` consumes byte sequences and decodes them into the original string format.
50	///
51	/// # Parameters
52	///
53	/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
54	///
55	/// # Returns
56	///
57	/// A `Decoder` instance that can be used to decode byte sequences into strings.
58	///
59	/// # Examples
60	///
61	/// ```moonbit
62	/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
63	/// let decoder = decoder(UTF8)
64	/// inspect!(decoder.consume!(inputs[0]), content="abc")
65	/// inspect!(decoder.consume!(inputs[1]), content="")
66	/// inspect!(decoder.consume!(inputs[2]), content="🐰")
67	/// assert_true!(decoder.finish!().is_empty())
68	pub fn decoder(encoding : Encoding) -> Decoder {
69	let i = FixedArray::default()	32✔
70	let i_pos = 0
71	let t = FixedArray::make(4, Byte::default())
72	let t_len = 0
73	let t_need = 0
74	let k = match encoding {
75	UTF8 => decode_utf_8	10✔
76	UTF16 => decode_utf_16le	2✔
77	UTF16LE => decode_utf_16le	11✔
78	UTF16BE => decode_utf_16be	9✔
79	}
80	{ i, i_pos, t, t_len, t_need, k, encoding }
81	}
82
83	///\|
84	/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
85	///
86	/// This function can work in streaming mode where bytes are consumed incrementally.
87	/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
88	///
89	/// # Parameters
90	///
91	/// - `self`: The `Decoder` instance used to decode the byte sequence.
92	/// - `input`: The byte sequence to be decoded.
93	/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
94	///
95	/// # Returns
96	///
97	/// A `String` representing the decoded content from the input byte sequence.
98	///
99	/// # Errors
100	///
101	/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
102	/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
103	///
104	/// # Examples
105	///
106	/// ```moonbit
107	/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
108	/// let decoder = @encoding.decoder(UTF8)
109	/// inspect!(decoder.decode!(inputs[0], stream=true), content="abc")
110	/// inspect!(decoder.decode!(inputs[1], stream=true), content="")
111	/// inspect!(decoder.decode!(inputs[2], stream=false), content="🐰")
112	/// ```
113	pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
114	if input.length() > 0 {	76✔
115	self.i_cont(input)	70✔
116	}
117	if self.i_rem() == 0 {
118	return String::default()	6✔
119	}
120	let size_hint = match self.encoding {
121	UTF16 \| UTF16LE \| UTF16BE => self.i_rem()	50✔
122	UTF8 => estimate_utf_16_len(self.i)	20✔
123	}
124	let builder = StringBuilder::new(size_hint~)
125
126	// drive decoder to decode
127	loop self.decode_() {
128	Uchar(u) => {	278✔
129	builder.write_char(u)
130	continue self.decode_()
131	}
132	Malformed(bs) =>
133	if stream && self.t_need > 0 {	4✔
NEW UNCOV 134	builder.to_string()	×
135	} else {
136	raise MalformedError(bs)	4✔
137	}
138	End => builder.to_string()	42✔
139	Refill(t) =>
140	if stream {	24✔
141	builder.to_string()	24✔
142	} else {
UNCOV 143	raise TruncatedError(t)	×
144	}
145	}
146	}
147
148	///\|
149	/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
150	///
151	/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
152	///
153	/// # Parameters
154	///
155	/// - `self`: The `Decoder` instance used to consume the byte sequence.
156	/// - `input`: The byte sequence to be consumed and decoded incrementally.
157	///
158	/// # Returns
159	///
160	/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
161	///
162	/// # Errors
163	///
164	/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
165	/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
166	pub fn consume!(self : Decoder, input : Bytes) -> String {
167	self.decode!(input, stream=true)	50✔
168	}
169
170	///\|
171	/// Finalize the decoding process and return the remaining decoded string.
172	///
173	/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
174	/// and triggering the final decoding step to produce the remaining output.
175	///
176	/// # Parameters
177	///
178	/// - `self`: The `Decoder` instance used to finalize the decoding process.
179	///
180	/// # Returns
181	///
182	/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
183	///
184	/// # Errors
185	///
186	/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
187	/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
188	pub fn finish!(self : Decoder) -> String {
189	self.decode!(b"", stream=false)	6✔
190	}
191
192	///\|
193	/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
194	///
195	/// This function can work in streaming mode where bytes are consumed incrementally.
196	/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
197	///
198	/// # Parameters
199	///
200	/// - `self`: The `Decoder` instance used to decode the byte sequence.
201	/// - `input`: The byte sequence to be decoded.
202	/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
203	///
204	/// # Returns
205	///
206	/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
207	pub fn decode_lossy(
208	self : Decoder,
209	input : Bytes,
210	stream~ : Bool = false
211	) -> String {
212	if input.length() > 0 {	6✔
213	self.i_cont(input)	6✔
214	}
215	if self.i_rem() == 0 {
UNCOV 216	return String::default()	×
217	}
218	let size_hint = match self.encoding {
219	UTF16 \| UTF16LE \| UTF16BE => self.i_rem()	4✔
220	UTF8 => estimate_utf_16_len(self.i)	2✔
221	}
222	let builder = StringBuilder::new(size_hint~)
223
224	// drive decoder to decode
225	loop self.decode_() {
226	Uchar(u) => {	6✔
227	builder.write_char(u)
228	continue self.decode_()
229	}
230	Malformed(_) =>
231	if stream && self.t_need > 0 {	9✔
NEW UNCOV 232	builder.to_string()	×
233	} else {
234	builder.write_char(U_REP)	9✔
235	continue self.decode_()
236	}
237	End => builder.to_string()	6✔
238	Refill(_) =>
239	if stream {	3✔
NEW UNCOV 240	builder.to_string()	×
241	} else {
242	continue self.decode_()	3✔
243	}
244	}
245	}
246
247	///\|
248	/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
249	///
250	/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
251	///
252	/// # Parameters
253	///
254	/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
255	/// - `input`: The byte sequence to be consumed and decoded incrementally.
256	///
257	/// # Returns
258	///
259	/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
260	pub fn lossy_consume(self : Decoder, input : Bytes) -> String {
UNCOV 261	self.decode_lossy(input, stream=true)	×
262	}
263
264	///\|
265	/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
266	///
267	/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
268	/// and triggering the final decoding step to produce the remaining output.
269	///
270	/// # Parameters
271	///
272	/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
273	///
274	/// # Returns
275	///
276	/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
277	pub fn lossy_finish(self : Decoder) -> String {
UNCOV 278	self.decode_lossy(b"", stream=false)	×
279	}
280
281	///\|
282	fn i_cont(self : Decoder, input : Bytes) -> Unit {
283	// concat `input` to `i`, drop decoded `i`
284	let i_rem = @math.maximum(self.i_rem(), 0)	76✔
285	let new_len = i_rem + input.length()
286	// init a new `i`
287	let new_i = FixedArray::make(new_len, Byte::default())
288	if i_rem > 0 {
289	// copy the remainder of the old `i` into the new `i`
UNCOV 290	self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)	×
291	}
292	// copy all `input` into new `i`, starting at the remainder of the old `i`
293	new_i.blit_from_bytes(i_rem, input, 0, input.length())
294	self.i = new_i
295	// reset position to starting position
296	self.i_pos = 0
297	}
298
299	// Implementations
300
301	///\|
302	fn decode_(self : Decoder) -> Decode {
303	(self.k)(self)	372✔
304	}
305
306	///\|
307	fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
308	self.k = k	324✔
309	v
310	}
311
312	///\|
313	fn i_rem(self : Decoder) -> Int {
314	self.i.length() - self.i_pos	627✔
315	}
316
317	///\|
318	fn t_need(self : Decoder, need : Int) -> Unit {
319	self.t_len = 0	27✔
320	self.t_need = need
321	}
322
323	///\|
324	fn eoi(self : Decoder) -> Unit {
325	self.i = FixedArray::default()	27✔
326	}
327
328	///\|
329	fn refill(self : Decoder, k : Cont) -> Decode {
330	self.eoi()	27✔
331	self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
332	}
333
334	///\|
335	fn t_fill(k : Cont, decoder : Decoder) -> Decode {
336	fn blit(decoder : Decoder, l : Int) -> Unit {	54✔
337	decoder.i.blit_to(	51✔
338	decoder.t,
339	len=l,
340	dst_offset=decoder.t_len,
341	src_offset=decoder.i_pos,
342	)
343	decoder.i_pos += l
344	decoder.t_len += l
345	}
346
347	let rem = decoder.i_rem()
348	if rem < 0 { // eoi
349	k(decoder)	3✔
350	} else {
351	let need = decoder.t_need - decoder.t_len	51✔
352	if rem < need {
353	blit(decoder, rem)	27✔
354	decoder.refill(@tuple.curry(t_fill)(k))
355	} else {
356	blit(decoder, need)	24✔
357	k(decoder)
358	}
359	}
360	}
361
362	// UTF8
363
364	///\|
365	fn decode_utf_8(self : Decoder) -> Decode {
366	let rem = self.i_rem()	95✔
367	if rem <= 0 {
368	Decode::End	14✔
369	} else {
370	let idx = self.i[self.i_pos].to_int()	81✔
371	let need = utf_8_len[idx]
372	if rem < need {
373	self.t_need(need)	6✔
374	t_fill(t_decode_utf_8, self)
375	} else {
376	let j = self.i_pos	75✔
377	if need == 0 {
378	self.i_pos += 1	1✔
379	self.ret(decode_utf_8, malformed(self.i, j, 1))
380	} else {
381	self.i_pos += need	74✔
382	self.ret(decode_utf_8, r_utf_8(self.i, j, need))
383	}
384	}
385	}
386	}
387
388	///\|
389	fn t_decode_utf_8(self : Decoder) -> Decode {
390	if self.t_len < self.t_need {	6✔
UNCOV 391	self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))	×
392	} else {
393	self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))	6✔
394	}
395	}
396
397	///\|
398	fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
399	fn uchar(c : Int) {	80✔
400	Uchar(Char::from_int(c))	74✔
401	}
402
403	match length {
404	1 => uchar(bytes[offset].to_int())	44✔
405	2 => {	6✔
406	let b0 = bytes[offset].to_int()
407	let b1 = bytes[offset + 1].to_int()
408	if (b1 >> 6) != 0b10 {
409	malformed(bytes, offset, length)	4✔
410	} else {
411	uchar(((b0 & 0x1F) << 6) \| (b1 & 0x3F))	2✔
412	}
413	}
414	3 => {	13✔
415	let b0 = bytes[offset].to_int()
416	let b1 = bytes[offset + 1].to_int()
417	let b2 = bytes[offset + 2].to_int()
418	let c = ((b0 & 0x0F) << 12) \| (((b1 & 0x3F) << 6) \| (b2 & 0x3F))
419	if (b2 >> 6) != 0b10 {
UNCOV 420	malformed(bytes, offset, length)	×
421	} else {
422	match b0 {	13✔
423	0xE0 =>
424	if b1 < 0xA0 \|\| 0xBF < b1 {	1✔
425	malformed(bytes, offset, length)	1✔
426	} else {
UNCOV 427	uchar(c)	×
428	}
429	0xED =>
UNCOV 430	if b1 < 0x80 \|\| 0x9F < b1 {	×
UNCOV 431	malformed(bytes, offset, length)	×
432	} else {
UNCOV 433	uchar(c)	×
434	}
435	_ =>
436	if (b1 >> 6) != 0b10 {	12✔
UNCOV 437	malformed(bytes, offset, length)	×
438	} else {
439	uchar(c)	12✔
440	}
441	}
442	}
443	}
444	4 => {	17✔
445	let b0 = bytes[offset].to_int()
446	let b1 = bytes[offset + 1].to_int()
447	let b2 = bytes[offset + 2].to_int()
448	let b3 = bytes[offset + 3].to_int()
449	let c = ((b0 & 0x07) << 18) \|
450	((b1 & 0x3F) << 12) \|
451	((b2 & 0x3F) << 6) \|
452	(b3 & 0x3F)
453	if (b3 >> 6) != 0b10 \|\| (b2 >> 6) != 0b10 {
454	malformed(bytes, offset, length)	1✔
455	} else {
456	match b0 {	16✔
457	0xF0 =>
458	if b1 < 0x90 \|\| 0xBF < b1 {	16✔
459	malformed(bytes, offset, length)	×
460	} else {
461	uchar(c)	16✔
462	}
463	0xF4 =>
UNCOV 464	if b1 < 0x80 \|\| 0x8F < b1 {	×
UNCOV 465	malformed(bytes, offset, length)	×
466	} else {
UNCOV 467	uchar(c)	×
468	}
469	_ =>
UNCOV 470	if (b1 >> 6) != 0b10 {	×
UNCOV 471	malformed(bytes, offset, length)	×
472	} else {
UNCOV 473	uchar(c)	×
474	}
475	}
476	}
477	}
UNCOV 478	_ => panic()	×
479	}
480	}
481
482	// UTF16LE
483
484	///\|
485	priv enum UTF16Decode {
486	Hi(Int)
487	UTF16Malformed(Bytes)
488	UTF16Uchar(Char)
489	}
490
491	///\|
492	fn decode_utf_16le(self : Decoder) -> Decode {
493	let rem = self.i_rem()	141✔
494	if rem <= 0 {
495	Decode::End	21✔
496	} else if rem < 2 {	120✔
497	self.t_need(2)	9✔
498	t_fill(t_decode_utf_16le, self)
499	} else {
500	let j = self.i_pos	111✔
501	self.i_pos += 2
502	self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
503	}
504	}
505
506	///\|
507	fn t_decode_utf_16le(self : Decoder) -> Decode {
508	if self.t_len < self.t_need {	9✔
UNCOV 509	self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))	×
510	} else {
511	self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))	9✔
512	}
513	}
514
515	///\|
516	fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
517	match v {	120✔
518	UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))	111✔
UNCOV 519	UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))	×
520	Hi(hi) => {	9✔
521	let rem = self.i_rem()
522	if rem < 2 {
523	self.t_need(2)	4✔
524	t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
525	} else {
526	let j = self.i_pos	5✔
527	let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
528	match dcd {
529	Uchar(_) => self.i_pos += 2	4✔
530	_ => ()	1✔
531	}
532	self.ret(decode_utf_16le, dcd)
533	}
534	}
535	}
536	}
537
538	///\|
539	fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
540	if decoder.t_len < decoder.t_need {	4✔
541	decoder.ret(	2✔
542	decode_utf_16le,
543	malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
544	)
545	} else {
546	decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))	2✔
547	}
548	}
549
550	///\|
551	fn r_utf_16_lo(
552	hi : Int,
553	bytes : FixedArray[Byte],
554	offset0 : Int,
555	offset1 : Int
556	) -> Decode {
557	let b0 = bytes[offset0].to_int()	13✔
558	let b1 = bytes[offset1].to_int()
559	let lo = (b0 << 8) \| b1
560	if lo < 0xDC00 \|\| lo > 0xDFFF {
561	// NOTE(jinser): only hi malformed, skip lo if lo is illegal
562	//
563	// For example, b"\xD8\x00\x00\x48" (BE)
564	// Since \xD8\x00 is legal hi, here will try to parse lo next,
565	// however the whole \xD8\x00\x00\x48 is illegal so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
566	//
567	// But \x00\x48 itself is a legal UTF16 code point with a value of `H`,
568	// the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
569	//
570	// > printf '\xD8\x00\x00\x48' \| uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
571	// �H
572	Malformed([bytes[offset0], bytes[offset1]])	3✔
573	} else {
574	Uchar(Char::from_int(((hi & 0x3FF) << 10) \| ((lo & 0x3FF) + 0x10000)))	10✔
575	}
576	}
577
578	///\|
579	fn r_utf_16(
580	bytes : FixedArray[Byte],
581	offset0 : Int,
582	offset1 : Int
583	) -> UTF16Decode {
584	let b0 = bytes[offset0].to_int()	216✔
585	let b1 = bytes[offset1].to_int()
586	let u = (b0 << 8) \| b1
587	if u < 0xD800 \|\| u > 0xDFFF {
588	UTF16Uchar(Char::from_int(u))	200✔
589	} else if u > 0xDBFF {	16✔
UNCOV 590	UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))	×
591	} else {
592	Hi(u)	16✔
593	}
594	}
595
596	// UTF16BE
597
598	///\|
599	fn decode_utf_16be(self : Decoder) -> Decode {
600	let rem = self.i_rem()	109✔
601	if rem <= 0 {
602	Decode::End	13✔
603	} else if rem < 2 {	96✔
604	self.t_need(2)	7✔
605	t_fill(t_decode_utf_16be, self)
606	} else {
607	let j = self.i_pos	89✔
608	self.i_pos += 2
609	self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
610	}
611	}
612
613	///\|
614	fn t_decode_utf_16be(self : Decoder) -> Decode {
615	if self.t_len < self.t_need {	7✔
UNCOV 616	self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))	×
617	} else {
618	self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))	7✔
619	}
620	}
621
622	///\|
623	fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
624	match decode {	96✔
625	UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))	89✔
UNCOV 626	UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))	×
627	Hi(hi) => {	7✔
628	let rem = self.i_rem()
629	if rem < 2 {
630	self.t_need(2)	1✔
631	t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
632	} else {
633	let j = self.i_pos	6✔
634	let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
635	match dcd {
636	Uchar(_) => self.i_pos += 2	4✔
637	_ => ()	2✔
638	}
639	self.ret(decode_utf_16be, dcd)
640	}
641	}
642	}
643	}
644
645	///\|
646	fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
647	if self.t_len < self.t_need {	1✔
648	self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))	1✔
649	} else {
UNCOV 650	self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))	×
651	}
652	}

moonbitlang / x / 383

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous