moonbitlang / x / 387

Committed 10 Mar 2025 10:05AM UTC coverage: 88.472% (-1.1%) from 89.592%

Build # 387

Build Type

Pull #112

github

Committed by

web-flow

Commit Message

Merge f19595cde into 2fdae536b

Pull Request Pull Request #112: perf(encoding): optimize char accumulation

Run Details

2 of 19 new or added lines in 2 files covered. (10.53%)

9 existing lines in 1 file now uncovered.

1274 of 1440 relevant lines covered (88.47%)

429.39 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.03

/encoding/decoding.mbt

// Copyright 2024 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

///| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
/// https://unicode.org/charts/nameslist/n_FFF0.html
pub const U_REP = '\u{FFFD}'

///|
let utf_8_len = [
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
  4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]

///|
/// Create and return a `Decoder` for the specified character encoding.
///
/// The `Decoder` consumes byte sequences and decodes them into the original string format.
///
/// # Parameters
///
/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
///
/// # Returns
///
/// A `Decoder` instance that can be used to decode byte sequences into strings.
///
/// # Examples
///
/// ```moonbit
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
/// let decoder = decoder(UTF8)
/// inspect!(decoder.consume!(inputs[0]), content="abc")
/// inspect!(decoder.consume!(inputs[1]), content="")
/// inspect!(decoder.consume!(inputs[2]), content="🐰")
/// assert_true!(decoder.finish!().is_empty())
pub fn decoder(encoding : Encoding) -> Decoder {
  let i = FixedArray::default()
  let i_pos = 0
  let t = FixedArray::make(4, Byte::default())
  let t_len = 0
  let t_need = 0
  let k = match encoding {
    UTF8 => decode_utf_8
    UTF16 => decode_utf_16le
    UTF16LE => decode_utf_16le
    UTF16BE => decode_utf_16be
  }
  { i, i_pos, t, t_len, t_need, k }
}

///|
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
///
/// This function can work in streaming mode where bytes are consumed incrementally.
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to decode the byte sequence.
/// - `input`: The byte sequence to be decoded.
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
///
/// # Returns
///
/// A `String` representing the decoded content from the input byte sequence.
///
/// # Errors
///
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
///
/// # Examples
///
/// ```moonbit
/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
/// let decoder = @encoding.decoder(UTF8)
/// inspect!(decoder.decode!(inputs[0], stream=true), content="abc")
/// inspect!(decoder.decode!(inputs[1], stream=true), content="")
/// inspect!(decoder.decode!(inputs[2], stream=false), content="🐰")
/// ```
pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
  if input.length() > 0 {
    self.i_cont(input)
  }
  if self.i_rem() == 0 {
    return String::default()
  }

  // TODO: Estimate size_hint based on input and encoding more accurately
  let builder = StringBuilder::new(size_hint=input.length())

  // drive decoder to decode
  loop self.decode_() {
    Uchar(u) => {
      builder.write_char(u)
      continue self.decode_()
    }
    Malformed(bs) =>
      if stream && self.t_need > 0 {
        builder.to_string()
      } else {
        raise MalformedError(bs)
      }
    End => builder.to_string()
    Refill(t) =>
      if stream {
        builder.to_string()
      } else {
        raise TruncatedError(t)
      }
  }
}

///|
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
///
/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to consume the byte sequence.
/// - `input`: The byte sequence to be consumed and decoded incrementally.
///
/// # Returns
///
/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
///
/// # Errors
///
/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
pub fn consume!(self : Decoder, input : Bytes) -> String {
  self.decode!(input, stream=true)
}

///|
/// Finalize the decoding process and return the remaining decoded string.
///
/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
/// and triggering the final decoding step to produce the remaining output.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to finalize the decoding process.
///
/// # Returns
///
/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
///
/// # Errors
///
/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
pub fn finish!(self : Decoder) -> String {
  self.decode!(b"", stream=false)
}

///|
/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function can work in streaming mode where bytes are consumed incrementally.
/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to decode the byte sequence.
/// - `input`: The byte sequence to be decoded.
/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
///
/// # Returns
///
/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
pub fn decode_lossy(
  self : Decoder,
  input : Bytes,
  stream~ : Bool = false
) -> String {
  if input.length() > 0 {
    self.i_cont(input)
  }
  if self.i_rem() == 0 {
    return String::default()
  }

  // drive decoder to decode
  let chars = []
  loop self.decode_() {
    Uchar(u) => {
      chars.push(u)
      continue self.decode_()
    }
    Malformed(_) =>
      if stream && self.t_need > 0 {
        String::from_array(chars)
      } else {
        chars.push(U_REP)
        continue self.decode_()
      }
    End => String::from_array(chars)
    Refill(_) =>
      if stream {
        String::from_array(chars)
      } else {
        continue self.decode_()
      }
  }
}

///|
/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
/// - `input`: The byte sequence to be consumed and decoded incrementally.
///
/// # Returns
///
/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
pub fn lossy_consume(self : Decoder, input : Bytes) -> String {
  self.decode_lossy(input, stream=true)
}

///|
/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
///
/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
/// and triggering the final decoding step to produce the remaining output.
///
/// # Parameters
///
/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
///
/// # Returns
///
/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
pub fn lossy_finish(self : Decoder) -> String {
  self.decode_lossy(b"", stream=false)
}

///|
fn i_cont(self : Decoder, input : Bytes) -> Unit {
  // concat `input` to `i`, drop decoded `i`
  let i_rem = @math.maximum(self.i_rem(), 0)
  let new_len = i_rem + input.length()
  // init a new `i`
  let new_i = FixedArray::make(new_len, Byte::default())
  if i_rem > 0 {
    // copy the remainder of the old `i` into the new `i`
    self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)
  }
  // copy all `input` into new `i`, starting at the remainder of the old `i`
  new_i.blit_from_bytes(i_rem, input, 0, input.length())
  self.i = new_i
  // reset position to starting position
  self.i_pos = 0
}

// Implementations

///|
fn decode_(self : Decoder) -> Decode {
  (self.k)(self)
}

///|
fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
  self.k = k
  v
}

///|
fn i_rem(self : Decoder) -> Int {
  self.i.length() - self.i_pos
}

///|
fn t_need(self : Decoder, need : Int) -> Unit {
  self.t_len = 0
  self.t_need = need
}

///|
fn eoi(self : Decoder) -> Unit {
  self.i = FixedArray::default()
}

///|
fn refill(self : Decoder, k : Cont) -> Decode {
  self.eoi()
  self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
}

///|
fn t_fill(k : Cont, decoder : Decoder) -> Decode {
  fn blit(decoder : Decoder, l : Int) -> Unit {
    decoder.i.blit_to(
      decoder.t,
      len=l,
      dst_offset=decoder.t_len,
      src_offset=decoder.i_pos,
    )
    decoder.i_pos += l
    decoder.t_len += l
  }

  let rem = decoder.i_rem()
  if rem < 0 { // eoi
    k(decoder)
  } else {
    let need = decoder.t_need - decoder.t_len
    if rem < need {
      blit(decoder, rem)
      decoder.refill(@tuple.curry(t_fill)(k))
    } else {
      blit(decoder, need)
      k(decoder)
    }
  }
}

// UTF8

///|
fn decode_utf_8(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else {
    let idx = self.i[self.i_pos].to_int()
    let need = utf_8_len[idx]
    if rem < need {
      self.t_need(need)
      t_fill(t_decode_utf_8, self)
    } else {
      let j = self.i_pos
      if need == 0 {
        self.i_pos += 1
        self.ret(decode_utf_8, malformed(self.i, j, 1))
      } else {
        self.i_pos += need
        self.ret(decode_utf_8, r_utf_8(self.i, j, need))
      }
    }
  }
}

///|
fn t_decode_utf_8(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))
  } else {
    self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))
  }
}

///|
fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
  fn uchar(c : Int) {
    Uchar(Char::from_int(c))
  }

  match length {
    1 => uchar(bytes[offset].to_int())
    2 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      if (b1 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F))
      }
    }
    3 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      let b2 = bytes[offset + 2].to_int()
      let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F))
      if (b2 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        match b0 {
          0xE0 =>
            if b1 < 0xA0 || 0xBF < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          0xED =>
            if b1 < 0x80 || 0x9F < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          _ =>
            if (b1 >> 6) != 0b10 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
        }
      }
    }
    4 => {
      let b0 = bytes[offset].to_int()
      let b1 = bytes[offset + 1].to_int()
      let b2 = bytes[offset + 2].to_int()
      let b3 = bytes[offset + 3].to_int()
      let c = ((b0 & 0x07) << 18) |
        ((b1 & 0x3F) << 12) |
        ((b2 & 0x3F) << 6) |
        (b3 & 0x3F)
      if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 {
        malformed(bytes, offset, length)
      } else {
        match b0 {
          0xF0 =>
            if b1 < 0x90 || 0xBF < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          0xF4 =>
            if b1 < 0x80 || 0x8F < b1 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
          _ =>
            if (b1 >> 6) != 0b10 {
              malformed(bytes, offset, length)
            } else {
              uchar(c)
            }
        }
      }
    }
    _ => panic()
  }
}

// UTF16LE

///|
priv enum UTF16Decode {
  Hi(Int)
  UTF16Malformed(Bytes)
  UTF16Uchar(Char)
}

///|
fn decode_utf_16le(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else if rem < 2 {
    self.t_need(2)
    t_fill(t_decode_utf_16le, self)
  } else {
    let j = self.i_pos
    self.i_pos += 2
    self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
  }
}

///|
fn t_decode_utf_16le(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))
  } else {
    self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))
  }
}

///|
fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
  match v {
    UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))
    UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))
    Hi(hi) => {
      let rem = self.i_rem()
      if rem < 2 {
        self.t_need(2)
        t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
      } else {
        let j = self.i_pos
        let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
        match dcd {
          Uchar(_) => self.i_pos += 2
          _ => ()
        }
        self.ret(decode_utf_16le, dcd)
      }
    }
  }
}

///|
fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
  if decoder.t_len < decoder.t_need {
    decoder.ret(
      decode_utf_16le,
      malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
    )
  } else {
    decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))
  }
}

///|
fn r_utf_16_lo(
  hi : Int,
  bytes : FixedArray[Byte],
  offset0 : Int,
  offset1 : Int
) -> Decode {
  let b0 = bytes[offset0].to_int()
  let b1 = bytes[offset1].to_int()
  let lo = (b0 << 8) | b1
  if lo < 0xDC00 || lo > 0xDFFF {
    // NOTE(jinser): only hi malformed, skip lo if lo is illegal
    //
    // For example, b"\xD8\x00\x00\x48" (BE)
    // Since \xD8\x00 is *legal* hi, here will try to parse lo next,
    // however the whole \xD8\x00\x00\x48 is *illegal* so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
    //
    // But \x00\x48 itself is a *legal* UTF16 code point with a value of `H`,
    // the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
    //
    // > printf '\xD8\x00\x00\x48' | uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
    // �H
    Malformed([bytes[offset0], bytes[offset1]])
  } else {
    Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000)))
  }
}

///|
fn r_utf_16(
  bytes : FixedArray[Byte],
  offset0 : Int,
  offset1 : Int
) -> UTF16Decode {
  let b0 = bytes[offset0].to_int()
  let b1 = bytes[offset1].to_int()
  let u = (b0 << 8) | b1
  if u < 0xD800 || u > 0xDFFF {
    UTF16Uchar(Char::from_int(u))
  } else if u > 0xDBFF {
    UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))
  } else {
    Hi(u)
  }
}

// UTF16BE

///|
fn decode_utf_16be(self : Decoder) -> Decode {
  let rem = self.i_rem()
  if rem <= 0 {
    Decode::End
  } else if rem < 2 {
    self.t_need(2)
    t_fill(t_decode_utf_16be, self)
  } else {
    let j = self.i_pos
    self.i_pos += 2
    self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
  }
}

///|
fn t_decode_utf_16be(self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))
  } else {
    self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))
  }
}

///|
fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
  match decode {
    UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))
    UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))
    Hi(hi) => {
      let rem = self.i_rem()
      if rem < 2 {
        self.t_need(2)
        t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
      } else {
        let j = self.i_pos
        let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
        match dcd {
          Uchar(_) => self.i_pos += 2
          _ => ()
        }
        self.ret(decode_utf_16be, dcd)
      }
    }
  }
}

///|
fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
  if self.t_len < self.t_need {
    self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))
  } else {
    self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))
  }
}

1	// Copyright 2024 International Digital Economy Academy
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	///\| The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
16	/// https://unicode.org/charts/nameslist/n_FFF0.html
17	pub const U_REP = '\u{FFFD}'
18
19	///\|
20	let utf_8_len = [
21	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
26	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28	0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
29	2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
30	4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31	]
32
33	///\|
34	/// Create and return a `Decoder` for the specified character encoding.
35	///
36	/// The `Decoder` consumes byte sequences and decodes them into the original string format.
37	///
38	/// # Parameters
39	///
40	/// - `encoding`: The character encoding format to be used for decoding the input byte sequences.
41	///
42	/// # Returns
43	///
44	/// A `Decoder` instance that can be used to decode byte sequences into strings.
45	///
46	/// # Examples
47	///
48	/// ```moonbit
49	/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
50	/// let decoder = decoder(UTF8)
51	/// inspect!(decoder.consume!(inputs[0]), content="abc")
52	/// inspect!(decoder.consume!(inputs[1]), content="")
53	/// inspect!(decoder.consume!(inputs[2]), content="🐰")
54	/// assert_true!(decoder.finish!().is_empty())
55	pub fn decoder(encoding : Encoding) -> Decoder {
56	let i = FixedArray::default()	32✔
57	let i_pos = 0
58	let t = FixedArray::make(4, Byte::default())
59	let t_len = 0
60	let t_need = 0
61	let k = match encoding {
62	UTF8 => decode_utf_8	10✔
63	UTF16 => decode_utf_16le	2✔
64	UTF16LE => decode_utf_16le	11✔
65	UTF16BE => decode_utf_16be	9✔
66	}
67	{ i, i_pos, t, t_len, t_need, k }
68	}
69
70	///\|
71	/// Decode the given byte sequence using the specified `Decoder` and return the resulting string.
72	///
73	/// This function can work in streaming mode where bytes are consumed incrementally.
74	/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
75	///
76	/// # Parameters
77	///
78	/// - `self`: The `Decoder` instance used to decode the byte sequence.
79	/// - `input`: The byte sequence to be decoded.
80	/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
81	///
82	/// # Returns
83	///
84	/// A `String` representing the decoded content from the input byte sequence.
85	///
86	/// # Errors
87	///
88	/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
89	/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
90	///
91	/// # Examples
92	///
93	/// ```moonbit
94	/// let inputs = [b"abc", b"\xf0", b"\x9f\x90\xb0"] // UTF8(🐰) == <F09F 90B0>
95	/// let decoder = @encoding.decoder(UTF8)
96	/// inspect!(decoder.decode!(inputs[0], stream=true), content="abc")
97	/// inspect!(decoder.decode!(inputs[1], stream=true), content="")
98	/// inspect!(decoder.decode!(inputs[2], stream=false), content="🐰")
99	/// ```
100	pub fn decode!(self : Decoder, input : Bytes, stream~ : Bool = false) -> String {
101	if input.length() > 0 {	76✔
102	self.i_cont(input)	70✔
103	}
104	if self.i_rem() == 0 {
105	return String::default()	6✔
106	}
107
108	// TODO: Estimate size_hint based on input and encoding more accurately
109	let builder = StringBuilder::new(size_hint=input.length())
110
111	// drive decoder to decode
112	loop self.decode_() {
113	Uchar(u) => {	278✔
114	builder.write_char(u)
115	continue self.decode_()
116	}
117	Malformed(bs) =>
118	if stream && self.t_need > 0 {	4✔
NEW 119	builder.to_string()	×
120	} else {
121	raise MalformedError(bs)	4✔
122	}
123	End => builder.to_string()	42✔
124	Refill(t) =>
125	if stream {	24✔
126	builder.to_string()	24✔
127	} else {
128	raise TruncatedError(t)	×
129	}
130	}
131	}
132
133	///\|
134	/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally.
135	///
136	/// This function calls `decode!` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
137	///
138	/// # Parameters
139	///
140	/// - `self`: The `Decoder` instance used to consume the byte sequence.
141	/// - `input`: The byte sequence to be consumed and decoded incrementally.
142	///
143	/// # Returns
144	///
145	/// A `String` representing the partially decoded content from the input byte sequence, as more bytes are expected.
146	///
147	/// # Errors
148	///
149	/// `MalformedError`: when the byte sequence is not properly formatted according to the specified encoding.
150	/// `TruncatedError`: when the byte sequence ends prematurely, implying that more data is expected for complete decoding.
151	pub fn consume!(self : Decoder, input : Bytes) -> String {
152	self.decode!(input, stream=true)	50✔
153	}
154
155	///\|
156	/// Finalize the decoding process and return the remaining decoded string.
157	///
158	/// This function calls `decode!` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
159	/// and triggering the final decoding step to produce the remaining output.
160	///
161	/// # Parameters
162	///
163	/// - `self`: The `Decoder` instance used to finalize the decoding process.
164	///
165	/// # Returns
166	///
167	/// A `String` representing the final part of the decoded content, after all byte sequences have been processed.
168	///
169	/// # Errors
170	///
171	/// `MalformedError`: This error is raised if the remaining byte sequence is not properly formatted according to the specified encoding.
172	/// `TruncatedError`: This error is raised if the remaining byte sequence ends prematurely, implying that more data was expected for complete decoding.
173	pub fn finish!(self : Decoder) -> String {
174	self.decode!(b"", stream=false)	6✔
175	}
176
177	///\|
178	/// Decode the given byte sequence using the specified `Decoder` and return the resulting string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
179	///
180	/// This function can work in streaming mode where bytes are consumed incrementally.
181	/// When `stream` is `false`, it indicates the end of the input and triggers the final decoding step.
182	///
183	/// # Parameters
184	///
185	/// - `self`: The `Decoder` instance used to decode the byte sequence.
186	/// - `input`: The byte sequence to be decoded.
187	/// - `stream~`: A boolean indicating whether more bytes will be supplied for decoding. It defaults to `false`.
188	///
189	/// # Returns
190	///
191	/// A `String` representing the decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`).
192	pub fn decode_lossy(
193	self : Decoder,
194	input : Bytes,
195	stream~ : Bool = false
196	) -> String {
197	if input.length() > 0 {	6✔
198	self.i_cont(input)	6✔
199	}
200	if self.i_rem() == 0 {
201	return String::default()	×
202	}
203
204	// drive decoder to decode
205	let chars = []
206	loop self.decode_() {
207	Uchar(u) => {	6✔
208	chars.push(u)
209	continue self.decode_()
210	}
211	Malformed(_) =>
212	if stream && self.t_need > 0 {	9✔
213	String::from_array(chars)	×
214	} else {
215	chars.push(U_REP)	9✔
216	continue self.decode_()
217	}
218	End => String::from_array(chars)	6✔
219	Refill(_) =>
220	if stream {	3✔
221	String::from_array(chars)	×
222	} else {
223	continue self.decode_()	3✔
224	}
225	}
226	}
227
228	///\|
229	/// Consume the given byte sequence using the specified `Decoder` and return the resulting string incrementally, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
230	///
231	/// This function calls `decode_lossy` with the `stream` parameter set to `true`, indicating that more bytes will follow for decoding.
232	///
233	/// # Parameters
234	///
235	/// - `self`: The `Decoder` instance used to consume and decode the byte sequence.
236	/// - `input`: The byte sequence to be consumed and decoded incrementally.
237	///
238	/// # Returns
239	///
240	/// A `String` representing the partially decoded content from the input byte sequence, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), as more bytes are expected.
241	pub fn lossy_consume(self : Decoder, input : Bytes) -> String {
242	self.decode_lossy(input, stream=true)	×
243	}
244
245	///\|
246	/// Finalize the lossy decoding process and return the remaining decoded string, replacing any invalid sequences with the Unicode Replacement Character (`U+FFFD`).
247	///
248	/// This function calls `decode_lossy` with the `stream` parameter set to `false`, indicating that no more bytes will be supplied
249	/// and triggering the final decoding step to produce the remaining output.
250	///
251	/// # Parameters
252	///
253	/// - `self`: The `Decoder` instance used to finalize the lossy decoding process.
254	///
255	/// # Returns
256	///
257	/// A `String` representing the final part of the decoded content, with any invalid sequences replaced by the Unicode Replacement Character (`U+FFFD`), after all byte sequences have been processed.
258	pub fn lossy_finish(self : Decoder) -> String {
259	self.decode_lossy(b"", stream=false)	×
260	}
261
262	///\|
263	fn i_cont(self : Decoder, input : Bytes) -> Unit {
264	// concat `input` to `i`, drop decoded `i`
265	let i_rem = @math.maximum(self.i_rem(), 0)	76✔
266	let new_len = i_rem + input.length()
267	// init a new `i`
268	let new_i = FixedArray::make(new_len, Byte::default())
269	if i_rem > 0 {
270	// copy the remainder of the old `i` into the new `i`
271	self.i.blit_to(new_i, len=i_rem, src_offset=self.i_pos)	×
272	}
273	// copy all `input` into new `i`, starting at the remainder of the old `i`
274	new_i.blit_from_bytes(i_rem, input, 0, input.length())
275	self.i = new_i
276	// reset position to starting position
277	self.i_pos = 0
278	}
279
280	// Implementations
281
282	///\|
283	fn decode_(self : Decoder) -> Decode {
284	(self.k)(self)	372✔
285	}
286
287	///\|
288	fn ret(self : Decoder, k : Cont, v : Decode) -> Decode {
289	self.k = k	324✔
290	v
291	}
292
293	///\|
294	fn i_rem(self : Decoder) -> Int {
295	self.i.length() - self.i_pos	573✔
296	}
297
298	///\|
299	fn t_need(self : Decoder, need : Int) -> Unit {
300	self.t_len = 0	27✔
301	self.t_need = need
302	}
303
304	///\|
305	fn eoi(self : Decoder) -> Unit {
306	self.i = FixedArray::default()	27✔
307	}
308
309	///\|
310	fn refill(self : Decoder, k : Cont) -> Decode {
311	self.eoi()	27✔
312	self.ret(k, Decode::Refill(Bytes::from_fixedarray(self.t)))
313	}
314
315	///\|
316	fn t_fill(k : Cont, decoder : Decoder) -> Decode {
317	fn blit(decoder : Decoder, l : Int) -> Unit {	54✔
318	decoder.i.blit_to(	51✔
319	decoder.t,
320	len=l,
321	dst_offset=decoder.t_len,
322	src_offset=decoder.i_pos,
323	)
324	decoder.i_pos += l
325	decoder.t_len += l
326	}
327
328	let rem = decoder.i_rem()
329	if rem < 0 { // eoi
330	k(decoder)	3✔
331	} else {
332	let need = decoder.t_need - decoder.t_len	51✔
333	if rem < need {
334	blit(decoder, rem)	27✔
335	decoder.refill(@tuple.curry(t_fill)(k))
336	} else {
337	blit(decoder, need)	24✔
338	k(decoder)
339	}
340	}
341	}
342
343	// UTF8
344
345	///\|
346	fn decode_utf_8(self : Decoder) -> Decode {
347	let rem = self.i_rem()	95✔
348	if rem <= 0 {
349	Decode::End	14✔
350	} else {
351	let idx = self.i[self.i_pos].to_int()	81✔
352	let need = utf_8_len[idx]
353	if rem < need {
354	self.t_need(need)	6✔
355	t_fill(t_decode_utf_8, self)
356	} else {
357	let j = self.i_pos	75✔
358	if need == 0 {
359	self.i_pos += 1	1✔
360	self.ret(decode_utf_8, malformed(self.i, j, 1))
361	} else {
362	self.i_pos += need	74✔
363	self.ret(decode_utf_8, r_utf_8(self.i, j, need))
364	}
365	}
366	}
367	}
368
369	///\|
370	fn t_decode_utf_8(self : Decoder) -> Decode {
371	if self.t_len < self.t_need {	6✔
372	self.ret(decode_utf_8, malformed(self.t, 0, self.t_len))	×
373	} else {
374	self.ret(decode_utf_8, r_utf_8(self.t, 0, self.t_len))	6✔
375	}
376	}
377
378	///\|
379	fn r_utf_8(bytes : FixedArray[Byte], offset : Int, length : Int) -> Decode {
380	fn uchar(c : Int) {	80✔
381	Uchar(Char::from_int(c))	74✔
382	}
383
384	match length {
385	1 => uchar(bytes[offset].to_int())	44✔
386	2 => {	6✔
387	let b0 = bytes[offset].to_int()
388	let b1 = bytes[offset + 1].to_int()
389	if (b1 >> 6) != 0b10 {
390	malformed(bytes, offset, length)	4✔
391	} else {
392	uchar(((b0 & 0x1F) << 6) \| (b1 & 0x3F))	2✔
393	}
394	}
395	3 => {	13✔
396	let b0 = bytes[offset].to_int()
397	let b1 = bytes[offset + 1].to_int()
398	let b2 = bytes[offset + 2].to_int()
399	let c = ((b0 & 0x0F) << 12) \| (((b1 & 0x3F) << 6) \| (b2 & 0x3F))
400	if (b2 >> 6) != 0b10 {
401	malformed(bytes, offset, length)	×
402	} else {
403	match b0 {	13✔
404	0xE0 =>
405	if b1 < 0xA0 \|\| 0xBF < b1 {	1✔
406	malformed(bytes, offset, length)	1✔
407	} else {
408	uchar(c)	×
409	}
410	0xED =>
411	if b1 < 0x80 \|\| 0x9F < b1 {	×
412	malformed(bytes, offset, length)	×
413	} else {
414	uchar(c)	×
415	}
416	_ =>
417	if (b1 >> 6) != 0b10 {	12✔
418	malformed(bytes, offset, length)	×
419	} else {
420	uchar(c)	12✔
421	}
422	}
423	}
424	}
425	4 => {	17✔
426	let b0 = bytes[offset].to_int()
427	let b1 = bytes[offset + 1].to_int()
428	let b2 = bytes[offset + 2].to_int()
429	let b3 = bytes[offset + 3].to_int()
430	let c = ((b0 & 0x07) << 18) \|
431	((b1 & 0x3F) << 12) \|
432	((b2 & 0x3F) << 6) \|
433	(b3 & 0x3F)
434	if (b3 >> 6) != 0b10 \|\| (b2 >> 6) != 0b10 {
435	malformed(bytes, offset, length)	1✔
436	} else {
437	match b0 {	16✔
438	0xF0 =>
439	if b1 < 0x90 \|\| 0xBF < b1 {	16✔
440	malformed(bytes, offset, length)	×
441	} else {
442	uchar(c)	16✔
443	}
444	0xF4 =>
445	if b1 < 0x80 \|\| 0x8F < b1 {	×
446	malformed(bytes, offset, length)	×
447	} else {
448	uchar(c)	×
449	}
450	_ =>
451	if (b1 >> 6) != 0b10 {	×
452	malformed(bytes, offset, length)	×
453	} else {
454	uchar(c)	×
455	}
456	}
457	}
458	}
459	_ => panic()	×
460	}
461	}
462
463	// UTF16LE
464
465	///\|
466	priv enum UTF16Decode {
467	Hi(Int)
468	UTF16Malformed(Bytes)
469	UTF16Uchar(Char)
470	}
471
472	///\|
473	fn decode_utf_16le(self : Decoder) -> Decode {
474	let rem = self.i_rem()	141✔
475	if rem <= 0 {
476	Decode::End	21✔
477	} else if rem < 2 {	120✔
478	self.t_need(2)	9✔
479	t_fill(t_decode_utf_16le, self)
480	} else {
481	let j = self.i_pos	111✔
482	self.i_pos += 2
483	self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j))
484	}
485	}
486
487	///\|
488	fn t_decode_utf_16le(self : Decoder) -> Decode {
489	if self.t_len < self.t_need {	9✔
490	self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len))	×
491	} else {
492	self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0))	9✔
493	}
494	}
495
496	///\|
497	fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode {
498	match v {	120✔
499	UTF16Uchar(u) => self.ret(decode_utf_16le, Uchar(u))	111✔
500	UTF16Malformed(s) => self.ret(decode_utf_16le, Malformed(s))	×
501	Hi(hi) => {	9✔
502	let rem = self.i_rem()
503	if rem < 2 {
504	self.t_need(2)	4✔
505	t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self)
506	} else {
507	let j = self.i_pos	5✔
508	let dcd = r_utf_16_lo(hi, self.i, j + 1, j)
509	match dcd {
510	Uchar(_) => self.i_pos += 2	4✔
511	_ => ()	1✔
512	}
513	self.ret(decode_utf_16le, dcd)
514	}
515	}
516	}
517	}
518
519	///\|
520	fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode {
521	if decoder.t_len < decoder.t_need {	4✔
522	decoder.ret(	2✔
523	decode_utf_16le,
524	malformed_pair(false, hi, decoder.t, 0, decoder.t_len),
525	)
526	} else {
527	decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0))	2✔
528	}
529	}
530
531	///\|
532	fn r_utf_16_lo(
533	hi : Int,
534	bytes : FixedArray[Byte],
535	offset0 : Int,
536	offset1 : Int
537	) -> Decode {
538	let b0 = bytes[offset0].to_int()	13✔
539	let b1 = bytes[offset1].to_int()
540	let lo = (b0 << 8) \| b1
541	if lo < 0xDC00 \|\| lo > 0xDFFF {
542	// NOTE(jinser): only hi malformed, skip lo if lo is illegal
543	//
544	// For example, b"\xD8\x00\x00\x48" (BE)
545	// Since \xD8\x00 is legal hi, here will try to parse lo next,
546	// however the whole \xD8\x00\x00\x48 is illegal so the result will be a `Malformed[b"\xD8\x00\x00\x48"]`
547	//
548	// But \x00\x48 itself is a legal UTF16 code point with a value of `H`,
549	// the ideal result should be: `[Malformed(b"\xD8\x00"), Uchar('H')]`
550	//
551	// > printf '\xD8\x00\x00\x48' \| uconv --from-code UTF16BE --to-code UTF8 --from-callback substitute
552	// �H
553	Malformed([bytes[offset0], bytes[offset1]])	3✔
554	} else {
555	Uchar(Char::from_int(((hi & 0x3FF) << 10) \| ((lo & 0x3FF) + 0x10000)))	10✔
556	}
557	}
558
559	///\|
560	fn r_utf_16(
561	bytes : FixedArray[Byte],
562	offset0 : Int,
563	offset1 : Int
564	) -> UTF16Decode {
565	let b0 = bytes[offset0].to_int()	216✔
566	let b1 = bytes[offset1].to_int()
567	let u = (b0 << 8) \| b1
568	if u < 0xD800 \|\| u > 0xDFFF {
569	UTF16Uchar(Char::from_int(u))	200✔
570	} else if u > 0xDBFF {	16✔
571	UTF16Malformed(slice(bytes, @math.minimum(offset0, offset1), 2))	×
572	} else {
573	Hi(u)	16✔
574	}
575	}
576
577	// UTF16BE
578
579	///\|
580	fn decode_utf_16be(self : Decoder) -> Decode {
581	let rem = self.i_rem()	109✔
582	if rem <= 0 {
583	Decode::End	13✔
584	} else if rem < 2 {	96✔
585	self.t_need(2)	7✔
586	t_fill(t_decode_utf_16be, self)
587	} else {
588	let j = self.i_pos	89✔
589	self.i_pos += 2
590	self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1))
591	}
592	}
593
594	///\|
595	fn t_decode_utf_16be(self : Decoder) -> Decode {
596	if self.t_len < self.t_need {	7✔
597	self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len))	×
598	} else {
599	self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1))	7✔
600	}
601	}
602
603	///\|
604	fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode {
605	match decode {	96✔
606	UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x))	89✔
607	UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x))	×
608	Hi(hi) => {	7✔
609	let rem = self.i_rem()
610	if rem < 2 {
611	self.t_need(2)	1✔
612	t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self)
613	} else {
614	let j = self.i_pos	6✔
615	let dcd = r_utf_16_lo(hi, self.i, j, j + 1)
616	match dcd {
617	Uchar(_) => self.i_pos += 2	4✔
618	_ => ()	2✔
619	}
620	self.ret(decode_utf_16be, dcd)
621	}
622	}
623	}
624	}
625
626	///\|
627	fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode {
628	if self.t_len < self.t_need {	1✔
629	self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len))	1✔
630	} else {
631	self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1))	×
632	}
633	}

moonbitlang / x / 387

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous