23374838686

Committed 21 Mar 2026 07:29AM UTC coverage: 59.341% (-20.0%) from 79.315%

Build # 23374838686

Build Type

Pull #15075

github

Committed by

web-flow

Commit Message

Merge 90b4e834f into 6587e363a

Pull Request Pull Request #15075: Stack 8001 v16.4

Coverage Stats

38 of 70 new or added lines in 10 files covered. (54.29%)

34165 existing lines in 563 files now uncovered.

119621 of 201584 relevant lines covered (59.34%)

650666.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.15

/rust/htp/src/utf8_decoder.rs

// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software
// and associated documentation files (the "Software"), to deal in the Software without restriction,
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
use crate::{
    bstr::Bstr,
    config::{DecoderConfig, HtpUnwanted},
    unicode_bestfit_map::UnicodeBestfitMap,
    util::{FlagOperations, HtpFlags},
};

static utf8d: [u8; 400] = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0xb, 0x6, 0x6,
    0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
    0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
    1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
];
static utf8d_allow_overlong: [u8; 400] = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0x6, 0x6, 0x6,
    0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
    0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
    1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
];

#[derive(Clone)]
pub(crate) struct Utf8Decoder {
    bestfit_map: UnicodeBestfitMap,
    state: u32,
    seq: u32,
    codepoint: u32,
    pub(crate) flags: u64,
    pub(crate) seen_valid: bool,
    pub(crate) decoded_bytes: Vec<u8>,
}

impl Utf8Decoder {
    /// Make a new owned Utf8Decoder
    pub(crate) fn new(bestfit_map: UnicodeBestfitMap) -> Self {
        Self {
            bestfit_map,
            state: 0,
            seq: 0,
            codepoint: 0,
            flags: 0,
            seen_valid: false,
            decoded_bytes: Vec::new(),
        }
    }

    /// Decode utf8 byte using best-fit map.
    fn decode_byte(&mut self, encoded_byte: u8, is_last_byte: bool) {
        self.seq = self.seq.wrapping_add(1);
        self.decode_byte_allow_overlong(encoded_byte as u32);
        match self.state {
            0 => {
                if self.seq == 1 {
                    // ASCII character, which we just copy.
                    self.decoded_bytes.push(self.codepoint as u8);
                } else {
                    // A valid UTF-8 character, which we need to convert.
                    self.seen_valid = true;
                    // Check for overlong characters and set the flag accordingly.
                    if (self.seq == 2 && self.codepoint < 0x80)
                        || (self.seq == 3 && self.codepoint < 0x800)
                        || (self.seq == 4 && self.codepoint < 0x10000)
                    {
                        self.flags.set(HtpFlags::PATH_UTF8_OVERLONG);
                    }
                    // Special flag for half-width/full-width evasion.
                    if self.codepoint >= 0xff00 && self.codepoint <= 0xffef {
                        self.flags.set(HtpFlags::PATH_HALF_FULL_RANGE)
                    }
                    // Use best-fit mapping to convert to a single byte.
                    self.decoded_bytes.push(self.bestfit_codepoint());
                }
                self.seq = 0;
            }
            1 => {
                // Invalid UTF-8 character.
                self.flags.set(HtpFlags::PATH_UTF8_INVALID);
                // Output the replacement byte, replacing one or more invalid bytes.
                // If the invalid byte was first in a sequence, consume it. Otherwise,
                // assume it's the starting byte of the next character.
                self.state = 0;
                self.codepoint = 0;
                self.decoded_bytes.push(self.bestfit_map.replacement_byte);
                if self.seq != 1 {
                    self.seq = 0;
                    self.decode_byte(encoded_byte, is_last_byte);
                } else {
                    self.seq = 0;
                }
            }
            _ => {
                // The character is not yet formed.
                if is_last_byte {
                    // If the last input chunk ended with an incomplete byte sequence for a code point,
                    // this is an error and a replacement character is emitted hence starting from 1 not 0
                    for _ in 1..self.seq {
                        self.decoded_bytes.push(self.bestfit_map.replacement_byte);
                    }
                }
            }
        }
    }

    /// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream
    /// with an ascii stream, storing the result in self.decoded_bytes. Overlong
    /// characters will be decoded and invalid characters will be replaced with
    /// the replacement byte specified in the bestfit_map. Best-fit mapping will be used
    /// to convert UTF-8 into a single-byte stream.
    fn decode_and_validate(&mut self, input: &[u8]) {
        //Reset all internals
        self.state = 0;
        self.seq = 0;
        self.codepoint = 0;
        self.flags = 0;
        self.decoded_bytes.clear();
        self.decoded_bytes.reserve(input.len());
        self.seen_valid = false;
        for (byte, is_last) in input
            .iter()
            .enumerate()
            .map(|(i, b)| (b, i + 1 == input.len()))
        {
            self.decode_byte(*byte, is_last);
        }
        // Did the input stream seem like a valid UTF-8 string?
        if self.seen_valid && !self.flags.is_set(HtpFlags::PATH_UTF8_INVALID) {
            self.flags.set(HtpFlags::PATH_UTF8_VALID)
        }
    }

    /// Process one byte of UTF-8 data and set the code point if one is available. Allows
    /// overlong characters in input.
    ///
    /// Sets the state to ACCEPT(0) for a valid character, REJECT(1) for an invalid character,
    ///         or OTHER(u32) if the character has not yet been formed
    fn decode_byte_allow_overlong(&mut self, byte: u32) {
        let type_0: u32 = utf8d_allow_overlong[byte as usize] as u32;
        self.codepoint = if self.state != 0 {
            (byte & 0x3f) | (self.codepoint << 6)
        } else {
            (0xff >> type_0) & byte
        };
        self.state = utf8d[(256u32)
            .wrapping_add((self.state).wrapping_mul(16))
            .wrapping_add(type_0) as usize] as u32;
    }

    /// Convert a Unicode codepoint into a single-byte, using best-fit
    /// mapping (as specified in the provided configuration structure).
    ///
    /// Returns converted single byte
    fn bestfit_codepoint(&self) -> u8 {
        // Is it a single-byte codepoint?
        if self.codepoint < 0x100 {
            return self.codepoint as u8;
        }
        self.bestfit_map.get(self.codepoint)
    }
}

/// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream with
/// an ascii stream. Overlong characters will be decoded and invalid characters will
/// be replaced with the replacement byte specified in the cfg. Best-fit mapping will
/// be used to convert UTF-8 into a single-byte stream. The resulting decoded path will
/// be stored in the input path if the transaction cfg indicates it
pub(crate) fn decode_and_validate_inplace(
    cfg: &DecoderConfig, flags: &mut u64, status: &mut HtpUnwanted, path: &mut Bstr,
) {
    let mut decoder = Utf8Decoder::new(cfg.bestfit_map);
    decoder.decode_and_validate(path.as_slice());
    if cfg.utf8_convert_bestfit {
        path.clear();
        path.add(decoder.decoded_bytes.as_slice());
    }
    flags.set(decoder.flags);

    if flags.is_set(HtpFlags::PATH_UTF8_INVALID) && cfg.utf8_invalid_unwanted != HtpUnwanted::Ignore
    {
        *status = cfg.utf8_invalid_unwanted;
    }
}
#[cfg(test)]
mod tests {
    use crate::{
        bstr::Bstr, config::Config, config::HtpUnwanted, utf8_decoder::decode_and_validate_inplace,
    };
    use rstest::rstest;

    #[rstest]
    #[case(b"\xf1.\xf1\xef\xbd\x9dabcd", "?.?}abcd")]
    //1111 0000 1001 0000 1000 1101 1111 1111
    #[case::invalid_incomplete_seq(b"\xf0\x90\x8d\xff", "??")]
    //1110 0010 1000 0010
    #[case::invalid_incomplete_seq(b"\xe2\x82", "?")]
    //1100 0010 1111 1111 1111 0000
    #[case::invalid_incomplete_seq(b"\xc2\xff\xf0", "??")]
    //1111 0000 1001 0000 0010 1000 1011 1100
    #[case::invalid_incomplete_seq(b"\xf0\x90\x28\xbc", "?(?")]
    fn test_decode_and_validate_inplace(#[case] input: &[u8], #[case] expected: &str) {
        let mut cfg = Config::default();
        cfg.set_utf8_convert_bestfit(true);
        let mut i = Bstr::from(input);
        let mut flags = 0;
        let mut response_status_expected_number = HtpUnwanted::Ignore;
        decode_and_validate_inplace(
            &cfg.decoder_cfg,
            &mut flags,
            &mut response_status_expected_number,
            &mut i,
        );
        assert_eq!(i, Bstr::from(expected));
    }
}

1	// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2	//
3	// Permission is hereby granted, free of charge, to any person obtaining a copy of this software
4	// and associated documentation files (the "Software"), to deal in the Software without restriction,
5	// including without limitation the rights to use, copy, modify, merge, publish, distribute,
6	// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
7	// furnished to do so, subject to the following conditions:
8	//
9	// The above copyright notice and this permission notice shall be included in all copies or
10	// substantial portions of the Software.
11	//
12	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13	// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14	// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15	// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17	//
18	// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
19	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
20	use crate::{
21	bstr::Bstr,
22	config::{DecoderConfig, HtpUnwanted},
23	unicode_bestfit_map::UnicodeBestfitMap,
24	util::{FlagOperations, HtpFlags},
25	};
26
27	static utf8d: [u8; 400] = [
28	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
33	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
34	8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
35	0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0xb, 0x6, 0x6,
36	0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
37	0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38	1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
39	1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
40	1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
41	1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42	];
43	static utf8d_allow_overlong: [u8; 400] = [
44	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
49	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
50	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51	0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0x6, 0x6, 0x6,
52	0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
53	0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54	1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
55	1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
56	1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
57	1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58	];
59
60	#[derive(Clone)]
61	pub(crate) struct Utf8Decoder {
62	bestfit_map: UnicodeBestfitMap,
63	state: u32,
64	seq: u32,
65	codepoint: u32,
66	pub(crate) flags: u64,
67	pub(crate) seen_valid: bool,
68	pub(crate) decoded_bytes: Vec<u8>,
69	}
70
71	impl Utf8Decoder {
72	/// Make a new owned Utf8Decoder
73	pub(crate) fn new(bestfit_map: UnicodeBestfitMap) -> Self {	11,056✔
74	Self {	11,056✔
75	bestfit_map,	11,056✔
76	state: 0,	11,056✔
77	seq: 0,	11,056✔
78	codepoint: 0,	11,056✔
79	flags: 0,	11,056✔
80	seen_valid: false,	11,056✔
81	decoded_bytes: Vec::new(),	11,056✔
82	}	11,056✔
83	}	11,056✔
84
85	/// Decode utf8 byte using best-fit map.
86	fn decode_byte(&mut self, encoded_byte: u8, is_last_byte: bool) {	217,533✔
87	self.seq = self.seq.wrapping_add(1);	217,533✔
88	self.decode_byte_allow_overlong(encoded_byte as u32);	217,533✔
89	match self.state {	217,533✔
90	0 => {
91	if self.seq == 1 {	216,376✔
92	// ASCII character, which we just copy.	216,251✔
93	self.decoded_bytes.push(self.codepoint as u8);	216,251✔
94	} else {	216,251✔
95	// A valid UTF-8 character, which we need to convert.
96	self.seen_valid = true;	125✔
97	// Check for overlong characters and set the flag accordingly.	125✔
98	if (self.seq == 2 && self.codepoint < 0x80)	125✔
99	\|\| (self.seq == 3 && self.codepoint < 0x800)	113✔
100	\|\| (self.seq == 4 && self.codepoint < 0x10000)	112✔
101	{	13✔
102	self.flags.set(HtpFlags::PATH_UTF8_OVERLONG);	13✔
103	}	112✔
104	// Special flag for half-width/full-width evasion.
105	if self.codepoint >= 0xff00 && self.codepoint <= 0xffef {	125✔
106	self.flags.set(HtpFlags::PATH_HALF_FULL_RANGE)	×
107	}	125✔
108	// Use best-fit mapping to convert to a single byte.
109	self.decoded_bytes.push(self.bestfit_codepoint());	125✔
110	}
111	self.seq = 0;	216,376✔
112	}
113	1 => {
114	// Invalid UTF-8 character.
115	self.flags.set(HtpFlags::PATH_UTF8_INVALID);	749✔
116	// Output the replacement byte, replacing one or more invalid bytes.	749✔
117	// If the invalid byte was first in a sequence, consume it. Otherwise,	749✔
118	// assume it's the starting byte of the next character.	749✔
119	self.state = 0;	749✔
120	self.codepoint = 0;	749✔
121	self.decoded_bytes.push(self.bestfit_map.replacement_byte);	749✔
122	if self.seq != 1 {	749✔
123	self.seq = 0;	215✔
124	self.decode_byte(encoded_byte, is_last_byte);	215✔
125	} else {	534✔
126	self.seq = 0;	534✔
127	}	534✔
128	}
129	_ => {
130	// The character is not yet formed.
131	if is_last_byte {	408✔
132	// If the last input chunk ended with an incomplete byte sequence for a code point,
133	// this is an error and a replacement character is emitted hence starting from 1 not 0
134	for _ in 1..self.seq {	22✔
135	self.decoded_bytes.push(self.bestfit_map.replacement_byte);	1✔
136	}	1✔
137	}	386✔
138	}
139	}
140	}	217,533✔
141
142	/// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream
143	/// with an ascii stream, storing the result in self.decoded_bytes. Overlong
144	/// characters will be decoded and invalid characters will be replaced with
145	/// the replacement byte specified in the bestfit_map. Best-fit mapping will be used
146	/// to convert UTF-8 into a single-byte stream.
147	fn decode_and_validate(&mut self, input: &[u8]) {	11,056✔
148	//Reset all internals	11,056✔
149	self.state = 0;	11,056✔
150	self.seq = 0;	11,056✔
151	self.codepoint = 0;	11,056✔
152	self.flags = 0;	11,056✔
153	self.decoded_bytes.clear();	11,056✔
154	self.decoded_bytes.reserve(input.len());	11,056✔
155	self.seen_valid = false;	11,056✔
156	for (byte, is_last) in input	217,318✔
157	.iter()	11,056✔
158	.enumerate()	11,056✔
159	.map(\|(i, b)\| (b, i + 1 == input.len()))	217,318✔
160	{	217,318✔
161	self.decode_byte(*byte, is_last);	217,318✔
162	}	217,318✔
163	// Did the input stream seem like a valid UTF-8 string?
164	if self.seen_valid && !self.flags.is_set(HtpFlags::PATH_UTF8_INVALID) {	11,056✔
165	self.flags.set(HtpFlags::PATH_UTF8_VALID)	6✔
166	}	11,050✔
167	}	11,056✔
168
169	/// Process one byte of UTF-8 data and set the code point if one is available. Allows
170	/// overlong characters in input.
171	///
172	/// Sets the state to ACCEPT(0) for a valid character, REJECT(1) for an invalid character,
173	/// or OTHER(u32) if the character has not yet been formed
174	fn decode_byte_allow_overlong(&mut self, byte: u32) {	217,533✔
175	let type_0: u32 = utf8d_allow_overlong[byte as usize] as u32;	217,533✔
176	self.codepoint = if self.state != 0 {	217,533✔
177	(byte & 0x3f) \| (self.codepoint << 6)	386✔
178	} else {
179	(0xff >> type_0) & byte	217,147✔
180	};
181	self.state = utf8d[(256u32)	217,533✔
182	.wrapping_add((self.state).wrapping_mul(16))	217,533✔
183	.wrapping_add(type_0) as usize] as u32;	217,533✔
184	}	217,533✔
185
186	/// Convert a Unicode codepoint into a single-byte, using best-fit
187	/// mapping (as specified in the provided configuration structure).
188	///
189	/// Returns converted single byte
190	fn bestfit_codepoint(&self) -> u8 {	125✔
191	// Is it a single-byte codepoint?	125✔
192	if self.codepoint < 0x100 {	125✔
193	return self.codepoint as u8;	19✔
194	}	106✔
195	self.bestfit_map.get(self.codepoint)	106✔
196	}	125✔
197	}
198
199	/// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream with
200	/// an ascii stream. Overlong characters will be decoded and invalid characters will
201	/// be replaced with the replacement byte specified in the cfg. Best-fit mapping will
202	/// be used to convert UTF-8 into a single-byte stream. The resulting decoded path will
203	/// be stored in the input path if the transaction cfg indicates it
204	pub(crate) fn decode_and_validate_inplace(	11,056✔
205	cfg: &DecoderConfig, flags: &mut u64, status: &mut HtpUnwanted, path: &mut Bstr,	11,056✔
206	) {	11,056✔
207	let mut decoder = Utf8Decoder::new(cfg.bestfit_map);	11,056✔
208	decoder.decode_and_validate(path.as_slice());	11,056✔
209	if cfg.utf8_convert_bestfit {	11,056✔
UNCOV 210	path.clear();	×
UNCOV 211	path.add(decoder.decoded_bytes.as_slice());	×
212	}	11,056✔
213	flags.set(decoder.flags);	11,056✔
214		11,056✔
215	if flags.is_set(HtpFlags::PATH_UTF8_INVALID) && cfg.utf8_invalid_unwanted != HtpUnwanted::Ignore	11,056✔
216	{	×
217	*status = cfg.utf8_invalid_unwanted;	×
218	}	11,056✔
219	}	11,056✔
220	#[cfg(test)]
221	mod tests {
222	use crate::{
223	bstr::Bstr, config::Config, config::HtpUnwanted, utf8_decoder::decode_and_validate_inplace,
224	};
225	use rstest::rstest;
226
227	#[rstest]
228	#[case(b"\xf1.\xf1\xef\xbd\x9dabcd", "?.?}abcd")]
229	//1111 0000 1001 0000 1000 1101 1111 1111
230	#[case::invalid_incomplete_seq(b"\xf0\x90\x8d\xff", "??")]
231	//1110 0010 1000 0010
232	#[case::invalid_incomplete_seq(b"\xe2\x82", "?")]
233	//1100 0010 1111 1111 1111 0000
234	#[case::invalid_incomplete_seq(b"\xc2\xff\xf0", "??")]
235	//1111 0000 1001 0000 0010 1000 1011 1100
236	#[case::invalid_incomplete_seq(b"\xf0\x90\x28\xbc", "?(?")]
237	fn test_decode_and_validate_inplace(#[case] input: &[u8], #[case] expected: &str) {
238	let mut cfg = Config::default();
239	cfg.set_utf8_convert_bestfit(true);
240	let mut i = Bstr::from(input);
241	let mut flags = 0;
242	let mut response_status_expected_number = HtpUnwanted::Ignore;
243	decode_and_validate_inplace(
244	&cfg.decoder_cfg,
245	&mut flags,
246	&mut response_status_expected_number,
247	&mut i,
248	);
249	assert_eq!(i, Bstr::from(expected));
250	}
251	}

OISF / suricata / 23374838686

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous