• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OISF / suricata / 23374838686

21 Mar 2026 07:29AM UTC coverage: 59.341% (-20.0%) from 79.315%
23374838686

Pull #15075

github

web-flow
Merge 90b4e834f into 6587e363a
Pull Request #15075: Stack 8001 v16.4

38 of 70 new or added lines in 10 files covered. (54.29%)

34165 existing lines in 563 files now uncovered.

119621 of 201584 relevant lines covered (59.34%)

650666.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.15
/rust/htp/src/utf8_decoder.rs
1
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2
//
3
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software
4
// and associated documentation files (the "Software"), to deal in the Software without restriction,
5
// including without limitation the rights to use, copy, modify, merge, publish, distribute,
6
// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
7
// furnished to do so, subject to the following conditions:
8
//
9
// The above copyright notice and this permission notice shall be included in all copies or
10
// substantial portions of the Software.
11
//
12
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13
// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
15
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17
//
18
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
19
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
20
use crate::{
21
    bstr::Bstr,
22
    config::{DecoderConfig, HtpUnwanted},
23
    unicode_bestfit_map::UnicodeBestfitMap,
24
    util::{FlagOperations, HtpFlags},
25
};
26

27
static utf8d: [u8; 400] = [
28
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
33
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
34
    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
35
    0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0xb, 0x6, 0x6,
36
    0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
37
    0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38
    1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
39
    1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
40
    1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
41
    1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
];
43
static utf8d_allow_overlong: [u8; 400] = [
44
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
49
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
50
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51
    0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0x6, 0x6, 0x6,
52
    0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0, 0x1, 0x2, 0x3, 0x5, 0x8,
53
    0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54
    1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
55
    1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1,
56
    1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1,
57
    1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58
];
59

60
#[derive(Clone)]
61
pub(crate) struct Utf8Decoder {
62
    bestfit_map: UnicodeBestfitMap,
63
    state: u32,
64
    seq: u32,
65
    codepoint: u32,
66
    pub(crate) flags: u64,
67
    pub(crate) seen_valid: bool,
68
    pub(crate) decoded_bytes: Vec<u8>,
69
}
70

71
impl Utf8Decoder {
72
    /// Make a new owned Utf8Decoder
73
    pub(crate) fn new(bestfit_map: UnicodeBestfitMap) -> Self {
11,056✔
74
        Self {
11,056✔
75
            bestfit_map,
11,056✔
76
            state: 0,
11,056✔
77
            seq: 0,
11,056✔
78
            codepoint: 0,
11,056✔
79
            flags: 0,
11,056✔
80
            seen_valid: false,
11,056✔
81
            decoded_bytes: Vec::new(),
11,056✔
82
        }
11,056✔
83
    }
11,056✔
84

85
    /// Decode utf8 byte using best-fit map.
86
    fn decode_byte(&mut self, encoded_byte: u8, is_last_byte: bool) {
217,533✔
87
        self.seq = self.seq.wrapping_add(1);
217,533✔
88
        self.decode_byte_allow_overlong(encoded_byte as u32);
217,533✔
89
        match self.state {
217,533✔
90
            0 => {
91
                if self.seq == 1 {
216,376✔
92
                    // ASCII character, which we just copy.
216,251✔
93
                    self.decoded_bytes.push(self.codepoint as u8);
216,251✔
94
                } else {
216,251✔
95
                    // A valid UTF-8 character, which we need to convert.
96
                    self.seen_valid = true;
125✔
97
                    // Check for overlong characters and set the flag accordingly.
125✔
98
                    if (self.seq == 2 && self.codepoint < 0x80)
125✔
99
                        || (self.seq == 3 && self.codepoint < 0x800)
113✔
100
                        || (self.seq == 4 && self.codepoint < 0x10000)
112✔
101
                    {
13✔
102
                        self.flags.set(HtpFlags::PATH_UTF8_OVERLONG);
13✔
103
                    }
112✔
104
                    // Special flag for half-width/full-width evasion.
105
                    if self.codepoint >= 0xff00 && self.codepoint <= 0xffef {
125✔
106
                        self.flags.set(HtpFlags::PATH_HALF_FULL_RANGE)
×
107
                    }
125✔
108
                    // Use best-fit mapping to convert to a single byte.
109
                    self.decoded_bytes.push(self.bestfit_codepoint());
125✔
110
                }
111
                self.seq = 0;
216,376✔
112
            }
113
            1 => {
114
                // Invalid UTF-8 character.
115
                self.flags.set(HtpFlags::PATH_UTF8_INVALID);
749✔
116
                // Output the replacement byte, replacing one or more invalid bytes.
749✔
117
                // If the invalid byte was first in a sequence, consume it. Otherwise,
749✔
118
                // assume it's the starting byte of the next character.
749✔
119
                self.state = 0;
749✔
120
                self.codepoint = 0;
749✔
121
                self.decoded_bytes.push(self.bestfit_map.replacement_byte);
749✔
122
                if self.seq != 1 {
749✔
123
                    self.seq = 0;
215✔
124
                    self.decode_byte(encoded_byte, is_last_byte);
215✔
125
                } else {
534✔
126
                    self.seq = 0;
534✔
127
                }
534✔
128
            }
129
            _ => {
130
                // The character is not yet formed.
131
                if is_last_byte {
408✔
132
                    // If the last input chunk ended with an incomplete byte sequence for a code point,
133
                    // this is an error and a replacement character is emitted hence starting from 1 not 0
134
                    for _ in 1..self.seq {
22✔
135
                        self.decoded_bytes.push(self.bestfit_map.replacement_byte);
1✔
136
                    }
1✔
137
                }
386✔
138
            }
139
        }
140
    }
217,533✔
141

142
    /// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream
143
    /// with an ascii stream, storing the result in self.decoded_bytes. Overlong
144
    /// characters will be decoded and invalid characters will be replaced with
145
    /// the replacement byte specified in the bestfit_map. Best-fit mapping will be used
146
    /// to convert UTF-8 into a single-byte stream.
147
    fn decode_and_validate(&mut self, input: &[u8]) {
11,056✔
148
        //Reset all internals
11,056✔
149
        self.state = 0;
11,056✔
150
        self.seq = 0;
11,056✔
151
        self.codepoint = 0;
11,056✔
152
        self.flags = 0;
11,056✔
153
        self.decoded_bytes.clear();
11,056✔
154
        self.decoded_bytes.reserve(input.len());
11,056✔
155
        self.seen_valid = false;
11,056✔
156
        for (byte, is_last) in input
217,318✔
157
            .iter()
11,056✔
158
            .enumerate()
11,056✔
159
            .map(|(i, b)| (b, i + 1 == input.len()))
217,318✔
160
        {
217,318✔
161
            self.decode_byte(*byte, is_last);
217,318✔
162
        }
217,318✔
163
        // Did the input stream seem like a valid UTF-8 string?
164
        if self.seen_valid && !self.flags.is_set(HtpFlags::PATH_UTF8_INVALID) {
11,056✔
165
            self.flags.set(HtpFlags::PATH_UTF8_VALID)
6✔
166
        }
11,050✔
167
    }
11,056✔
168

169
    /// Process one byte of UTF-8 data and set the code point if one is available. Allows
170
    /// overlong characters in input.
171
    ///
172
    /// Sets the state to ACCEPT(0) for a valid character, REJECT(1) for an invalid character,
173
    ///         or OTHER(u32) if the character has not yet been formed
174
    fn decode_byte_allow_overlong(&mut self, byte: u32) {
217,533✔
175
        let type_0: u32 = utf8d_allow_overlong[byte as usize] as u32;
217,533✔
176
        self.codepoint = if self.state != 0 {
217,533✔
177
            (byte & 0x3f) | (self.codepoint << 6)
386✔
178
        } else {
179
            (0xff >> type_0) & byte
217,147✔
180
        };
181
        self.state = utf8d[(256u32)
217,533✔
182
            .wrapping_add((self.state).wrapping_mul(16))
217,533✔
183
            .wrapping_add(type_0) as usize] as u32;
217,533✔
184
    }
217,533✔
185

186
    /// Convert a Unicode codepoint into a single-byte, using best-fit
187
    /// mapping (as specified in the provided configuration structure).
188
    ///
189
    /// Returns converted single byte
190
    fn bestfit_codepoint(&self) -> u8 {
125✔
191
        // Is it a single-byte codepoint?
125✔
192
        if self.codepoint < 0x100 {
125✔
193
            return self.codepoint as u8;
19✔
194
        }
106✔
195
        self.bestfit_map.get(self.codepoint)
106✔
196
    }
125✔
197
}
198

199
/// Decode a UTF-8 encoded path. Replaces a possibly-invalid utf8 byte stream with
200
/// an ascii stream. Overlong characters will be decoded and invalid characters will
201
/// be replaced with the replacement byte specified in the cfg. Best-fit mapping will
202
/// be used to convert UTF-8 into a single-byte stream. The resulting decoded path will
203
/// be stored in the input path if the transaction cfg indicates it
204
pub(crate) fn decode_and_validate_inplace(
11,056✔
205
    cfg: &DecoderConfig, flags: &mut u64, status: &mut HtpUnwanted, path: &mut Bstr,
11,056✔
206
) {
11,056✔
207
    let mut decoder = Utf8Decoder::new(cfg.bestfit_map);
11,056✔
208
    decoder.decode_and_validate(path.as_slice());
11,056✔
209
    if cfg.utf8_convert_bestfit {
11,056✔
UNCOV
210
        path.clear();
×
UNCOV
211
        path.add(decoder.decoded_bytes.as_slice());
×
212
    }
11,056✔
213
    flags.set(decoder.flags);
11,056✔
214

11,056✔
215
    if flags.is_set(HtpFlags::PATH_UTF8_INVALID) && cfg.utf8_invalid_unwanted != HtpUnwanted::Ignore
11,056✔
216
    {
×
217
        *status = cfg.utf8_invalid_unwanted;
×
218
    }
11,056✔
219
}
11,056✔
220
#[cfg(test)]
221
mod tests {
222
    use crate::{
223
        bstr::Bstr, config::Config, config::HtpUnwanted, utf8_decoder::decode_and_validate_inplace,
224
    };
225
    use rstest::rstest;
226

227
    #[rstest]
228
    #[case(b"\xf1.\xf1\xef\xbd\x9dabcd", "?.?}abcd")]
229
    //1111 0000 1001 0000 1000 1101 1111 1111
230
    #[case::invalid_incomplete_seq(b"\xf0\x90\x8d\xff", "??")]
231
    //1110 0010 1000 0010
232
    #[case::invalid_incomplete_seq(b"\xe2\x82", "?")]
233
    //1100 0010 1111 1111 1111 0000
234
    #[case::invalid_incomplete_seq(b"\xc2\xff\xf0", "??")]
235
    //1111 0000 1001 0000 0010 1000 1011 1100
236
    #[case::invalid_incomplete_seq(b"\xf0\x90\x28\xbc", "?(?")]
237
    fn test_decode_and_validate_inplace(#[case] input: &[u8], #[case] expected: &str) {
238
        let mut cfg = Config::default();
239
        cfg.set_utf8_convert_bestfit(true);
240
        let mut i = Bstr::from(input);
241
        let mut flags = 0;
242
        let mut response_status_expected_number = HtpUnwanted::Ignore;
243
        decode_and_validate_inplace(
244
            &cfg.decoder_cfg,
245
            &mut flags,
246
            &mut response_status_expected_number,
247
            &mut i,
248
        );
249
        assert_eq!(i, Bstr::from(expected));
250
    }
251
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc