• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 27253840577

08 Jun 2026 09:36PM UTC coverage: 89.367% (+0.01%) from 89.356%
27253840577

push

github

web-flow
Merge pull request #5657 from randombit/jack/ctrl-escaping

Escape control characters in codec error reporting and DN printing

110761 of 123940 relevant lines covered (89.37%)

11052282.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.4
/src/lib/utils/charset.cpp
1
/*
2
* Character Set Handling
3
* (C) 1999-2007,2021 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7

8
#include <botan/internal/charset.h>
9

10
#include <botan/exceptn.h>
11
#include <botan/internal/loadstor.h>
12

13
namespace Botan {
14

15
namespace {
16

17
void append_utf8_for(std::string& s, uint32_t c) {
27,994✔
18
   if(c >= 0xD800 && c < 0xE000) {
27,994✔
19
      throw Decoding_Error("Invalid Unicode character");
2✔
20
   }
21

22
   if(c <= 0x7F) {
27,992✔
23
      const uint8_t b0 = static_cast<uint8_t>(c);
27,478✔
24
      s.push_back(static_cast<char>(b0));
27,478✔
25
   } else if(c <= 0x7FF) {
514✔
26
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
200✔
27
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
200✔
28
      s.push_back(static_cast<char>(b0));
200✔
29
      s.push_back(static_cast<char>(b1));
200✔
30
   } else if(c <= 0xFFFF) {
314✔
31
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
292✔
32
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
292✔
33
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
292✔
34
      s.push_back(static_cast<char>(b0));
292✔
35
      s.push_back(static_cast<char>(b1));
292✔
36
      s.push_back(static_cast<char>(b2));
292✔
37
   } else if(c <= 0x10FFFF) {
22✔
38
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
1✔
39
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
1✔
40
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
1✔
41
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
1✔
42
      s.push_back(static_cast<char>(b0));
1✔
43
      s.push_back(static_cast<char>(b1));
1✔
44
      s.push_back(static_cast<char>(b2));
1✔
45
      s.push_back(static_cast<char>(b3));
1✔
46
   } else {
47
      throw Decoding_Error("Invalid Unicode character");
21✔
48
   }
49
}
27,971✔
50

51
}  // namespace
52

53
uint32_t next_utf8_codepoint(std::string_view utf8, size_t& pos) {
710,333✔
54
   auto read_continuation = [&]() -> uint32_t {
713,170✔
55
      if(pos >= utf8.size()) {
2,837✔
56
         throw Decoding_Error("Invalid UTF-8 sequence");
5✔
57
      }
58
      const uint8_t b = static_cast<uint8_t>(utf8[pos++]);
2,832✔
59
      if((b & 0xC0) != 0x80) {
2,832✔
60
         throw Decoding_Error("Invalid UTF-8 sequence");
9✔
61
      }
62
      return b & 0x3F;
2,823✔
63
   };
710,333✔
64

65
   const uint8_t lead = static_cast<uint8_t>(utf8[pos++]);
710,333✔
66
   uint32_t c = 0;
710,333✔
67

68
   if(lead <= 0x7F) {
710,333✔
69
      c = lead;
707,561✔
70
   } else if((lead & 0xE0) == 0xC0) {
2,772✔
71
      c = (lead & 0x1F) << 6;
2,692✔
72
      c |= read_continuation();
2,692✔
73
      if(c < 0x80) {
2,685✔
74
         throw Decoding_Error("Overlong UTF-8 sequence");
1✔
75
      }
76
   } else if((lead & 0xF0) == 0xE0) {
80✔
77
      c = (lead & 0x0F) << 12;
53✔
78
      c |= read_continuation() << 6;
53✔
79
      c |= read_continuation();
53✔
80
      if(c < 0x800) {
50✔
81
         throw Decoding_Error("Overlong UTF-8 sequence");
3✔
82
      }
83
   } else if((lead & 0xF8) == 0xF0) {
27✔
84
      c = (lead & 0x07) << 18;
14✔
85
      c |= read_continuation() << 12;
14✔
86
      c |= read_continuation() << 6;
13✔
87
      c |= read_continuation();
12✔
88
      if(c < 0x10000) {
10✔
89
         throw Decoding_Error("Overlong UTF-8 sequence");
3✔
90
      }
91
   } else {
92
      throw Decoding_Error("Invalid UTF-8 sequence");
13✔
93
   }
94

95
   if(c > 0x10FFFF) {
710,299✔
96
      throw Decoding_Error("UTF-8 sequence encodes value outside Unicode range");
3✔
97
   }
98
   if(c >= 0xD800 && c < 0xE000) {
710,296✔
99
      throw Decoding_Error("UTF-8 sequence encodes surrogate code point");
3✔
100
   }
101

102
   return c;
710,293✔
103
}
104

105
bool is_valid_utf8(std::string_view utf8) {
40,744✔
106
   try {
40,744✔
107
      size_t pos = 0;
40,744✔
108
      while(pos < utf8.size()) {
722,542✔
109
         const uint32_t c = next_utf8_codepoint(utf8, pos);
681,816✔
110
         BOTAN_UNUSED(c);
111
      }
112
   } catch(Decoding_Error&) {
18✔
113
      return false;
18✔
114
   }
18✔
115
   return true;
40,726✔
116
}
117

118
std::string ucs2_to_utf8(std::span<const uint8_t> ucs2) {
953✔
119
   if(ucs2.size() % 2 != 0) {
953✔
120
      throw Decoding_Error("Invalid length for UCS-2 string");
8✔
121
   }
122

123
   const size_t chars = ucs2.size() / 2;
945✔
124

125
   std::string s;
945✔
126
   for(size_t i = 0; i != chars; ++i) {
2,456✔
127
      const uint32_t c = load_be<uint16_t>(ucs2.data(), i);
1,512✔
128
      append_utf8_for(s, c);
1,512✔
129
   }
130

131
   return s;
944✔
132
}
1✔
133

134
std::vector<uint8_t> utf8_to_ucs2(std::string_view utf8) {
44✔
135
   std::vector<uint8_t> out;
44✔
136
   out.reserve(utf8.size() * 2);
44✔
137

138
   size_t pos = 0;
44✔
139
   while(pos < utf8.size()) {
252✔
140
      const uint32_t c = next_utf8_codepoint(utf8, pos);
221✔
141
      if(c > 0xFFFF) {
209✔
142
         throw Decoding_Error("Cannot encode character in UCS-2");
1✔
143
      }
144
      const uint16_t val = static_cast<uint16_t>(c);
208✔
145
      out.push_back(get_byte<0>(val));
208✔
146
      out.push_back(get_byte<1>(val));
208✔
147
   }
148

149
   return out;
31✔
150
}
13✔
151

152
std::string ucs4_to_utf8(std::span<const uint8_t> ucs4) {
71✔
153
   if(ucs4.size() % 4 != 0) {
71✔
154
      throw Decoding_Error("Invalid length for UCS-4 string");
5✔
155
   }
156

157
   const size_t chars = ucs4.size() / 4;
66✔
158

159
   std::string s;
66✔
160
   for(size_t i = 0; i != chars; ++i) {
112✔
161
      const uint32_t c = load_be<uint32_t>(ucs4.data(), i);
68✔
162
      append_utf8_for(s, c);
68✔
163
   }
164

165
   return s;
44✔
166
}
22✔
167

168
std::vector<uint8_t> utf8_to_ucs4(std::string_view utf8) {
21✔
169
   std::vector<uint8_t> out;
21✔
170
   out.reserve(utf8.size() * 4);
21✔
171

172
   size_t pos = 0;
21✔
173
   while(pos < utf8.size()) {
54✔
174
      const uint32_t val = next_utf8_codepoint(utf8, pos);
43✔
175
      out.push_back(get_byte<0>(val));
33✔
176
      out.push_back(get_byte<1>(val));
33✔
177
      out.push_back(get_byte<2>(val));
33✔
178
      out.push_back(get_byte<3>(val));
33✔
179
   }
180

181
   return out;
11✔
182
}
10✔
183

184
/*
185
* Convert from ISO 8859-1 to UTF-8
186
*/
187
std::string latin1_to_utf8(std::span<const uint8_t> chars) {
1,196✔
188
   std::string s;
1,196✔
189
   for(const uint8_t b : chars) {
27,610✔
190
      append_utf8_for(s, static_cast<uint32_t>(b));
26,414✔
191
   }
192
   return s;
1,196✔
193
}
×
194

195
bool is_ascii_control_char(char c) {
306✔
196
   const uint8_t b = static_cast<uint8_t>(c);
306✔
197
   return b < 0x20 || b == 0x7F;
306✔
198
}
199

200
bool is_unicode_control_char(uint32_t cp) {
28,249✔
201
   return cp < 0x20 || (cp >= 0x7F && cp <= 0x9F);
28,249✔
202
}
203

204
std::string escape_control_chars(std::string_view utf8) {
426✔
205
   std::string out;
426✔
206
   out.reserve(utf8.size());
426✔
207

208
   const auto append_hex_escape = [&](uint8_t b) {
452✔
209
      out += "\\x";
26✔
210
      out += nibble_to_hex(b >> 4);
52✔
211
      out += nibble_to_hex(b);
52✔
212
   };
452✔
213

214
   size_t pos = 0;
426✔
215
   while(pos < utf8.size()) {
18,696✔
216
      const size_t start = pos;
18,270✔
217

218
      uint32_t cp = 0;
18,270✔
219
      try {
18,270✔
220
         cp = next_utf8_codepoint(utf8, pos);
18,270✔
221
      } catch(const Decoding_Error&) {
×
222
         // Not valid UTF-8: escape the offending byte and resume
223
         append_hex_escape(static_cast<uint8_t>(utf8[start]));
×
224
         pos = start + 1;
×
225
         continue;
×
226
      }
×
227

228
      if(is_unicode_control_char(cp)) {
18,270✔
229
         for(size_t i = start; i < pos; ++i) {
49✔
230
            append_hex_escape(static_cast<uint8_t>(utf8[i]));
26✔
231
         }
232
      } else {
233
         out.append(utf8.substr(start, pos - start));
36,494✔
234
      }
235
   }
236

237
   return out;
426✔
238
}
×
239

240
std::string format_char_for_display(char c) {
84✔
241
   std::string out;
84✔
242
   out += '\'';
84✔
243

244
   if(c == '\t') {
84✔
245
      out += "\\t";
15✔
246
   } else if(c == '\n') {
69✔
247
      out += "\\n";
15✔
248
   } else if(c == '\r') {
54✔
249
      out += "\\r";
15✔
250
   } else if(is_ascii_control_char(c) || static_cast<uint8_t>(c) >= 0x80) {
39✔
251
      const auto b = static_cast<uint8_t>(c);
17✔
252
      out += "\\x";
17✔
253
      out += nibble_to_hex(b >> 4);
34✔
254
      out += nibble_to_hex(b);
34✔
255
   } else {
256
      out += c;
22✔
257
   }
258

259
   out += '\'';
84✔
260

261
   return out;
84✔
262
}
×
263

264
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc