• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 23234035284

18 Mar 2026 07:35AM UTC coverage: 89.676% (+0.002%) from 89.674%
23234035284

Pull #5459

github

web-flow
Merge ad389837e into a7cbacbd5
Pull Request #5459: Added utf8_to_ucs2 and utf8_to_ucs4

104505 of 116536 relevant lines covered (89.68%)

11690135.13 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.79
/src/lib/utils/charset.cpp
1
/*
2
* Character Set Handling
3
* (C) 1999-2007,2021 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7

8
#include <botan/internal/charset.h>
9

10
#include <botan/exceptn.h>
11
#include <botan/internal/loadstor.h>
12
#include <sstream>
13

14
namespace Botan {
15

16
namespace {
17

18
void append_utf8_for(std::string& s, uint32_t c) {
29,816✔
19
   if(c >= 0xD800 && c < 0xE000) {
29,816✔
20
      throw Decoding_Error("Invalid Unicode character");
×
21
   }
22

23
   if(c <= 0x7F) {
29,816✔
24
      const uint8_t b0 = static_cast<uint8_t>(c);
29,171✔
25
      s.push_back(static_cast<char>(b0));
29,171✔
26
   } else if(c <= 0x7FF) {
645✔
27
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
256✔
28
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
256✔
29
      s.push_back(static_cast<char>(b0));
256✔
30
      s.push_back(static_cast<char>(b1));
256✔
31
   } else if(c <= 0xFFFF) {
389✔
32
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
359✔
33
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
359✔
34
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
359✔
35
      s.push_back(static_cast<char>(b0));
359✔
36
      s.push_back(static_cast<char>(b1));
359✔
37
      s.push_back(static_cast<char>(b2));
359✔
38
   } else if(c <= 0x10FFFF) {
30✔
39
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
×
40
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
×
41
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
×
42
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
×
43
      s.push_back(static_cast<char>(b0));
×
44
      s.push_back(static_cast<char>(b1));
×
45
      s.push_back(static_cast<char>(b2));
×
46
      s.push_back(static_cast<char>(b3));
×
47
   } else {
48
      throw Decoding_Error("Invalid Unicode character");
30✔
49
   }
50
}
29,786✔
51

52
uint32_t next_utf8_codepoint(const std::string& utf8, size_t& pos) {
102✔
53
   auto read_continuation = [&]() -> uint32_t {
212✔
54
      if(pos >= utf8.size()) {
110✔
55
         throw Decoding_Error("Invalid UTF-8 sequence");
3✔
56
      }
57
      const uint8_t b = static_cast<uint8_t>(utf8[pos++]);
107✔
58
      if((b & 0xC0) != 0x80) {
107✔
59
         throw Decoding_Error("Invalid UTF-8 sequence");
2✔
60
      }
61
      return b & 0x3F;
105✔
62
   };
102✔
63

64
   const uint8_t lead = static_cast<uint8_t>(utf8[pos++]);
102✔
65
   uint32_t c = 0;
102✔
66

67
   if(lead <= 0x7F) {
102✔
68
      c = lead;
42✔
69
   } else if((lead & 0xE0) == 0xC0) {
60✔
70
      c = (lead & 0x1F) << 6;
11✔
71
      c |= read_continuation();
11✔
72
      if(c < 0x80) {
9✔
73
         throw Decoding_Error("Overlong UTF-8 sequence");
1✔
74
      }
75
   } else if((lead & 0xF0) == 0xE0) {
49✔
76
      c = (lead & 0x0F) << 12;
42✔
77
      c |= read_continuation() << 6;
42✔
78
      c |= read_continuation();
42✔
79
      if(c < 0x800) {
40✔
80
         throw Decoding_Error("Overlong UTF-8 sequence");
1✔
81
      }
82
   } else if((lead & 0xF8) == 0xF0) {
7✔
83
      c = (lead & 0x07) << 18;
5✔
84
      c |= read_continuation() << 12;
5✔
85
      c |= read_continuation() << 6;
5✔
86
      c |= read_continuation();
5✔
87
      if(c < 0x10000) {
4✔
88
         throw Decoding_Error("Overlong UTF-8 sequence");
1✔
89
      }
90
   } else {
91
      throw Decoding_Error("Invalid UTF-8 sequence");
2✔
92
   }
93

94
   if(c > 0x10FFFF) {
92✔
95
      throw Decoding_Error("UTF-8 sequence encodes value outside Unicode range");
1✔
96
   }
97
   if(c >= 0xD800 && c < 0xE000) {
91✔
98
      throw Decoding_Error("UTF-8 sequence encodes surrogate code point");
3✔
99
   }
100

101
   return c;
88✔
102
}
103

104
}  // namespace
105

106
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
1,138✔
107
   if(len % 2 != 0) {
1,138✔
108
      throw Decoding_Error("Invalid length for UCS-2 string");
10✔
109
   }
110

111
   const size_t chars = len / 2;
1,128✔
112

113
   std::string s;
1,128✔
114
   for(size_t i = 0; i != chars; ++i) {
2,919✔
115
      const uint32_t c = load_be<uint16_t>(ucs2, i);
1,791✔
116
      append_utf8_for(s, c);
1,791✔
117
   }
118

119
   return s;
1,128✔
120
}
×
121

122
std::vector<uint8_t> utf8_to_ucs2(const std::string& utf8) {
13✔
123
   std::vector<uint8_t> out;
13✔
124
   out.reserve(utf8.size() * 2);
13✔
125

126
   size_t pos = 0;
13✔
127
   while(pos < utf8.size()) {
75✔
128
      const uint32_t c = next_utf8_codepoint(utf8, pos);
71✔
129
      if(c > 0xFFFF) {
63✔
130
         throw Decoding_Error("Cannot encode character in UCS-2");
1✔
131
      }
132
      const uint16_t val = static_cast<uint16_t>(c);
62✔
133
      out.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
62✔
134
      out.push_back(static_cast<uint8_t>(val & 0xFF));
62✔
135
   }
136

137
   return out;
4✔
138
}
9✔
139

140
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
82✔
141
   if(len % 4 != 0) {
82✔
142
      throw Decoding_Error("Invalid length for UCS-4 string");
5✔
143
   }
144

145
   const size_t chars = len / 4;
77✔
146

147
   std::string s;
77✔
148
   for(size_t i = 0; i != chars; ++i) {
119✔
149
      const uint32_t c = load_be<uint32_t>(ucs4, i);
72✔
150
      append_utf8_for(s, c);
72✔
151
   }
152

153
   return s;
47✔
154
}
30✔
155

156
std::vector<uint8_t> utf8_to_ucs4(const std::string& utf8) {
8✔
157
   std::vector<uint8_t> out;
8✔
158
   out.reserve(utf8.size() * 4);
8✔
159

160
   size_t pos = 0;
8✔
161
   while(pos < utf8.size()) {
33✔
162
      const uint32_t val = next_utf8_codepoint(utf8, pos);
31✔
163
      out.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
25✔
164
      out.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
25✔
165
      out.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
25✔
166
      out.push_back(static_cast<uint8_t>(val & 0xFF));
25✔
167
   }
168

169
   return out;
2✔
170
}
6✔
171

172
/*
173
* Convert from ISO 8859-1 to UTF-8
174
*/
175
std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
1,295✔
176
   std::string s;
1,295✔
177
   for(size_t i = 0; i != len; ++i) {
29,248✔
178
      const uint32_t c = static_cast<uint8_t>(chars[i]);
27,953✔
179
      append_utf8_for(s, c);
27,953✔
180
   }
181
   return s;
1,295✔
182
}
×
183

184
std::string format_char_for_display(char c) {
77✔
185
   std::ostringstream oss;
77✔
186

187
   oss << "'";
77✔
188

189
   if(c == '\t') {
77✔
190
      oss << "\\t";
14✔
191
   } else if(c == '\n') {
63✔
192
      oss << "\\n";
14✔
193
   } else if(c == '\r') {
49✔
194
      oss << "\\r";
14✔
195
   } else if(static_cast<unsigned char>(c) >= 128) {
35✔
196
      const unsigned char z = static_cast<unsigned char>(c);
6✔
197
      oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
6✔
198
   } else {
199
      oss << c;
29✔
200
   }
201

202
   oss << "'";
77✔
203

204
   return oss.str();
154✔
205
}
77✔
206

207
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc