• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 23413161931

22 Mar 2026 09:41PM UTC coverage: 89.428% (+0.004%) from 89.424%
23413161931

push

github

web-flow
Merge pull request #5459 from dmazzella/charset

Added utf8_to_ucs2 and utf8_to_ucs4

104781 of 117168 relevant lines covered (89.43%)

11754657.04 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.79
/src/lib/utils/charset.cpp
1
/*
2
* Character Set Handling
3
* (C) 1999-2007,2021 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7

8
#include <botan/internal/charset.h>
9

10
#include <botan/exceptn.h>
11
#include <botan/internal/loadstor.h>
12
#include <sstream>
13

14
namespace Botan {
15

16
namespace {
17

18
void append_utf8_for(std::string& s, uint32_t c) {
29,816✔
19
   if(c >= 0xD800 && c < 0xE000) {
29,816✔
20
      throw Decoding_Error("Invalid Unicode character");
×
21
   }
22

23
   if(c <= 0x7F) {
29,816✔
24
      const uint8_t b0 = static_cast<uint8_t>(c);
29,171✔
25
      s.push_back(static_cast<char>(b0));
29,171✔
26
   } else if(c <= 0x7FF) {
645✔
27
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
256✔
28
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
256✔
29
      s.push_back(static_cast<char>(b0));
256✔
30
      s.push_back(static_cast<char>(b1));
256✔
31
   } else if(c <= 0xFFFF) {
389✔
32
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
359✔
33
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
359✔
34
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
359✔
35
      s.push_back(static_cast<char>(b0));
359✔
36
      s.push_back(static_cast<char>(b1));
359✔
37
      s.push_back(static_cast<char>(b2));
359✔
38
   } else if(c <= 0x10FFFF) {
30✔
39
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
×
40
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
×
41
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
×
42
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
×
43
      s.push_back(static_cast<char>(b0));
×
44
      s.push_back(static_cast<char>(b1));
×
45
      s.push_back(static_cast<char>(b2));
×
46
      s.push_back(static_cast<char>(b3));
×
47
   } else {
48
      throw Decoding_Error("Invalid Unicode character");
30✔
49
   }
50
}
29,786✔
51

52
uint32_t next_utf8_codepoint(const std::string& utf8, size_t& pos) {
124✔
53
   auto read_continuation = [&]() -> uint32_t {
278✔
54
      if(pos >= utf8.size()) {
154✔
55
         throw Decoding_Error("Invalid UTF-8 sequence");
3✔
56
      }
57
      const uint8_t b = static_cast<uint8_t>(utf8[pos++]);
151✔
58
      if((b & 0xC0) != 0x80) {
151✔
59
         throw Decoding_Error("Invalid UTF-8 sequence");
4✔
60
      }
61
      return b & 0x3F;
147✔
62
   };
124✔
63

64
   const uint8_t lead = static_cast<uint8_t>(utf8[pos++]);
124✔
65
   uint32_t c = 0;
124✔
66

67
   if(lead <= 0x7F) {
124✔
68
      c = lead;
42✔
69
   } else if((lead & 0xE0) == 0xC0) {
82✔
70
      c = (lead & 0x1F) << 6;
17✔
71
      c |= read_continuation();
17✔
72
      if(c < 0x80) {
13✔
73
         throw Decoding_Error("Overlong UTF-8 sequence");
1✔
74
      }
75
   } else if((lead & 0xF0) == 0xE0) {
65✔
76
      c = (lead & 0x0F) << 12;
52✔
77
      c |= read_continuation() << 6;
52✔
78
      c |= read_continuation();
52✔
79
      if(c < 0x800) {
50✔
80
         throw Decoding_Error("Overlong UTF-8 sequence");
3✔
81
      }
82
   } else if((lead & 0xF8) == 0xF0) {
13✔
83
      c = (lead & 0x07) << 18;
11✔
84
      c |= read_continuation() << 12;
11✔
85
      c |= read_continuation() << 6;
11✔
86
      c |= read_continuation();
11✔
87
      if(c < 0x10000) {
10✔
88
         throw Decoding_Error("Overlong UTF-8 sequence");
3✔
89
      }
90
   } else {
91
      throw Decoding_Error("Invalid UTF-8 sequence");
2✔
92
   }
93

94
   if(c > 0x10FFFF) {
108✔
95
      throw Decoding_Error("UTF-8 sequence encodes value outside Unicode range");
3✔
96
   }
97
   if(c >= 0xD800 && c < 0xE000) {
105✔
98
      throw Decoding_Error("UTF-8 sequence encodes surrogate code point");
3✔
99
   }
100

101
   return c;
102✔
102
}
103

104
}  // namespace
105

106
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
1,138✔
107
   if(len % 2 != 0) {
1,138✔
108
      throw Decoding_Error("Invalid length for UCS-2 string");
10✔
109
   }
110

111
   const size_t chars = len / 2;
1,128✔
112

113
   std::string s;
1,128✔
114
   for(size_t i = 0; i != chars; ++i) {
2,919✔
115
      const uint32_t c = load_be<uint16_t>(ucs2, i);
1,791✔
116
      append_utf8_for(s, c);
1,791✔
117
   }
118

119
   return s;
1,128✔
120
}
×
121

122
std::vector<uint8_t> utf8_to_ucs2(const std::string& utf8) {
24✔
123
   std::vector<uint8_t> out;
24✔
124
   out.reserve(utf8.size() * 2);
24✔
125

126
   size_t pos = 0;
24✔
127
   while(pos < utf8.size()) {
92✔
128
      const uint32_t c = next_utf8_codepoint(utf8, pos);
81✔
129
      if(c > 0xFFFF) {
69✔
130
         throw Decoding_Error("Cannot encode character in UCS-2");
1✔
131
      }
132
      const uint16_t val = static_cast<uint16_t>(c);
68✔
133
      out.push_back(get_byte<0>(val));
68✔
134
      out.push_back(get_byte<1>(val));
68✔
135
   }
136

137
   return out;
11✔
138
}
13✔
139

140
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
82✔
141
   if(len % 4 != 0) {
82✔
142
      throw Decoding_Error("Invalid length for UCS-4 string");
5✔
143
   }
144

145
   const size_t chars = len / 4;
77✔
146

147
   std::string s;
77✔
148
   for(size_t i = 0; i != chars; ++i) {
119✔
149
      const uint32_t c = load_be<uint32_t>(ucs4, i);
72✔
150
      append_utf8_for(s, c);
72✔
151
   }
152

153
   return s;
47✔
154
}
30✔
155

156
std::vector<uint8_t> utf8_to_ucs4(const std::string& utf8) {
21✔
157
   std::vector<uint8_t> out;
21✔
158
   out.reserve(utf8.size() * 4);
21✔
159

160
   size_t pos = 0;
21✔
161
   while(pos < utf8.size()) {
54✔
162
      const uint32_t val = next_utf8_codepoint(utf8, pos);
43✔
163
      out.push_back(get_byte<0>(val));
33✔
164
      out.push_back(get_byte<1>(val));
33✔
165
      out.push_back(get_byte<2>(val));
33✔
166
      out.push_back(get_byte<3>(val));
33✔
167
   }
168

169
   return out;
11✔
170
}
10✔
171

172
/*
173
* Convert from ISO 8859-1 to UTF-8
174
*/
175
std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
1,295✔
176
   std::string s;
1,295✔
177
   for(size_t i = 0; i != len; ++i) {
29,248✔
178
      const uint32_t c = static_cast<uint8_t>(chars[i]);
27,953✔
179
      append_utf8_for(s, c);
27,953✔
180
   }
181
   return s;
1,295✔
182
}
×
183

184
std::string format_char_for_display(char c) {
77✔
185
   std::ostringstream oss;
77✔
186

187
   oss << "'";
77✔
188

189
   if(c == '\t') {
77✔
190
      oss << "\\t";
14✔
191
   } else if(c == '\n') {
63✔
192
      oss << "\\n";
14✔
193
   } else if(c == '\r') {
49✔
194
      oss << "\\r";
14✔
195
   } else if(static_cast<unsigned char>(c) >= 128) {
35✔
196
      const unsigned char z = static_cast<unsigned char>(c);
6✔
197
      oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
6✔
198
   } else {
199
      oss << c;
29✔
200
   }
201

202
   oss << "'";
77✔
203

204
   return oss.str();
154✔
205
}
77✔
206

207
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc