• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 23197056192

17 Mar 2026 01:39PM UTC coverage: 89.661% (-0.02%) from 89.676%
23197056192

Pull #5459

github

web-flow
Merge 56d311d7a into 588516d8d
Pull Request #5459: Added utf8_to_ucs2 and utf8_to_ucs4

104481 of 116529 relevant lines covered (89.66%)

11676053.16 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.01
/src/lib/utils/charset.cpp
1
/*
2
* Character Set Handling
3
* (C) 1999-2007,2021 Jack Lloyd
4
*
5
* Botan is released under the Simplified BSD License (see license.txt)
6
*/
7

8
#include <botan/internal/charset.h>
9

10
#include <botan/exceptn.h>
11
#include <botan/internal/loadstor.h>
12
#include <sstream>
13

14
namespace Botan {
15

16
namespace {
17

18
void append_utf8_for(std::string& s, uint32_t c) {
29,816✔
19
   if(c >= 0xD800 && c < 0xE000) {
29,816✔
20
      throw Decoding_Error("Invalid Unicode character");
×
21
   }
22

23
   if(c <= 0x7F) {
29,816✔
24
      const uint8_t b0 = static_cast<uint8_t>(c);
29,171✔
25
      s.push_back(static_cast<char>(b0));
29,171✔
26
   } else if(c <= 0x7FF) {
645✔
27
      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
256✔
28
      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
256✔
29
      s.push_back(static_cast<char>(b0));
256✔
30
      s.push_back(static_cast<char>(b1));
256✔
31
   } else if(c <= 0xFFFF) {
389✔
32
      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
359✔
33
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
359✔
34
      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
359✔
35
      s.push_back(static_cast<char>(b0));
359✔
36
      s.push_back(static_cast<char>(b1));
359✔
37
      s.push_back(static_cast<char>(b2));
359✔
38
   } else if(c <= 0x10FFFF) {
30✔
39
      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
×
40
      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
×
41
      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
×
42
      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
×
43
      s.push_back(static_cast<char>(b0));
×
44
      s.push_back(static_cast<char>(b1));
×
45
      s.push_back(static_cast<char>(b2));
×
46
      s.push_back(static_cast<char>(b3));
×
47
   } else {
48
      throw Decoding_Error("Invalid Unicode character");
30✔
49
   }
50
}
29,786✔
51

52
}  // namespace
53

54
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) {
1,138✔
55
   if(len % 2 != 0) {
1,138✔
56
      throw Decoding_Error("Invalid length for UCS-2 string");
10✔
57
   }
58

59
   const size_t chars = len / 2;
1,128✔
60

61
   std::string s;
1,128✔
62
   for(size_t i = 0; i != chars; ++i) {
2,919✔
63
      const uint32_t c = load_be<uint16_t>(ucs2, i);
1,791✔
64
      append_utf8_for(s, c);
1,791✔
65
   }
66

67
   return s;
1,128✔
68
}
×
69

70
std::vector<uint8_t> utf8_to_ucs2(const std::string& utf8) {
4✔
71
   std::vector<uint8_t> out;
4✔
72
   out.reserve(utf8.size() * 2);
4✔
73

74
   size_t pos = 0;
75
   while(pos < utf8.size()) {
66✔
76
      uint32_t c = static_cast<uint8_t>(utf8[pos++]);
62✔
77
      if(c >= 0x80) {
62✔
78
         if((c & 0xE0) == 0xC0) {
40✔
79
            if(pos >= utf8.size()) {
4✔
80
               throw Decoding_Error("Invalid UTF-8 sequence");
×
81
            }
82
            c = (c & 0x1F) << 6;
4✔
83
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
4✔
84
         } else if((c & 0xF0) == 0xE0) {
36✔
85
            if(pos + 1 >= utf8.size()) {
36✔
86
               throw Decoding_Error("Invalid UTF-8 sequence");
×
87
            }
88
            c = (c & 0x0F) << 12;
36✔
89
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 6;
36✔
90
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
36✔
91
         } else if((c & 0xF8) == 0xF0) {
×
92
            if(pos + 2 >= utf8.size()) {
×
93
               throw Decoding_Error("Invalid UTF-8 sequence");
×
94
            }
95
            c = (c & 0x07) << 18;
×
96
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 12;
×
97
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 6;
×
98
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
×
99
         } else {
100
            throw Decoding_Error("Invalid UTF-8 sequence");
×
101
         }
102
      }
103
      if(c >= 0xD800 && c < 0xE000) {
62✔
104
         throw Decoding_Error("Surrogate pair code point cannot be encoded in UCS-2");
×
105
      }
106
      if(c > 0xFFFF) {
62✔
107
         throw Decoding_Error("Cannot encode character in UCS-2");
×
108
      }
109

110
      const uint16_t val = static_cast<uint16_t>(c);
62✔
111
      out.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
62✔
112
      out.push_back(static_cast<uint8_t>(val & 0xFF));
62✔
113
   }
114

115
   return out;
4✔
116
}
×
117

118
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) {
82✔
119
   if(len % 4 != 0) {
82✔
120
      throw Decoding_Error("Invalid length for UCS-4 string");
5✔
121
   }
122

123
   const size_t chars = len / 4;
77✔
124

125
   std::string s;
77✔
126
   for(size_t i = 0; i != chars; ++i) {
119✔
127
      const uint32_t c = load_be<uint32_t>(ucs4, i);
72✔
128
      append_utf8_for(s, c);
72✔
129
   }
130

131
   return s;
47✔
132
}
30✔
133

134
std::vector<uint8_t> utf8_to_ucs4(const std::string& utf8) {
1✔
135
   std::vector<uint8_t> out;
1✔
136
   out.reserve(utf8.size() * 4);
1✔
137

138
   size_t pos = 0;
139
   while(pos < utf8.size()) {
25✔
140
      uint32_t c = static_cast<uint8_t>(utf8[pos++]);
24✔
141
      if(c >= 0x80) {
24✔
142
         if((c & 0xE0) == 0xC0) {
4✔
143
            if(pos >= utf8.size()) {
4✔
144
               throw Decoding_Error("Invalid UTF-8 sequence");
×
145
            }
146
            c = (c & 0x1F) << 6;
4✔
147
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
4✔
148
         } else if((c & 0xF0) == 0xE0) {
×
149
            if(pos + 1 >= utf8.size()) {
×
150
               throw Decoding_Error("Invalid UTF-8 sequence");
×
151
            }
152
            c = (c & 0x0F) << 12;
×
153
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 6;
×
154
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
×
155
         } else if((c & 0xF8) == 0xF0) {
×
156
            if(pos + 2 >= utf8.size()) {
×
157
               throw Decoding_Error("Invalid UTF-8 sequence");
×
158
            }
159
            c = (c & 0x07) << 18;
×
160
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 12;
×
161
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F) << 6;
×
162
            c |= (static_cast<uint8_t>(utf8[pos++]) & 0x3F);
×
163
         } else {
164
            throw Decoding_Error("Invalid UTF-8 sequence");
×
165
         }
166
      }
167

168
      const uint32_t val = static_cast<uint32_t>(c);
24✔
169
      out.push_back(static_cast<uint8_t>((val >> 24) & 0xFF));
24✔
170
      out.push_back(static_cast<uint8_t>((val >> 16) & 0xFF));
24✔
171
      out.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
24✔
172
      out.push_back(static_cast<uint8_t>(val & 0xFF));
24✔
173
   }
174

175
   return out;
1✔
176
}
×
177

178
/*
179
* Convert from ISO 8859-1 to UTF-8
180
*/
181
std::string latin1_to_utf8(const uint8_t chars[], size_t len) {
1,295✔
182
   std::string s;
1,295✔
183
   for(size_t i = 0; i != len; ++i) {
29,248✔
184
      const uint32_t c = static_cast<uint8_t>(chars[i]);
27,953✔
185
      append_utf8_for(s, c);
27,953✔
186
   }
187
   return s;
1,295✔
188
}
×
189

190
std::string format_char_for_display(char c) {
77✔
191
   std::ostringstream oss;
77✔
192

193
   oss << "'";
77✔
194

195
   if(c == '\t') {
77✔
196
      oss << "\\t";
14✔
197
   } else if(c == '\n') {
63✔
198
      oss << "\\n";
14✔
199
   } else if(c == '\r') {
49✔
200
      oss << "\\r";
14✔
201
   } else if(static_cast<unsigned char>(c) >= 128) {
35✔
202
      const unsigned char z = static_cast<unsigned char>(c);
6✔
203
      oss << "\\x" << std::hex << std::uppercase << static_cast<int>(z);
6✔
204
   } else {
205
      oss << c;
29✔
206
   }
207

208
   oss << "'";
77✔
209

210
   return oss.str();
154✔
211
}
77✔
212

213
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc