• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

libbitcoin / libbitcoin-system / 22814433198

08 Mar 2026 05:06AM UTC coverage: 81.289% (+0.07%) from 81.216%
22814433198

Pull #1791

github

web-flow
Merge 236b2927b into 032c34e64
Pull Request #1791: [RFC] Replace ICU dependency with embedded Unicode tables

139 of 150 new or added lines in 1 file covered. (92.67%)

30 existing lines in 3 files now uncovered.

10987 of 13516 relevant lines covered (81.29%)

3503968.18 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.06
/src/unicode/normalization.cpp
1
/**
2
 * Copyright (c) 2011-2026 libbitcoin-system developers (see AUTHORS)
3
 *
4
 * This file is part of libbitcoin.
5
 *
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU Affero General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Affero General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Affero General Public License
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 */
19
#include <bitcoin/system/unicode/normalization.hpp>
20

21
#include <algorithm>
22
#include <bitcoin/system/data/data.hpp>
23
#include <bitcoin/system/math/math.hpp>
24
#include <bitcoin/system/unicode/ascii.hpp>
25
#include <bitcoin/system/unicode/code_points.hpp>
26
#include <bitcoin/system/unicode/conversion.hpp>
27
#include <bitcoin/system/unicode/unicode_tables.hpp>
28

29
namespace libbitcoin {
30
namespace system {
31

32
// Local helpers.
33
// ----------------------------------------------------------------------------
34

35
constexpr bool is_contained(char32_t value,
473,784,252✔
36
    const char32_interval& interval) NOEXCEPT
37
{
38
    return interval.first <= value && value <= interval.second;
473,784,252✔
39
}
40

41
// Hangul syllable constants (algorithmic decomposition/composition).
42
// unicode.org/reports/tr15/#Hangul
43
constexpr char32_t hangul_sbase  = 0xAC00u;
44
constexpr char32_t hangul_lbase  = 0x1100u;
45
constexpr char32_t hangul_vbase  = 0x1161u;
46
constexpr char32_t hangul_tbase  = 0x11A7u;
47
constexpr uint32_t hangul_lcount = 19u;
48
constexpr uint32_t hangul_vcount = 21u;
49
constexpr uint32_t hangul_tcount = 28u;
50
constexpr uint32_t hangul_ncount = hangul_vcount * hangul_tcount; // 588
51
constexpr uint32_t hangul_scount = hangul_lcount * hangul_ncount; // 11172
52

53
// Unicode normalization helpers.
54
// ----------------------------------------------------------------------------
55

56
// Get canonical combining class (0 = starter).
57
static uint8_t get_ccc(char32_t point) NOEXCEPT
45,287✔
58
{
59
    if (point > 0x10FFFFu)
45,287✔
60
        return 0;
61
    const auto data1 = unicode_data1[(point >> 7)];
45,287✔
62
    const auto data2 = unicode_data2[(data1 << 7) + (point & 0x7Fu)];
45,287✔
63
    return combining_index[data2];
45,287✔
64
}
65

66
// Decompose one code point into out (recursive, single-step per table entry).
67
static void decompose_one(std::u32string& out, char32_t cp,
41,306✔
68
    bool compat) NOEXCEPT
69
{
70
    // Hangul: algorithmic decomposition (same for NFD and NFKD).
71
    if (cp >= hangul_sbase && cp < hangul_sbase + hangul_scount)
41,306✔
72
    {
73
        const auto si = static_cast<uint32_t>(cp - hangul_sbase);
5✔
74
        out.push_back(hangul_lbase + si / hangul_ncount);
5✔
75
        out.push_back(hangul_vbase + (si % hangul_ncount) / hangul_tcount);
5✔
76
        const auto ti = si % hangul_tcount;
5✔
77
        if (ti != 0u)
5✔
78
            out.push_back(hangul_tbase + ti);
2✔
79
        return;
5✔
80
    }
81

82
    // Two-level trie lookup.
83
    if (cp > 0x10FFFFu)
41,301✔
84
    {
NEW
UNCOV
85
        out.push_back(cp);
×
NEW
86
        return;
×
87
    }
88
    const auto block = decomp_index1[cp >> decomp_shift];
41,301✔
89
    const auto idx   = decomp_index2[
41,301✔
90
        (static_cast<size_t>(block) << decomp_shift) + (cp & 0x7Fu)];
41,301✔
91

92
    if (idx == 0u)
41,301✔
93
    {
94
        out.push_back(cp);
39,076✔
95
        return;
39,076✔
96
    }
97

98
    const auto header    = decomp_pool[idx];
2,225✔
99
    const auto is_compat = (header & 0x80000000u) != 0u;
2,225✔
100
    const auto count     = (header >> 24) & 0x7Fu;
2,225✔
101

102
    // In NFD mode skip compatibility-only mappings.
103
    if (is_compat && !compat)
2,225✔
104
    {
105
        out.push_back(cp);
2✔
106
        return;
2✔
107
    }
108

109
    // Recurse into each component.
110
    for (uint32_t i = 0u; i < count; ++i)
6,762✔
111
        decompose_one(out, static_cast<char32_t>(decomp_pool[idx + 1u + i]),
4,539✔
112
            compat);
113
}
114

115
// Apply canonical ordering (UAX #15 section 3.11):
116
// Within each "combining sequence" (runs of CCC > 0), sort by CCC, stable.
117
static void canonical_order(std::u32string& seq) NOEXCEPT
6,647✔
118
{
119
    const auto n = seq.size();
6,647✔
120
    if (n < 2u)
6,647✔
121
        return;
122

123
    // Identify runs of combining characters and stable-sort each.
124
    size_t run_start = 0u;
125
    while (run_start < n)
15,058✔
126
    {
127
        // Skip starters.
128
        while (run_start < n && get_ccc(seq[run_start]) == 0u)
43,859✔
129
            ++run_start;
34,740✔
130

131
        // Find end of this combining run.
132
        size_t run_end = run_start;
9,119✔
133
        while (run_end < n && get_ccc(seq[run_end]) != 0u)
12,761✔
134
            ++run_end;
3,642✔
135

136
        if (run_end > run_start + 1u)
9,119✔
137
        {
138
            std::stable_sort(seq.begin() + run_start, seq.begin() + run_end,
27✔
139
                [](char32_t a, char32_t b) NOEXCEPT
40✔
140
                {
141
                    return get_ccc(a) < get_ccc(b);
40✔
142
                });
143
        }
144
        run_start = run_end;
145
    }
146
}
147

148
// Canonical composition lookup (returns 0 if no composition found).
149
static char32_t comp_lookup(char32_t a, char32_t b) NOEXCEPT
21✔
150
{
151
    // Hangul L + V → LV
152
    if (a >= hangul_lbase && a < hangul_lbase + hangul_lcount &&
21✔
153
        b >= hangul_vbase && b < hangul_vbase + hangul_vcount)
4✔
154
    {
155
        return hangul_sbase
4✔
156
            + (a - hangul_lbase) * hangul_ncount
157
            + (b - hangul_vbase) * hangul_tcount;
4✔
158
    }
159

160
    // Hangul LV + T → LVT
161
    if (a >= hangul_sbase && a < hangul_sbase + hangul_scount &&
17✔
162
        (a - hangul_sbase) % hangul_tcount == 0u &&
3✔
163
        b >  hangul_tbase  && b < hangul_tbase + hangul_tcount)
3✔
164
    {
165
        return a + (b - hangul_tbase);
2✔
166
    }
167

168
    // Binary search in comp_pairs (sorted by [a, b]).
169
    size_t lo = 0u, hi = comp_pairs_count;
170
    while (lo < hi)
156✔
171
    {
172
        const size_t mid  = (lo + hi) / 2u;
147✔
173
        const size_t base = mid * 3u;
147✔
174
        const auto   ca   = comp_pairs[base];
147✔
175
        const auto   cb   = comp_pairs[base + 1u];
147✔
176
        if (ca < a || (ca == a && cb < b))
147✔
177
            lo = mid + 1u;
62✔
178
        else if (ca > a || (ca == a && cb > b))
85✔
179
            hi = mid;
180
        else
181
            return static_cast<char32_t>(comp_pairs[base + 2u]);
6✔
182
    }
183
    return 0u;
184
}
185

186
// Apply canonical composition pass (UAX #15 canonical composition algorithm).
187
static void compose(std::u32string& seq) NOEXCEPT
7✔
188
{
189
    const size_t n = seq.size();
7✔
190
    if (n < 2u)
7✔
191
        return;
192

193
    // We scan for each starter and try to compose with following combiners.
194
    size_t i = 0u;
195
    while (i < seq.size())
23✔
196
    {
197
        if (get_ccc(seq[i]) != 0u)
16✔
198
        {
NEW
199
            ++i;
×
NEW
UNCOV
200
            continue;
×
201
        }
202

203
        // seq[i] is a starter; scan forward for composable characters.
204
        uint8_t last_ccc = 0u;
16✔
205
        size_t j = i + 1u;
16✔
206
        while (j < seq.size())
28✔
207
        {
208
            const uint8_t ccc = get_ccc(seq[j]);
21✔
209

210
            // A combining character is "blocked" if a character with equal or
211
            // higher CCC appeared between the starter and this character.
212
            const bool blocked = (last_ccc > 0u && last_ccc >= ccc);
21✔
213

214
            if (ccc == 0u)
21✔
215
            {
216
                // seq[j] is also a starter. Try composition (e.g. Hangul L+V,
217
                // LV+T); if they compose, continue scanning; otherwise stop.
218
                if (!blocked)
15✔
219
                {
220
                    const char32_t c = comp_lookup(seq[i], seq[j]);
15✔
221
                    if (c != 0u)
15✔
222
                    {
223
                        seq[i] = c;
6✔
224
                        seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));
6✔
225
                        last_ccc = 0u;
6✔
226
                        continue;
6✔
227
                    }
228
                }
229
                break; // Next starter that won't compose — stop.
230
            }
231

232
            if (!blocked)
6✔
233
            {
234
                const char32_t c = comp_lookup(seq[i], seq[j]);
6✔
235
                if (c != 0u)
6✔
236
                {
237
                    seq[i] = c;
6✔
238
                    seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));
6✔
239
                    last_ccc = 0u; // Restart scan from just after new starter.
6✔
240
                    continue;
6✔
241
                }
242
            }
243

NEW
244
            last_ccc = ccc;
×
NEW
245
            ++j;
×
246
        }
247
        ++i;
248
    }
249
}
250

251
// Full normalization (decompose → order → optionally compose).
252
static bool normal_form(std::string& value, bool compat,
6,647✔
253
    bool recompose) NOEXCEPT
254
{
255
    const auto u32 = to_utf32(value);
6,647✔
256
    std::u32string out;
6,647✔
257
    out.reserve(u32.size() * 2u); // Most decompositions are small.
6,647✔
258

259
    for (const char32_t cp : u32)
43,414✔
260
        decompose_one(out, cp, compat);
36,767✔
261

262
    canonical_order(out);
6,647✔
263

264
    if (recompose)
6,647✔
265
        compose(out);
7✔
266

267
    value = to_utf8(out);
6,647✔
268
    return true;
6,647✔
269
}
270

271
// Case folding helpers.
272
// ----------------------------------------------------------------------------
273

274
// Binary search in case_fold_pairs (sorted by from_cp).
275
// Returns folded code points written into out[], returns count (1-3).
276
static uint32_t fold_lower_one(char32_t cp, char32_t out[3]) NOEXCEPT
37,771✔
277
{
278
    size_t lo = 0u, hi = case_fold_pairs_count;
37,771✔
279
    while (lo < hi)
418,678✔
280
    {
281
        const size_t mid  = (lo + hi) / 2u;
380,917✔
282
        const size_t base = mid * 5u;
380,917✔
283
        const auto   from = case_fold_pairs[base];
380,917✔
284
        if (from < static_cast<uint32_t>(cp))
380,917✔
285
            lo = mid + 1u;
216,935✔
286
        else if (from > static_cast<uint32_t>(cp))
163,982✔
287
            hi = mid;
288
        else
289
        {
290
            const uint32_t n = case_fold_pairs[base + 1u];
10✔
291
            out[0] = static_cast<char32_t>(case_fold_pairs[base + 2u]);
10✔
292
            out[1] = static_cast<char32_t>(case_fold_pairs[base + 3u]);
10✔
293
            out[2] = static_cast<char32_t>(case_fold_pairs[base + 4u]);
10✔
294
            return n;
10✔
295
        }
296
    }
297
    out[0] = cp;
37,761✔
298
    return 1u; // Identity (already lowercase or caseless).
37,761✔
299
}
300

301
static uint32_t fold_upper_one(char32_t cp, char32_t out[3]) NOEXCEPT
1✔
302
{
303
    size_t lo = 0u, hi = upper_pairs_count;
1✔
304
    while (lo < hi)
12✔
305
    {
306
        const size_t mid  = (lo + hi) / 2u;
11✔
307
        const size_t base = mid * 5u;
11✔
308
        const auto   from = upper_pairs[base];
11✔
309
        if (from < static_cast<uint32_t>(cp))
11✔
310
            lo = mid + 1u;
4✔
311
        else if (from > static_cast<uint32_t>(cp))
7✔
312
            hi = mid;
313
        else
314
        {
NEW
315
            const uint32_t n = upper_pairs[base + 1u];
×
NEW
316
            out[0] = static_cast<char32_t>(upper_pairs[base + 2u]);
×
NEW
317
            out[1] = static_cast<char32_t>(upper_pairs[base + 3u]);
×
NEW
318
            out[2] = static_cast<char32_t>(upper_pairs[base + 4u]);
×
NEW
319
            return n;
×
320
        }
321
    }
322
    out[0] = cp;
1✔
323
    return 1u;
1✔
324
}
325

326
static bool apply_case(std::string& out, const std::string& in,
6,583✔
327
    bool lower) NOEXCEPT
328
{
329
    const auto u32 = to_utf32(in);
6,583✔
330
    std::u32string result;
6,583✔
331
    result.reserve(u32.size());
6,583✔
332

333
    char32_t buf[3]{};
6,583✔
334
    for (const char32_t cp : u32)
44,355✔
335
    {
336
        const uint32_t n = lower ? fold_lower_one(cp, buf)
37,772✔
337
                                 : fold_upper_one(cp, buf);
1✔
338
        for (uint32_t i = 0u; i < n; ++i)
75,545✔
339
            result.push_back(buf[i]);
37,773✔
340
    }
341

342
    out = to_utf8(result);
6,583✔
343
    return true;
6,583✔
344
}
345

346
// ICU dependency (ascii supported, otherwise false if HAVE_ICU not defined).
347
// ----------------------------------------------------------------------------
348

349
bool to_lower(std::string& value) NOEXCEPT
14,258✔
350
{
351
    if (is_ascii(value))
14,258✔
352
    {
353
        value = ascii_to_lower(value);
7,676✔
354
        return true;
7,676✔
355
    }
356

357
    return apply_case(value, value, true);
6,582✔
358
}
359

360
bool to_upper(std::string& value) NOEXCEPT
5✔
361
{
362
    if (is_ascii(value))
5✔
363
    {
364
        value = ascii_to_upper(value);
4✔
365
        return true;
4✔
366
    }
367

368
    return apply_case(value, value, false);
1✔
369
}
370

371
bool to_canonical_composition(std::string& value) NOEXCEPT
8✔
372
{
373
    return is_ascii(value) || normal_form(value, false, true);
8✔
374
}
375

376
bool to_canonical_decomposition(std::string& value) NOEXCEPT
12✔
377
{
378
    return is_ascii(value) || normal_form(value, false, false);
12✔
379
}
380

381
bool to_compatibility_composition(std::string& value) NOEXCEPT
3✔
382
{
383
    return is_ascii(value) || normal_form(value, true, true);
3✔
384
}
385

386
bool to_compatibility_decomposition(std::string& value) NOEXCEPT
14,355✔
387
{
388
    return is_ascii(value) || normal_form(value, true, false);
14,355✔
389
}
390

391
// No ICU dependency.
392
// ----------------------------------------------------------------------------
393

UNCOV
394
bool is_unicode(char32_t point) NOEXCEPT
×
395
{
396
    return point < 0x0010ffff;
×
397
}
398

399
bool is_separator(char32_t point) NOEXCEPT
1,114,130✔
400
{
401
    if (!is_unicode(point))
1,114,130✔
402
        return false;
403

404
    for (size_t index = 0; index < char32_separators_count; ++index)
20,054,034✔
405
        if (point == char32_separators[index])
18,939,939✔
406
            return true;
407

408
    return false;
409
}
410

411
bool is_whitespace(char32_t point) NOEXCEPT
1,114,157✔
412
{
413
    if (!is_unicode(point))
1,114,157✔
414
        return false;
415

416
    for (size_t index = 0; index < char32_whitespace_count; ++index)
28,967,273✔
417
        if (point == char32_whitespace[index])
27,853,171✔
418
            return true;
419

420
    return false;
421
}
422

423
bool is_combining(char32_t point) NOEXCEPT
3,465,057✔
424
{
425
    if (!is_unicode(point))
3,465,057✔
426
        return false;
427

428
    // github.com/python/cpython/blob/main/Modules/unicodedata.c
429
    const auto data1 = unicode_data1[(point >> 7)];
3,465,057✔
430
    const auto data2 = unicode_data2[(data1 << 7) + (point & sub1(1 << 7))];
3,465,057✔
431
    return !is_zero(combining_index[data2]);
3,465,057✔
432
}
433

434
bool is_diacritic(char32_t point) NOEXCEPT
2,228,240✔
435
{
436
    if (!is_unicode(point))
2,228,240✔
437
        return false;
438

439
    for (size_t index = 0; index < char32_diacritics_count; ++index)
440,993,141✔
440
        if (is_contained(point, char32_diacritics[index]))
438,766,584✔
441
            return true;
442

443
    return false;
444
}
445

446
bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT
1,287,868✔
447
{
448
    if (!is_unicode(point))
1,287,868✔
449
        return false;
450

451
    for (size_t index = 0; index < char32_chinese_japanese_korean_count; ++index)
36,207,775✔
452
        if (is_contained(point, char32_chinese_japanese_korean[index]))
35,017,668✔
453
            return true;
454

455
    return false;
456
}
457

458
bool has_whitespace(const std::string& value) NOEXCEPT
11✔
459
{
460
    if (value.empty())
11✔
461
        return false;
462

463
    const auto points = to_utf32(value);
10✔
464
    return std::any_of(points.begin(), points.end(), [](char32_t character)
10✔
465
    {
466
        return is_whitespace(character);
14✔
467
    });
468
}
469

470
std::string to_non_combining_form(const std::string& value) NOEXCEPT
31,875✔
471
{
472
    if (value.empty())
31,875✔
473
        return value;
8✔
474

475
    // utf32 ensures each word is a single unicode character.
476
    auto points = to_utf32(value);
31,867✔
477
    std::erase_if(points, is_combining);
31,867✔
478
    return to_utf8(points);
31,867✔
479
}
480

481
std::string to_non_diacritic_form(const std::string& value) NOEXCEPT
6✔
482
{
483
    if (value.empty())
6✔
484
        return value;
1✔
485

486
    // utf32 ensures each word is a single unicode character.
487
    auto points = to_utf32(value);
5✔
488
    std::erase_if(points, is_diacritic);
5✔
489
    return to_utf8(points);
5✔
490
}
491

492
// Compress ascii whitespace and remove ascii spaces between cjk characters.
493
std::string to_compressed_form(const std::string& value) NOEXCEPT
31,881✔
494
{
495
    // Compress ascii whitespace to a single 0x20 between each utf32 token.
496
    const auto normalized = system::join(system::split(value));
31,881✔
497

498
    // utf32 ensures each word is a single unicode character.
499
    const auto points = to_utf32(normalized);
31,881✔
500

501
    // A character cannot be between two others if there aren't at least three.
502
    if (points.size() < 3u)
31,881✔
503
        return normalized;
4,108✔
504

505
    // Copy the first character to output.
506
    std::u32string compressed{ points.front() };
27,773✔
507

508
    // Remove a single ascii whitespace between CJK characters.
509
    // Front and back cannot be between two characters, so skip them.
510
    for (size_t point = 1; point < sub1(points.size()); point++)
1,204,622✔
511
    {
512
        if (!(is_ascii_whitespace(points[point]) &&
1,176,849✔
513
            is_chinese_japanese_or_korean(points[sub1(point)]) &&
170,019✔
514
            is_chinese_japanese_or_korean(points[add1(point)])))
3,738✔
515
        {
516
            compressed += points[point];
1,173,112✔
517
        }
518
    }
519

520
    // Copy the last character to output.
521
    compressed += points.back();
27,773✔
522
    return to_utf8(compressed);
27,773✔
523
}
524

525
} // namespace system
526
} // namespace libbitcoin
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc