• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

libbitcoin / libbitcoin-system / 23079264858

14 Mar 2026 03:19AM UTC coverage: 81.31% (+0.01%) from 81.296%
23079264858

Pull #1793

github

web-flow
Merge af934943c into 4132085bc
Pull Request #1793: Make ICU normalizers unconditional (void), style.

139 of 145 new or added lines in 3 files covered. (95.86%)

1 existing line in 1 file now uncovered.

11002 of 13531 relevant lines covered (81.31%)

3502537.47 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.35
/src/unicode/normalization.cpp
1
/**
2
 * Copyright (c) 2011-2026 libbitcoin developers (see AUTHORS)
3
 *
4
 * This file is part of libbitcoin.
5
 *
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU Affero General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Affero General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Affero General Public License
17
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 */
19
#include <bitcoin/system/unicode/normalization.hpp>
20

21
#include <algorithm>
22
#include <bitcoin/system/data/data.hpp>
23
#include <bitcoin/system/math/math.hpp>
24
#include <bitcoin/system/unicode/ascii.hpp>
25
#include <bitcoin/system/unicode/code_points.hpp>
26
#include <bitcoin/system/unicode/conversion.hpp>
27
#include <bitcoin/system/unicode/unicode_tables.hpp>
28

29
namespace libbitcoin {
30
namespace system {
31

32
BC_PUSH_WARNING(NO_ARRAY_INDEXING)
33
BC_PUSH_WARNING(NO_DYNAMIC_ARRAY_INDEXING)
34

35
// Constants.
36
// ----------------------------------------------------------------------------
37

38
constexpr char32_t max_unicode_point = 0x10ffff;
39

40
// Hangul syllable constants (unicode.org/reports/tr15/#Hangul).
41
namespace hangul
42
{
43
    constexpr char32_t syllable_base = 0xac00;
44
    constexpr char32_t leading_base = 0x1100;
45
    constexpr char32_t vowel_base = 0x1161;
46
    constexpr char32_t trailing_base = 0x11a7;
47
    constexpr uint32_t leading_count = 19;
48
    constexpr uint32_t vowel_count = 21;
49
    constexpr uint32_t trailing_count = 28;
50
    constexpr uint32_t nucleus_count = vowel_count * trailing_count;
51
    constexpr uint32_t syllable_count = leading_count * nucleus_count;
52
}
53

54
// Unicode normalization helpers.
55
// ----------------------------------------------------------------------------
56

57
constexpr size_t components(uint32_t header) NOEXCEPT
6,764✔
58
{
59
    // Bits 24-30 is the number of decomposition components.
60
    constexpr size_t discard = 24;
6,764✔
61
    constexpr size_t visible = add1(30 - discard);
6,764✔
62
    return bit_and(shift_right(header, discard),
6,764✔
63
        unmask_right<uint32_t>(visible));
6,764✔
64
}
65

66
constexpr bool is_compatible(uint32_t header) NOEXCEPT
2,226✔
67
{
68
    // Most significant bit flags a compatibility-only (NFKD) mapping.
69
    return get_left(header);
2,226✔
70
}
71

72
static uint32_t decompose(char32_t point) NOEXCEPT
41,313✔
73
{
74
    constexpr size_t shift = 7;
41,313✔
75

76
    // Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
77
    const auto block = decomposition_index1[shift_right(point, shift)];
41,313✔
78
    return decomposition_index2[shift_left<uint32_t>(block, shift) +
41,313✔
79
        bit_and(point, unmask_right<char32_t>(shift))];
41,313✔
80
}
81

82
static void decompose_one(std::u32string& out, char32_t point,
41,318✔
83
    bool compatible) NOEXCEPT
84
{
85
    using namespace hangul;
41,318✔
86

87
    // Hangul: algorithmic decomposition (same for NFD and NFKD).
88
    if (point >= syllable_base &&
41,318✔
89
        point < syllable_base + syllable_count)
90
    {
91
        const auto syllable = point - syllable_base;
5✔
92
        out.push_back(leading_base + syllable / nucleus_count);
5✔
93

94
        const auto modulo = (syllable % nucleus_count);
5✔
95
        out.push_back(vowel_base + modulo / trailing_count);
5✔
96

97
        const auto trailing = syllable % trailing_count;
5✔
98
        if (is_nonzero(trailing))
5✔
99
            out.push_back(trailing_base + trailing);
2✔
100

101
        return;
5✔
102
    }
103

104
    if (point > max_unicode_point)
41,313✔
105
    {
106
        out.push_back(point);
×
107
        return;
×
108
    }
109

110
    const auto index = decompose(point);
41,313✔
111
    if (is_zero(index))
41,313✔
112
    {
113
        out.push_back(point);
39,087✔
114
        return;
39,087✔
115
    }
116

117
    const auto header = decomposition_pool[index];
2,226✔
118

119
    // Skip compatibility-only mappings in NFD mode.
120
    if (is_compatible(header) && !compatible)
2,226✔
121
    {
122
        out.push_back(point);
2✔
123
        return;
2✔
124
    }
125

126
    // Recurse into each component.
127
    for (size_t component{}; component < components(header); ++component)
6,764✔
128
        decompose_one(out, decomposition_pool[add1(index) + component],
4,540✔
129
            compatible);
130
}
131

132
static uint8_t get_canonical(char32_t point) NOEXCEPT
3,510,355✔
133
{
134
    if (point > max_unicode_point)
3,510,355✔
135
        return {};
136

137
    // Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
138
    const auto data1 = unicode_data1[shift_right(point, 7)];
3,510,355✔
139
    const auto data2 = unicode_data2[shift_left<uint32_t>(data1, 7) +
3,510,355✔
140
        bit_and(point, unmask_right<char32_t>(7))];
3,510,355✔
141

142
    return combining_index[data2];
3,510,355✔
143
}
144

145
// Apply canonical ordering (UAX #15 section 3.11).
146
static void canonical_order(std::u32string& sequence) NOEXCEPT
6,648✔
147
{
148
    const auto size = sequence.size();
6,648✔
149
    if (size < two)
6,648✔
150
        return;
151

152
    size_t start{};
153
    while (start < size)
15,060✔
154
    {
155
        // Skip starters.
156
        while (start < size && is_zero(get_canonical(sequence[start])))
43,871✔
157
            ++start;
34,751✔
158

159
        // Find end of combining run.
160
        auto end{ start };
9,120✔
161
        while (end < size && !is_zero(get_canonical(sequence[end])))
12,762✔
162
            ++end;
3,642✔
163

164
        // Stable-sort combining characters.
165
        if (end > add1(start))
9,120✔
166
        {
167
            const auto at = std::next(sequence.begin(), start);
27✔
168
            const auto to = std::next(sequence.begin(), end);
27✔
169
            std::stable_sort(at, to, [](auto left, auto right) NOEXCEPT
27✔
170
            {
171
                return get_canonical(left) < get_canonical(right);
40✔
172
            });
173
        }
174

175
        start = end;
176
    }
177
}
178

179
// Canonical composition lookup (0 if no composition found).
180
static char32_t composition_lookup(char32_t a, char32_t b) NOEXCEPT
21✔
181
{
182
    using namespace hangul;
21✔
183

184
    // Hangul L + V -> LV
185
    if ((a >= leading_base) &&
21✔
186
        (a < leading_base + leading_count) &&
187
        (b >= vowel_base) &&
4✔
188
        (b < vowel_base + vowel_count))
189
    {
190
        return syllable_base
4✔
191
            + ((a - leading_base) * nucleus_count)
192
            + ((b - vowel_base) * trailing_count);
4✔
193
    }
194

195
    // Hangul LV + T -> LVT
196
    if ((a >= syllable_base) &&
17✔
197
        (a < syllable_base + syllable_count) &&
3✔
198
        (is_zero((a - syllable_base) % trailing_count)) &&
3✔
199
        (b > trailing_base) &&
20✔
200
        (b < trailing_base + trailing_count))
201
    {
202
        return a + (b - trailing_base);
2✔
203
    }
204

205
    // Binary search in composition_pairs (sorted by [a, b]).
206
    size_t lo{}, hi{ composition_pairs_count };
207
    while (lo < hi)
156✔
208
    {
209
        const auto middle = to_half(lo + hi);
147✔
210
        const auto base = middle * 3u;
147✔
211
        const auto alpha = composition_pairs[base];
147✔
212
        const auto bravo = composition_pairs[add1(base)];
147✔
213

214
        if (alpha < a || (alpha == a && bravo < b))
147✔
215
        {
216
            lo = add1(middle);
62✔
217
        }
218
        else if (alpha > a || (alpha == a && bravo > b))
85✔
219
        {
220
            hi = middle;
221
        }
222
        else
223
        {
224
            return composition_pairs[base + 2u];
6✔
225
        }
226
    }
227

228
    return {};
229
}
230

231
// Apply canonical composition pass (UAX #15 canonical composition algorithm).
232
static void compose(std::u32string& sequence) NOEXCEPT
7✔
233
{
234
    const size_t size = sequence.size();
7✔
235
    if (size < two)
7✔
236
        return;
237

238
    for (size_t index{}; index < sequence.size();)
23✔
239
    {
240
        // Skip non-starters.
241
        if (!is_zero(get_canonical(sequence[index])))
16✔
242
        {
243
            ++index;
×
244
            continue;
×
245
        }
246

247
        uint8_t last{};
16✔
248

249
        // Scan for composable characters.
250
        for (auto inner{ add1(index) }; inner < sequence.size();)
28✔
251
        {
252
            const auto current = get_canonical(sequence[inner]);
21✔
253

254
            // Combining char "blocked" if char >= ccc between it and starter.
255
            const auto blocked = is_nonzero(last) && last >= current;
21✔
256

257
            if (is_zero(current))
21✔
258
            {
259
                if (!blocked)
15✔
260
                {
261
                    // sequence[inner] is also a starter, try composition.
262
                    const auto point = composition_lookup(sequence[index],
15✔
263
                        sequence[inner]);
264

265
                    if (is_nonzero(point))
15✔
266
                    {
267
                        sequence[index] = point;
6✔
268
                        sequence.erase(std::next(sequence.begin(), inner));
6✔
269
                        last = {};
6✔
270
                        continue;
6✔
271
                    }
272
                }
273

274
                // Stop on next starter that won't compose.
275
                break;
276
            }
277

278
            if (!blocked)
6✔
279
            {
280
                const auto point = composition_lookup(sequence[index],
6✔
281
                    sequence[inner]);
282

283
                if (is_nonzero(point))
6✔
284
                {
285
                    sequence[index] = point;
6✔
286
                    sequence.erase(std::next(sequence.begin(), inner));
6✔
287
                    last = {};
6✔
288
                    continue;
6✔
289
                }
290
            }
291

NEW
292
            last = current;
×
293
            ++inner;
×
294
        }
295

296
        ++index;
297
    }
298
}
299

300
static void normal_form(std::string& value, bool compatible,
6,648✔
301
    bool recompose) NOEXCEPT
302
{
303
    const auto points = to_utf32(value);
6,648✔
304
    std::u32string out{};
6,648✔
305

306
    // Most decompositions are small so reserve * two.
307
    out.reserve(points.size() * two);
6,648✔
308
    for (const auto point: points)
43,426✔
309
        decompose_one(out, point, compatible);
36,778✔
310

311
    canonical_order(out);
6,648✔
312
    if (recompose)
6,648✔
313
        compose(out);
7✔
314

315
    value = to_utf8(out);
13,296✔
316
}
6,648✔
317

318
// Case folding helpers.
319
// ----------------------------------------------------------------------------
320

321
// msvc is flagging array below indexation as pointer math.
322
BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)
323

324
// Binary search in case_fold_pairs (sorted by from_cp).
325
// Returns folded code points written into out[], returns count (1-3).
326
static uint32_t fold_lower_one(char32_t point, char32_t out[3]) NOEXCEPT
37,771✔
327
{
328
    size_t lo{}, hi{ case_fold_pairs_count };
37,771✔
329
    while (lo < hi)
418,678✔
330
    {
331
        const auto middle = to_half(lo + hi);
380,917✔
332
        const auto base = middle * 5u;
380,917✔
333
        const auto from = case_fold_pairs[base];
380,917✔
334

335
        if (from < point)
380,917✔
336
        {
337
            lo = add1(middle);
216,935✔
338
        }
339
        else if (from > point)
163,982✔
340
        {
341
            hi = middle;
342
        }
343
        else
344
        {
345
            const auto count = case_fold_pairs[add1(base)];
10✔
346
            out[0] = case_fold_pairs[base + 2u];
10✔
347
            out[1] = case_fold_pairs[base + 3u];
10✔
348
            out[2] = case_fold_pairs[base + 4u];
10✔
349
            return count;
10✔
350
        }
351
    }
352

353
    out[0] = point;
37,761✔
354
    return 1u;
37,761✔
355
}
356

357
static uint32_t fold_upper_one(char32_t point, char32_t out[3]) NOEXCEPT
1✔
358
{
359
    size_t lo{}, hi{ upper_pairs_count };
1✔
360
    while (lo < hi)
12✔
361
    {
362
        const auto middle  = to_half(lo + hi);
11✔
363
        const auto base = middle * 5_size;
11✔
364
        const auto from = upper_pairs[base];
11✔
365

366
        if (from < point)
11✔
367
        {
368
            lo = add1(middle);
4✔
369
        }
370
        else if (from > point)
7✔
371
        {
372
            hi = middle;
373
        }
374
        else
375
        {
NEW
376
            const auto count = upper_pairs[add1(base)];
×
NEW
377
            out[0] = upper_pairs[base + 2u];
×
NEW
378
            out[1] = upper_pairs[base + 3u];
×
NEW
379
            out[2] = upper_pairs[base + 4u];
×
UNCOV
380
            return count;
×
381
        }
382
    }
383

384
    out[0] = point;
1✔
385
    return 1u;
1✔
386
}
387

388
BC_POP_WARNING()
389

390
static std::string apply_case(const std::string& in, bool lower) NOEXCEPT
6,583✔
391
{
392
    const auto points = to_utf32(in);
6,583✔
393
    std::u32string out{};
6,583✔
394
    out.reserve(points.size());
6,583✔
395

396
    char32_t buffer[3]{};
6,583✔
397
    for (const auto point: points)
44,355✔
398
    {
399
        const auto count = lower ?
37,772✔
400
            fold_lower_one(point, &buffer[0]) :
37,771✔
401
            fold_upper_one(point, &buffer[0]);
1✔
402

403
        for (size_t index{}; index < count; ++index)
75,545✔
404
            out.push_back(buffer[index]);
37,773✔
405
    }
406

407
    return to_utf8(out);
6,583✔
408
}
409

410
// Unicode case folding.
411
// ----------------------------------------------------------------------------
412

413
void to_lower(std::string& value) NOEXCEPT
14,258✔
414
{
415
    value = is_ascii(value) ? ascii_to_lower(value) : apply_case(value, true);
14,258✔
416
}
14,258✔
417

418
void to_upper(std::string& value) NOEXCEPT
5✔
419
{
420
    value = is_ascii(value) ? ascii_to_upper(value) : apply_case(value, false);
5✔
421
}
5✔
422

423
void to_canonical_composition(std::string& value) NOEXCEPT
8✔
424
{
425
    if (!is_ascii(value)) normal_form(value, false, true);
8✔
426
}
8✔
427

428
void to_canonical_decomposition(std::string& value) NOEXCEPT
12✔
429
{
430
    if (!is_ascii(value)) normal_form(value, false, false);
12✔
431
}
12✔
432

433
void to_compatibility_composition(std::string& value) NOEXCEPT
3✔
434
{
435
    if (!is_ascii(value)) normal_form(value, true, true);
3✔
436
}
3✔
437

438
void to_compatibility_decomposition(std::string& value) NOEXCEPT
14,356✔
439
{
440
    if (!is_ascii(value)) normal_form(value, true, false);
14,356✔
441
}
14,356✔
442

443
// Character tests (no conversions).
444
// ----------------------------------------------------------------------------
445

446
bool is_unicode(char32_t point) NOEXCEPT
×
447
{
NEW
448
    return point <= max_unicode_point;
×
449
}
450

451
bool is_separator(char32_t point) NOEXCEPT
1,114,130✔
452
{
453
    if (!is_unicode(point))
1,114,130✔
454
        return false;
455

456
    for (size_t index{}; index < char32_separators_count; ++index)
20,054,034✔
457
        if (point == char32_separators[index])
18,939,939✔
458
            return true;
459

460
    return false;
461
}
462

463
bool is_whitespace(char32_t point) NOEXCEPT
1,114,157✔
464
{
465
    if (!is_unicode(point))
1,114,157✔
466
        return false;
467

468
    for (size_t index{}; index < char32_whitespace_count; ++index)
28,967,273✔
469
        if (point == char32_whitespace[index])
27,853,171✔
470
            return true;
471

472
    return false;
473
}
474

475
bool is_combining(char32_t point) NOEXCEPT
3,465,057✔
476
{
477
    // github.com/python/cpython/blob/main/Modules/unicodedata.c
478
    return is_unicode(point) && !is_zero(get_canonical(point));
3,465,057✔
479
}
480

481
// local
482
constexpr bool is_contained(char32_t value,
473,784,252✔
483
    const char32_interval& interval) NOEXCEPT
484
{
485
    return interval.first <= value && value <= interval.second;
473,784,252✔
486
}
487

488
bool is_diacritic(char32_t point) NOEXCEPT
2,228,240✔
489
{
490
    if (!is_unicode(point))
2,228,240✔
491
        return false;
492

493
    for (size_t index{}; index < char32_diacritics_count; ++index)
440,993,141✔
494
        if (is_contained(point, char32_diacritics[index]))
438,766,584✔
495
            return true;
496

497
    return false;
498
}
499

500
bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT
1,287,868✔
501
{
502
    if (!is_unicode(point))
1,287,868✔
503
        return false;
504

505
    for (size_t index{}; index < char32_chinese_japanese_korean_count; ++index)
36,207,775✔
506
        if (is_contained(point, char32_chinese_japanese_korean[index]))
35,017,668✔
507
            return true;
508

509
    return false;
510
}
511

512
bool has_whitespace(const std::string& value) NOEXCEPT
11✔
513
{
514
    if (value.empty())
11✔
515
        return false;
516

517
    const auto points = to_utf32(value);
10✔
518
    return std::any_of(points.begin(), points.end(),
10✔
519
        [](char32_t character) NOEXCEPT
23✔
520
        {
521
            return is_whitespace(character);
14✔
522
        });
523
}
524

525
// Conversions.
526
// ----------------------------------------------------------------------------
527

528
std::string to_non_combining_form(const std::string& value) NOEXCEPT
31,875✔
529
{
530
    if (value.empty())
31,875✔
531
        return value;
8✔
532

533
    // utf32 ensures each word is a single unicode character.
534
    auto points = to_utf32(value);
31,867✔
535
    std::erase_if(points, is_combining);
31,867✔
536
    return to_utf8(points);
31,867✔
537
}
538

539
std::string to_non_diacritic_form(const std::string& value) NOEXCEPT
6✔
540
{
541
    if (value.empty())
6✔
542
        return value;
1✔
543

544
    // utf32 ensures each word is a single unicode character.
545
    auto points = to_utf32(value);
5✔
546
    std::erase_if(points, is_diacritic);
5✔
547
    return to_utf8(points);
5✔
548
}
549

550
// Compress ascii whitespace and remove ascii spaces between cjk characters.
551
std::string to_compressed_form(const std::string& value) NOEXCEPT
31,881✔
552
{
553
    // Compress ascii whitespace to a single 0x20 between each utf32 token.
554
    const auto normalized = system::join(system::split(value));
31,881✔
555

556
    // utf32 ensures each word is a single unicode character.
557
    const auto points = to_utf32(normalized);
31,881✔
558

559
    // A character cannot be between two others if there aren't at least three.
560
    if (points.size() < 3u)
31,881✔
561
        return normalized;
4,108✔
562

563
    // Copy the first character to output.
564
    std::u32string compressed{ points.front() };
27,773✔
565

566
    // Remove a single ascii whitespace between CJK characters.
567
    // Front and back cannot be between two characters, so skip them.
568
    for (auto point{ one }; point < sub1(points.size()); point++)
1,204,622✔
569
    {
570
        if (!(is_ascii_whitespace(points[point]) &&
1,176,849✔
571
            is_chinese_japanese_or_korean(points[sub1(point)]) &&
170,019✔
572
            is_chinese_japanese_or_korean(points[add1(point)])))
3,738✔
573
        {
574
            compressed += points[point];
1,173,112✔
575
        }
576
    }
577

578
    // Copy the last character to output.
579
    compressed += points.back();
27,773✔
580
    return to_utf8(compressed);
27,773✔
581
}
582

583
BC_POP_WARNING()
584
BC_POP_WARNING()
585

586
} // namespace system
587
} // namespace libbitcoin
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc