22814433198

Committed 08 Mar 2026 05:06AM UTC coverage: 81.289% (+0.07%) from 81.216%

Build # 22814433198

Build Type

Pull #1791

github

Committed by

web-flow

Commit Message

Merge 236b2927b into 032c34e64

Pull Request Pull Request #1791: [RFC] Replace ICU dependency with embedded Unicode tables

Coverage Stats

139 of 150 new or added lines in 1 file covered. (92.67%)

30 existing lines in 3 files now uncovered.

10987 of 13516 relevant lines covered (81.29%)

3503968.18 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.06

/src/unicode/normalization.cpp

/**
 * Copyright (c) 2011-2026 libbitcoin-system developers (see AUTHORS)
 *
 * This file is part of libbitcoin.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
#include <bitcoin/system/unicode/normalization.hpp>

#include <algorithm>
#include <bitcoin/system/data/data.hpp>
#include <bitcoin/system/math/math.hpp>
#include <bitcoin/system/unicode/ascii.hpp>
#include <bitcoin/system/unicode/code_points.hpp>
#include <bitcoin/system/unicode/conversion.hpp>
#include <bitcoin/system/unicode/unicode_tables.hpp>

namespace libbitcoin {
namespace system {

// Local helpers.
// ----------------------------------------------------------------------------

constexpr bool is_contained(char32_t value,
    const char32_interval& interval) NOEXCEPT
{
    return interval.first <= value && value <= interval.second;
}

// Hangul syllable constants (algorithmic decomposition/composition).
// unicode.org/reports/tr15/#Hangul
constexpr char32_t hangul_sbase  = 0xAC00u;
constexpr char32_t hangul_lbase  = 0x1100u;
constexpr char32_t hangul_vbase  = 0x1161u;
constexpr char32_t hangul_tbase  = 0x11A7u;
constexpr uint32_t hangul_lcount = 19u;
constexpr uint32_t hangul_vcount = 21u;
constexpr uint32_t hangul_tcount = 28u;
constexpr uint32_t hangul_ncount = hangul_vcount * hangul_tcount; // 588
constexpr uint32_t hangul_scount = hangul_lcount * hangul_ncount; // 11172

// Unicode normalization helpers.
// ----------------------------------------------------------------------------

// Get canonical combining class (0 = starter).
static uint8_t get_ccc(char32_t point) NOEXCEPT
{
    if (point > 0x10FFFFu)
        return 0;
    const auto data1 = unicode_data1[(point >> 7)];
    const auto data2 = unicode_data2[(data1 << 7) + (point & 0x7Fu)];
    return combining_index[data2];
}

// Decompose one code point into out (recursive, single-step per table entry).
static void decompose_one(std::u32string& out, char32_t cp,
    bool compat) NOEXCEPT
{
    // Hangul: algorithmic decomposition (same for NFD and NFKD).
    if (cp >= hangul_sbase && cp < hangul_sbase + hangul_scount)
    {
        const auto si = static_cast<uint32_t>(cp - hangul_sbase);
        out.push_back(hangul_lbase + si / hangul_ncount);
        out.push_back(hangul_vbase + (si % hangul_ncount) / hangul_tcount);
        const auto ti = si % hangul_tcount;
        if (ti != 0u)
            out.push_back(hangul_tbase + ti);
        return;
    }

    // Two-level trie lookup.
    if (cp > 0x10FFFFu)
    {
        out.push_back(cp);
        return;
    }
    const auto block = decomp_index1[cp >> decomp_shift];
    const auto idx   = decomp_index2[
        (static_cast<size_t>(block) << decomp_shift) + (cp & 0x7Fu)];

    if (idx == 0u)
    {
        out.push_back(cp);
        return;
    }

    const auto header    = decomp_pool[idx];
    const auto is_compat = (header & 0x80000000u) != 0u;
    const auto count     = (header >> 24) & 0x7Fu;

    // In NFD mode skip compatibility-only mappings.
    if (is_compat && !compat)
    {
        out.push_back(cp);
        return;
    }

    // Recurse into each component.
    for (uint32_t i = 0u; i < count; ++i)
        decompose_one(out, static_cast<char32_t>(decomp_pool[idx + 1u + i]),
            compat);
}

// Apply canonical ordering (UAX #15 section 3.11):
// Within each "combining sequence" (runs of CCC > 0), sort by CCC, stable.
static void canonical_order(std::u32string& seq) NOEXCEPT
{
    const auto n = seq.size();
    if (n < 2u)
        return;

    // Identify runs of combining characters and stable-sort each.
    size_t run_start = 0u;
    while (run_start < n)
    {
        // Skip starters.
        while (run_start < n && get_ccc(seq[run_start]) == 0u)
            ++run_start;

        // Find end of this combining run.
        size_t run_end = run_start;
        while (run_end < n && get_ccc(seq[run_end]) != 0u)
            ++run_end;

        if (run_end > run_start + 1u)
        {
            std::stable_sort(seq.begin() + run_start, seq.begin() + run_end,
                [](char32_t a, char32_t b) NOEXCEPT
                {
                    return get_ccc(a) < get_ccc(b);
                });
        }
        run_start = run_end;
    }
}

// Canonical composition lookup (returns 0 if no composition found).
static char32_t comp_lookup(char32_t a, char32_t b) NOEXCEPT
{
    // Hangul L + V → LV
    if (a >= hangul_lbase && a < hangul_lbase + hangul_lcount &&
        b >= hangul_vbase && b < hangul_vbase + hangul_vcount)
    {
        return hangul_sbase
            + (a - hangul_lbase) * hangul_ncount
            + (b - hangul_vbase) * hangul_tcount;
    }

    // Hangul LV + T → LVT
    if (a >= hangul_sbase && a < hangul_sbase + hangul_scount &&
        (a - hangul_sbase) % hangul_tcount == 0u &&
        b >  hangul_tbase  && b < hangul_tbase + hangul_tcount)
    {
        return a + (b - hangul_tbase);
    }

    // Binary search in comp_pairs (sorted by [a, b]).
    size_t lo = 0u, hi = comp_pairs_count;
    while (lo < hi)
    {
        const size_t mid  = (lo + hi) / 2u;
        const size_t base = mid * 3u;
        const auto   ca   = comp_pairs[base];
        const auto   cb   = comp_pairs[base + 1u];
        if (ca < a || (ca == a && cb < b))
            lo = mid + 1u;
        else if (ca > a || (ca == a && cb > b))
            hi = mid;
        else
            return static_cast<char32_t>(comp_pairs[base + 2u]);
    }
    return 0u;
}

// Apply canonical composition pass (UAX #15 canonical composition algorithm).
static void compose(std::u32string& seq) NOEXCEPT
{
    const size_t n = seq.size();
    if (n < 2u)
        return;

    // We scan for each starter and try to compose with following combiners.
    size_t i = 0u;
    while (i < seq.size())
    {
        if (get_ccc(seq[i]) != 0u)
        {
            ++i;
            continue;
        }

        // seq[i] is a starter; scan forward for composable characters.
        uint8_t last_ccc = 0u;
        size_t j = i + 1u;
        while (j < seq.size())
        {
            const uint8_t ccc = get_ccc(seq[j]);

            // A combining character is "blocked" if a character with equal or
            // higher CCC appeared between the starter and this character.
            const bool blocked = (last_ccc > 0u && last_ccc >= ccc);

            if (ccc == 0u)
            {
                // seq[j] is also a starter. Try composition (e.g. Hangul L+V,
                // LV+T); if they compose, continue scanning; otherwise stop.
                if (!blocked)
                {
                    const char32_t c = comp_lookup(seq[i], seq[j]);
                    if (c != 0u)
                    {
                        seq[i] = c;
                        seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));
                        last_ccc = 0u;
                        continue;
                    }
                }
                break; // Next starter that won't compose — stop.
            }

            if (!blocked)
            {
                const char32_t c = comp_lookup(seq[i], seq[j]);
                if (c != 0u)
                {
                    seq[i] = c;
                    seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));
                    last_ccc = 0u; // Restart scan from just after new starter.
                    continue;
                }
            }

            last_ccc = ccc;
            ++j;
        }
        ++i;
    }
}

// Full normalization (decompose → order → optionally compose).
static bool normal_form(std::string& value, bool compat,
    bool recompose) NOEXCEPT
{
    const auto u32 = to_utf32(value);
    std::u32string out;
    out.reserve(u32.size() * 2u); // Most decompositions are small.

    for (const char32_t cp : u32)
        decompose_one(out, cp, compat);

    canonical_order(out);

    if (recompose)
        compose(out);

    value = to_utf8(out);
    return true;
}

// Case folding helpers.
// ----------------------------------------------------------------------------

// Binary search in case_fold_pairs (sorted by from_cp).
// Returns folded code points written into out[], returns count (1-3).
static uint32_t fold_lower_one(char32_t cp, char32_t out[3]) NOEXCEPT
{
    size_t lo = 0u, hi = case_fold_pairs_count;
    while (lo < hi)
    {
        const size_t mid  = (lo + hi) / 2u;
        const size_t base = mid * 5u;
        const auto   from = case_fold_pairs[base];
        if (from < static_cast<uint32_t>(cp))
            lo = mid + 1u;
        else if (from > static_cast<uint32_t>(cp))
            hi = mid;
        else
        {
            const uint32_t n = case_fold_pairs[base + 1u];
            out[0] = static_cast<char32_t>(case_fold_pairs[base + 2u]);
            out[1] = static_cast<char32_t>(case_fold_pairs[base + 3u]);
            out[2] = static_cast<char32_t>(case_fold_pairs[base + 4u]);
            return n;
        }
    }
    out[0] = cp;
    return 1u; // Identity (already lowercase or caseless).
}

static uint32_t fold_upper_one(char32_t cp, char32_t out[3]) NOEXCEPT
{
    size_t lo = 0u, hi = upper_pairs_count;
    while (lo < hi)
    {
        const size_t mid  = (lo + hi) / 2u;
        const size_t base = mid * 5u;
        const auto   from = upper_pairs[base];
        if (from < static_cast<uint32_t>(cp))
            lo = mid + 1u;
        else if (from > static_cast<uint32_t>(cp))
            hi = mid;
        else
        {
            const uint32_t n = upper_pairs[base + 1u];
            out[0] = static_cast<char32_t>(upper_pairs[base + 2u]);
            out[1] = static_cast<char32_t>(upper_pairs[base + 3u]);
            out[2] = static_cast<char32_t>(upper_pairs[base + 4u]);
            return n;
        }
    }
    out[0] = cp;
    return 1u;
}

static bool apply_case(std::string& out, const std::string& in,
    bool lower) NOEXCEPT
{
    const auto u32 = to_utf32(in);
    std::u32string result;
    result.reserve(u32.size());

    char32_t buf[3]{};
    for (const char32_t cp : u32)
    {
        const uint32_t n = lower ? fold_lower_one(cp, buf)
                                 : fold_upper_one(cp, buf);
        for (uint32_t i = 0u; i < n; ++i)
            result.push_back(buf[i]);
    }

    out = to_utf8(result);
    return true;
}

// ICU dependency (ascii supported, otherwise false if HAVE_ICU not defined).
// ----------------------------------------------------------------------------

bool to_lower(std::string& value) NOEXCEPT
{
    if (is_ascii(value))
    {
        value = ascii_to_lower(value);
        return true;
    }

    return apply_case(value, value, true);
}

bool to_upper(std::string& value) NOEXCEPT
{
    if (is_ascii(value))
    {
        value = ascii_to_upper(value);
        return true;
    }

    return apply_case(value, value, false);
}

bool to_canonical_composition(std::string& value) NOEXCEPT
{
    return is_ascii(value) || normal_form(value, false, true);
}

bool to_canonical_decomposition(std::string& value) NOEXCEPT
{
    return is_ascii(value) || normal_form(value, false, false);
}

bool to_compatibility_composition(std::string& value) NOEXCEPT
{
    return is_ascii(value) || normal_form(value, true, true);
}

bool to_compatibility_decomposition(std::string& value) NOEXCEPT
{
    return is_ascii(value) || normal_form(value, true, false);
}

// No ICU dependency.
// ----------------------------------------------------------------------------

bool is_unicode(char32_t point) NOEXCEPT
{
    return point < 0x0010ffff;
}

bool is_separator(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index = 0; index < char32_separators_count; ++index)
        if (point == char32_separators[index])
            return true;

    return false;
}

bool is_whitespace(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index = 0; index < char32_whitespace_count; ++index)
        if (point == char32_whitespace[index])
            return true;

    return false;
}

bool is_combining(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    // github.com/python/cpython/blob/main/Modules/unicodedata.c
    const auto data1 = unicode_data1[(point >> 7)];
    const auto data2 = unicode_data2[(data1 << 7) + (point & sub1(1 << 7))];
    return !is_zero(combining_index[data2]);
}

bool is_diacritic(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index = 0; index < char32_diacritics_count; ++index)
        if (is_contained(point, char32_diacritics[index]))
            return true;

    return false;
}

bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index = 0; index < char32_chinese_japanese_korean_count; ++index)
        if (is_contained(point, char32_chinese_japanese_korean[index]))
            return true;

    return false;
}

bool has_whitespace(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return false;

    const auto points = to_utf32(value);
    return std::any_of(points.begin(), points.end(), [](char32_t character)
    {
        return is_whitespace(character);
    });
}

std::string to_non_combining_form(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return value;

    // utf32 ensures each word is a single unicode character.
    auto points = to_utf32(value);
    std::erase_if(points, is_combining);
    return to_utf8(points);
}

std::string to_non_diacritic_form(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return value;

    // utf32 ensures each word is a single unicode character.
    auto points = to_utf32(value);
    std::erase_if(points, is_diacritic);
    return to_utf8(points);
}

// Compress ascii whitespace and remove ascii spaces between cjk characters.
std::string to_compressed_form(const std::string& value) NOEXCEPT
{
    // Compress ascii whitespace to a single 0x20 between each utf32 token.
    const auto normalized = system::join(system::split(value));

    // utf32 ensures each word is a single unicode character.
    const auto points = to_utf32(normalized);

    // A character cannot be between two others if there aren't at least three.
    if (points.size() < 3u)
        return normalized;

    // Copy the first character to output.
    std::u32string compressed{ points.front() };

    // Remove a single ascii whitespace between CJK characters.
    // Front and back cannot be between two characters, so skip them.
    for (size_t point = 1; point < sub1(points.size()); point++)
    {
        if (!(is_ascii_whitespace(points[point]) &&
            is_chinese_japanese_or_korean(points[sub1(point)]) &&
            is_chinese_japanese_or_korean(points[add1(point)])))
        {
            compressed += points[point];
        }
    }

    // Copy the last character to output.
    compressed += points.back();
    return to_utf8(compressed);
}

} // namespace system
} // namespace libbitcoin

1	/**
2	* Copyright (c) 2011-2026 libbitcoin-system developers (see AUTHORS)
3	*
4	* This file is part of libbitcoin.
5	*
6	* This program is free software: you can redistribute it and/or modify
7	* it under the terms of the GNU Affero General Public License as published by
8	* the Free Software Foundation, either version 3 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU Affero General Public License for more details.
15	*
16	* You should have received a copy of the GNU Affero General Public License
17	* along with this program. If not, see <http://www.gnu.org/licenses/>.
18	*/
19	#include <bitcoin/system/unicode/normalization.hpp>
20
21	#include <algorithm>
22	#include <bitcoin/system/data/data.hpp>
23	#include <bitcoin/system/math/math.hpp>
24	#include <bitcoin/system/unicode/ascii.hpp>
25	#include <bitcoin/system/unicode/code_points.hpp>
26	#include <bitcoin/system/unicode/conversion.hpp>
27	#include <bitcoin/system/unicode/unicode_tables.hpp>
28
29	namespace libbitcoin {
30	namespace system {
31
32	// Local helpers.
33	// ----------------------------------------------------------------------------
34
35	constexpr bool is_contained(char32_t value,	473,784,252✔
36	const char32_interval& interval) NOEXCEPT
37	{
38	return interval.first <= value && value <= interval.second;	473,784,252✔
39	}
40
41	// Hangul syllable constants (algorithmic decomposition/composition).
42	// unicode.org/reports/tr15/#Hangul
43	constexpr char32_t hangul_sbase = 0xAC00u;
44	constexpr char32_t hangul_lbase = 0x1100u;
45	constexpr char32_t hangul_vbase = 0x1161u;
46	constexpr char32_t hangul_tbase = 0x11A7u;
47	constexpr uint32_t hangul_lcount = 19u;
48	constexpr uint32_t hangul_vcount = 21u;
49	constexpr uint32_t hangul_tcount = 28u;
50	constexpr uint32_t hangul_ncount = hangul_vcount * hangul_tcount; // 588
51	constexpr uint32_t hangul_scount = hangul_lcount * hangul_ncount; // 11172
52
53	// Unicode normalization helpers.
54	// ----------------------------------------------------------------------------
55
56	// Get canonical combining class (0 = starter).
57	static uint8_t get_ccc(char32_t point) NOEXCEPT	45,287✔
58	{
59	if (point > 0x10FFFFu)	45,287✔
60	return 0;
61	const auto data1 = unicode_data1[(point >> 7)];	45,287✔
62	const auto data2 = unicode_data2[(data1 << 7) + (point & 0x7Fu)];	45,287✔
63	return combining_index[data2];	45,287✔
64	}
65
66	// Decompose one code point into out (recursive, single-step per table entry).
67	static void decompose_one(std::u32string& out, char32_t cp,	41,306✔
68	bool compat) NOEXCEPT
69	{
70	// Hangul: algorithmic decomposition (same for NFD and NFKD).
71	if (cp >= hangul_sbase && cp < hangul_sbase + hangul_scount)	41,306✔
72	{
73	const auto si = static_cast<uint32_t>(cp - hangul_sbase);	5✔
74	out.push_back(hangul_lbase + si / hangul_ncount);	5✔
75	out.push_back(hangul_vbase + (si % hangul_ncount) / hangul_tcount);	5✔
76	const auto ti = si % hangul_tcount;	5✔
77	if (ti != 0u)	5✔
78	out.push_back(hangul_tbase + ti);	2✔
79	return;	5✔
80	}
81
82	// Two-level trie lookup.
83	if (cp > 0x10FFFFu)	41,301✔
84	{
NEW UNCOV 85	out.push_back(cp);	×
NEW 86	return;	×
87	}
88	const auto block = decomp_index1[cp >> decomp_shift];	41,301✔
89	const auto idx = decomp_index2[	41,301✔
90	(static_cast<size_t>(block) << decomp_shift) + (cp & 0x7Fu)];	41,301✔
91
92	if (idx == 0u)	41,301✔
93	{
94	out.push_back(cp);	39,076✔
95	return;	39,076✔
96	}
97
98	const auto header = decomp_pool[idx];	2,225✔
99	const auto is_compat = (header & 0x80000000u) != 0u;	2,225✔
100	const auto count = (header >> 24) & 0x7Fu;	2,225✔
101
102	// In NFD mode skip compatibility-only mappings.
103	if (is_compat && !compat)	2,225✔
104	{
105	out.push_back(cp);	2✔
106	return;	2✔
107	}
108
109	// Recurse into each component.
110	for (uint32_t i = 0u; i < count; ++i)	6,762✔
111	decompose_one(out, static_cast<char32_t>(decomp_pool[idx + 1u + i]),	4,539✔
112	compat);
113	}
114
115	// Apply canonical ordering (UAX #15 section 3.11):
116	// Within each "combining sequence" (runs of CCC > 0), sort by CCC, stable.
117	static void canonical_order(std::u32string& seq) NOEXCEPT	6,647✔
118	{
119	const auto n = seq.size();	6,647✔
120	if (n < 2u)	6,647✔
121	return;
122
123	// Identify runs of combining characters and stable-sort each.
124	size_t run_start = 0u;
125	while (run_start < n)	15,058✔
126	{
127	// Skip starters.
128	while (run_start < n && get_ccc(seq[run_start]) == 0u)	43,859✔
129	++run_start;	34,740✔
130
131	// Find end of this combining run.
132	size_t run_end = run_start;	9,119✔
133	while (run_end < n && get_ccc(seq[run_end]) != 0u)	12,761✔
134	++run_end;	3,642✔
135
136	if (run_end > run_start + 1u)	9,119✔
137	{
138	std::stable_sort(seq.begin() + run_start, seq.begin() + run_end,	27✔
139	[](char32_t a, char32_t b) NOEXCEPT	40✔
140	{
141	return get_ccc(a) < get_ccc(b);	40✔
142	});
143	}
144	run_start = run_end;
145	}
146	}
147
148	// Canonical composition lookup (returns 0 if no composition found).
149	static char32_t comp_lookup(char32_t a, char32_t b) NOEXCEPT	21✔
150	{
151	// Hangul L + V → LV
152	if (a >= hangul_lbase && a < hangul_lbase + hangul_lcount &&	21✔
153	b >= hangul_vbase && b < hangul_vbase + hangul_vcount)	4✔
154	{
155	return hangul_sbase	4✔
156	+ (a - hangul_lbase) * hangul_ncount
157	+ (b - hangul_vbase) * hangul_tcount;	4✔
158	}
159
160	// Hangul LV + T → LVT
161	if (a >= hangul_sbase && a < hangul_sbase + hangul_scount &&	17✔
162	(a - hangul_sbase) % hangul_tcount == 0u &&	3✔
163	b > hangul_tbase && b < hangul_tbase + hangul_tcount)	3✔
164	{
165	return a + (b - hangul_tbase);	2✔
166	}
167
168	// Binary search in comp_pairs (sorted by [a, b]).
169	size_t lo = 0u, hi = comp_pairs_count;
170	while (lo < hi)	156✔
171	{
172	const size_t mid = (lo + hi) / 2u;	147✔
173	const size_t base = mid * 3u;	147✔
174	const auto ca = comp_pairs[base];	147✔
175	const auto cb = comp_pairs[base + 1u];	147✔
176	if (ca < a \|\| (ca == a && cb < b))	147✔
177	lo = mid + 1u;	62✔
178	else if (ca > a \|\| (ca == a && cb > b))	85✔
179	hi = mid;
180	else
181	return static_cast<char32_t>(comp_pairs[base + 2u]);	6✔
182	}
183	return 0u;
184	}
185
186	// Apply canonical composition pass (UAX #15 canonical composition algorithm).
187	static void compose(std::u32string& seq) NOEXCEPT	7✔
188	{
189	const size_t n = seq.size();	7✔
190	if (n < 2u)	7✔
191	return;
192
193	// We scan for each starter and try to compose with following combiners.
194	size_t i = 0u;
195	while (i < seq.size())	23✔
196	{
197	if (get_ccc(seq[i]) != 0u)	16✔
198	{
NEW 199	++i;	×
NEW UNCOV 200	continue;	×
201	}
202
203	// seq[i] is a starter; scan forward for composable characters.
204	uint8_t last_ccc = 0u;	16✔
205	size_t j = i + 1u;	16✔
206	while (j < seq.size())	28✔
207	{
208	const uint8_t ccc = get_ccc(seq[j]);	21✔
209
210	// A combining character is "blocked" if a character with equal or
211	// higher CCC appeared between the starter and this character.
212	const bool blocked = (last_ccc > 0u && last_ccc >= ccc);	21✔
213
214	if (ccc == 0u)	21✔
215	{
216	// seq[j] is also a starter. Try composition (e.g. Hangul L+V,
217	// LV+T); if they compose, continue scanning; otherwise stop.
218	if (!blocked)	15✔
219	{
220	const char32_t c = comp_lookup(seq[i], seq[j]);	15✔
221	if (c != 0u)	15✔
222	{
223	seq[i] = c;	6✔
224	seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));	6✔
225	last_ccc = 0u;	6✔
226	continue;	6✔
227	}
228	}
229	break; // Next starter that won't compose — stop.
230	}
231
232	if (!blocked)	6✔
233	{
234	const char32_t c = comp_lookup(seq[i], seq[j]);	6✔
235	if (c != 0u)	6✔
236	{
237	seq[i] = c;	6✔
238	seq.erase(seq.begin() + static_cast<std::ptrdiff_t>(j));	6✔
239	last_ccc = 0u; // Restart scan from just after new starter.	6✔
240	continue;	6✔
241	}
242	}
243
NEW 244	last_ccc = ccc;	×
NEW 245	++j;	×
246	}
247	++i;
248	}
249	}
250
251	// Full normalization (decompose → order → optionally compose).
252	static bool normal_form(std::string& value, bool compat,	6,647✔
253	bool recompose) NOEXCEPT
254	{
255	const auto u32 = to_utf32(value);	6,647✔
256	std::u32string out;	6,647✔
257	out.reserve(u32.size() * 2u); // Most decompositions are small.	6,647✔
258
259	for (const char32_t cp : u32)	43,414✔
260	decompose_one(out, cp, compat);	36,767✔
261
262	canonical_order(out);	6,647✔
263
264	if (recompose)	6,647✔
265	compose(out);	7✔
266
267	value = to_utf8(out);	6,647✔
268	return true;	6,647✔
269	}
270
271	// Case folding helpers.
272	// ----------------------------------------------------------------------------
273
274	// Binary search in case_fold_pairs (sorted by from_cp).
275	// Returns folded code points written into out[], returns count (1-3).
276	static uint32_t fold_lower_one(char32_t cp, char32_t out[3]) NOEXCEPT	37,771✔
277	{
278	size_t lo = 0u, hi = case_fold_pairs_count;	37,771✔
279	while (lo < hi)	418,678✔
280	{
281	const size_t mid = (lo + hi) / 2u;	380,917✔
282	const size_t base = mid * 5u;	380,917✔
283	const auto from = case_fold_pairs[base];	380,917✔
284	if (from < static_cast<uint32_t>(cp))	380,917✔
285	lo = mid + 1u;	216,935✔
286	else if (from > static_cast<uint32_t>(cp))	163,982✔
287	hi = mid;
288	else
289	{
290	const uint32_t n = case_fold_pairs[base + 1u];	10✔
291	out[0] = static_cast<char32_t>(case_fold_pairs[base + 2u]);	10✔
292	out[1] = static_cast<char32_t>(case_fold_pairs[base + 3u]);	10✔
293	out[2] = static_cast<char32_t>(case_fold_pairs[base + 4u]);	10✔
294	return n;	10✔
295	}
296	}
297	out[0] = cp;	37,761✔
298	return 1u; // Identity (already lowercase or caseless).	37,761✔
299	}
300
301	static uint32_t fold_upper_one(char32_t cp, char32_t out[3]) NOEXCEPT	1✔
302	{
303	size_t lo = 0u, hi = upper_pairs_count;	1✔
304	while (lo < hi)	12✔
305	{
306	const size_t mid = (lo + hi) / 2u;	11✔
307	const size_t base = mid * 5u;	11✔
308	const auto from = upper_pairs[base];	11✔
309	if (from < static_cast<uint32_t>(cp))	11✔
310	lo = mid + 1u;	4✔
311	else if (from > static_cast<uint32_t>(cp))	7✔
312	hi = mid;
313	else
314	{
NEW 315	const uint32_t n = upper_pairs[base + 1u];	×
NEW 316	out[0] = static_cast<char32_t>(upper_pairs[base + 2u]);	×
NEW 317	out[1] = static_cast<char32_t>(upper_pairs[base + 3u]);	×
NEW 318	out[2] = static_cast<char32_t>(upper_pairs[base + 4u]);	×
NEW 319	return n;	×
320	}
321	}
322	out[0] = cp;	1✔
323	return 1u;	1✔
324	}
325
326	static bool apply_case(std::string& out, const std::string& in,	6,583✔
327	bool lower) NOEXCEPT
328	{
329	const auto u32 = to_utf32(in);	6,583✔
330	std::u32string result;	6,583✔
331	result.reserve(u32.size());	6,583✔
332
333	char32_t buf[3]{};	6,583✔
334	for (const char32_t cp : u32)	44,355✔
335	{
336	const uint32_t n = lower ? fold_lower_one(cp, buf)	37,772✔
337	: fold_upper_one(cp, buf);	1✔
338	for (uint32_t i = 0u; i < n; ++i)	75,545✔
339	result.push_back(buf[i]);	37,773✔
340	}
341
342	out = to_utf8(result);	6,583✔
343	return true;	6,583✔
344	}
345
346	// ICU dependency (ascii supported, otherwise false if HAVE_ICU not defined).
347	// ----------------------------------------------------------------------------
348
349	bool to_lower(std::string& value) NOEXCEPT	14,258✔
350	{
351	if (is_ascii(value))	14,258✔
352	{
353	value = ascii_to_lower(value);	7,676✔
354	return true;	7,676✔
355	}
356
357	return apply_case(value, value, true);	6,582✔
358	}
359
360	bool to_upper(std::string& value) NOEXCEPT	5✔
361	{
362	if (is_ascii(value))	5✔
363	{
364	value = ascii_to_upper(value);	4✔
365	return true;	4✔
366	}
367
368	return apply_case(value, value, false);	1✔
369	}
370
371	bool to_canonical_composition(std::string& value) NOEXCEPT	8✔
372	{
373	return is_ascii(value) \|\| normal_form(value, false, true);	8✔
374	}
375
376	bool to_canonical_decomposition(std::string& value) NOEXCEPT	12✔
377	{
378	return is_ascii(value) \|\| normal_form(value, false, false);	12✔
379	}
380
381	bool to_compatibility_composition(std::string& value) NOEXCEPT	3✔
382	{
383	return is_ascii(value) \|\| normal_form(value, true, true);	3✔
384	}
385
386	bool to_compatibility_decomposition(std::string& value) NOEXCEPT	14,355✔
387	{
388	return is_ascii(value) \|\| normal_form(value, true, false);	14,355✔
389	}
390
391	// No ICU dependency.
392	// ----------------------------------------------------------------------------
393
UNCOV 394	bool is_unicode(char32_t point) NOEXCEPT	×
395	{
396	return point < 0x0010ffff;	×
397	}
398
399	bool is_separator(char32_t point) NOEXCEPT	1,114,130✔
400	{
401	if (!is_unicode(point))	1,114,130✔
402	return false;
403
404	for (size_t index = 0; index < char32_separators_count; ++index)	20,054,034✔
405	if (point == char32_separators[index])	18,939,939✔
406	return true;
407
408	return false;
409	}
410
411	bool is_whitespace(char32_t point) NOEXCEPT	1,114,157✔
412	{
413	if (!is_unicode(point))	1,114,157✔
414	return false;
415
416	for (size_t index = 0; index < char32_whitespace_count; ++index)	28,967,273✔
417	if (point == char32_whitespace[index])	27,853,171✔
418	return true;
419
420	return false;
421	}
422
423	bool is_combining(char32_t point) NOEXCEPT	3,465,057✔
424	{
425	if (!is_unicode(point))	3,465,057✔
426	return false;
427
428	// github.com/python/cpython/blob/main/Modules/unicodedata.c
429	const auto data1 = unicode_data1[(point >> 7)];	3,465,057✔
430	const auto data2 = unicode_data2[(data1 << 7) + (point & sub1(1 << 7))];	3,465,057✔
431	return !is_zero(combining_index[data2]);	3,465,057✔
432	}
433
434	bool is_diacritic(char32_t point) NOEXCEPT	2,228,240✔
435	{
436	if (!is_unicode(point))	2,228,240✔
437	return false;
438
439	for (size_t index = 0; index < char32_diacritics_count; ++index)	440,993,141✔
440	if (is_contained(point, char32_diacritics[index]))	438,766,584✔
441	return true;
442
443	return false;
444	}
445
446	bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT	1,287,868✔
447	{
448	if (!is_unicode(point))	1,287,868✔
449	return false;
450
451	for (size_t index = 0; index < char32_chinese_japanese_korean_count; ++index)	36,207,775✔
452	if (is_contained(point, char32_chinese_japanese_korean[index]))	35,017,668✔
453	return true;
454
455	return false;
456	}
457
458	bool has_whitespace(const std::string& value) NOEXCEPT	11✔
459	{
460	if (value.empty())	11✔
461	return false;
462
463	const auto points = to_utf32(value);	10✔
464	return std::any_of(points.begin(), points.end(), [](char32_t character)	10✔
465	{
466	return is_whitespace(character);	14✔
467	});
468	}
469
470	std::string to_non_combining_form(const std::string& value) NOEXCEPT	31,875✔
471	{
472	if (value.empty())	31,875✔
473	return value;	8✔
474
475	// utf32 ensures each word is a single unicode character.
476	auto points = to_utf32(value);	31,867✔
477	std::erase_if(points, is_combining);	31,867✔
478	return to_utf8(points);	31,867✔
479	}
480
481	std::string to_non_diacritic_form(const std::string& value) NOEXCEPT	6✔
482	{
483	if (value.empty())	6✔
484	return value;	1✔
485
486	// utf32 ensures each word is a single unicode character.
487	auto points = to_utf32(value);	5✔
488	std::erase_if(points, is_diacritic);	5✔
489	return to_utf8(points);	5✔
490	}
491
492	// Compress ascii whitespace and remove ascii spaces between cjk characters.
493	std::string to_compressed_form(const std::string& value) NOEXCEPT	31,881✔
494	{
495	// Compress ascii whitespace to a single 0x20 between each utf32 token.
496	const auto normalized = system::join(system::split(value));	31,881✔
497
498	// utf32 ensures each word is a single unicode character.
499	const auto points = to_utf32(normalized);	31,881✔
500
501	// A character cannot be between two others if there aren't at least three.
502	if (points.size() < 3u)	31,881✔
503	return normalized;	4,108✔
504
505	// Copy the first character to output.
506	std::u32string compressed{ points.front() };	27,773✔
507
508	// Remove a single ascii whitespace between CJK characters.
509	// Front and back cannot be between two characters, so skip them.
510	for (size_t point = 1; point < sub1(points.size()); point++)	1,204,622✔
511	{
512	if (!(is_ascii_whitespace(points[point]) &&	1,176,849✔
513	is_chinese_japanese_or_korean(points[sub1(point)]) &&	170,019✔
514	is_chinese_japanese_or_korean(points[add1(point)])))	3,738✔
515	{
516	compressed += points[point];	1,173,112✔
517	}
518	}
519
520	// Copy the last character to output.
521	compressed += points.back();	27,773✔
522	return to_utf8(compressed);	27,773✔
523	}
524
525	} // namespace system
526	} // namespace libbitcoin

libbitcoin / libbitcoin-system / 22814433198

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous