23079264858

Committed 14 Mar 2026 03:19AM UTC coverage: 81.31% (+0.01%) from 81.296%

Build # 23079264858

Build Type

Pull #1793

github

Committed by

web-flow

Commit Message

Merge af934943c into 4132085bc

Pull Request Pull Request #1793: Make ICU normalizers unconditional (void), style.

Run Details

139 of 145 new or added lines in 3 files covered. (95.86%)

1 existing line in 1 file now uncovered.

11002 of 13531 relevant lines covered (81.31%)

3502537.47 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.35

/src/unicode/normalization.cpp

/**
 * Copyright (c) 2011-2026 libbitcoin developers (see AUTHORS)
 *
 * This file is part of libbitcoin.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
#include <bitcoin/system/unicode/normalization.hpp>

#include <algorithm>
#include <bitcoin/system/data/data.hpp>
#include <bitcoin/system/math/math.hpp>
#include <bitcoin/system/unicode/ascii.hpp>
#include <bitcoin/system/unicode/code_points.hpp>
#include <bitcoin/system/unicode/conversion.hpp>
#include <bitcoin/system/unicode/unicode_tables.hpp>

namespace libbitcoin {
namespace system {

BC_PUSH_WARNING(NO_ARRAY_INDEXING)
BC_PUSH_WARNING(NO_DYNAMIC_ARRAY_INDEXING)

// Constants.
// ----------------------------------------------------------------------------

constexpr char32_t max_unicode_point = 0x10ffff;

// Hangul syllable constants (unicode.org/reports/tr15/#Hangul).
namespace hangul
{
    constexpr char32_t syllable_base = 0xac00;
    constexpr char32_t leading_base = 0x1100;
    constexpr char32_t vowel_base = 0x1161;
    constexpr char32_t trailing_base = 0x11a7;
    constexpr uint32_t leading_count = 19;
    constexpr uint32_t vowel_count = 21;
    constexpr uint32_t trailing_count = 28;
    constexpr uint32_t nucleus_count = vowel_count * trailing_count;
    constexpr uint32_t syllable_count = leading_count * nucleus_count;
}

// Unicode normalization helpers.
// ----------------------------------------------------------------------------

constexpr size_t components(uint32_t header) NOEXCEPT
{
    // Bits 24-30 is the number of decomposition components.
    constexpr size_t discard = 24;
    constexpr size_t visible = add1(30 - discard);
    return bit_and(shift_right(header, discard),
        unmask_right<uint32_t>(visible));
}

constexpr bool is_compatible(uint32_t header) NOEXCEPT
{
    // Most significant bit flags a compatibility-only (NFKD) mapping.
    return get_left(header);
}

static uint32_t decompose(char32_t point) NOEXCEPT
{
    constexpr size_t shift = 7;

    // Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
    const auto block = decomposition_index1[shift_right(point, shift)];
    return decomposition_index2[shift_left<uint32_t>(block, shift) +
        bit_and(point, unmask_right<char32_t>(shift))];
}

static void decompose_one(std::u32string& out, char32_t point,
    bool compatible) NOEXCEPT
{
    using namespace hangul;

    // Hangul: algorithmic decomposition (same for NFD and NFKD).
    if (point >= syllable_base &&
        point < syllable_base + syllable_count)
    {
        const auto syllable = point - syllable_base;
        out.push_back(leading_base + syllable / nucleus_count);

        const auto modulo = (syllable % nucleus_count);
        out.push_back(vowel_base + modulo / trailing_count);

        const auto trailing = syllable % trailing_count;
        if (is_nonzero(trailing))
            out.push_back(trailing_base + trailing);

        return;
    }

    if (point > max_unicode_point)
    {
        out.push_back(point);
        return;
    }

    const auto index = decompose(point);
    if (is_zero(index))
    {
        out.push_back(point);
        return;
    }

    const auto header = decomposition_pool[index];

    // Skip compatibility-only mappings in NFD mode.
    if (is_compatible(header) && !compatible)
    {
        out.push_back(point);
        return;
    }

    // Recurse into each component.
    for (size_t component{}; component < components(header); ++component)
        decompose_one(out, decomposition_pool[add1(index) + component],
            compatible);
}

static uint8_t get_canonical(char32_t point) NOEXCEPT
{
    if (point > max_unicode_point)
        return {};

    // Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
    const auto data1 = unicode_data1[shift_right(point, 7)];
    const auto data2 = unicode_data2[shift_left<uint32_t>(data1, 7) +
        bit_and(point, unmask_right<char32_t>(7))];

    return combining_index[data2];
}

// Apply canonical ordering (UAX #15 section 3.11).
static void canonical_order(std::u32string& sequence) NOEXCEPT
{
    const auto size = sequence.size();
    if (size < two)
        return;

    size_t start{};
    while (start < size)
    {
        // Skip starters.
        while (start < size && is_zero(get_canonical(sequence[start])))
            ++start;

        // Find end of combining run.
        auto end{ start };
        while (end < size && !is_zero(get_canonical(sequence[end])))
            ++end;

        // Stable-sort combining characters.
        if (end > add1(start))
        {
            const auto at = std::next(sequence.begin(), start);
            const auto to = std::next(sequence.begin(), end);
            std::stable_sort(at, to, [](auto left, auto right) NOEXCEPT
            {
                return get_canonical(left) < get_canonical(right);
            });
        }

        start = end;
    }
}

// Canonical composition lookup (0 if no composition found).
static char32_t composition_lookup(char32_t a, char32_t b) NOEXCEPT
{
    using namespace hangul;

    // Hangul L + V -> LV
    if ((a >= leading_base) &&
        (a < leading_base + leading_count) &&
        (b >= vowel_base) &&
        (b < vowel_base + vowel_count))
    {
        return syllable_base
            + ((a - leading_base) * nucleus_count)
            + ((b - vowel_base) * trailing_count);
    }

    // Hangul LV + T -> LVT
    if ((a >= syllable_base) &&
        (a < syllable_base + syllable_count) &&
        (is_zero((a - syllable_base) % trailing_count)) &&
        (b > trailing_base) &&
        (b < trailing_base + trailing_count))
    {
        return a + (b - trailing_base);
    }

    // Binary search in composition_pairs (sorted by [a, b]).
    size_t lo{}, hi{ composition_pairs_count };
    while (lo < hi)
    {
        const auto middle = to_half(lo + hi);
        const auto base = middle * 3u;
        const auto alpha = composition_pairs[base];
        const auto bravo = composition_pairs[add1(base)];

        if (alpha < a || (alpha == a && bravo < b))
        {
            lo = add1(middle);
        }
        else if (alpha > a || (alpha == a && bravo > b))
        {
            hi = middle;
        }
        else
        {
            return composition_pairs[base + 2u];
        }
    }

    return {};
}

// Apply canonical composition pass (UAX #15 canonical composition algorithm).
static void compose(std::u32string& sequence) NOEXCEPT
{
    const size_t size = sequence.size();
    if (size < two)
        return;

    for (size_t index{}; index < sequence.size();)
    {
        // Skip non-starters.
        if (!is_zero(get_canonical(sequence[index])))
        {
            ++index;
            continue;
        }

        uint8_t last{};

        // Scan for composable characters.
        for (auto inner{ add1(index) }; inner < sequence.size();)
        {
            const auto current = get_canonical(sequence[inner]);

            // Combining char "blocked" if char >= ccc between it and starter.
            const auto blocked = is_nonzero(last) && last >= current;

            if (is_zero(current))
            {
                if (!blocked)
                {
                    // sequence[inner] is also a starter, try composition.
                    const auto point = composition_lookup(sequence[index],
                        sequence[inner]);

                    if (is_nonzero(point))
                    {
                        sequence[index] = point;
                        sequence.erase(std::next(sequence.begin(), inner));
                        last = {};
                        continue;
                    }
                }

                // Stop on next starter that won't compose.
                break;
            }

            if (!blocked)
            {
                const auto point = composition_lookup(sequence[index],
                    sequence[inner]);

                if (is_nonzero(point))
                {
                    sequence[index] = point;
                    sequence.erase(std::next(sequence.begin(), inner));
                    last = {};
                    continue;
                }
            }

            last = current;
            ++inner;
        }

        ++index;
    }
}

static void normal_form(std::string& value, bool compatible,
    bool recompose) NOEXCEPT
{
    const auto points = to_utf32(value);
    std::u32string out{};

    // Most decompositions are small so reserve * two.
    out.reserve(points.size() * two);
    for (const auto point: points)
        decompose_one(out, point, compatible);

    canonical_order(out);
    if (recompose)
        compose(out);

    value = to_utf8(out);
}

// Case folding helpers.
// ----------------------------------------------------------------------------

// msvc is flagging array below indexation as pointer math.
BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)

// Binary search in case_fold_pairs (sorted by from_cp).
// Returns folded code points written into out[], returns count (1-3).
static uint32_t fold_lower_one(char32_t point, char32_t out[3]) NOEXCEPT
{
    size_t lo{}, hi{ case_fold_pairs_count };
    while (lo < hi)
    {
        const auto middle = to_half(lo + hi);
        const auto base = middle * 5u;
        const auto from = case_fold_pairs[base];

        if (from < point)
        {
            lo = add1(middle);
        }
        else if (from > point)
        {
            hi = middle;
        }
        else
        {
            const auto count = case_fold_pairs[add1(base)];
            out[0] = case_fold_pairs[base + 2u];
            out[1] = case_fold_pairs[base + 3u];
            out[2] = case_fold_pairs[base + 4u];
            return count;
        }
    }

    out[0] = point;
    return 1u;
}

static uint32_t fold_upper_one(char32_t point, char32_t out[3]) NOEXCEPT
{
    size_t lo{}, hi{ upper_pairs_count };
    while (lo < hi)
    {
        const auto middle  = to_half(lo + hi);
        const auto base = middle * 5_size;
        const auto from = upper_pairs[base];

        if (from < point)
        {
            lo = add1(middle);
        }
        else if (from > point)
        {
            hi = middle;
        }
        else
        {
            const auto count = upper_pairs[add1(base)];
            out[0] = upper_pairs[base + 2u];
            out[1] = upper_pairs[base + 3u];
            out[2] = upper_pairs[base + 4u];
            return count;
        }
    }

    out[0] = point;
    return 1u;
}

BC_POP_WARNING()

static std::string apply_case(const std::string& in, bool lower) NOEXCEPT
{
    const auto points = to_utf32(in);
    std::u32string out{};
    out.reserve(points.size());

    char32_t buffer[3]{};
    for (const auto point: points)
    {
        const auto count = lower ?
            fold_lower_one(point, &buffer[0]) :
            fold_upper_one(point, &buffer[0]);

        for (size_t index{}; index < count; ++index)
            out.push_back(buffer[index]);
    }

    return to_utf8(out);
}

// Unicode case folding.
// ----------------------------------------------------------------------------

void to_lower(std::string& value) NOEXCEPT
{
    value = is_ascii(value) ? ascii_to_lower(value) : apply_case(value, true);
}

void to_upper(std::string& value) NOEXCEPT
{
    value = is_ascii(value) ? ascii_to_upper(value) : apply_case(value, false);
}

void to_canonical_composition(std::string& value) NOEXCEPT
{
    if (!is_ascii(value)) normal_form(value, false, true);
}

void to_canonical_decomposition(std::string& value) NOEXCEPT
{
    if (!is_ascii(value)) normal_form(value, false, false);
}

void to_compatibility_composition(std::string& value) NOEXCEPT
{
    if (!is_ascii(value)) normal_form(value, true, true);
}

void to_compatibility_decomposition(std::string& value) NOEXCEPT
{
    if (!is_ascii(value)) normal_form(value, true, false);
}

// Character tests (no conversions).
// ----------------------------------------------------------------------------

bool is_unicode(char32_t point) NOEXCEPT
{
    return point <= max_unicode_point;
}

bool is_separator(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index{}; index < char32_separators_count; ++index)
        if (point == char32_separators[index])
            return true;

    return false;
}

bool is_whitespace(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index{}; index < char32_whitespace_count; ++index)
        if (point == char32_whitespace[index])
            return true;

    return false;
}

bool is_combining(char32_t point) NOEXCEPT
{
    // github.com/python/cpython/blob/main/Modules/unicodedata.c
    return is_unicode(point) && !is_zero(get_canonical(point));
}

// local
constexpr bool is_contained(char32_t value,
    const char32_interval& interval) NOEXCEPT
{
    return interval.first <= value && value <= interval.second;
}

bool is_diacritic(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index{}; index < char32_diacritics_count; ++index)
        if (is_contained(point, char32_diacritics[index]))
            return true;

    return false;
}

bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT
{
    if (!is_unicode(point))
        return false;

    for (size_t index{}; index < char32_chinese_japanese_korean_count; ++index)
        if (is_contained(point, char32_chinese_japanese_korean[index]))
            return true;

    return false;
}

bool has_whitespace(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return false;

    const auto points = to_utf32(value);
    return std::any_of(points.begin(), points.end(),
        [](char32_t character) NOEXCEPT
        {
            return is_whitespace(character);
        });
}

// Conversions.
// ----------------------------------------------------------------------------

std::string to_non_combining_form(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return value;

    // utf32 ensures each word is a single unicode character.
    auto points = to_utf32(value);
    std::erase_if(points, is_combining);
    return to_utf8(points);
}

std::string to_non_diacritic_form(const std::string& value) NOEXCEPT
{
    if (value.empty())
        return value;

    // utf32 ensures each word is a single unicode character.
    auto points = to_utf32(value);
    std::erase_if(points, is_diacritic);
    return to_utf8(points);
}

// Compress ascii whitespace and remove ascii spaces between cjk characters.
std::string to_compressed_form(const std::string& value) NOEXCEPT
{
    // Compress ascii whitespace to a single 0x20 between each utf32 token.
    const auto normalized = system::join(system::split(value));

    // utf32 ensures each word is a single unicode character.
    const auto points = to_utf32(normalized);

    // A character cannot be between two others if there aren't at least three.
    if (points.size() < 3u)
        return normalized;

    // Copy the first character to output.
    std::u32string compressed{ points.front() };

    // Remove a single ascii whitespace between CJK characters.
    // Front and back cannot be between two characters, so skip them.
    for (auto point{ one }; point < sub1(points.size()); point++)
    {
        if (!(is_ascii_whitespace(points[point]) &&
            is_chinese_japanese_or_korean(points[sub1(point)]) &&
            is_chinese_japanese_or_korean(points[add1(point)])))
        {
            compressed += points[point];
        }
    }

    // Copy the last character to output.
    compressed += points.back();
    return to_utf8(compressed);
}

BC_POP_WARNING()
BC_POP_WARNING()

} // namespace system
} // namespace libbitcoin

1	/**
2	* Copyright (c) 2011-2026 libbitcoin developers (see AUTHORS)
3	*
4	* This file is part of libbitcoin.
5	*
6	* This program is free software: you can redistribute it and/or modify
7	* it under the terms of the GNU Affero General Public License as published by
8	* the Free Software Foundation, either version 3 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU Affero General Public License for more details.
15	*
16	* You should have received a copy of the GNU Affero General Public License
17	* along with this program. If not, see <http://www.gnu.org/licenses/>.
18	*/
19	#include <bitcoin/system/unicode/normalization.hpp>
20
21	#include <algorithm>
22	#include <bitcoin/system/data/data.hpp>
23	#include <bitcoin/system/math/math.hpp>
24	#include <bitcoin/system/unicode/ascii.hpp>
25	#include <bitcoin/system/unicode/code_points.hpp>
26	#include <bitcoin/system/unicode/conversion.hpp>
27	#include <bitcoin/system/unicode/unicode_tables.hpp>
28
29	namespace libbitcoin {
30	namespace system {
31
32	BC_PUSH_WARNING(NO_ARRAY_INDEXING)
33	BC_PUSH_WARNING(NO_DYNAMIC_ARRAY_INDEXING)
34
35	// Constants.
36	// ----------------------------------------------------------------------------
37
38	constexpr char32_t max_unicode_point = 0x10ffff;
39
40	// Hangul syllable constants (unicode.org/reports/tr15/#Hangul).
41	namespace hangul
42	{
43	constexpr char32_t syllable_base = 0xac00;
44	constexpr char32_t leading_base = 0x1100;
45	constexpr char32_t vowel_base = 0x1161;
46	constexpr char32_t trailing_base = 0x11a7;
47	constexpr uint32_t leading_count = 19;
48	constexpr uint32_t vowel_count = 21;
49	constexpr uint32_t trailing_count = 28;
50	constexpr uint32_t nucleus_count = vowel_count * trailing_count;
51	constexpr uint32_t syllable_count = leading_count * nucleus_count;
52	}
53
54	// Unicode normalization helpers.
55	// ----------------------------------------------------------------------------
56
57	constexpr size_t components(uint32_t header) NOEXCEPT	6,764✔
58	{
59	// Bits 24-30 is the number of decomposition components.
60	constexpr size_t discard = 24;	6,764✔
61	constexpr size_t visible = add1(30 - discard);	6,764✔
62	return bit_and(shift_right(header, discard),	6,764✔
63	unmask_right<uint32_t>(visible));	6,764✔
64	}
65
66	constexpr bool is_compatible(uint32_t header) NOEXCEPT	2,226✔
67	{
68	// Most significant bit flags a compatibility-only (NFKD) mapping.
69	return get_left(header);	2,226✔
70	}
71
72	static uint32_t decompose(char32_t point) NOEXCEPT	41,313✔
73	{
74	constexpr size_t shift = 7;	41,313✔
75
76	// Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
77	const auto block = decomposition_index1[shift_right(point, shift)];	41,313✔
78	return decomposition_index2[shift_left<uint32_t>(block, shift) +	41,313✔
79	bit_and(point, unmask_right<char32_t>(shift))];	41,313✔
80	}
81
82	static void decompose_one(std::u32string& out, char32_t point,	41,318✔
83	bool compatible) NOEXCEPT
84	{
85	using namespace hangul;	41,318✔
86
87	// Hangul: algorithmic decomposition (same for NFD and NFKD).
88	if (point >= syllable_base &&	41,318✔
89	point < syllable_base + syllable_count)
90	{
91	const auto syllable = point - syllable_base;	5✔
92	out.push_back(leading_base + syllable / nucleus_count);	5✔
93
94	const auto modulo = (syllable % nucleus_count);	5✔
95	out.push_back(vowel_base + modulo / trailing_count);	5✔
96
97	const auto trailing = syllable % trailing_count;	5✔
98	if (is_nonzero(trailing))	5✔
99	out.push_back(trailing_base + trailing);	2✔
100
101	return;	5✔
102	}
103
104	if (point > max_unicode_point)	41,313✔
105	{
106	out.push_back(point);	×
107	return;	×
108	}
109
110	const auto index = decompose(point);	41,313✔
111	if (is_zero(index))	41,313✔
112	{
113	out.push_back(point);	39,087✔
114	return;	39,087✔
115	}
116
117	const auto header = decomposition_pool[index];	2,226✔
118
119	// Skip compatibility-only mappings in NFD mode.
120	if (is_compatible(header) && !compatible)	2,226✔
121	{
122	out.push_back(point);	2✔
123	return;	2✔
124	}
125
126	// Recurse into each component.
127	for (size_t component{}; component < components(header); ++component)	6,764✔
128	decompose_one(out, decomposition_pool[add1(index) + component],	4,540✔
129	compatible);
130	}
131
132	static uint8_t get_canonical(char32_t point) NOEXCEPT	3,510,355✔
133	{
134	if (point > max_unicode_point)	3,510,355✔
135	return {};
136
137	// Two-level trie: level-1 selects 128-entry block, level-2 resolves entry.
138	const auto data1 = unicode_data1[shift_right(point, 7)];	3,510,355✔
139	const auto data2 = unicode_data2[shift_left<uint32_t>(data1, 7) +	3,510,355✔
140	bit_and(point, unmask_right<char32_t>(7))];	3,510,355✔
141
142	return combining_index[data2];	3,510,355✔
143	}
144
145	// Apply canonical ordering (UAX #15 section 3.11).
146	static void canonical_order(std::u32string& sequence) NOEXCEPT	6,648✔
147	{
148	const auto size = sequence.size();	6,648✔
149	if (size < two)	6,648✔
150	return;
151
152	size_t start{};
153	while (start < size)	15,060✔
154	{
155	// Skip starters.
156	while (start < size && is_zero(get_canonical(sequence[start])))	43,871✔
157	++start;	34,751✔
158
159	// Find end of combining run.
160	auto end{ start };	9,120✔
161	while (end < size && !is_zero(get_canonical(sequence[end])))	12,762✔
162	++end;	3,642✔
163
164	// Stable-sort combining characters.
165	if (end > add1(start))	9,120✔
166	{
167	const auto at = std::next(sequence.begin(), start);	27✔
168	const auto to = std::next(sequence.begin(), end);	27✔
169	std::stable_sort(at, to, [](auto left, auto right) NOEXCEPT	27✔
170	{
171	return get_canonical(left) < get_canonical(right);	40✔
172	});
173	}
174
175	start = end;
176	}
177	}
178
179	// Canonical composition lookup (0 if no composition found).
180	static char32_t composition_lookup(char32_t a, char32_t b) NOEXCEPT	21✔
181	{
182	using namespace hangul;	21✔
183
184	// Hangul L + V -> LV
185	if ((a >= leading_base) &&	21✔
186	(a < leading_base + leading_count) &&
187	(b >= vowel_base) &&	4✔
188	(b < vowel_base + vowel_count))
189	{
190	return syllable_base	4✔
191	+ ((a - leading_base) * nucleus_count)
192	+ ((b - vowel_base) * trailing_count);	4✔
193	}
194
195	// Hangul LV + T -> LVT
196	if ((a >= syllable_base) &&	17✔
197	(a < syllable_base + syllable_count) &&	3✔
198	(is_zero((a - syllable_base) % trailing_count)) &&	3✔
199	(b > trailing_base) &&	20✔
200	(b < trailing_base + trailing_count))
201	{
202	return a + (b - trailing_base);	2✔
203	}
204
205	// Binary search in composition_pairs (sorted by [a, b]).
206	size_t lo{}, hi{ composition_pairs_count };
207	while (lo < hi)	156✔
208	{
209	const auto middle = to_half(lo + hi);	147✔
210	const auto base = middle * 3u;	147✔
211	const auto alpha = composition_pairs[base];	147✔
212	const auto bravo = composition_pairs[add1(base)];	147✔
213
214	if (alpha < a \|\| (alpha == a && bravo < b))	147✔
215	{
216	lo = add1(middle);	62✔
217	}
218	else if (alpha > a \|\| (alpha == a && bravo > b))	85✔
219	{
220	hi = middle;
221	}
222	else
223	{
224	return composition_pairs[base + 2u];	6✔
225	}
226	}
227
228	return {};
229	}
230
231	// Apply canonical composition pass (UAX #15 canonical composition algorithm).
232	static void compose(std::u32string& sequence) NOEXCEPT	7✔
233	{
234	const size_t size = sequence.size();	7✔
235	if (size < two)	7✔
236	return;
237
238	for (size_t index{}; index < sequence.size();)	23✔
239	{
240	// Skip non-starters.
241	if (!is_zero(get_canonical(sequence[index])))	16✔
242	{
243	++index;	×
244	continue;	×
245	}
246
247	uint8_t last{};	16✔
248
249	// Scan for composable characters.
250	for (auto inner{ add1(index) }; inner < sequence.size();)	28✔
251	{
252	const auto current = get_canonical(sequence[inner]);	21✔
253
254	// Combining char "blocked" if char >= ccc between it and starter.
255	const auto blocked = is_nonzero(last) && last >= current;	21✔
256
257	if (is_zero(current))	21✔
258	{
259	if (!blocked)	15✔
260	{
261	// sequence[inner] is also a starter, try composition.
262	const auto point = composition_lookup(sequence[index],	15✔
263	sequence[inner]);
264
265	if (is_nonzero(point))	15✔
266	{
267	sequence[index] = point;	6✔
268	sequence.erase(std::next(sequence.begin(), inner));	6✔
269	last = {};	6✔
270	continue;	6✔
271	}
272	}
273
274	// Stop on next starter that won't compose.
275	break;
276	}
277
278	if (!blocked)	6✔
279	{
280	const auto point = composition_lookup(sequence[index],	6✔
281	sequence[inner]);
282
283	if (is_nonzero(point))	6✔
284	{
285	sequence[index] = point;	6✔
286	sequence.erase(std::next(sequence.begin(), inner));	6✔
287	last = {};	6✔
288	continue;	6✔
289	}
290	}
291
NEW 292	last = current;	×
293	++inner;	×
294	}
295
296	++index;
297	}
298	}
299
300	static void normal_form(std::string& value, bool compatible,	6,648✔
301	bool recompose) NOEXCEPT
302	{
303	const auto points = to_utf32(value);	6,648✔
304	std::u32string out{};	6,648✔
305
306	// Most decompositions are small so reserve * two.
307	out.reserve(points.size() * two);	6,648✔
308	for (const auto point: points)	43,426✔
309	decompose_one(out, point, compatible);	36,778✔
310
311	canonical_order(out);	6,648✔
312	if (recompose)	6,648✔
313	compose(out);	7✔
314
315	value = to_utf8(out);	13,296✔
316	}	6,648✔
317
318	// Case folding helpers.
319	// ----------------------------------------------------------------------------
320
321	// msvc is flagging array below indexation as pointer math.
322	BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)
323
324	// Binary search in case_fold_pairs (sorted by from_cp).
325	// Returns folded code points written into out[], returns count (1-3).
326	static uint32_t fold_lower_one(char32_t point, char32_t out[3]) NOEXCEPT	37,771✔
327	{
328	size_t lo{}, hi{ case_fold_pairs_count };	37,771✔
329	while (lo < hi)	418,678✔
330	{
331	const auto middle = to_half(lo + hi);	380,917✔
332	const auto base = middle * 5u;	380,917✔
333	const auto from = case_fold_pairs[base];	380,917✔
334
335	if (from < point)	380,917✔
336	{
337	lo = add1(middle);	216,935✔
338	}
339	else if (from > point)	163,982✔
340	{
341	hi = middle;
342	}
343	else
344	{
345	const auto count = case_fold_pairs[add1(base)];	10✔
346	out[0] = case_fold_pairs[base + 2u];	10✔
347	out[1] = case_fold_pairs[base + 3u];	10✔
348	out[2] = case_fold_pairs[base + 4u];	10✔
349	return count;	10✔
350	}
351	}
352
353	out[0] = point;	37,761✔
354	return 1u;	37,761✔
355	}
356
357	static uint32_t fold_upper_one(char32_t point, char32_t out[3]) NOEXCEPT	1✔
358	{
359	size_t lo{}, hi{ upper_pairs_count };	1✔
360	while (lo < hi)	12✔
361	{
362	const auto middle = to_half(lo + hi);	11✔
363	const auto base = middle * 5_size;	11✔
364	const auto from = upper_pairs[base];	11✔
365
366	if (from < point)	11✔
367	{
368	lo = add1(middle);	4✔
369	}
370	else if (from > point)	7✔
371	{
372	hi = middle;
373	}
374	else
375	{
NEW 376	const auto count = upper_pairs[add1(base)];	×
NEW 377	out[0] = upper_pairs[base + 2u];	×
NEW 378	out[1] = upper_pairs[base + 3u];	×
NEW 379	out[2] = upper_pairs[base + 4u];	×
UNCOV 380	return count;	×
381	}
382	}
383
384	out[0] = point;	1✔
385	return 1u;	1✔
386	}
387
388	BC_POP_WARNING()
389
390	static std::string apply_case(const std::string& in, bool lower) NOEXCEPT	6,583✔
391	{
392	const auto points = to_utf32(in);	6,583✔
393	std::u32string out{};	6,583✔
394	out.reserve(points.size());	6,583✔
395
396	char32_t buffer[3]{};	6,583✔
397	for (const auto point: points)	44,355✔
398	{
399	const auto count = lower ?	37,772✔
400	fold_lower_one(point, &buffer[0]) :	37,771✔
401	fold_upper_one(point, &buffer[0]);	1✔
402
403	for (size_t index{}; index < count; ++index)	75,545✔
404	out.push_back(buffer[index]);	37,773✔
405	}
406
407	return to_utf8(out);	6,583✔
408	}
409
410	// Unicode case folding.
411	// ----------------------------------------------------------------------------
412
413	void to_lower(std::string& value) NOEXCEPT	14,258✔
414	{
415	value = is_ascii(value) ? ascii_to_lower(value) : apply_case(value, true);	14,258✔
416	}	14,258✔
417
418	void to_upper(std::string& value) NOEXCEPT	5✔
419	{
420	value = is_ascii(value) ? ascii_to_upper(value) : apply_case(value, false);	5✔
421	}	5✔
422
423	void to_canonical_composition(std::string& value) NOEXCEPT	8✔
424	{
425	if (!is_ascii(value)) normal_form(value, false, true);	8✔
426	}	8✔
427
428	void to_canonical_decomposition(std::string& value) NOEXCEPT	12✔
429	{
430	if (!is_ascii(value)) normal_form(value, false, false);	12✔
431	}	12✔
432
433	void to_compatibility_composition(std::string& value) NOEXCEPT	3✔
434	{
435	if (!is_ascii(value)) normal_form(value, true, true);	3✔
436	}	3✔
437
438	void to_compatibility_decomposition(std::string& value) NOEXCEPT	14,356✔
439	{
440	if (!is_ascii(value)) normal_form(value, true, false);	14,356✔
441	}	14,356✔
442
443	// Character tests (no conversions).
444	// ----------------------------------------------------------------------------
445
446	bool is_unicode(char32_t point) NOEXCEPT	×
447	{
NEW 448	return point <= max_unicode_point;	×
449	}
450
451	bool is_separator(char32_t point) NOEXCEPT	1,114,130✔
452	{
453	if (!is_unicode(point))	1,114,130✔
454	return false;
455
456	for (size_t index{}; index < char32_separators_count; ++index)	20,054,034✔
457	if (point == char32_separators[index])	18,939,939✔
458	return true;
459
460	return false;
461	}
462
463	bool is_whitespace(char32_t point) NOEXCEPT	1,114,157✔
464	{
465	if (!is_unicode(point))	1,114,157✔
466	return false;
467
468	for (size_t index{}; index < char32_whitespace_count; ++index)	28,967,273✔
469	if (point == char32_whitespace[index])	27,853,171✔
470	return true;
471
472	return false;
473	}
474
475	bool is_combining(char32_t point) NOEXCEPT	3,465,057✔
476	{
477	// github.com/python/cpython/blob/main/Modules/unicodedata.c
478	return is_unicode(point) && !is_zero(get_canonical(point));	3,465,057✔
479	}
480
481	// local
482	constexpr bool is_contained(char32_t value,	473,784,252✔
483	const char32_interval& interval) NOEXCEPT
484	{
485	return interval.first <= value && value <= interval.second;	473,784,252✔
486	}
487
488	bool is_diacritic(char32_t point) NOEXCEPT	2,228,240✔
489	{
490	if (!is_unicode(point))	2,228,240✔
491	return false;
492
493	for (size_t index{}; index < char32_diacritics_count; ++index)	440,993,141✔
494	if (is_contained(point, char32_diacritics[index]))	438,766,584✔
495	return true;
496
497	return false;
498	}
499
500	bool is_chinese_japanese_or_korean(char32_t point) NOEXCEPT	1,287,868✔
501	{
502	if (!is_unicode(point))	1,287,868✔
503	return false;
504
505	for (size_t index{}; index < char32_chinese_japanese_korean_count; ++index)	36,207,775✔
506	if (is_contained(point, char32_chinese_japanese_korean[index]))	35,017,668✔
507	return true;
508
509	return false;
510	}
511
512	bool has_whitespace(const std::string& value) NOEXCEPT	11✔
513	{
514	if (value.empty())	11✔
515	return false;
516
517	const auto points = to_utf32(value);	10✔
518	return std::any_of(points.begin(), points.end(),	10✔
519	[](char32_t character) NOEXCEPT	23✔
520	{
521	return is_whitespace(character);	14✔
522	});
523	}
524
525	// Conversions.
526	// ----------------------------------------------------------------------------
527
528	std::string to_non_combining_form(const std::string& value) NOEXCEPT	31,875✔
529	{
530	if (value.empty())	31,875✔
531	return value;	8✔
532
533	// utf32 ensures each word is a single unicode character.
534	auto points = to_utf32(value);	31,867✔
535	std::erase_if(points, is_combining);	31,867✔
536	return to_utf8(points);	31,867✔
537	}
538
539	std::string to_non_diacritic_form(const std::string& value) NOEXCEPT	6✔
540	{
541	if (value.empty())	6✔
542	return value;	1✔
543
544	// utf32 ensures each word is a single unicode character.
545	auto points = to_utf32(value);	5✔
546	std::erase_if(points, is_diacritic);	5✔
547	return to_utf8(points);	5✔
548	}
549
550	// Compress ascii whitespace and remove ascii spaces between cjk characters.
551	std::string to_compressed_form(const std::string& value) NOEXCEPT	31,881✔
552	{
553	// Compress ascii whitespace to a single 0x20 between each utf32 token.
554	const auto normalized = system::join(system::split(value));	31,881✔
555
556	// utf32 ensures each word is a single unicode character.
557	const auto points = to_utf32(normalized);	31,881✔
558
559	// A character cannot be between two others if there aren't at least three.
560	if (points.size() < 3u)	31,881✔
561	return normalized;	4,108✔
562
563	// Copy the first character to output.
564	std::u32string compressed{ points.front() };	27,773✔
565
566	// Remove a single ascii whitespace between CJK characters.
567	// Front and back cannot be between two characters, so skip them.
568	for (auto point{ one }; point < sub1(points.size()); point++)	1,204,622✔
569	{
570	if (!(is_ascii_whitespace(points[point]) &&	1,176,849✔
571	is_chinese_japanese_or_korean(points[sub1(point)]) &&	170,019✔
572	is_chinese_japanese_or_korean(points[add1(point)])))	3,738✔
573	{
574	compressed += points[point];	1,173,112✔
575	}
576	}
577
578	// Copy the last character to output.
579	compressed += points.back();	27,773✔
580	return to_utf8(compressed);	27,773✔
581	}
582
583	BC_POP_WARNING()
584	BC_POP_WARNING()
585
586	} // namespace system
587	} // namespace libbitcoin

libbitcoin / libbitcoin-system / 23079264858

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous