• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / 1738

06 Oct 2023 04:08PM UTC coverage: 91.61% (+0.04%) from 91.567%
1738

push

Evergreen

web-flow
Merge pull request #7040 from realm/release/13.23.0

core v13.23.0

94320 of 173524 branches covered (0.0%)

230622 of 251742 relevant lines covered (91.61%)

6562113.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.15
/src/realm/unicode.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <algorithm>
20
#include <vector>
21

22
#ifdef _WIN32
23
#ifndef NOMINMAX
24
#define NOMINMAX
25
#endif
26
#include <windows.h>
27
#else
28
#include <ctype.h>
29
#endif
30

31
#include <realm/util/safe_int_ops.hpp>
32
#include <realm/unicode.hpp>
33

34
#include <clocale>
35

36
#ifdef _MSC_VER
37
#include <codecvt>
38
#else
39
#include <locale>
40
#endif
41

42
using namespace realm;
43

44
namespace {
45

46
std::wstring utf8_to_wstring(StringData str)
47
{
×
48
#if defined(_MSC_VER)
49
    // __STDC_UTF_16__ seems not to work
50
    static_assert(sizeof(wchar_t) == 2, "Expected Windows to use utf16");
51

52
    // First get the number of chars needed for output buffer
53
    int wchars_num = MultiByteToWideChar(CP_UTF8, 0, str.data(), -1, nullptr, 0);
54
    auto wstr = std::make_unique<wchar_t[]>(wchars_num);
55
    // Then convert
56
    MultiByteToWideChar(CP_UTF8, 0, str.data(), -1, wstr.get(), wchars_num);
57
    std::wstring w_result{wstr.get()};
58

59
    return w_result;
60
#else
61
    // gcc 4.7 and 4.8 do not yet support codecvt_utf8_utf16 and wstring_convert, and note that we can NOT just use
62
    // setlocale() + mbstowcs() because setlocale is extremely slow and may change locale of the entire user process
63
    static_cast<void>(str);
×
64
    REALM_ASSERT(false);
×
65
    return L"";
×
66
#endif
×
67
}
×
68

69
} // unnamed namespace
70

71

72
namespace realm {
73

74
// Highest character currently supported for *sorting* strings in Realm, when using STRING_COMPARE_CPP11.
75
constexpr size_t last_latin_extended_2_unicode = 591;
76

77
bool set_string_compare_method(string_compare_method_t method, StringCompareCallback callback)
78
{
66✔
79
    if (method == STRING_COMPARE_CPP11) {
66✔
80
#if !REALM_ANDROID
×
81
        std::string l = std::locale("").name();
×
82
        // We cannot use C locale because it puts 'Z' before 'a'
83
        if (l == "C")
×
84
            return false;
×
85
#else
86
        // If Realm wasn't compiled as C++11, just return false.
87
        return false;
88
#endif
89
    }
66✔
90
    else if (method == STRING_COMPARE_CALLBACK) {
66✔
91
        string_compare_callback = std::move(callback);
6✔
92
    }
6✔
93

33✔
94
    // other success actions
33✔
95
    string_compare_method = method;
66✔
96
    return true;
66✔
97
}
66✔
98

99
// clang-format off
100
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
101
size_t sequence_length(char lead)
102
{
1,133,534,892✔
103
    // keep 'static' else entire array will be pushed to stack at each call
556,335,744✔
104
    const static unsigned char lengths[256] = {
1,133,534,892✔
105
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,133,534,892✔
106
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,133,534,892✔
107
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,133,534,892✔
108
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
1,133,534,892✔
109
    };
1,133,534,892✔
110

556,335,744✔
111
    return lengths[static_cast<unsigned char>(lead)];
1,133,534,892✔
112
}
1,133,534,892✔
113
// clang-format on
114

115
// Check if the next UTF-8 sequence in [begin, end) is identical to
116
// the one beginning at begin2. If it is, 'begin' is advanced
117
// accordingly.
118
inline bool equal_sequence(const char*& begin, const char* end, const char* begin2)
119
{
22,782✔
120
    if (begin[0] != begin2[0])
22,782✔
121
        return false;
888✔
122

10,947✔
123
    size_t i = 1;
21,894✔
124
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
21,894✔
125
        // All following bytes matching '10xxxxxx' will be considered
21✔
126
        // as part of this character.
21✔
127
        while (begin + i != end) {
108✔
128
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
90✔
129
                break;
24✔
130
            if (begin[i] != begin2[i])
66✔
131
                return false;
×
132
            ++i;
66✔
133
        }
66✔
134
    }
42✔
135

10,947✔
136
    begin += i;
21,894✔
137
    return true;
21,894✔
138
}
21,894✔
139

140
// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
141
uint32_t utf8value(const char* character)
142
{
379,138,899✔
143
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
379,138,899✔
144
    size_t len = sequence_length(c[0]);
379,138,899✔
145
    uint32_t res = c[0];
379,138,899✔
146

186,443,625✔
147
    if (len == 1)
379,138,899✔
148
        return res;
377,023,929✔
149

647,286✔
150
    res &= (0x3f >> (len - 1));
2,114,970✔
151

647,286✔
152
    for (size_t i = 1; i < len; i++)
7,811,514✔
153
        res = ((res << 6) | (c[i] & 0x3f));
5,696,544✔
154

647,286✔
155
    return res;
2,114,970✔
156
}
2,114,970✔
157

158
// Returns bool(string1 < string2) for utf-8
159
bool utf8_compare(StringData string1, StringData string2)
160
{
1,482,846✔
161
    const char* s1 = string1.data();
1,482,846✔
162
    const char* s2 = string2.data();
1,482,846✔
163

747,342✔
164
    // This collation_order array has 592 entries; one entry per unicode character in the range 0...591
747,342✔
165
    // (upto and including 'Latin Extended 2'). The value tells what 'sorting order rank' the character
747,342✔
166
    // has, such that unichar1 < unichar2 implies collation_order[unichar1] < collation_order[unichar2]. The
747,342✔
167
    // array is generated from the table found at ftp://ftp.unicode.org/Public/UCA/latest/allkeys.txt. At the
747,342✔
168
    // bottom of unicode.cpp you can find source code that reads such a file and translates it into C++ that
747,342✔
169
    // you can copy/paste in case the official table should get updated.
747,342✔
170
    //
747,342✔
171
    // NOTE: Some numbers in the array are vere large. This is because the value is the *global* rank of the
747,342✔
172
    // almost full unicode set. An optimization could be to 'normalize' all values so they ranged from
747,342✔
173
    // 0...591 so they would fit in a uint16_t array instead of uint32_t.
747,342✔
174
    //
747,342✔
175
    // It groups all characters that look visually identical, that is, it puts `a, ‡, Â` together and before
747,342✔
176
    // `¯, o, ˆ`. Note that this sorting method is wrong in some countries, such as Denmark where `Â` must
747,342✔
177
    // come last. NOTE: This is a limitation of STRING_COMPARE_CORE until we get better such 'locale' support.
747,342✔
178

747,342✔
179
    // clang-format off
747,342✔
180
    static const uint32_t collation_order_core_similar[last_latin_extended_2_unicode + 1] = {
1,482,846✔
181
        0, 1, 2, 3, 4, 5, 6, 7, 8, 456, 457, 458, 459, 460, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 461, 462, 463, 464, 8130, 465, 466, 467,
1,482,846✔
182
        468, 469, 470, 471, 472, 473, 474, 475, 8178, 8248, 8433, 8569, 8690, 8805, 8912, 9002, 9093, 9182, 476, 477, 478, 479, 480, 481, 482, 9290, 9446, 9511, 9595, 9690, 9818, 9882, 9965, 10051, 10156, 10211, 10342, 10408, 10492, 10588,
1,482,846✔
183
        10752, 10828, 10876, 10982, 11080, 11164, 11304, 11374, 11436, 11493, 11561, 483, 484, 485, 486, 487, 488, 9272, 9428, 9492, 9575, 9671, 9800, 9864, 9947, 10030, 10138, 10193, 10339, 10389, 10474, 10570, 10734, 10811, 10857, 10964, 11062, 11146, 11285, 11356,
1,482,846✔
184
        11417, 11476, 11543, 489, 490, 491, 492, 27, 28, 29, 30, 31, 32, 493, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
1,482,846✔
185
        494, 495, 8128, 8133, 8127, 8135, 496, 497, 498, 499, 9308, 500, 501, 59, 502, 503, 504, 505, 8533, 8669, 506, 12018, 507, 508, 509, 8351, 10606, 510, 8392, 8377, 8679, 511, 9317, 9315, 9329, 9353, 9348, 9341, 9383, 9545,
1,482,846✔
186
        9716, 9714, 9720, 9732, 10078, 10076, 10082, 10086, 9635, 10522, 10615, 10613, 10619, 10640, 10633, 512, 10652, 11190, 11188, 11194, 11202, 11515, 11624, 11038, 9316, 9314, 9328, 9352, 9345, 9340, 9381, 9543, 9715, 9713, 9719, 9731, 10077, 10075, 10081, 10085,
1,482,846✔
187
        9633, 10521, 10614, 10612, 10618, 10639, 10630, 513, 10651, 11189, 11187, 11193, 11199, 11514, 11623, 11521, 9361, 9360, 9319, 9318, 9359, 9358, 9536, 9535, 9538, 9537, 9542, 9541, 9540, 9539, 9620, 9619, 9626, 9625, 9744, 9743, 9718, 9717, 9736, 9735,
1,482,846✔
188
        9742, 9741, 9730, 9729, 9909, 9908, 9907, 9906, 9913, 9912, 9915, 9914, 9989, 9988, 10000, 9998, 10090, 10089, 10095, 10094, 10080, 10079, 10093, 10092, 10091, 10120, 10113, 10112, 10180, 10179, 10240, 10239, 10856, 10322, 10321, 10326, 10325, 10324, 10323, 10340,
1,482,846✔
189
        10337, 10328, 10327, 10516, 10515, 10526, 10525, 10520, 10519, 11663, 10567, 10566, 10660, 10659, 10617, 10616, 10638, 10637, 10689, 10688, 10901, 10900, 10907, 10906, 10903, 10902, 11006, 11005, 11010, 11009, 11018, 11017, 11012, 11011, 11109, 11108, 11104, 11103, 11132, 11131,
1,482,846✔
190
        11215, 11214, 11221, 11220, 11192, 11191, 11198, 11197, 11213, 11212, 11219, 11218, 11401, 11400, 11519, 11518, 11522, 11583, 11582, 11589, 11588, 11587, 11586, 11027, 9477, 9486, 9488, 9487, 11657, 11656, 10708, 9568, 9567, 9662, 9664, 9667, 9666, 11594, 9774, 9779,
1,482,846✔
191
        9784, 9860, 9859, 9937, 9943, 10014, 10135, 10129, 10266, 10265, 10363, 10387, 11275, 10554, 10556, 10723, 10673, 10672, 9946, 9945, 10802, 10801, 10929, 11653, 11652, 11054, 11058, 11136, 11139, 11138, 11141, 11232, 11231, 11282, 11347, 11537, 11536, 11597, 11596, 11613,
1,482,846✔
192
        11619, 11618, 11621, 11645, 11655, 11654, 11125, 11629, 11683, 11684, 11685, 11686, 9654, 9653, 9652, 10345, 10344, 10343, 10541, 10540, 10539, 9339, 9338, 10084, 10083, 10629, 10628, 11196, 11195, 11211, 11210, 11205, 11204, 11209, 11208, 11207, 11206, 9773, 9351, 9350,
1,482,846✔
193
        9357, 9356, 9388, 9387, 9934, 9933, 9911, 9910, 10238, 10237, 10656, 10655, 10658, 10657, 11616, 11615, 10181, 9651, 9650, 9648, 9905, 9904, 10015, 11630, 10518, 10517, 9344, 9343, 9386, 9385, 10654, 10653, 9365, 9364, 9367, 9366, 9752, 9751, 9754, 9753,
1,482,846✔
194
        10099, 10098, 10101, 10100, 10669, 10668, 10671, 10670, 10911, 10910, 10913, 10912, 11228, 11227, 11230, 11229, 11026, 11025, 11113, 11112, 11542, 11541, 9991, 9990, 10557, 9668, 10731, 10730, 11601, 11600, 9355, 9354, 9738, 9737, 10636, 10635, 10646, 10645, 10648, 10647,
1,482,846✔
195
        10650, 10649, 11528, 11527, 10382, 10563, 11142, 10182, 9641, 10848, 9409, 9563, 9562, 10364, 11134, 11048, 11606, 11660, 11659, 9478, 11262, 11354, 9769, 9768, 10186, 10185, 10855, 10854, 10936, 10935, 11535, 11534
1,482,846✔
196
    };
1,482,846✔
197

747,342✔
198
    static const uint32_t collation_order_core[last_latin_extended_2_unicode + 1] = {
1,482,846✔
199
        0, 2, 3, 4, 5, 6, 7, 8, 9, 33, 34, 35, 36, 37, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 38, 39, 40, 41, 42, 43, 29, 44, 45, 46, 76, 47, 30, 48, 49, 128, 132, 134, 137, 139, 140, 143, 144, 145, 146, 50, 51, 77, 78, 79, 52, 53, 148, 182, 191, 208, 229, 263, 267, 285, 295, 325, 333, 341, 360, 363, 385, 429, 433, 439, 454, 473, 491, 527, 531, 537, 539, 557, 54, 55, 56, 57, 58, 59, 147, 181, 190, 207,
1,482,846✔
200
        228, 262, 266, 284, 294, 324, 332, 340, 359, 362, 384, 428, 432, 438, 453, 472, 490, 526, 530, 536, 538, 556, 60, 61, 62, 63, 28, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 32, 64, 72, 73, 74, 75, 65, 88, 66, 89, 149, 81, 90, 1, 91, 67, 92, 80, 136, 138, 68, 93, 94, 95, 69, 133, 386, 82, 129, 130, 131, 70, 153, 151, 157, 165, 575, 588, 570, 201, 233,
1,482,846✔
201
        231, 237, 239, 300, 298, 303, 305, 217, 371, 390, 388, 394, 402, 584, 83, 582, 495, 493, 497, 555, 541, 487, 470, 152, 150, 156, 164, 574, 587, 569, 200, 232, 230, 236, 238, 299, 297, 302, 304, 216, 370, 389, 387, 393, 401, 583, 84, 581, 494, 492, 496, 554, 540, 486, 544, 163, 162, 161, 160, 167, 166, 193, 192, 197, 196, 195, 194, 199, 198, 210, 209, 212, 211, 245, 244, 243, 242, 235, 234, 247, 246, 241, 240, 273, 272, 277, 276, 271, 270, 279, 278, 287, 286, 291, 290, 313, 312, 311, 310, 309,
1,482,846✔
202
        308, 315, 314, 301, 296, 323, 322, 328, 327, 337, 336, 434, 343, 342, 349, 348, 347, 346, 345, 344, 353, 352, 365, 364, 373, 372, 369, 368, 375, 383, 382, 400, 399, 398, 397, 586, 585, 425, 424, 442, 441, 446, 445, 444, 443, 456, 455, 458, 457, 462, 461, 460, 459, 477, 476, 475, 474, 489, 488, 505, 504, 503, 502, 501, 500, 507, 506, 549, 548, 509, 508, 533, 532, 543, 542, 545, 559, 558, 561, 560, 563, 562, 471, 183, 185, 187, 186, 189, 188, 206, 205, 204, 226, 215, 214, 213, 218, 257, 258, 259,
1,482,846✔
203
        265, 264, 282, 283, 292, 321, 316, 339, 338, 350, 354, 361, 374, 376, 405, 421, 420, 423, 422, 431, 430, 440, 468, 467, 466, 469, 480, 479, 478, 481, 524, 523, 525, 528, 553, 552, 565, 564, 571, 579, 578, 580, 135, 142, 141, 589, 534, 85, 86, 87, 71, 225, 224, 223, 357, 356, 355, 380, 379, 378, 159, 158, 307, 306, 396, 395, 499, 498, 518, 517, 512, 511, 516, 515, 514, 513, 256, 174, 173, 170, 169, 573, 572, 281, 280, 275, 274, 335, 334, 404, 403, 415, 414, 577, 576, 329, 222, 221, 220, 269,
1,482,846✔
204
        268, 293, 535, 367, 366, 172, 171, 180, 179, 411, 410, 176, 175, 178, 177, 253, 252, 255, 254, 318, 317, 320, 319, 417, 416, 419, 418, 450, 449, 452, 451, 520, 519, 522, 521, 464, 463, 483, 482, 261, 260, 289, 288, 377, 227, 427, 426, 567, 566, 155, 154, 249, 248, 409, 408, 413, 412, 392, 391, 407, 406, 547, 546, 358, 381, 485, 326, 219, 437, 168, 203, 202, 351, 484, 465, 568, 591, 590, 184, 510, 529, 251, 250, 331, 330, 436, 435, 448, 447, 551, 550
1,482,846✔
205
    };
1,482,846✔
206
    // clang-format on
747,342✔
207

747,342✔
208
    bool use_internal_sort_order =
1,482,846✔
209
        (string_compare_method == STRING_COMPARE_CORE) || (string_compare_method == STRING_COMPARE_CORE_SIMILAR);
1,482,846✔
210

747,342✔
211
    if (use_internal_sort_order) {
1,482,846✔
212
        // Core-only method. Compares in us_EN locale (sorting may be slightly inaccurate in some countries). Will
747,291✔
213
        // return arbitrary return value for invalid utf8 (silent error treatment). If one or both strings have
747,291✔
214
        // unicodes beyond 'Latin Extended 2' (0...591), then the strings are compared by unicode value.
747,291✔
215
        uint32_t char1;
1,482,783✔
216
        uint32_t char2;
1,482,783✔
217
        do {
191,105,667✔
218
            size_t remaining1 = string1.size() - (s1 - string1.data());
191,105,667✔
219
            size_t remaining2 = string2.size() - (s2 - string2.data());
191,105,667✔
220

94,601,163✔
221
            if ((remaining1 == 0) != (remaining2 == 0)) {
191,105,667✔
222
                // exactly one of the strings have ended (not both or none; xor)
151,584✔
223
                return (remaining1 == 0);
304,737✔
224
            }
304,737✔
225
            else if (remaining2 == 0 && remaining1 == 0) {
190,800,930✔
226
                // strings are identical
27✔
227
                return false;
54✔
228
            }
54✔
229

94,449,552✔
230
            // invalid utf8
94,449,552✔
231
            if (remaining1 < sequence_length(s1[0]) || remaining2 < sequence_length(s2[0]))
190,800,876✔
232
                return false;
882✔
233

94,449,030✔
234
            char1 = utf8value(s1);
190,799,994✔
235
            char2 = utf8value(s2);
190,799,994✔
236

94,449,030✔
237
            if (char1 == char2) {
190,799,994✔
238
                // Go to next characters for both strings
94,012,095✔
239
                s1 += sequence_length(s1[0]);
189,781,320✔
240
                s2 += sequence_length(s2[0]);
189,781,320✔
241
            }
189,781,320✔
242
            else {
1,018,674✔
243
                // Test if above Latin Extended B
436,935✔
244
                if (char1 > last_latin_extended_2_unicode || char2 > last_latin_extended_2_unicode)
1,162,215✔
245
                    return char1 < char2;
51,690✔
246

411,066✔
247
                const uint32_t* internal_collation_order = collation_order_core;
966,984✔
248
                if (string_compare_method == STRING_COMPARE_CORE_SIMILAR) {
966,984✔
249
                    internal_collation_order = collation_order_core_similar;
17,562✔
250
                }
17,562✔
251
                uint32_t value1 = internal_collation_order[char1];
966,984✔
252
                uint32_t value2 = internal_collation_order[char2];
966,984✔
253

411,066✔
254
                return value1 < value2;
966,984✔
255
            }
966,984✔
256

94,449,030✔
257
        } while (true);
190,218,255✔
258
    }
1,482,783✔
259
    else if (string_compare_method == STRING_COMPARE_CPP11) {
63✔
260
        // C++11. Precise sorting in user's current locale. Arbitrary return value (silent error) for invalid utf8
261
        std::wstring wstring1 = utf8_to_wstring(string1);
×
262
        std::wstring wstring2 = utf8_to_wstring(string2);
×
263
        std::locale l = std::locale("");
×
264
        bool ret = l(wstring1, wstring2);
×
265
        return ret;
×
266
    }
×
267
    else if (string_compare_method == STRING_COMPARE_CALLBACK) {
69✔
268
        // Callback method
21✔
269
        bool ret = string_compare_callback(s1, s2);
39✔
270
        return ret;
39✔
271
    }
39✔
272

158,253✔
273
    REALM_ASSERT(false);
158,460✔
274
    return false;
158,460✔
275
}
158,460✔
276

277
// Converts UTF-8 source into upper or lower case. This function
278
// preserves the byte length of each UTF-8 character in following way:
279
// If an output character differs in size, it is simply substituded by
280
// the original character. This may of course give wrong search
281
// results in very special cases. Todo.
282
util::Optional<std::string> case_map(StringData source, bool upper)
283
{
218,478✔
284
    std::string result;
218,478✔
285
    result.resize(source.size());
218,478✔
286

109,239✔
287
#if defined(_WIN32)
288
    constexpr int tmp_buffer_size = 32;
289
    const char* begin = source.data();
290
    const char* end = begin + source.size();
291
    auto output = result.begin();
292
    while (begin != end) {
293
        auto n = end - begin;
294
        if (n > tmp_buffer_size) {
295
            // Break the input string into chunks - but don't break in the middle of a multibyte character
296
            const char* p = begin;
297
            const char* buffer_end = begin + tmp_buffer_size;
298
            while (p < buffer_end) {
299
                size_t len = sequence_length(*p);
300
                p += len;
301
                if (p > buffer_end) {
302
                    p -= len;
303
                    break;
304
                }
305
            }
306
            n = p - begin;
307
        }
308

309
        wchar_t tmp[tmp_buffer_size];
310

311
        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
312
        if (n2 == 0)
313
            return util::none;
314

315
        if (n2 < tmp_buffer_size)
316
            tmp[n2] = 0;
317

318
        // Note: If tmp[0] == 0, it is because the string contains a
319
        // null-chacarcter, which is perfectly fine.
320

321
        wchar_t mapped_tmp[tmp_buffer_size];
322
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
323
                      tmp_buffer_size, nullptr, nullptr, 0);
324

325
        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
326
        // to catch invalid UTF-8. Even though the documentation says
327
        // unambigously that it is supposed to work, it doesn't. When
328
        // the flag is specified, the function fails with error
329
        // ERROR_INVALID_FLAGS.
330
        DWORD flags = 0;
331
        auto m = static_cast<int>(end - begin);
332
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
333
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
334
            return util::none;
335

336
        if (n3 != n) {
337
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
338
        }
339

340
        begin += n;
341
        output += n;
342
    }
343

344
    return result;
345
#else
346
    size_t sz = source.size();
218,478✔
347
    typedef std::char_traits<char> traits;
218,478✔
348
    for (size_t i = 0; i < sz; ++i) {
18,713,172✔
349
        char c = source[i];
18,494,700✔
350
        auto int_val = traits::to_int_type(c);
18,494,700✔
351

9,247,338✔
352
        auto copy_bytes = [&](size_t n) {
9,247,434✔
353
            if (i + n > sz) {
192✔
354
                return false;
6✔
355
            }
6✔
356
            for (size_t j = 1; j < n; j++) {
600✔
357
                result[i++] = c;
414✔
358
                c = source[i];
414✔
359
                if ((c & 0xC0) != 0x80) {
414✔
360
                    return false;
×
361
                }
×
362
            }
414✔
363
            return true;
186✔
364
        };
186✔
365

9,247,338✔
366
        if (int_val < 0x80) {
18,494,700✔
367
            // Handle ASCII
9,246,924✔
368
            if (upper && (c >= 'a' && c <= 'z')) {
18,493,872✔
369
                c -= 0x20;
16,180,872✔
370
            }
16,180,872✔
371
            else if (!upper && (c >= 'A' && c <= 'Z')) {
2,313,000✔
372
                c += 0x20;
807,276✔
373
            }
807,276✔
374
        }
18,493,872✔
375
        else {
828✔
376
            if ((int_val & 0xE0) == 0xc0) {
828✔
377
                // 2 byte utf-8
318✔
378
                if (i + 2 > sz) {
636✔
379
                    return {};
×
380
                }
×
381
                c = source[i + 1];
636✔
382
                if ((c & 0xC0) != 0x80) {
636✔
383
                    return {};
×
384
                }
×
385
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
636✔
386
                // Handle some Latin-1 supplement characters
318✔
387
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
636✔
388
                    u -= 0x20;
270✔
389
                }
270✔
390
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
366✔
391
                    u += 0x20;
180✔
392
                }
180✔
393

318✔
394
                result[i++] = static_cast<char>((u >> 6) | 0xC0);
636✔
395
                c = static_cast<char>((u & 0x3f) | 0x80);
636✔
396
            }
636✔
397
            else if ((int_val & 0xF0) == 0xE0) {
192✔
398
                // 3 byte utf-8
72✔
399
                if (!copy_bytes(3)) {
144✔
400
                    return {};
×
401
                }
×
402
            }
48✔
403
            else if ((int_val & 0xF8) == 0xF0) {
48✔
404
                // 4 byte utf-8
24✔
405
                if (!copy_bytes(4)) {
48✔
406
                    return {};
6✔
407
                }
6✔
408
            }
×
409
            else {
×
410
                return {};
×
411
            }
×
412
        }
18,494,694✔
413
        result[i] = c;
18,494,694✔
414
    }
18,494,694✔
415
    return result;
218,475✔
416
#endif
218,478✔
417
}
218,478✔
418

419
std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
420
{
111,288✔
421
    return case_map(source, upper).value_or("");
111,288✔
422
}
111,288✔
423

424
// If needle == haystack, return true. NOTE: This function first
425
// performs a case insensitive *byte* compare instead of one whole
426
// UTF-8 character at a time. This is very fast, but not enough to
427
// guarantee that the strings are identical, so we need to finish off
428
// with a slower but rigorous comparison. The signature is similar in
429
// spirit to std::equal().
430
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
431
{
105,408✔
432
    for (size_t i = 0; i != haystack.size(); ++i) {
153,522✔
433
        char c = haystack[i];
66,918✔
434
        if (needle_lower[i] != c && needle_upper[i] != c)
66,918✔
435
            return false;
18,804✔
436
    }
66,918✔
437

52,704✔
438
    const char* begin = haystack.data();
96,006✔
439
    const char* end = begin + haystack.size();
86,604✔
440
    const char* i = begin;
86,604✔
441
    while (i != end) {
108,498✔
442
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
21,894✔
443
            !equal_sequence(i, end, needle_upper + (i - begin)))
11,391✔
444
            return false;
×
445
    }
21,894✔
446
    return true;
86,604✔
447
}
86,604✔
448

449

450
// Test if needle is a substring of haystack. The signature is similar
451
// in spirit to std::search().
452
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
453
{
7,626✔
454
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
3,813✔
455
    size_t i = 0;
7,626✔
456
    while (needle_size <= haystack.size() - i) {
22,830✔
457
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
17,382✔
458
            return i;
2,178✔
459
        }
2,178✔
460
        ++i;
15,204✔
461
    }
15,204✔
462
    return haystack.size(); // Not found
6,537✔
463
}
7,626✔
464

465
/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
466
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
467
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
468
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
469
                  const std::array<uint8_t, 256>& charmap)
470
{
7,422✔
471
    if (needle_size == 0)
7,422✔
472
        return haystack.size() != 0;
×
473

3,711✔
474
    // Prepare vars to avoid lookups in loop
3,711✔
475
    size_t last_char_pos = needle_size - 1;
7,422✔
476
    unsigned char lastCharU = needle_upper[last_char_pos];
7,422✔
477
    unsigned char lastCharL = needle_lower[last_char_pos];
7,422✔
478

3,711✔
479
    // Do Boyer-Moore search
3,711✔
480
    size_t p = last_char_pos;
7,422✔
481
    while (p < haystack.size()) {
14,340✔
482
        unsigned char c = haystack.data()[p]; // Get candidate for last char
7,740✔
483

3,870✔
484
        if (c == lastCharU || c == lastCharL) {
7,740✔
485
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
870✔
486
            if (equal_case_fold(candidate, needle_upper, needle_lower))
870✔
487
                return true; // text found!
822✔
488
        }
6,918✔
489

3,459✔
490
        // If we don't have a match, see how far we can move char_pos
3,459✔
491
        if (charmap[c] == 0)
6,918✔
492
            p += needle_size; // char was not present in search string
6,696✔
493
        else
222✔
494
            p += charmap[c];
222✔
495
    }
6,918✔
496

3,711✔
497
    return false;
7,011✔
498
}
7,422✔
499

500
bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
501
{
10,578✔
502
    if (text.is_null() || lower.is_null()) {
10,578✔
503
        return (text.is_null() && lower.is_null());
×
504
    }
×
505

5,289✔
506
    return StringData::matchlike_ins(text, lower, upper);
10,578✔
507
}
10,578✔
508

509
bool string_like_ins(StringData text, StringData pattern) noexcept
510
{
222✔
511
    if (text.is_null() || pattern.is_null()) {
222✔
512
        return (text.is_null() && pattern.is_null());
30✔
513
    }
30✔
514

96✔
515
    std::string upper = case_map(pattern, true, IgnoreErrors);
192✔
516
    std::string lower = case_map(pattern, false, IgnoreErrors);
192✔
517

96✔
518
    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
192✔
519
}
192✔
520

521
} // namespace realm
522

523

524
/*
525
// This is source code for generating the table in utf8_compare() from an allkey.txt file:
526

527
// Unicodes up to and including 'Latin Extended 2' (0...591)
528

529
std::vector<int64_t> order;
530
order.resize(last_latin_extended_2_unicode + 1);
531
std::string line;
532
std::ifstream myfile("d:/allkeys.txt");
533

534
// Read header text
535
for (size_t t = 0; t < 19; t++)
536
    getline(myfile, line);
537

538
// Read payload
539
for (size_t entry = 0; getline(myfile, line); entry++)
540
{
541
    string str = line.substr(0, 4);
542
    int64_t unicode = std::stoul(str, nullptr, 16);
543
    if (unicode < order.size())
544
    order[unicode] = entry;
545
}
546

547
// Emit something that you can copy/paste into the Core source code in unicode.cpp
548
cout << "static const uint32_t collation_order[] = {";
549
for (size_t t = 0; t < order.size(); t++) {
550
    if (t > 0 && t % 40 == 0)
551
        cout << "\n";
552
    cout << order[t] << (t + 1 < order.size() ? ", " : "");
553
}
554

555
cout << "};";
556
myfile.close();
557
*/
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc