• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / kraen.hansen_75

23 Aug 2024 10:17AM UTC coverage: 91.772% (+0.01%) from 91.761%
kraen.hansen_75

Pull #7995

Evergreen

kraenhansen
Updated bindgen spec and helpers
Pull Request #7995: Updated bindgen spec and helpers

73344 of 130388 branches covered (56.25%)

139610 of 152127 relevant lines covered (91.77%)

6529327.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.83
/src/realm/unicode.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/unicode.hpp>
20

21
#include <algorithm>
22
#include <clocale>
23
#include <vector>
24

25
#ifdef _WIN32
26
#ifndef NOMINMAX
27
#define NOMINMAX
28
#endif
29
#include <windows.h>
30
#else
31
#include <ctype.h>
32
#endif
33

34
namespace realm {
35

36
// clang-format off
37
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38
size_t sequence_length(char lead)
39
{
38,308✔
40
    // keep 'static' else entire array will be pushed to stack at each call
41
    const static unsigned char lengths[256] = {
38,308✔
42
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38,308✔
43
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38,308✔
44
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38,308✔
45
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
38,308✔
46
    };
38,308✔
47

48
    return lengths[static_cast<unsigned char>(lead)];
38,308✔
49
}
38,308✔
50
// clang-format on
51

52
// Check if the next UTF-8 sequence in [begin, end) is identical to
53
// the one beginning at begin2. If it is, 'begin' is advanced
54
// accordingly.
55
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
56
{
18,740✔
57
    if (begin[0] != begin2[0])
18,740✔
58
        return false;
616✔
59

60
    size_t i = 1;
18,124✔
61
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
18,124✔
62
        // All following bytes matching '10xxxxxx' will be considered
63
        // as part of this character.
64
        while (begin + i != end) {
72✔
65
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
60✔
66
                break;
16✔
67
            if (begin[i] != begin2[i])
44✔
68
                return false;
×
69
            ++i;
44✔
70
        }
44✔
71
    }
28✔
72

73
    begin += i;
18,124✔
74
    return true;
18,124✔
75
}
18,124✔
76

77
// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78
uint32_t utf8value(const char* character)
79
{
×
80
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
×
81
    size_t len = sequence_length(c[0]);
×
82
    uint32_t res = c[0];
×
83

84
    if (len == 1)
×
85
        return res;
×
86

87
    res &= (0x3f >> (len - 1));
×
88

89
    for (size_t i = 1; i < len; i++)
×
90
        res = ((res << 6) | (c[i] & 0x3f));
×
91

92
    return res;
×
93
}
×
94

95
// Converts UTF-8 source into upper or lower case. This function
96
// preserves the byte length of each UTF-8 character in following way:
97
// If an output character differs in size, it is simply substituded by
98
// the original character. This may of course give wrong search
99
// results in very special cases. Todo.
100
util::Optional<std::string> case_map(StringData source, bool upper)
101
{
154,456✔
102
    std::string result;
154,456✔
103
    result.resize(source.size());
154,456✔
104

105
#if defined(_WIN32)
106
    constexpr int tmp_buffer_size = 32;
107
    const char* begin = source.data();
108
    const char* end = begin + source.size();
109
    auto output = result.begin();
110
    while (begin != end) {
111
        auto n = end - begin;
112
        if (n > tmp_buffer_size) {
113
            // Break the input string into chunks - but don't break in the middle of a multibyte character
114
            const char* p = begin;
115
            const char* buffer_end = begin + tmp_buffer_size;
116
            while (p < buffer_end) {
117
                size_t len = sequence_length(*p);
118
                p += len;
119
                if (p > buffer_end) {
120
                    p -= len;
121
                    break;
122
                }
123
            }
124
            n = p - begin;
125
        }
126

127
        wchar_t tmp[tmp_buffer_size];
128

129
        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130
        if (n2 == 0)
131
            return util::none;
132

133
        if (n2 < tmp_buffer_size)
134
            tmp[n2] = 0;
135

136
        // Note: If tmp[0] == 0, it is because the string contains a
137
        // null-chacarcter, which is perfectly fine.
138

139
        wchar_t mapped_tmp[tmp_buffer_size];
140
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141
                      tmp_buffer_size, nullptr, nullptr, 0);
142

143
        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144
        // to catch invalid UTF-8. Even though the documentation says
145
        // unambigously that it is supposed to work, it doesn't. When
146
        // the flag is specified, the function fails with error
147
        // ERROR_INVALID_FLAGS.
148
        DWORD flags = 0;
149
        auto m = static_cast<int>(end - begin);
150
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152
            return util::none;
153

154
        if (n3 != n) {
155
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156
        }
157

158
        begin += n;
159
        output += n;
160
    }
161

162
    return result;
163
#else
164
    size_t sz = source.size();
154,456✔
165
    typedef std::char_traits<char> traits;
154,456✔
166
    for (size_t i = 0; i < sz; ++i) {
12,547,544✔
167
        char c = source[i];
12,393,092✔
168
        auto int_val = traits::to_int_type(c);
12,393,092✔
169

170
        auto copy_bytes = [&](size_t n) {
12,393,092✔
171
            if (i + n > sz) {
128✔
172
                return false;
4✔
173
            }
4✔
174
            for (size_t j = 1; j < n; j++) {
400✔
175
                result[i++] = c;
276✔
176
                c = source[i];
276✔
177
                if ((c & 0xC0) != 0x80) {
276✔
178
                    return false;
×
179
                }
×
180
            }
276✔
181
            return true;
124✔
182
        };
124✔
183

184
        if (int_val < 0x80) {
12,393,092✔
185
            // Handle ASCII
186
            if (upper && (c >= 'a' && c <= 'z')) {
12,392,540✔
187
                c -= 0x20;
10,808,664✔
188
            }
10,808,664✔
189
            else if (!upper && (c >= 'A' && c <= 'Z')) {
1,583,876✔
190
                c += 0x20;
540,648✔
191
            }
540,648✔
192
        }
12,392,540✔
193
        else {
552✔
194
            if ((int_val & 0xE0) == 0xc0) {
552✔
195
                // 2 byte utf-8
196
                if (i + 2 > sz) {
424✔
197
                    return {};
×
198
                }
×
199
                c = source[i + 1];
424✔
200
                if ((c & 0xC0) != 0x80) {
424✔
201
                    return {};
×
202
                }
×
203
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
424✔
204
                // Handle some Latin-1 supplement characters
205
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
424✔
206
                    u -= 0x20;
180✔
207
                }
180✔
208
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
244✔
209
                    u += 0x20;
120✔
210
                }
120✔
211

212
                result[i++] = static_cast<char>((u >> 6) | 0xC0);
424✔
213
                c = static_cast<char>((u & 0x3f) | 0x80);
424✔
214
            }
424✔
215
            else if ((int_val & 0xF0) == 0xE0) {
128✔
216
                // 3 byte utf-8
217
                if (!copy_bytes(3)) {
96✔
218
                    return {};
×
219
                }
×
220
            }
96✔
221
            else if ((int_val & 0xF8) == 0xF0) {
32✔
222
                // 4 byte utf-8
223
                if (!copy_bytes(4)) {
32✔
224
                    return {};
4✔
225
                }
4✔
226
            }
32✔
227
            else {
×
228
                return {};
×
229
            }
×
230
        }
552✔
231
        result[i] = c;
12,393,088✔
232
    }
12,393,088✔
233
    return result;
154,452✔
234
#endif
154,456✔
235
}
154,456✔
236

237
std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238
{
82,944✔
239
    return case_map(source, upper).value_or("");
82,944✔
240
}
82,944✔
241

242
// If needle == haystack, return true. NOTE: This function first
243
// performs a case insensitive *byte* compare instead of one whole
244
// UTF-8 character at a time. This is very fast, but not enough to
245
// guarantee that the strings are identical, so we need to finish off
246
// with a slower but rigorous comparison. The signature is similar in
247
// spirit to std::equal().
248
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249
{
73,408✔
250
    for (size_t i = 0; i != haystack.size(); ++i) {
124,756✔
251
        char c = haystack[i];
66,268✔
252
        if (needle_lower[i] != c && needle_upper[i] != c)
66,268✔
253
            return false;
14,920✔
254
    }
66,268✔
255

256
    const char* begin = haystack.data();
58,488✔
257
    const char* end = begin + haystack.size();
58,488✔
258
    const char* i = begin;
58,488✔
259
    while (i != end) {
76,612✔
260
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
18,124✔
261
            !equal_sequence(i, end, needle_upper + (i - begin)))
18,124✔
262
            return false;
×
263
    }
18,124✔
264
    return true;
58,488✔
265
}
58,488✔
266

267

268
// Test if needle is a substring of haystack. The signature is similar
269
// in spirit to std::search().
270
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271
{
6,212✔
272
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
273
    size_t i = 0;
6,212✔
274
    while (needle_size <= haystack.size() - i) {
17,052✔
275
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
12,596✔
276
            return i;
1,756✔
277
        }
1,756✔
278
        ++i;
10,840✔
279
    }
10,840✔
280
    return haystack.size(); // Not found
4,456✔
281
}
6,212✔
282

283
/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287
                  const std::array<uint8_t, 256>& charmap)
288
{
4,948✔
289
    if (needle_size == 0)
4,948✔
290
        return haystack.size() != 0;
×
291

292
    // Prepare vars to avoid lookups in loop
293
    size_t last_char_pos = needle_size - 1;
4,948✔
294
    unsigned char lastCharU = needle_upper[last_char_pos];
4,948✔
295
    unsigned char lastCharL = needle_lower[last_char_pos];
4,948✔
296

297
    // Do Boyer-Moore search
298
    size_t p = last_char_pos;
4,948✔
299
    while (p < haystack.size()) {
9,560✔
300
        unsigned char c = haystack.data()[p]; // Get candidate for last char
5,160✔
301

302
        if (c == lastCharU || c == lastCharL) {
5,160✔
303
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
580✔
304
            if (equal_case_fold(candidate, needle_upper, needle_lower))
580✔
305
                return true; // text found!
548✔
306
        }
580✔
307

308
        // If we don't have a match, see how far we can move char_pos
309
        if (charmap[c] == 0)
4,612✔
310
            p += needle_size; // char was not present in search string
4,464✔
311
        else
148✔
312
            p += charmap[c];
148✔
313
    }
4,612✔
314

315
    return false;
4,400✔
316
}
4,948✔
317

318
bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319
{
8,196✔
320
    if (text.is_null() || lower.is_null()) {
8,196✔
321
        return (text.is_null() && lower.is_null());
×
322
    }
×
323

324
    return StringData::matchlike_ins(text, lower, upper);
8,196✔
325
}
8,196✔
326

327
bool string_like_ins(StringData text, StringData pattern) noexcept
328
{
148✔
329
    if (text.is_null() || pattern.is_null()) {
148✔
330
        return (text.is_null() && pattern.is_null());
20✔
331
    }
20✔
332

333
    std::string upper = case_map(pattern, true, IgnoreErrors);
128✔
334
    std::string lower = case_map(pattern, false, IgnoreErrors);
128✔
335

336
    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
128✔
337
}
148✔
338

339
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc