• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / james.stone_381

25 Sep 2023 06:35PM CUT coverage: 90.919% (+0.03%) from 90.892%
james.stone_381

Pull #6670

Evergreen

ironage
optimize: only compare strings once
Pull Request #6670: Sorting stage 3

97114 of 177952 branches covered (0.0%)

879 of 887 new or added lines in 12 files covered. (99.1%)

105 existing lines in 17 files now uncovered.

236103 of 259684 relevant lines covered (90.92%)

7062315.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.64
/src/realm/unicode.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/unicode.hpp>
20

21
#include <algorithm>
22
#include <clocale>
23
#include <vector>
24

25
#ifdef _WIN32
26
#ifndef NOMINMAX
27
#define NOMINMAX
28
#endif
29
#include <windows.h>
30
#else
31
#include <ctype.h>
32
#endif
33

34
namespace realm {
35

36
// clang-format off
37
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38
size_t sequence_length(char lead)
39
{
2,510,778✔
40
    // keep 'static' else entire array will be pushed to stack at each call
1,238,475✔
41
    const static unsigned char lengths[256] = {
2,510,778✔
42
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2,510,778✔
43
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2,510,778✔
44
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2,510,778✔
45
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
2,510,778✔
46
    };
2,510,778✔
47

1,238,475✔
48
    return lengths[static_cast<unsigned char>(lead)];
2,510,778✔
49
}
2,510,778✔
50
// clang-format on
51

52
// Check if the next UTF-8 sequence in [begin, end) is identical to
53
// the one beginning at begin2. If it is, 'begin' is advanced
54
// accordingly.
55
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
56
{
22,962✔
57
    if (begin[0] != begin2[0])
22,962✔
58
        return false;
924✔
59

11,019✔
60
    size_t i = 1;
22,038✔
61
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
22,038✔
62
        // All following bytes matching '10xxxxxx' will be considered
21✔
63
        // as part of this character.
21✔
64
        while (begin + i != end) {
108✔
65
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
90✔
66
                break;
24✔
67
            if (begin[i] != begin2[i])
66✔
68
                return false;
×
69
            ++i;
66✔
70
        }
66✔
71
    }
42✔
72

11,019✔
73
    begin += i;
22,038✔
74
    return true;
22,038✔
75
}
22,038✔
76

77
// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78
uint32_t utf8value(const char* character)
UNCOV
79
{
×
UNCOV
80
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
×
UNCOV
81
    size_t len = sequence_length(c[0]);
×
UNCOV
82
    uint32_t res = c[0];
×
83

UNCOV
84
    if (len == 1)
×
UNCOV
85
        return res;
×
86

UNCOV
87
    res &= (0x3f >> (len - 1));
×
88

UNCOV
89
    for (size_t i = 1; i < len; i++)
×
UNCOV
90
        res = ((res << 6) | (c[i] & 0x3f));
×
91

UNCOV
92
    return res;
×
UNCOV
93
}
×
94

95
// Converts UTF-8 source into upper or lower case. This function
96
// preserves the byte length of each UTF-8 character in following way:
97
// If an output character differs in size, it is simply substituded by
98
// the original character. This may of course give wrong search
99
// results in very special cases. Todo.
100
util::Optional<std::string> case_map(StringData source, bool upper)
101
{
218,580✔
102
    std::string result;
218,580✔
103
    result.resize(source.size());
218,580✔
104

109,290✔
105
#if defined(_WIN32)
106
    constexpr int tmp_buffer_size = 32;
107
    const char* begin = source.data();
108
    const char* end = begin + source.size();
109
    auto output = result.begin();
110
    while (begin != end) {
111
        auto n = end - begin;
112
        if (n > tmp_buffer_size) {
113
            // Break the input string into chunks - but don't break in the middle of a multibyte character
114
            const char* p = begin;
115
            const char* buffer_end = begin + tmp_buffer_size;
116
            while (p < buffer_end) {
117
                size_t len = sequence_length(*p);
118
                p += len;
119
                if (p > buffer_end) {
120
                    p -= len;
121
                    break;
122
                }
123
            }
124
            n = p - begin;
125
        }
126

127
        wchar_t tmp[tmp_buffer_size];
128

129
        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130
        if (n2 == 0)
131
            return util::none;
132

133
        if (n2 < tmp_buffer_size)
134
            tmp[n2] = 0;
135

136
        // Note: If tmp[0] == 0, it is because the string contains a
137
        // null-chacarcter, which is perfectly fine.
138

139
        wchar_t mapped_tmp[tmp_buffer_size];
140
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141
                      tmp_buffer_size, nullptr, nullptr, 0);
142

143
        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144
        // to catch invalid UTF-8. Even though the documentation says
145
        // unambigously that it is supposed to work, it doesn't. When
146
        // the flag is specified, the function fails with error
147
        // ERROR_INVALID_FLAGS.
148
        DWORD flags = 0;
149
        auto m = static_cast<int>(end - begin);
150
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152
            return util::none;
153

154
        if (n3 != n) {
155
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156
        }
157

158
        begin += n;
159
        output += n;
160
    }
161

162
    return result;
163
#else
164
    size_t sz = source.size();
218,580✔
165
    typedef std::char_traits<char> traits;
218,580✔
166
    for (size_t i = 0; i < sz; ++i) {
18,714,573✔
167
        char c = source[i];
18,495,999✔
168
        auto int_val = traits::to_int_type(c);
18,495,999✔
169

9,247,998✔
170
        auto copy_bytes = [&](size_t n) {
9,248,094✔
171
            if (i + n > sz) {
192✔
172
                return false;
6✔
173
            }
6✔
174
            for (size_t j = 1; j < n; j++) {
600✔
175
                result[i++] = c;
414✔
176
                c = source[i];
414✔
177
                if ((c & 0xC0) != 0x80) {
414✔
178
                    return false;
×
179
                }
×
180
            }
414✔
181
            return true;
186✔
182
        };
186✔
183

9,247,998✔
184
        if (int_val < 0x80) {
18,495,999✔
185
            // Handle ASCII
9,247,584✔
186
            if (upper && (c >= 'a' && c <= 'z')) {
18,495,171✔
187
                c -= 0x20;
16,181,580✔
188
            }
16,181,580✔
189
            else if (!upper && (c >= 'A' && c <= 'Z')) {
2,313,591✔
190
                c += 0x20;
807,156✔
191
            }
807,156✔
192
        }
18,495,171✔
193
        else {
828✔
194
            if ((int_val & 0xE0) == 0xc0) {
828✔
195
                // 2 byte utf-8
318✔
196
                if (i + 2 > sz) {
636✔
197
                    return {};
×
198
                }
×
199
                c = source[i + 1];
636✔
200
                if ((c & 0xC0) != 0x80) {
636✔
201
                    return {};
×
202
                }
×
203
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
636✔
204
                // Handle some Latin-1 supplement characters
318✔
205
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
636✔
206
                    u -= 0x20;
270✔
207
                }
270✔
208
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
366✔
209
                    u += 0x20;
180✔
210
                }
180✔
211

318✔
212
                result[i++] = static_cast<char>((u >> 6) | 0xC0);
636✔
213
                c = static_cast<char>((u & 0x3f) | 0x80);
636✔
214
            }
636✔
215
            else if ((int_val & 0xF0) == 0xE0) {
192✔
216
                // 3 byte utf-8
72✔
217
                if (!copy_bytes(3)) {
144✔
218
                    return {};
×
219
                }
×
220
            }
48✔
221
            else if ((int_val & 0xF8) == 0xF0) {
48✔
222
                // 4 byte utf-8
24✔
223
                if (!copy_bytes(4)) {
48✔
224
                    return {};
6✔
225
                }
6✔
UNCOV
226
            }
×
UNCOV
227
            else {
×
UNCOV
228
                return {};
×
UNCOV
229
            }
×
230
        }
18,495,993✔
231
        result[i] = c;
18,495,993✔
232
    }
18,495,993✔
233
    return result;
218,577✔
234
#endif
218,580✔
235
}
218,580✔
236

237
std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238
{
111,309✔
239
    return case_map(source, upper).value_or("");
111,309✔
240
}
111,309✔
241

242
// If needle == haystack, return true. NOTE: This function first
243
// performs a case insensitive *byte* compare instead of one whole
244
// UTF-8 character at a time. This is very fast, but not enough to
245
// guarantee that the strings are identical, so we need to finish off
246
// with a slower but rigorous comparison. The signature is similar in
247
// spirit to std::equal().
248
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249
{
105,432✔
250
    for (size_t i = 0; i != haystack.size(); ++i) {
153,978✔
251
        char c = haystack[i];
67,362✔
252
        if (needle_lower[i] != c && needle_upper[i] != c)
67,362✔
253
            return false;
18,816✔
254
    }
67,362✔
255

52,716✔
256
    const char* begin = haystack.data();
96,024✔
257
    const char* end = begin + haystack.size();
86,616✔
258
    const char* i = begin;
86,616✔
259
    while (i != end) {
108,654✔
260
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
22,038✔
261
            !equal_sequence(i, end, needle_upper + (i - begin)))
11,481✔
262
            return false;
×
263
    }
22,038✔
264
    return true;
86,616✔
265
}
86,616✔
266

267

268
// Test if needle is a substring of haystack. The signature is similar
269
// in spirit to std::search().
270
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271
{
7,602✔
272
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
3,801✔
273
    size_t i = 0;
7,602✔
274
    while (needle_size <= haystack.size() - i) {
22,782✔
275
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
17,334✔
276
            return i;
2,154✔
277
        }
2,154✔
278
        ++i;
15,180✔
279
    }
15,180✔
280
    return haystack.size(); // Not found
6,525✔
281
}
7,602✔
282

283
/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287
                  const std::array<uint8_t, 256>& charmap)
288
{
7,422✔
289
    if (needle_size == 0)
7,422✔
290
        return haystack.size() != 0;
×
291

3,711✔
292
    // Prepare vars to avoid lookups in loop
3,711✔
293
    size_t last_char_pos = needle_size - 1;
7,422✔
294
    unsigned char lastCharU = needle_upper[last_char_pos];
7,422✔
295
    unsigned char lastCharL = needle_lower[last_char_pos];
7,422✔
296

3,711✔
297
    // Do Boyer-Moore search
3,711✔
298
    size_t p = last_char_pos;
7,422✔
299
    while (p < haystack.size()) {
14,340✔
300
        unsigned char c = haystack.data()[p]; // Get candidate for last char
7,740✔
301

3,870✔
302
        if (c == lastCharU || c == lastCharL) {
7,740✔
303
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
870✔
304
            if (equal_case_fold(candidate, needle_upper, needle_lower))
870✔
305
                return true; // text found!
822✔
306
        }
6,918✔
307

3,459✔
308
        // If we don't have a match, see how far we can move char_pos
3,459✔
309
        if (charmap[c] == 0)
6,918✔
310
            p += needle_size; // char was not present in search string
6,696✔
311
        else
222✔
312
            p += charmap[c];
222✔
313
    }
6,918✔
314

3,711✔
315
    return false;
7,011✔
316
}
7,422✔
317

318
bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319
{
10,578✔
320
    if (text.is_null() || lower.is_null()) {
10,578✔
321
        return (text.is_null() && lower.is_null());
×
322
    }
×
323

5,289✔
324
    return StringData::matchlike_ins(text, lower, upper);
10,578✔
325
}
10,578✔
326

327
bool string_like_ins(StringData text, StringData pattern) noexcept
328
{
222✔
329
    if (text.is_null() || pattern.is_null()) {
222✔
330
        return (text.is_null() && pattern.is_null());
30✔
331
    }
30✔
332

96✔
333
    std::string upper = case_map(pattern, true, IgnoreErrors);
192✔
334
    std::string lower = case_map(pattern, false, IgnoreErrors);
192✔
335

96✔
336
    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
192✔
337
}
192✔
338

339
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc