• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / thomas.goyne_478

02 Aug 2024 05:19PM UTC coverage: 91.089% (-0.01%) from 91.1%
thomas.goyne_478

Pull #7944

Evergreen

tgoyne
Only track pending client resets done by the same core version

If the previous attempt at performing a client reset was done with a different
core version then we should retry the client reset as the new version may have
fixed a bug that made the previous attempt fail (or may be a downgrade to a
version before when the bug was introduced). This also simplifies the tracking
as it means that we don't need to be able to read trackers created by different
versions.

This also means that we can freely change the schema of the table, which this
takes advantage of to drop the unused primary key and make the error required,
as we never actually stored null and the code reading it would have crashed if
it encountered a null error.
Pull Request #7944: Only track pending client resets done by the same core version

102704 of 181534 branches covered (56.58%)

138 of 153 new or added lines in 10 files covered. (90.2%)

85 existing lines in 16 files now uncovered.

216717 of 237917 relevant lines covered (91.09%)

5947762.1 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.83
/src/realm/unicode.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/unicode.hpp>
20

21
#include <algorithm>
22
#include <clocale>
23
#include <vector>
24

25
#ifdef _WIN32
26
#ifndef NOMINMAX
27
#define NOMINMAX
28
#endif
29
#include <windows.h>
30
#else
31
#include <ctype.h>
32
#endif
33

34
namespace realm {
35

36
// clang-format off
37
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38
size_t sequence_length(char lead)
39
{
120,258✔
40
    // keep 'static' else entire array will be pushed to stack at each call
41
    const static unsigned char lengths[256] = {
120,258✔
42
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120,258✔
43
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120,258✔
44
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120,258✔
45
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
120,258✔
46
    };
120,258✔
47

48
    return lengths[static_cast<unsigned char>(lead)];
120,258✔
49
}
120,258✔
50
// clang-format on
51

52
// Check if the next UTF-8 sequence in [begin, end) is identical to
53
// the one beginning at begin2. If it is, 'begin' is advanced
54
// accordingly.
55
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
56
{
28,110✔
57
    if (begin[0] != begin2[0])
28,110✔
58
        return false;
924✔
59

60
    size_t i = 1;
27,186✔
61
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
27,186✔
62
        // All following bytes matching '10xxxxxx' will be considered
63
        // as part of this character.
64
        while (begin + i != end) {
108✔
65
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
90✔
66
                break;
24✔
67
            if (begin[i] != begin2[i])
66✔
68
                return false;
×
69
            ++i;
66✔
70
        }
66✔
71
    }
42✔
72

73
    begin += i;
27,186✔
74
    return true;
27,186✔
75
}
27,186✔
76

77
// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78
uint32_t utf8value(const char* character)
79
{
×
80
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
×
81
    size_t len = sequence_length(c[0]);
×
82
    uint32_t res = c[0];
×
83

84
    if (len == 1)
×
85
        return res;
×
86

87
    res &= (0x3f >> (len - 1));
×
88

89
    for (size_t i = 1; i < len; i++)
×
90
        res = ((res << 6) | (c[i] & 0x3f));
×
91

92
    return res;
×
93
}
×
94

95
// Converts UTF-8 source into upper or lower case. This function
96
// preserves the byte length of each UTF-8 character in following way:
97
// If an output character differs in size, it is simply substituded by
98
// the original character. This may of course give wrong search
99
// results in very special cases. Todo.
100
util::Optional<std::string> case_map(StringData source, bool upper)
101
{
231,684✔
102
    std::string result;
231,684✔
103
    result.resize(source.size());
231,684✔
104

105
#if defined(_WIN32)
106
    constexpr int tmp_buffer_size = 32;
107
    const char* begin = source.data();
108
    const char* end = begin + source.size();
109
    auto output = result.begin();
110
    while (begin != end) {
111
        auto n = end - begin;
112
        if (n > tmp_buffer_size) {
113
            // Break the input string into chunks - but don't break in the middle of a multibyte character
114
            const char* p = begin;
115
            const char* buffer_end = begin + tmp_buffer_size;
116
            while (p < buffer_end) {
117
                size_t len = sequence_length(*p);
118
                p += len;
119
                if (p > buffer_end) {
120
                    p -= len;
121
                    break;
122
                }
123
            }
124
            n = p - begin;
125
        }
126

127
        wchar_t tmp[tmp_buffer_size];
128

129
        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130
        if (n2 == 0)
131
            return util::none;
132

133
        if (n2 < tmp_buffer_size)
134
            tmp[n2] = 0;
135

136
        // Note: If tmp[0] == 0, it is because the string contains a
137
        // null-chacarcter, which is perfectly fine.
138

139
        wchar_t mapped_tmp[tmp_buffer_size];
140
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141
                      tmp_buffer_size, nullptr, nullptr, 0);
142

143
        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144
        // to catch invalid UTF-8. Even though the documentation says
145
        // unambigously that it is supposed to work, it doesn't. When
146
        // the flag is specified, the function fails with error
147
        // ERROR_INVALID_FLAGS.
148
        DWORD flags = 0;
149
        auto m = static_cast<int>(end - begin);
150
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152
            return util::none;
153

154
        if (n3 != n) {
155
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156
        }
157

158
        begin += n;
159
        output += n;
160
    }
161

162
    return result;
163
#else
164
    size_t sz = source.size();
231,684✔
165
    typedef std::char_traits<char> traits;
231,684✔
166
    for (size_t i = 0; i < sz; ++i) {
18,821,352✔
167
        char c = source[i];
18,589,674✔
168
        auto int_val = traits::to_int_type(c);
18,589,674✔
169

170
        auto copy_bytes = [&](size_t n) {
18,589,674✔
171
            if (i + n > sz) {
192✔
172
                return false;
6✔
173
            }
6✔
174
            for (size_t j = 1; j < n; j++) {
600✔
175
                result[i++] = c;
414✔
176
                c = source[i];
414✔
177
                if ((c & 0xC0) != 0x80) {
414✔
178
                    return false;
×
179
                }
×
180
            }
414✔
181
            return true;
186✔
182
        };
186✔
183

184
        if (int_val < 0x80) {
18,589,674✔
185
            // Handle ASCII
186
            if (upper && (c >= 'a' && c <= 'z')) {
18,588,846✔
187
                c -= 0x20;
16,213,008✔
188
            }
16,213,008✔
189
            else if (!upper && (c >= 'A' && c <= 'Z')) {
2,375,838✔
190
                c += 0x20;
810,972✔
191
            }
810,972✔
192
        }
18,588,846✔
193
        else {
828✔
194
            if ((int_val & 0xE0) == 0xc0) {
828✔
195
                // 2 byte utf-8
196
                if (i + 2 > sz) {
636✔
197
                    return {};
×
198
                }
×
199
                c = source[i + 1];
636✔
200
                if ((c & 0xC0) != 0x80) {
636✔
201
                    return {};
×
202
                }
×
203
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
636✔
204
                // Handle some Latin-1 supplement characters
205
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
636✔
206
                    u -= 0x20;
270✔
207
                }
270✔
208
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
366✔
209
                    u += 0x20;
180✔
210
                }
180✔
211

212
                result[i++] = static_cast<char>((u >> 6) | 0xC0);
636✔
213
                c = static_cast<char>((u & 0x3f) | 0x80);
636✔
214
            }
636✔
215
            else if ((int_val & 0xF0) == 0xE0) {
192✔
216
                // 3 byte utf-8
217
                if (!copy_bytes(3)) {
144✔
218
                    return {};
×
219
                }
×
220
            }
144✔
221
            else if ((int_val & 0xF8) == 0xF0) {
48✔
222
                // 4 byte utf-8
223
                if (!copy_bytes(4)) {
48✔
224
                    return {};
6✔
225
                }
6✔
226
            }
48✔
UNCOV
227
            else {
×
UNCOV
228
                return {};
×
UNCOV
229
            }
×
230
        }
828✔
231
        result[i] = c;
18,589,668✔
232
    }
18,589,668✔
233
    return result;
231,678✔
234
#endif
231,684✔
235
}
231,684✔
236

237
std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238
{
124,416✔
239
    return case_map(source, upper).value_or("");
124,416✔
240
}
124,416✔
241

242
// If needle == haystack, return true. NOTE: This function first
243
// performs a case insensitive *byte* compare instead of one whole
244
// UTF-8 character at a time. This is very fast, but not enough to
245
// guarantee that the strings are identical, so we need to finish off
246
// with a slower but rigorous comparison. The signature is similar in
247
// spirit to std::equal().
248
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249
{
110,112✔
250
    for (size_t i = 0; i != haystack.size(); ++i) {
187,134✔
251
        char c = haystack[i];
99,402✔
252
        if (needle_lower[i] != c && needle_upper[i] != c)
99,402✔
253
            return false;
22,380✔
254
    }
99,402✔
255

256
    const char* begin = haystack.data();
87,732✔
257
    const char* end = begin + haystack.size();
87,732✔
258
    const char* i = begin;
87,732✔
259
    while (i != end) {
114,918✔
260
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
27,186✔
261
            !equal_sequence(i, end, needle_upper + (i - begin)))
27,186✔
262
            return false;
×
263
    }
27,186✔
264
    return true;
87,732✔
265
}
87,732✔
266

267

268
// Test if needle is a substring of haystack. The signature is similar
269
// in spirit to std::search().
270
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271
{
9,318✔
272
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
273
    size_t i = 0;
9,318✔
274
    while (needle_size <= haystack.size() - i) {
25,578✔
275
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
18,894✔
276
            return i;
2,634✔
277
        }
2,634✔
278
        ++i;
16,260✔
279
    }
16,260✔
280
    return haystack.size(); // Not found
6,684✔
281
}
9,318✔
282

283
/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287
                  const std::array<uint8_t, 256>& charmap)
288
{
7,422✔
289
    if (needle_size == 0)
7,422✔
290
        return haystack.size() != 0;
×
291

292
    // Prepare vars to avoid lookups in loop
293
    size_t last_char_pos = needle_size - 1;
7,422✔
294
    unsigned char lastCharU = needle_upper[last_char_pos];
7,422✔
295
    unsigned char lastCharL = needle_lower[last_char_pos];
7,422✔
296

297
    // Do Boyer-Moore search
298
    size_t p = last_char_pos;
7,422✔
299
    while (p < haystack.size()) {
14,340✔
300
        unsigned char c = haystack.data()[p]; // Get candidate for last char
7,740✔
301

302
        if (c == lastCharU || c == lastCharL) {
7,740✔
303
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
870✔
304
            if (equal_case_fold(candidate, needle_upper, needle_lower))
870✔
305
                return true; // text found!
822✔
306
        }
870✔
307

308
        // If we don't have a match, see how far we can move char_pos
309
        if (charmap[c] == 0)
6,918✔
310
            p += needle_size; // char was not present in search string
6,696✔
311
        else
222✔
312
            p += charmap[c];
222✔
313
    }
6,918✔
314

315
    return false;
6,600✔
316
}
7,422✔
317

318
bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319
{
12,294✔
320
    if (text.is_null() || lower.is_null()) {
12,294✔
321
        return (text.is_null() && lower.is_null());
×
322
    }
×
323

324
    return StringData::matchlike_ins(text, lower, upper);
12,294✔
325
}
12,294✔
326

327
bool string_like_ins(StringData text, StringData pattern) noexcept
328
{
222✔
329
    if (text.is_null() || pattern.is_null()) {
222✔
330
        return (text.is_null() && pattern.is_null());
30✔
331
    }
30✔
332

333
    std::string upper = case_map(pattern, true, IgnoreErrors);
192✔
334
    std::string lower = case_map(pattern, false, IgnoreErrors);
192✔
335

336
    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
192✔
337
}
222✔
338

339
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc