• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / james.stone_543

10 May 2024 08:59PM CUT coverage: 90.808% (-0.03%) from 90.837%
james.stone_543

Pull #7689

Evergreen

ironage
fix a test on windows
Pull Request #7689: RNET-1141 multiprocess encryption for writers with different page sizes

102068 of 181122 branches covered (56.35%)

202 of 223 new or added lines in 3 files covered. (90.58%)

119 existing lines in 13 files now uncovered.

214742 of 236479 relevant lines covered (90.81%)

5817458.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.83
/src/realm/unicode.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/unicode.hpp>
20

21
#include <algorithm>
22
#include <clocale>
23
#include <vector>
24

25
#ifdef _WIN32
26
#ifndef NOMINMAX
27
#define NOMINMAX
28
#endif
29
#include <windows.h>
30
#else
31
#include <ctype.h>
32
#endif
33

34
namespace realm {
35

36
// clang-format off
37
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38
size_t sequence_length(char lead)
39
{
129,387✔
40
    // keep 'static' else entire array will be pushed to stack at each call
41
    const static unsigned char lengths[256] = {
129,387✔
42
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129,387✔
43
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129,387✔
44
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129,387✔
45
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
129,387✔
46
    };
129,387✔
47

48
    return lengths[static_cast<unsigned char>(lead)];
129,387✔
49
}
129,387✔
50
// clang-format on
51

52
// Check if the next UTF-8 sequence in [begin, end) is identical to
53
// the one beginning at begin2. If it is, 'begin' is advanced
54
// accordingly.
55
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
56
{
28,110✔
57
    if (begin[0] != begin2[0])
28,110✔
58
        return false;
924✔
59

60
    size_t i = 1;
27,186✔
61
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
27,186✔
62
        // All following bytes matching '10xxxxxx' will be considered
63
        // as part of this character.
64
        while (begin + i != end) {
108✔
65
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
90✔
66
                break;
24✔
67
            if (begin[i] != begin2[i])
66✔
68
                return false;
×
69
            ++i;
66✔
70
        }
66✔
71
    }
42✔
72

73
    begin += i;
27,186✔
74
    return true;
27,186✔
75
}
27,186✔
76

77
// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78
uint32_t utf8value(const char* character)
79
{
×
80
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
×
81
    size_t len = sequence_length(c[0]);
×
82
    uint32_t res = c[0];
×
83

84
    if (len == 1)
×
85
        return res;
×
86

87
    res &= (0x3f >> (len - 1));
×
88

89
    for (size_t i = 1; i < len; i++)
×
90
        res = ((res << 6) | (c[i] & 0x3f));
×
91

92
    return res;
×
93
}
×
94

95
// Converts UTF-8 source into upper or lower case. This function
96
// preserves the byte length of each UTF-8 character in following way:
97
// If an output character differs in size, it is simply substituded by
98
// the original character. This may of course give wrong search
99
// results in very special cases. Todo.
100
util::Optional<std::string> case_map(StringData source, bool upper)
101
{
231,684✔
102
    std::string result;
231,684✔
103
    result.resize(source.size());
231,684✔
104

105
#if defined(_WIN32)
106
    constexpr int tmp_buffer_size = 32;
107
    const char* begin = source.data();
108
    const char* end = begin + source.size();
109
    auto output = result.begin();
110
    while (begin != end) {
111
        auto n = end - begin;
112
        if (n > tmp_buffer_size) {
113
            // Break the input string into chunks - but don't break in the middle of a multibyte character
114
            const char* p = begin;
115
            const char* buffer_end = begin + tmp_buffer_size;
116
            while (p < buffer_end) {
117
                size_t len = sequence_length(*p);
118
                p += len;
119
                if (p > buffer_end) {
120
                    p -= len;
121
                    break;
122
                }
123
            }
124
            n = p - begin;
125
        }
126

127
        wchar_t tmp[tmp_buffer_size];
128

129
        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130
        if (n2 == 0)
131
            return util::none;
132

133
        if (n2 < tmp_buffer_size)
134
            tmp[n2] = 0;
135

136
        // Note: If tmp[0] == 0, it is because the string contains a
137
        // null-chacarcter, which is perfectly fine.
138

139
        wchar_t mapped_tmp[tmp_buffer_size];
140
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141
                      tmp_buffer_size, nullptr, nullptr, 0);
142

143
        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144
        // to catch invalid UTF-8. Even though the documentation says
145
        // unambigously that it is supposed to work, it doesn't. When
146
        // the flag is specified, the function fails with error
147
        // ERROR_INVALID_FLAGS.
148
        DWORD flags = 0;
149
        auto m = static_cast<int>(end - begin);
150
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152
            return util::none;
153

154
        if (n3 != n) {
155
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156
        }
157

158
        begin += n;
159
        output += n;
160
    }
161

162
    return result;
163
#else
164
    size_t sz = source.size();
231,684✔
165
    typedef std::char_traits<char> traits;
231,684✔
166
    for (size_t i = 0; i < sz; ++i) {
18,821,352✔
167
        char c = source[i];
18,589,674✔
168
        auto int_val = traits::to_int_type(c);
18,589,674✔
169

170
        auto copy_bytes = [&](size_t n) {
18,589,674✔
171
            if (i + n > sz) {
192✔
172
                return false;
6✔
173
            }
6✔
174
            for (size_t j = 1; j < n; j++) {
600✔
175
                result[i++] = c;
414✔
176
                c = source[i];
414✔
177
                if ((c & 0xC0) != 0x80) {
414✔
178
                    return false;
×
179
                }
×
180
            }
414✔
181
            return true;
186✔
182
        };
186✔
183

184
        if (int_val < 0x80) {
18,589,674✔
185
            // Handle ASCII
186
            if (upper && (c >= 'a' && c <= 'z')) {
18,588,846✔
187
                c -= 0x20;
16,213,008✔
188
            }
16,213,008✔
189
            else if (!upper && (c >= 'A' && c <= 'Z')) {
2,375,838✔
190
                c += 0x20;
810,972✔
191
            }
810,972✔
192
        }
18,588,846✔
193
        else {
828✔
194
            if ((int_val & 0xE0) == 0xc0) {
828✔
195
                // 2 byte utf-8
196
                if (i + 2 > sz) {
636✔
197
                    return {};
×
198
                }
×
199
                c = source[i + 1];
636✔
200
                if ((c & 0xC0) != 0x80) {
636✔
201
                    return {};
×
202
                }
×
203
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
636✔
204
                // Handle some Latin-1 supplement characters
205
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
636✔
206
                    u -= 0x20;
270✔
207
                }
270✔
208
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
366✔
209
                    u += 0x20;
180✔
210
                }
180✔
211

212
                result[i++] = static_cast<char>((u >> 6) | 0xC0);
636✔
213
                c = static_cast<char>((u & 0x3f) | 0x80);
636✔
214
            }
636✔
215
            else if ((int_val & 0xF0) == 0xE0) {
192✔
216
                // 3 byte utf-8
217
                if (!copy_bytes(3)) {
144✔
218
                    return {};
×
219
                }
×
220
            }
144✔
221
            else if ((int_val & 0xF8) == 0xF0) {
48✔
222
                // 4 byte utf-8
223
                if (!copy_bytes(4)) {
48✔
224
                    return {};
6✔
225
                }
6✔
226
            }
48✔
UNCOV
227
            else {
×
UNCOV
228
                return {};
×
UNCOV
229
            }
×
230
        }
828✔
231
        result[i] = c;
18,589,668✔
232
    }
18,589,668✔
233
    return result;
231,678✔
234
#endif
231,684✔
235
}
231,684✔
236

237
std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238
{
124,416✔
239
    return case_map(source, upper).value_or("");
124,416✔
240
}
124,416✔
241

242
// If needle == haystack, return true. NOTE: This function first
243
// performs a case insensitive *byte* compare instead of one whole
244
// UTF-8 character at a time. This is very fast, but not enough to
245
// guarantee that the strings are identical, so we need to finish off
246
// with a slower but rigorous comparison. The signature is similar in
247
// spirit to std::equal().
248
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249
{
110,112✔
250
    for (size_t i = 0; i != haystack.size(); ++i) {
187,134✔
251
        char c = haystack[i];
99,402✔
252
        if (needle_lower[i] != c && needle_upper[i] != c)
99,402✔
253
            return false;
22,380✔
254
    }
99,402✔
255

256
    const char* begin = haystack.data();
87,732✔
257
    const char* end = begin + haystack.size();
87,732✔
258
    const char* i = begin;
87,732✔
259
    while (i != end) {
114,918✔
260
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
27,186✔
261
            !equal_sequence(i, end, needle_upper + (i - begin)))
27,186✔
262
            return false;
×
263
    }
27,186✔
264
    return true;
87,732✔
265
}
87,732✔
266

267

268
// Test if needle is a substring of haystack. The signature is similar
269
// in spirit to std::search().
270
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271
{
9,318✔
272
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
273
    size_t i = 0;
9,318✔
274
    while (needle_size <= haystack.size() - i) {
25,578✔
275
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
18,894✔
276
            return i;
2,634✔
277
        }
2,634✔
278
        ++i;
16,260✔
279
    }
16,260✔
280
    return haystack.size(); // Not found
6,684✔
281
}
9,318✔
282

283
/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287
                  const std::array<uint8_t, 256>& charmap)
288
{
7,422✔
289
    if (needle_size == 0)
7,422✔
290
        return haystack.size() != 0;
×
291

292
    // Prepare vars to avoid lookups in loop
293
    size_t last_char_pos = needle_size - 1;
7,422✔
294
    unsigned char lastCharU = needle_upper[last_char_pos];
7,422✔
295
    unsigned char lastCharL = needle_lower[last_char_pos];
7,422✔
296

297
    // Do Boyer-Moore search
298
    size_t p = last_char_pos;
7,422✔
299
    while (p < haystack.size()) {
14,340✔
300
        unsigned char c = haystack.data()[p]; // Get candidate for last char
7,740✔
301

302
        if (c == lastCharU || c == lastCharL) {
7,740✔
303
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
870✔
304
            if (equal_case_fold(candidate, needle_upper, needle_lower))
870✔
305
                return true; // text found!
822✔
306
        }
870✔
307

308
        // If we don't have a match, see how far we can move char_pos
309
        if (charmap[c] == 0)
6,918✔
310
            p += needle_size; // char was not present in search string
6,696✔
311
        else
222✔
312
            p += charmap[c];
222✔
313
    }
6,918✔
314

315
    return false;
6,600✔
316
}
7,422✔
317

318
bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319
{
12,294✔
320
    if (text.is_null() || lower.is_null()) {
12,294✔
321
        return (text.is_null() && lower.is_null());
×
322
    }
×
323

324
    return StringData::matchlike_ins(text, lower, upper);
12,294✔
325
}
12,294✔
326

327
bool string_like_ins(StringData text, StringData pattern) noexcept
328
{
222✔
329
    if (text.is_null() || pattern.is_null()) {
222✔
330
        return (text.is_null() && pattern.is_null());
30✔
331
    }
30✔
332

333
    std::string upper = case_map(pattern, true, IgnoreErrors);
192✔
334
    std::string lower = case_map(pattern, false, IgnoreErrors);
192✔
335

336
    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
192✔
337
}
222✔
338

339
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc