• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_337

03 Jul 2024 01:04PM UTC coverage: 90.864% (-0.1%) from 90.984%
jorgen.edelbo_337

Pull #7826

Evergreen

nicola-cab
Merge branch 'master' of github.com:realm/realm-core into next-major
Pull Request #7826: Merge Next major

102968 of 181176 branches covered (56.83%)

3131 of 3738 new or added lines in 54 files covered. (83.76%)

106 existing lines in 23 files now uncovered.

217725 of 239616 relevant lines covered (90.86%)

6844960.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.56
/src/realm/array_with_find.hpp
1
/*************************************************************************
2
 *
3
 * Copyright 2021 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
/*
20
Searching: The main finding function is:
21
    template <class cond, size_t bitwidth>
22
    void find(int64_t value, size_t start, size_t end, size_t baseindex, QueryState *state) const
23

24
    cond:       One of Equal, NotEqual, Greater, etc. classes
25

26
    find() will call QueryStateBase::match() for each search result. If match()
27
    returns false, find() will exit, otherwise it will keep searching remaining
28
    items in array.
29
*/
30

31
#ifndef REALM_ARRAY_WITH_FIND_HPP
32
#define REALM_ARRAY_WITH_FIND_HPP
33

34
#include <realm/array.hpp>
35
#include <realm/query_conditions.hpp>
36

37
/*
38
    MMX: mmintrin.h
39
    SSE: xmmintrin.h
40
    SSE2: emmintrin.h
41
    SSE3: pmmintrin.h
42
    SSSE3: tmmintrin.h
43
    SSE4A: ammintrin.h
44
    SSE4.1: smmintrin.h
45
    SSE4.2: nmmintrin.h
46
*/
47
#ifdef REALM_COMPILER_SSE
48
#include <emmintrin.h>             // SSE2
49
#include <realm/realm_nmmintrin.h> // SSE42
50
#endif
51

52
namespace realm {
53

54
namespace {
55
template <class T>
56
inline constexpr T no0(T v)
57
{
706,166,448✔
58
    return v == 0 ? 1 : v;
706,166,448!
59
}
706,166,448✔
60

61
template <size_t width>
62
inline constexpr uint64_t lower_bits()
63
{
3,516,315✔
64
    if (width == 1)
3,516,315✔
65
        return 0xFFFFFFFFFFFFFFFFULL;
×
66
    else if (width == 2)
3,516,315✔
67
        return 0x5555555555555555ULL;
×
68
    else if (width == 4)
3,516,315✔
69
        return 0x1111111111111111ULL;
1,431✔
70
    else if (width == 8)
3,514,884✔
71
        return 0x0101010101010101ULL;
2,596,113✔
72
    else if (width == 16)
918,771✔
73
        return 0x0001000100010001ULL;
918,744✔
74
    else if (width == 32)
27✔
75
        return 0x0000000100000001ULL;
×
76
    else if (width == 64)
27✔
77
        return 0x0000000000000001ULL;
×
78
    else {
27✔
79
        return uint64_t(-1);
27✔
80
    }
27✔
81
}
3,516,315✔
82
} // namespace
83

84
class ArrayWithFind {
85
public:
86
    ArrayWithFind(const Array& array) noexcept
87
        : m_array(array)
6,041,076✔
88
    {
12,466,455✔
89
    }
12,466,455✔
90

91
    // Main finding function - used for find_first, find_all, sum, max, min, etc.
92
    template <class cond>
93
    bool find(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
94

95
    void find_all(IntegerColumn* result, int64_t value, size_t col_offset = 0, size_t begin = 0,
96
                  size_t end = size_t(-1)) const;
97

98
    // Non-SSE find for the four functions Equal/NotEqual/Less/Greater
99
    template <class cond, size_t bitwidth>
100
    bool compare(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
101

102
    // Non-SSE find for Equal/NotEqual
103
    template <bool eq, size_t width>
104
    inline bool compare_equality(int64_t value, size_t start, size_t end, size_t baseindex,
105
                                 QueryStateBase* state) const;
106

107
    // Non-SSE find for Less/Greater
108
    template <bool gt, size_t bitwidth>
109
    bool compare_relation(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
110

111
    template <class cond, size_t foreign_width, size_t width>
112
    bool compare_leafs_4(const Array* foreign, size_t start, size_t end, size_t baseindex,
113
                         QueryStateBase* state) const;
114

115
    template <class cond>
116
    bool compare_leafs(const Array* foreign, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
117

118
    template <class cond, size_t width>
119
    bool compare_leafs(const Array* foreign, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
120

121
// SSE find for the four functions Equal/NotEqual/Less/Greater
122
#ifdef REALM_COMPILER_SSE
123
    template <class cond, size_t width>
124
    bool find_sse(int64_t value, __m128i* data, size_t items, QueryStateBase* state, size_t baseindex) const;
125

126
    template <class cond, size_t width>
127
    REALM_FORCEINLINE bool find_sse_intern(__m128i* action_data, __m128i* data, size_t items, QueryStateBase* state,
128
                                           size_t baseindex) const;
129

130
#endif
131

132
    template <size_t width>
133
    inline bool test_zero(uint64_t value) const; // Tests value for 0-elements
134

135
    template <bool eq, size_t width>
136
    size_t find_zero(uint64_t v) const; // Finds position of 0/non-zero element
137

138
    template <size_t width, bool zero>
139
    uint64_t cascade(uint64_t a) const; // Sets lowermost bits of zero or non-zero elements
140

141
    template <bool gt, size_t width>
142
    int64_t
143
    find_gtlt_magic(int64_t v) const; // Compute magic constant needed for searching for value 'v' using bit hacks
144

145
    size_t first_set_bit(uint32_t v) const;
146
    size_t first_set_bit64(int64_t v) const;
147

148
    // Find value greater/less in 64-bit chunk - only works for positive values
149
    template <bool gt, size_t width>
150
    bool find_gtlt_fast(uint64_t chunk, uint64_t magic, QueryStateBase* state, size_t baseindex) const;
151

152
    // Find value greater/less in 64-bit chunk - no constraints
153
    template <bool gt, size_t width>
154
    bool find_gtlt(int64_t v, uint64_t chunk, QueryStateBase* state, size_t baseindex) const;
155
    // Optimized implementation for release mode
156
    template <class cond, size_t bitwidth>
157
    bool find_optimized(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
158

159
private:
160
    const Array& m_array;
161

162
    bool find_all_will_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const;
163
};
164
//*************************************************************************************
165
// Finding code                                                                       *
166
//*************************************************************************************
167

168
template <size_t width, bool zero>
169
uint64_t ArrayWithFind::cascade(uint64_t a) const
170
{
171
    // Takes a chunk of values as argument and sets the least significant bit for each
172
    // element which is zero or non-zero, depending on the template parameter.
173
    // Example for zero=true:
174
    // width == 4 and a = 0x5fd07a107610f610
175
    // will return:       0x0001000100010001
176

177
    // static values needed for fast population count
178
    const uint64_t m1 = 0x5555555555555555ULL;
179

180
    if (width == 1) {
181
        return zero ? ~a : a;
182
    }
183
    else if (width == 2) {
184
        // Masks to avoid spillover between segments in cascades
185
        const uint64_t c1 = ~0ULL / 0x3 * 0x1;
186

187
        a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
188
        a &= m1;            // isolate single bit in each segment
189
        if (zero)
190
            a ^= m1; // reverse isolated bits if checking for zeroed segments
191

192
        return a;
193
    }
194
    else if (width == 4) {
195
        const uint64_t m = ~0ULL / 0xF * 0x1;
196

197
        // Masks to avoid spillover between segments in cascades
198
        const uint64_t c1 = ~0ULL / 0xF * 0x7;
199
        const uint64_t c2 = ~0ULL / 0xF * 0x3;
200

201
        a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
202
        a |= (a >> 2) & c2;
203
        a &= m; // isolate single bit in each segment
204
        if (zero)
205
            a ^= m; // reverse isolated bits if checking for zeroed segments
206

207
        return a;
208
    }
209
    else if (width == 8) {
210
        const uint64_t m = ~0ULL / 0xFF * 0x1;
211

212
        // Masks to avoid spillover between segments in cascades
213
        const uint64_t c1 = ~0ULL / 0xFF * 0x7F;
214
        const uint64_t c2 = ~0ULL / 0xFF * 0x3F;
215
        const uint64_t c3 = ~0ULL / 0xFF * 0x0F;
216

217
        a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
218
        a |= (a >> 2) & c2;
219
        a |= (a >> 4) & c3;
220
        a &= m; // isolate single bit in each segment
221
        if (zero)
222
            a ^= m; // reverse isolated bits if checking for zeroed segments
223

224
        return a;
225
    }
226
    else if (width == 16) {
227
        const uint64_t m = ~0ULL / 0xFFFF * 0x1;
228

229
        // Masks to avoid spillover between segments in cascades
230
        const uint64_t c1 = ~0ULL / 0xFFFF * 0x7FFF;
231
        const uint64_t c2 = ~0ULL / 0xFFFF * 0x3FFF;
232
        const uint64_t c3 = ~0ULL / 0xFFFF * 0x0FFF;
233
        const uint64_t c4 = ~0ULL / 0xFFFF * 0x00FF;
234

235
        a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
236
        a |= (a >> 2) & c2;
237
        a |= (a >> 4) & c3;
238
        a |= (a >> 8) & c4;
239
        a &= m; // isolate single bit in each segment
240
        if (zero)
241
            a ^= m; // reverse isolated bits if checking for zeroed segments
242

243
        return a;
244
    }
245

246
    else if (width == 32) {
247
        const uint64_t m = ~0ULL / 0xFFFFFFFF * 0x1;
248

249
        // Masks to avoid spillover between segments in cascades
250
        const uint64_t c1 = ~0ULL / 0xFFFFFFFF * 0x7FFFFFFF;
251
        const uint64_t c2 = ~0ULL / 0xFFFFFFFF * 0x3FFFFFFF;
252
        const uint64_t c3 = ~0ULL / 0xFFFFFFFF * 0x0FFFFFFF;
253
        const uint64_t c4 = ~0ULL / 0xFFFFFFFF * 0x00FFFFFF;
254
        const uint64_t c5 = ~0ULL / 0xFFFFFFFF * 0x0000FFFF;
255

256
        a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
257
        a |= (a >> 2) & c2;
258
        a |= (a >> 4) & c3;
259
        a |= (a >> 8) & c4;
260
        a |= (a >> 16) & c5;
261
        a &= m; // isolate single bit in each segment
262
        if (zero)
263
            a ^= m; // reverse isolated bits if checking for zeroed segments
264

265
        return a;
266
    }
267
    else if (width == 64) {
268
        return (a == 0) == zero;
269
    }
270
    else {
271
        REALM_ASSERT_DEBUG(false);
272
        return uint64_t(-1);
273
    }
274
}
275

276
// This is the main finding function for Array. Other finding functions are just
277
// wrappers around this one. Search for 'value' using condition cond (Equal,
278
// NotEqual, Less, etc) and call QueryStateBase::match() for each match. Break and
279
// return if QueryStateBase::match() returns false or 'end' is reached.
280
template <class cond, size_t bitwidth>
281
bool ArrayWithFind::find_optimized(int64_t value, size_t start, size_t end, size_t baseindex,
282
                                   QueryStateBase* state) const
283
{
12,470,454✔
284
    REALM_ASSERT_DEBUG(start <= m_array.m_size && (end <= m_array.m_size || end == size_t(-1)) && start <= end);
12,470,454✔
285

286
    size_t start2 = start;
12,470,454✔
287
    cond c;
12,470,454✔
288

289
    if (end == npos)
12,470,454✔
290
        end = m_array.m_size;
1,250,535✔
291

292
    if (!(m_array.m_size > start2 && start2 < end))
12,470,454✔
293
        return true;
106,422✔
294

295
    constexpr int64_t lbound = Array::lbound_for_width(bitwidth);
12,364,032✔
296
    constexpr int64_t ubound = Array::ubound_for_width(bitwidth);
12,364,032✔
297

298
    // Return immediately if no items in array can match (such as if cond == Greater && value == 100 &&
299
    // m_ubound == 15)
300
    if (!c.can_match(value, lbound, ubound))
12,364,032✔
301
        return true;
116,805✔
302

303
    // optimization if all items are guaranteed to match (such as cond == NotEqual && value == 100 && m_ubound == 15)
304
    if (c.will_match(value, lbound, ubound)) {
12,247,227✔
305
        return find_all_will_match(start2, end, baseindex, state);
284,697✔
306
    }
284,697✔
307

308
    // finder cannot handle this bitwidth
309
    REALM_ASSERT_3(m_array.m_width, !=, 0);
11,962,530!
310

311
#if defined(REALM_COMPILER_SSE)
5,820,993✔
312
    // Only use SSE if payload is at least one SSE chunk (128 bits) in size. Also note taht SSE doesn't support
313
    // Less-than comparison for 64-bit values.
314
    if ((!(std::is_same<cond, Less>::value && m_array.m_width == 64)) && end - start2 >= sizeof(__m128i) &&
5,820,993!
315
        m_array.m_width >= 8 &&
5,820,993!
316
        (sseavx<42>() || (sseavx<30>() && std::is_same<cond, Equal>::value && m_array.m_width < 64))) {
5,820,993!
317

318
        // find_sse() must start2 at 16-byte boundary, so search area before that using compare_equality()
319
        __m128i* const a =
4,013,448✔
320
            reinterpret_cast<__m128i*>(round_up(m_array.m_data + start2 * bitwidth / 8, sizeof(__m128i)));
4,013,448✔
321
        __m128i* const b =
4,013,448✔
322
            reinterpret_cast<__m128i*>(round_down(m_array.m_data + end * bitwidth / 8, sizeof(__m128i)));
4,013,448✔
323

324
        if (!compare<cond, bitwidth>(value, start2, (reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth),
4,013,448!
325
                                     baseindex, state))
4,013,448✔
326
            return false;
43,551✔
327

328
        // Search aligned area with SSE
329
        if (b > a) {
3,969,897!
330
            if (sseavx<42>()) {
3,960,162!
331
                if (!find_sse<cond, bitwidth>(
3,960,156!
332
                        value, a, b - a, state,
3,960,156✔
333
                        baseindex + ((reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth))))
3,960,156✔
334
                    return false;
3,344,526✔
335
            }
3,960,156✔
336
            else if (sseavx<30>()) {
6!
337

338
                if (!find_sse<Equal, bitwidth>(
×
339
                        value, a, b - a, state,
340
                        baseindex + ((reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth))))
341
                    return false;
342
            }
343
        }
3,960,162✔
344

345
        // Search remainder with compare_equality()
346
        if (!compare<cond, bitwidth>(value, (reinterpret_cast<char*>(b) - m_array.m_data) * 8 / no0(bitwidth), end,
625,371!
347
                                     baseindex, state))
625,371✔
348
            return false;
97,386✔
349

350
        return true;
527,985✔
351
    }
625,371✔
352
    else {
1,807,545✔
353
        return compare<cond, bitwidth>(value, start2, end, baseindex, state);
1,807,545✔
354
    }
1,807,545✔
355
#else
356
    return compare<cond, bitwidth>(value, start2, end, baseindex, state);
6,141,537✔
357
#endif
6,316,164✔
358
}
12,137,157✔
359

360
// Tests if any chunk in 'value' is 0
361
template <size_t width>
362
inline bool ArrayWithFind::test_zero(uint64_t value) const
363
{
112,883,985✔
364
    uint64_t hasZeroByte;
112,883,985✔
365
    constexpr uint64_t lower = lower_bits<width>();
112,883,985✔
366
    constexpr uint64_t upper = lower_bits<width>() * 1ULL << (width == 0 ? 0 : (width - 1ULL));
112,883,985✔
367
    hasZeroByte = (value - lower) & ~value & upper;
112,883,985✔
368
    return hasZeroByte != 0;
112,883,985✔
369
}
112,883,985✔
370

371
// Finds first zero (if eq == true) or non-zero (if eq == false) element in v and returns its position.
372
// IMPORTANT: This function assumes that at least 1 item matches (test this with test_zero() or other means first)!
373
template <bool eq, size_t width>
374
size_t ArrayWithFind::find_zero(uint64_t v) const
375
{
28,102,422✔
376
    size_t start = 0;
28,102,422✔
377
    uint64_t hasZeroByte;
28,102,422✔
378
    // Warning free way of computing (1ULL << width) - 1
379
    uint64_t mask = (width == 64 ? ~0ULL : ((1ULL << (width == 64 ? 0 : width)) - 1ULL));
28,102,422✔
380

381
    if (eq == (((v >> (width * start)) & mask) == 0)) {
28,102,422!
382
        return 0;
26,745,498✔
383
    }
26,745,498✔
384

385
    // Bisection optimization, speeds up small bitwidths with high match frequency. More partions than 2 do NOT pay
386
    // off because the work done by test_zero() is wasted for the cases where the value exists in first half, but
387
    // useful if it exists in last half. Sweet spot turns out to be the widths and partitions below.
388
    if (width <= 8) {
1,356,924✔
389
        hasZeroByte = test_zero<width>(v | 0xffffffff00000000ULL);
1,224,327✔
390
        if (eq ? !hasZeroByte : (v & 0x00000000ffffffffULL) == 0) {
1,224,327!
391
            // 00?? -> increasing
392
            start += 64 / no0(width) / 2;
109,383✔
393
            if (width <= 4) {
109,383✔
394
                hasZeroByte = test_zero<width>(v | 0xffff000000000000ULL);
69,840✔
395
                if (eq ? !hasZeroByte : (v & 0x0000ffffffffffffULL) == 0) {
69,840!
396
                    // 000?
397
                    start += 64 / no0(width) / 4;
19,884✔
398
                }
19,884✔
399
            }
69,840✔
400
        }
109,383✔
401
        else {
1,114,944✔
402
            if (width <= 4) {
1,114,944✔
403
                // ??00
404
                hasZeroByte = test_zero<width>(v | 0xffffffffffff0000ULL);
1,079,889✔
405
                if (eq ? !hasZeroByte : (v & 0x000000000000ffffULL) == 0) {
1,079,889!
406
                    // 0?00
407
                    start += 64 / no0(width) / 4;
127,551✔
408
                }
127,551✔
409
            }
1,079,889✔
410
        }
1,114,944✔
411
    }
1,224,327✔
412

413
    while (eq == (((v >> (width * start)) & mask) != 0)) {
3,747,735!
414
        // You must only call find_zero() if you are sure that at least 1 item matches
415
        REALM_ASSERT_3(start, <=, 8 * sizeof(v));
2,390,811✔
416
        start++;
2,390,811✔
417
    }
2,390,811✔
418

419
    return start;
1,356,924✔
420
}
28,102,422✔
421

422
// Generate a magic constant used for later bithacks
423
template <bool gt, size_t width>
424
int64_t ArrayWithFind::find_gtlt_magic(int64_t v) const
425
{
427,986✔
426
    uint64_t mask1 =
427,986✔
427
        (width == 64
427,986✔
428
             ? ~0ULL
427,986✔
429
             : ((1ULL << (width == 64 ? 0 : width)) - 1ULL)); // Warning free way of computing (1ULL << width) - 1
427,986✔
430
    uint64_t mask2 = mask1 >> 1;
427,986✔
431
    uint64_t magic = gt ? (~0ULL / no0(mask1) * (mask2 - v)) : (~0ULL / no0(mask1) * v);
427,986✔
432
    return magic;
427,986✔
433
}
427,986✔
434

435
template <bool gt, size_t width>
436
bool ArrayWithFind::find_gtlt_fast(uint64_t chunk, uint64_t magic, QueryStateBase* state, size_t baseindex) const
437
{
3,508,944✔
438
    // Tests if a a chunk of values contains values that are greater (if gt == true) or less (if gt == false) than v.
439
    // Fast, but limited to work when all values in the chunk are positive.
440

441
    uint64_t mask1 =
3,508,944✔
442
        (width == 64
3,508,944✔
443
             ? ~0ULL
3,508,944✔
444
             : ((1ULL << (width == 64 ? 0 : width)) - 1ULL)); // Warning free way of computing (1ULL << width) - 1
3,508,944✔
445
    uint64_t mask2 = mask1 >> 1;
3,508,944✔
446
    uint64_t m = gt ? (((chunk + magic) | chunk) & ~0ULL / no0(mask1) * (mask2 + 1))
3,512,220✔
447
                    : ((chunk - magic) & ~chunk & ~0ULL / no0(mask1) * (mask2 + 1));
2,147,488,717✔
448
    size_t p = 0;
3,508,944✔
449
    while (m) {
13,760,517!
450
        size_t t = first_set_bit64(m) / no0(width);
10,277,616✔
451
        p += t;
10,277,616✔
452
        if (!state->match(p + baseindex, Mixed{int64_t((chunk >> (p * width)) & mask1)}))
10,277,616!
453
            return false;
26,043✔
454

455
        if ((t + 1) * width == 64)
10,251,573!
456
            m = 0;
2,892✔
457
        else
10,248,681✔
458
            m >>= (t + 1) * width;
10,248,681✔
459
        p++;
10,251,573✔
460
    }
10,251,573✔
461

462
    return true;
3,482,901✔
463
}
3,508,944✔
464

465
// clang-format off
466
template <bool gt, size_t width>
467
bool ArrayWithFind::find_gtlt(int64_t v, uint64_t chunk, QueryStateBase* state, size_t baseindex) const
468
{
3,028,137✔
469
    // Find items in 'chunk' that are greater (if gt == true) or smaller (if gt == false) than 'v'. Fixme, __forceinline can make it crash in vS2010 - find out why
470
    if constexpr (width == 1) {
3,028,137✔
471
        for (size_t i = 0; i < 64; ++i) {
3,877,020✔
472
            int64_t v2 = static_cast<int64_t>(chunk & 0x1);
2,025,216✔
473
            if (gt ? v2 > v : v2 < v) {
2,025,216✔
474
                if (!state->match(i + baseindex, v2)) {
21,504✔
475
                    return false;
21,504✔
476
                }
21,504✔
477
            }
21,504✔
478
            chunk >>= 1;
2,003,712✔
479
        }
2,003,712✔
480
    }
21,114✔
481
    else if constexpr (width == 2) {
2,896,500✔
482
        for (size_t i = 0; i < 32; ++i) {
3,978,018✔
483
            int64_t v2 = static_cast<int64_t>(chunk & 0x3);
2,276,352✔
484
            if (gt ? v2 > v : v2 < v) {
2,276,352✔
485
                if (!state->match(i + baseindex, v2)) {
28,416✔
486
                    return false;
28,416✔
487
                }
28,416✔
488
            }
28,416✔
489
            chunk >>= 2;
2,247,936✔
490
        }
2,247,936✔
491
    }
42,450✔
492
    else if constexpr (width == 4) {
2,892,030✔
493
        for (size_t i = 0; i < 16; ++i) {
3,964,704✔
494
            int64_t v2 = static_cast<int64_t>(chunk & 0xf);
2,335,278✔
495
            if (gt ? v2 > v : v2 < v) {
2,335,278✔
496
                if (!state->match(i + baseindex, v2)) {
32,637✔
497
                    return false;
31,500✔
498
                }
31,500✔
499
            }
32,637✔
500
            chunk >>= 4;
2,303,778✔
501
        }
2,303,778✔
502
    }
80,295✔
503
    else if constexpr (width == 8) {
2,739,876✔
504
        for (size_t i = 0; i < 8; ++i) {
2,713,467✔
505
            int64_t v2 = static_cast<int64_t>(static_cast<int8_t>(chunk & 0xff));
786,888✔
506
            if (gt ? v2 > v : v2 < v) {
786,888✔
507
                if (!state->match(i + baseindex, v2)) {
107,961✔
508
                    return false;
8,628✔
509
                }
8,628✔
510
            }
107,961✔
511
            chunk >>= 8;
778,260✔
512
        }
778,260✔
513
    }
93,561✔
514
    else if constexpr (width == 16) {
2,638,152✔
515
        for (size_t i = 0; i < 4; ++i) {
13,091,517✔
516
            int64_t v2 = static_cast<int64_t>(static_cast<int16_t>(chunk & 0xffff));
10,484,976✔
517
            if (gt ? v2 > v : v2 < v) {
10,484,976✔
518
                if (!state->match(i + baseindex, v2)) {
4,466,040✔
519
                    return false;
31,611✔
520
                }
31,611✔
521
            }
4,466,040✔
522
            chunk >>= 16;
10,453,365✔
523
        }
10,453,365✔
524
    }
2,637,744✔
525
    else if constexpr (width == 32) {
2,606,397✔
526
        for (size_t i = 0; i < 2; ++i) {
2,606,397✔
527
            int64_t v2 = static_cast<int64_t>(static_cast<int32_t>(chunk & 0xffffffff));
2,606,397✔
528
            if (gt ? v2 > v : v2 < v) {
2,606,397✔
529
                if (!state->match(i + baseindex, v2)) {
2,606,397✔
530
                    return false;
2,606,397✔
531
                }
2,606,397✔
532
            }
2,606,397✔
533
            chunk >>= 32;
2,606,397✔
534
        }
2,606,397✔
535
    }
2,606,397✔
536
    else if constexpr (width == 64) {
2,606,397✔
537
        int64_t v2 = static_cast<int64_t>(chunk);
2,606,397✔
538
        if (gt ? v2 > v : v2 < v) {
2,606,397✔
539
            return state->match(baseindex, v2);
2,606,397✔
540
        }
2,606,397✔
541
    }
2,606,397✔
542

543
    static_cast<void>(state);
2,906,478✔
544
    return true;
2,947,770✔
545
}
3,028,137✔
546
// clang-format on
547

548
/// Find items in this Array that are equal (eq == true) or different (eq = false) from 'value'
549
template <bool eq, size_t width>
550
inline bool ArrayWithFind::compare_equality(int64_t value, size_t start, size_t end, size_t baseindex,
551
                                            QueryStateBase* state) const
552
{
12,009,813✔
553
    REALM_ASSERT_DEBUG(start <= m_array.m_size && (end <= m_array.m_size || end == size_t(-1)) && start <= end);
12,009,813!
554
    REALM_ASSERT_DEBUG(width == m_array.m_width);
12,009,813✔
555

556
    auto v = 64 / no0(width);
12,009,813✔
557
    size_t ee = round_up(start, v);
12,009,813✔
558
    ee = ee > end ? end : ee;
12,009,813✔
559
    for (; start < ee; ++start) {
16,927,626✔
560
        auto v = Array::get<width>(m_array, start);
7,434,720✔
561
        if (eq ? (v == value) : (v != value)) {
7,434,720!
562
            if (!state->match(start + baseindex))
2,526,054!
563
                return false;
2,516,907✔
564
        }
2,526,054✔
565
    }
7,434,720✔
566

567
    if (start >= end)
9,492,906✔
568
        return true;
2,272,752✔
569

570
    if (width != 32 && width != 64) {
7,220,154✔
571
        const int64_t* p = reinterpret_cast<const int64_t*>(m_array.m_data + (start * width / 8));
2,212,938✔
572
        const int64_t* const e = reinterpret_cast<int64_t*>(m_array.m_data + (end * width / 8)) - 1;
2,212,938✔
573
        const uint64_t mask =
2,212,938✔
574
            (width == 64
2,212,938✔
575
                 ? ~0ULL
2,212,938✔
576
                 : ((1ULL << (width == 64 ? 0 : width)) - 1ULL)); // Warning free way of computing (1ULL << width) - 1
2,212,938✔
577
        const uint64_t valuemask =
2,212,938✔
578
            ~0ULL / no0(mask) * (value & mask); // the "== ? :" is to avoid division by 0 compiler error
2,212,938✔
579

580
        while (p < e) {
112,543,833!
581
            uint64_t chunk = *p;
110,733,819✔
582
            uint64_t v2 = chunk ^ valuemask;
110,733,819✔
583
            start = (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(width);
110,733,819✔
584
            size_t a = 0;
110,733,819✔
585

586
            while (eq ? test_zero<width>(v2) : v2) {
137,952,675!
587
                size_t t = find_zero<eq, width>(v2);
28,102,425✔
588
                a += t;
28,102,425✔
589

590
                if (a >= 64 / no0(width))
28,102,425!
591
                    break;
480,645✔
592

593
                if (!state->match(a + start + baseindex)) {
27,621,780!
594
                    return false;
402,924✔
595
                }
402,924✔
596
                auto shift = (t + 1) * width;
27,218,856✔
597
                if (shift < 64)
27,218,856!
598
                    v2 >>= shift;
27,210,165✔
599
                else
8,691✔
600
                    v2 = 0;
8,691✔
601
                a += 1;
27,218,856✔
602
            }
27,218,856✔
603

604
            ++p;
110,330,895✔
605
        }
110,330,895✔
606

607
        // Loop ended because we are near end or end of array. No need to optimize search in remainder in this case
608
        // because end of array means that
609
        // lots of search work has taken place prior to ending here. So time spent searching remainder is relatively
610
        // tiny
611
        start = (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(width);
1,810,014✔
612
    }
1,810,014✔
613

614
    while (start < end) {
1,603,777,851✔
615
        if (eq ? Array::get<width>(m_array, start) == value : Array::get<width>(m_array, start) != value) {
1,601,119,803✔
616
            if (!state->match(start + baseindex)) {
5,401,086✔
617
                return false;
4,159,182✔
618
            }
4,159,182✔
619
        }
5,401,086✔
620
        ++start;
1,596,960,621✔
621
    }
1,596,960,621✔
622

623
    return true;
2,658,048✔
624
}
6,817,230✔
625

626
// There exists a couple of find() functions that take more or less template arguments. Always call the one that
627
// takes as most as possible to get best performance.
628

629
template <class cond>
630
bool ArrayWithFind::find(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
631
{
632
    REALM_TEMPEX2(return find_optimized, cond, m_array.m_width, (value, start, end, baseindex, state));
633
}
634

635
#ifdef REALM_COMPILER_SSE
636
// 'items' is the number of 16-byte SSE chunks. Returns index of packed element relative to first integer of first
637
// chunk
638
template <class cond, size_t width>
639
bool ArrayWithFind::find_sse(int64_t value, __m128i* data, size_t items, QueryStateBase* state,
640
                             size_t baseindex) const
641
{
3,960,171✔
642
    __m128i search = {0};
3,960,171✔
643

644
    if (width == 8)
3,960,171✔
645
        search = _mm_set1_epi8(static_cast<char>(value));
120,183✔
646
    else if (width == 16)
3,839,988✔
647
        search = _mm_set1_epi16(static_cast<short int>(value));
534,846✔
648
    else if (width == 32)
3,305,142✔
649
        search = _mm_set1_epi32(static_cast<int>(value));
3,284,937✔
650
    else if (width == 64) {
20,205✔
651
        if (std::is_same<cond, Less>::value)
20,205✔
652
            REALM_ASSERT(false);
20,205✔
653
        else
20,205✔
654
            search = _mm_set_epi64x(value, value);
20,205✔
655
    }
20,205✔
656

657
    return find_sse_intern<cond, width>(data, &search, items, state, baseindex);
3,960,171✔
658
}
3,960,171✔
659

660
// Compares packed action_data with packed data (equal, less, etc) and performs aggregate action (max, min, sum,
661
// find_all, etc) on value inside action_data for first match, if any
662
template <class cond, size_t width>
663
REALM_FORCEINLINE bool ArrayWithFind::find_sse_intern(__m128i* action_data, __m128i* data, size_t items,
664
                                                      QueryStateBase* state, size_t baseindex) const
665
{
3,960,168✔
666
    size_t i = 0;
3,960,168✔
667
    __m128i compare_result = {0};
3,960,168✔
668
    unsigned int resmask;
3,960,168✔
669

670
    // Search loop. Unrolling it has been tested to NOT increase performance (apparently mem bound)
671
    for (i = 0; i < items; ++i) {
456,544,722!
672
        // equal / not-equal
673
        if (std::is_same<cond, Equal>::value || std::is_same<cond, NotEqual>::value) {
455,929,080✔
674
            if (width == 8)
452,709,645✔
675
                compare_result = _mm_cmpeq_epi8(action_data[i], *data);
623,763✔
676
            if (width == 16)
452,709,645✔
677
                compare_result = _mm_cmpeq_epi16(action_data[i], *data);
53,579,958✔
678
            if (width == 32)
452,709,645✔
679
                compare_result = _mm_cmpeq_epi32(action_data[i], *data);
397,463,703✔
680
            if (width == 64) {
452,709,645✔
681
                compare_result = _mm_cmpeq_epi64(action_data[i], *data); // SSE 4.2 only
1,042,227✔
682
            }
1,042,227✔
683
        }
452,709,645✔
684

685
        // greater
686
        else if (std::is_same<cond, Greater>::value) {
3,219,435✔
687
            if (width == 8)
1,887,444✔
688
                compare_result = _mm_cmpgt_epi8(action_data[i], *data);
546,921✔
689
            if (width == 16)
1,887,444✔
690
                compare_result = _mm_cmpgt_epi16(action_data[i], *data);
756,522✔
691
            if (width == 32)
1,887,444✔
692
                compare_result = _mm_cmpgt_epi32(action_data[i], *data);
291,774✔
693
            if (width == 64)
1,887,444✔
694
                compare_result = _mm_cmpgt_epi64(action_data[i], *data);
292,230✔
695
        }
1,887,444✔
696
        // less
697
        else if (std::is_same<cond, Less>::value) {
1,332,066✔
698
            if (width == 8)
1,332,066✔
699
                compare_result = _mm_cmplt_epi8(action_data[i], *data);
35,928✔
700
            else if (width == 16)
1,296,138✔
701
                compare_result = _mm_cmplt_epi16(action_data[i], *data);
1,004,364✔
702
            else if (width == 32)
291,774✔
703
                compare_result = _mm_cmplt_epi32(action_data[i], *data);
291,774✔
704
            else
705
                REALM_ASSERT(false);
706
        }
1,332,066✔
707

708
        resmask = _mm_movemask_epi8(compare_result);
455,929,080✔
709

710
        if (std::is_same<cond, NotEqual>::value)
455,929,080✔
711
            resmask = ~resmask & 0x0000ffff;
3,234,636✔
712

713
        size_t s = i * sizeof(__m128i) * 8 / no0(width);
455,929,080✔
714

715
        while (resmask != 0) {
485,381,814!
716
            size_t idx = first_set_bit(resmask) * 8 / no0(width);
32,797,260✔
717
            s += idx;
32,797,260✔
718
            if (!state->match(s + baseindex))
32,797,260!
719
                return false;
3,344,526✔
720
            resmask >>= (idx + 1) * no0(width) / 8;
29,452,734✔
721
            ++s;
29,452,734✔
722
        }
29,452,734✔
723
    }
455,929,080✔
724

725
    return true;
615,642✔
726
}
3,960,168✔
727
#endif // REALM_COMPILER_SSE
728

729
template <class cond>
730
bool ArrayWithFind::compare_leafs(const Array* foreign, size_t start, size_t end, size_t baseindex,
731
                                  QueryStateBase* state) const
732
{
733
    cond c;
734
    REALM_ASSERT_3(start, <=, end);
735
    if (start == end)
736
        return true;
737

738

739
    int64_t v;
740

741
    // We can compare first element without checking for out-of-range
742
    v = m_array.get(start);
743
    if (c(v, foreign->get(start))) {
744
        if (!state->match(start + baseindex, v))
745
            return false;
746
    }
747

748
    start++;
749

750
    if (start + 3 < end) {
751
        v = m_array.get(start);
752
        if (c(v, foreign->get(start)))
753
            if (!state->match(start + baseindex, v))
754
                return false;
755

756
        v = m_array.get(start + 1);
757
        if (c(v, foreign->get(start + 1)))
758
            if (!state->match(start + 1 + baseindex, v))
759
                return false;
760

761
        v = m_array.get(start + 2);
762
        if (c(v, foreign->get(start + 2)))
763
            if (!state->match(start + 2 + baseindex, v))
764
                return false;
765

766
        start += 3;
767
    }
768
    else if (start == end) {
769
        return true;
770
    }
771

772
    bool r;
773
    REALM_TEMPEX2(r = compare_leafs, cond, m_array.m_width, (foreign, start, end, baseindex, state))
774
    return r;
775
}
776

777

778
template <class cond, size_t width>
779
bool ArrayWithFind::compare_leafs(const Array* foreign, size_t start, size_t end, size_t baseindex,
780
                                  QueryStateBase* state) const
781
{
782
    size_t fw = foreign->m_width;
783
    bool r;
784
    REALM_TEMPEX3(r = compare_leafs_4, cond, width, fw, (foreign, start, end, baseindex, state))
785
    return r;
786
}
787

788

789
template <class cond, size_t width, size_t foreign_width>
790
bool ArrayWithFind::compare_leafs_4(const Array* foreign, size_t start, size_t end, size_t baseindex,
791
                                    QueryStateBase* state) const
792
{
793
    cond c;
794
    char* foreign_m_data = foreign->m_data;
795

796
    if (width == 0 && foreign_width == 0) {
797
        if (c(0, 0)) {
798
            while (start < end) {
799
                if (!state->match(start + baseindex, 0))
800
                    return false;
801
                start++;
802
            }
803
        }
804
        else {
805
            return true;
806
        }
807
    }
808

809

810
#if defined(REALM_COMPILER_SSE)
811
    if (sseavx<42>() && width == foreign_width && (width == 8 || width == 16 || width == 32)) {
812
        // We can only use SSE if both bitwidths are equal and above 8 bits and all values are signed
813
        // and the two arrays are aligned the same way
814
        if ((reinterpret_cast<size_t>(m_array.m_data) & 0xf) == (reinterpret_cast<size_t>(foreign_m_data) & 0xf)) {
815
            while (start < end &&
816
                   (((reinterpret_cast<size_t>(m_array.m_data) & 0xf) * 8 + start * width) % (128) != 0)) {
817
                int64_t v = m_array.get_universal<width>(m_array.m_data, start);
818
                int64_t fv = m_array.get_universal<foreign_width>(foreign_m_data, start);
819
                if (c(v, fv)) {
820
                    if (!state->match(start + baseindex, v))
821
                        return false;
822
                }
823
                start++;
824
            }
825
            if (start == end)
826
                return true;
827

828

829
            size_t sse_items = (end - start) * width / 128;
830
            size_t sse_end = start + sse_items * 128 / no0(width);
831

832
            while (start < sse_end) {
833
                __m128i* a = reinterpret_cast<__m128i*>(m_array.m_data + start * width / 8);
834
                __m128i* b = reinterpret_cast<__m128i*>(foreign_m_data + start * width / 8);
835

836
                bool continue_search = find_sse_intern<cond, width>(a, b, 1, state, baseindex + start);
837

838
                if (!continue_search)
839
                    return false;
840

841
                start += 128 / no0(width);
842
            }
843
        }
844
    }
845
#endif
846

847
    while (start < end) {
848
        int64_t v = m_array.get_universal<width>(m_array.m_data, start);
849
        int64_t fv = m_array.get_universal<foreign_width>(foreign_m_data, start);
850

851
        if (c(v, fv)) {
852
            if (!state->match(start + baseindex, v))
853
                return false;
854
        }
855

856
        start++;
857
    }
858

859
    return true;
860
}
861

862

863
template <class cond, size_t bitwidth>
864
bool ArrayWithFind::compare(int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const
865
{
12,588,162✔
866
    bool ret = false;
12,588,162✔
867

868
    if (std::is_same<cond, Equal>::value)
12,588,162✔
869
        ret = compare_equality<true, bitwidth>(value, start, end, baseindex, state);
11,630,580✔
870
    else if (std::is_same<cond, NotEqual>::value)
957,582✔
871
        ret = compare_equality<false, bitwidth>(value, start, end, baseindex, state);
379,218✔
872
    else if (std::is_same<cond, Greater>::value)
578,364✔
873
        ret = compare_relation<true, bitwidth>(value, start, end, baseindex, state);
355,623✔
874
    else if (std::is_same<cond, Less>::value)
222,741✔
875
        ret = compare_relation<false, bitwidth>(value, start, end, baseindex, state);
222,741✔
UNCOV
876
    else
×
UNCOV
877
        REALM_ASSERT_DEBUG(false);
×
878

879
    return ret;
12,588,162✔
880
}
12,588,162✔
881

882
template <bool gt, size_t bitwidth>
883
bool ArrayWithFind::compare_relation(int64_t value, size_t start, size_t end, size_t baseindex,
884
                                     QueryStateBase* state) const
885
{
578,376✔
886
    REALM_ASSERT(start <= m_array.m_size && (end <= m_array.m_size || end == size_t(-1)) && start <= end);
578,376!
887
    uint64_t mask = (bitwidth == 64 ? ~0ULL
578,376✔
888
                                    : ((1ULL << (bitwidth == 64 ? 0 : bitwidth)) -
578,376✔
889
                                       1ULL)); // Warning free way of computing (1ULL << width) - 1
543,411✔
890

891
    size_t ee = round_up(start, 64 / no0(bitwidth));
578,376✔
892
    ee = ee > end ? end : ee;
578,376✔
893
    for (; start < ee; start++) {
579,060✔
894
        if (gt ? (Array::get<bitwidth>(m_array, start) > value) : (Array::get<bitwidth>(m_array, start) < value)) {
2,484!
895
            if (!state->match(start + baseindex, Array::get<bitwidth>(m_array, start)))
1,800!
896
                return false;
1,800✔
897
        }
1,800✔
898
    }
2,484✔
899

900
    if (start >= end)
576,576✔
901
        return true; // none found, continue (return true) regardless what QueryStateBase->match() would have returned
43,530✔
902
                     // on match
903

904
    const int64_t* p = reinterpret_cast<const int64_t*>(m_array.m_data + (start * bitwidth / 8));
533,046✔
905
    const int64_t* const e = reinterpret_cast<int64_t*>(m_array.m_data + (end * bitwidth / 8)) - 1;
533,046✔
906

907
    // Matches are rare enough to setup fast linear search for remaining items. We use
908
    // bit hacks from http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
909

910
    if (bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8 || bitwidth == 16) {
533,046✔
911
        uint64_t magic = find_gtlt_magic<gt, bitwidth>(value);
427,989✔
912

913
        // Bit hacks only work if searched item has its most significant bit clear for 'greater than' or
914
        // 'item <= 1 << bitwidth' for 'less than'
915
        if (value != int64_t((magic & mask)) && value >= 0 && bitwidth >= 2 &&
427,989!
916
            value <= static_cast<int64_t>((mask >> 1) - (gt ? 1 : 0))) {
2,147,680,573!
917
            // 15 ms
918
            while (p < e) {
3,689,298!
919
                uint64_t upper = lower_bits<bitwidth>() << (no0(bitwidth) - 1);
3,518,055✔
920

921
                const int64_t v = *p;
3,518,055✔
922
                size_t idx;
3,518,055✔
923

924
                // Bit hacks only works if all items in chunk have their most significant bit clear. Test this:
925
                upper = upper & v;
3,518,055✔
926

927
                if (!upper) {
3,518,055!
928
                    idx = find_gtlt_fast<gt, bitwidth>(
3,508,485✔
929
                        v, magic, state,
3,508,485✔
930
                        (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(bitwidth) + baseindex);
3,508,485✔
931
                }
3,508,485✔
932
                else
9,570✔
933
                    idx = find_gtlt<gt, bitwidth>(
9,570✔
934
                        value, v, state,
9,570✔
935
                        (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(bitwidth) + baseindex);
9,570✔
936

937
                if (!idx)
3,518,055!
938
                    return false;
26,055✔
939
                ++p;
3,492,000✔
940
            }
3,492,000✔
941
        }
197,298✔
942
        else {
230,691✔
943
            // 24 ms
944
            while (p < e) {
3,130,236!
945
                int64_t v = *p;
3,021,192✔
946
                if (!find_gtlt<gt, bitwidth>(
3,021,192!
947
                        value, v, state,
3,021,192✔
948
                        (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(bitwidth) + baseindex))
3,021,192✔
949
                    return false;
121,647✔
950
                ++p;
2,899,545✔
951
            }
2,899,545✔
952
        }
230,691✔
953
        start = (p - reinterpret_cast<int64_t*>(m_array.m_data)) * 8 * 8 / no0(bitwidth);
280,287✔
954
    }
280,287✔
955

956
    // matchcount logic in SIMD no longer pays off for 32/64 bit ints because we have just 4/2 elements
957

958
    // Test unaligned end and/or values of width > 16 manually
959
    while (start < end) {
6,397,233!
960
        if (gt ? Array::get<bitwidth>(m_array, start) > value : Array::get<bitwidth>(m_array, start) < value) {
6,119,889✔
961
            if (!state->match(start + baseindex))
734,235✔
962
                return false;
108,000✔
963
        }
734,235✔
964
        ++start;
6,011,889✔
965
    }
6,011,889✔
966
    return true;
277,344✔
967
}
385,344✔
968

969
//*************************************************************************************
970
// Finding code ends                                                                  *
971
//*************************************************************************************
972

973
} // namespace realm
974

975
#endif /* REALM_ARRAY_WITH_FIND_HPP */
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc