• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_337

03 Jul 2024 01:04PM UTC coverage: 90.864% (-0.1%) from 90.984%
jorgen.edelbo_337

Pull #7826

Evergreen

nicola-cab
Merge branch 'master' of github.com:realm/realm-core into next-major
Pull Request #7826: Merge Next major

102968 of 181176 branches covered (56.83%)

3131 of 3738 new or added lines in 54 files covered. (83.76%)

106 existing lines in 23 files now uncovered.

217725 of 239616 relevant lines covered (90.86%)

6844960.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.38
/src/realm/array_aggregate_optimizations.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2023 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/array.hpp>
20
#include <realm/array_with_find.hpp>
21

22
using namespace realm;
23

24
int64_t Array::sum(size_t start, size_t end) const
25
{
24,696✔
26
    REALM_TEMPEX(return sum, m_width, (start, end));
24,696✔
NEW
27
}
×
28

29
template <size_t w>
30
int64_t Array::sum(size_t start, size_t end) const
31
{
24,696✔
32
    if (end == size_t(-1))
24,696✔
33
        end = m_size;
24,696✔
34

35
    REALM_ASSERT_EX(end <= m_size && start <= end, start, end, m_size);
24,696✔
36

37
    if (start == end)
24,696✔
38
        return 0;
24✔
39

40
    int64_t s = 0;
24,672✔
41

42
    // Sum manually until 128 bit aligned
43
    for (; (start < end) && (((size_t(m_data) & 0xf) * 8 + start * w) % 128 != 0); start++) {
50,391✔
44
        s += get<w>(*this, start);
25,719✔
45
    }
25,719✔
46

47
    if (w == 1 || w == 2 || w == 4) {
24,672✔
48
        // Sum of bitwidths less than a byte (which are always positive)
49
        // uses a divide and conquer algorithm that is a variation of popolation count:
50
        // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
51

52
        // static values needed for fast sums
53
        const uint64_t m2 = 0x3333333333333333ULL;
18✔
54
        const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
18✔
55
        const uint64_t h01 = 0x0101010101010101ULL;
18✔
56

57
        int64_t* data = reinterpret_cast<int64_t*>(m_data + start * w / 8);
18✔
58
        size_t chunks = (end - start) * w / 8 / sizeof(int64_t);
18✔
59

60
        for (size_t t = 0; t < chunks; t++) {
1,179,642!
61
            if (w == 1) {
1,179,624✔
62
#if 0
63
#if defined(USE_SSE42) && defined(_MSC_VER) && defined(REALM_PTR_64)
64
                s += __popcnt64(data[t]);
65
#elif !defined(_MSC_VER) && defined(USE_SSE42) && defined(REALM_PTR_64)
66
                s += __builtin_popcountll(data[t]);
67
#else
68
                uint64_t a = data[t];
69
                const uint64_t m1  = 0x5555555555555555ULL;
70
                a -= (a >> 1) & m1;
71
                a = (a & m2) + ((a >> 2) & m2);
72
                a = (a + (a >> 4)) & m4;
73
                a = (a * h01) >> 56;
74
                s += a;
75
#endif
76
#endif
77
                s += fast_popcount64(data[t]);
393,204✔
78
            }
393,204✔
79
            else if (w == 2) {
786,420✔
80
                uint64_t a = data[t];
786,420✔
81
                a = (a & m2) + ((a >> 2) & m2);
786,420✔
82
                a = (a + (a >> 4)) & m4;
786,420✔
83
                a = (a * h01) >> 56;
786,420✔
84

85
                s += a;
786,420✔
86
            }
786,420✔
NEW
87
            else if (w == 4) {
×
NEW
88
                uint64_t a = data[t];
×
NEW
89
                a = (a & m4) + ((a >> 4) & m4);
×
NEW
90
                a = (a * h01) >> 56;
×
NEW
91
                s += a;
×
NEW
92
            }
×
93
        }
1,179,624✔
94
        start += sizeof(int64_t) * 8 / no0(w) * chunks;
18✔
95
    }
18✔
96

97
#ifdef REALM_COMPILER_SSE
12,333✔
98
    if (sseavx<42>()) {
12,333!
99
        // 2000 items summed 500000 times, 8/16/32 bits, miliseconds:
100
        // Naive, templated get<>: 391 371 374
101
        // SSE:                     97 148 282
102

103
        if ((w == 8 || w == 16 || w == 32) && end - start > sizeof(__m128i) * 8 / no0(w)) {
12,333!
104
            __m128i* data = reinterpret_cast<__m128i*>(m_data + start * w / 8);
90✔
105
            __m128i sum_result = {0};
90✔
106
            __m128i sum2;
90✔
107

108
            size_t chunks = (end - start) * w / 8 / sizeof(__m128i);
90✔
109

110
            for (size_t t = 0; t < chunks; t++) {
8,798,460!
111
                if (w == 8) {
8,798,370✔
112
                    /*
113
                    // 469 ms AND disadvantage of handling max 64k elements before overflow
114
                    __m128i vl = _mm_cvtepi8_epi16(data[t]);
115
                    __m128i vh = data[t];
116
                    vh.m128i_i64[0] = vh.m128i_i64[1];
117
                    vh = _mm_cvtepi8_epi16(vh);
118
                    sum_result = _mm_add_epi16(sum_result, vl);
119
                    sum_result = _mm_add_epi16(sum_result, vh);
120
                    */
121

122
                    /*
123
                    // 424 ms
124
                    __m128i vl = _mm_unpacklo_epi8(data[t], _mm_set1_epi8(0));
125
                    __m128i vh = _mm_unpackhi_epi8(data[t], _mm_set1_epi8(0));
126
                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vl, _mm_set1_epi16(1)));
127
                    sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vh, _mm_set1_epi16(1)));
128
                    */
129

130
                    __m128i vl = _mm_cvtepi8_epi16(data[t]); // sign extend lower words 8->16
786,429✔
131
                    __m128i vh = data[t];
786,429✔
132
                    vh = _mm_srli_si128(vh, 8); // v >>= 64
786,429✔
133
                    vh = _mm_cvtepi8_epi16(vh); // sign extend lower words 8->16
786,429✔
134
                    __m128i sum1 = _mm_add_epi16(vl, vh);
786,429✔
135
                    __m128i sumH = _mm_cvtepi16_epi32(sum1);
786,429✔
136
                    __m128i sumL = _mm_srli_si128(sum1, 8); // v >>= 64
786,429✔
137
                    sumL = _mm_cvtepi16_epi32(sumL);
786,429✔
138
                    sum_result = _mm_add_epi32(sum_result, sumL);
786,429✔
139
                    sum_result = _mm_add_epi32(sum_result, sumH);
786,429✔
140
                }
786,429✔
141
                else if (w == 16) {
8,011,941✔
142
                    // todo, can overflow for array size > 2^32
143
                    __m128i vl = _mm_cvtepi16_epi32(data[t]); // sign extend lower words 16->32
1,572,960✔
144
                    __m128i vh = data[t];
1,572,960✔
145
                    vh = _mm_srli_si128(vh, 8);  // v >>= 64
1,572,960✔
146
                    vh = _mm_cvtepi16_epi32(vh); // sign extend lower words 16->32
1,572,960✔
147
                    sum_result = _mm_add_epi32(sum_result, vl);
1,572,960✔
148
                    sum_result = _mm_add_epi32(sum_result, vh);
1,572,960✔
149
                }
1,572,960✔
150
                else if (w == 32) {
6,438,981✔
151
                    __m128i v = data[t];
6,438,981✔
152
                    __m128i v0 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
6,438,981✔
153
                    v = _mm_srli_si128(v, 8);           // v >>= 64
6,438,981✔
154
                    __m128i v1 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
6,438,981✔
155
                    sum_result = _mm_add_epi64(sum_result, v0);
6,438,981✔
156
                    sum_result = _mm_add_epi64(sum_result, v1);
6,438,981✔
157

158
                    /*
159
                    __m128i m = _mm_set1_epi32(0xc000);             // test if overflow could happen (still need
160
                    underflow test).
161
                    __m128i mm = _mm_and_si128(data[t], m);
162
                    zz = _mm_or_si128(mm, zz);
163
                    sum_result = _mm_add_epi32(sum_result, data[t]);
164
                    */
165
                }
6,438,981✔
166
            }
8,798,370✔
167
            start += sizeof(__m128i) * 8 / no0(w) * chunks;
90✔
168

169
            // prevent taking address of 'state' to make the compiler keep it in SSE register in above loop
170
            // (vc2010/gcc4.6)
171
            sum2 = sum_result;
90✔
172

173
            // Avoid aliasing bug where sum2 might not yet be initialized when accessed by get_universal
174
            char sum3[sizeof sum2];
90✔
175
            memcpy(&sum3, &sum2, sizeof sum2);
90✔
176

177
            // Sum elements of sum
178
            for (size_t t = 0; t < sizeof(__m128i) * 8 / ((w == 8 || w == 16) ? 32 : 64); ++t) {
420!
179
                int64_t v = get_universal < (w == 8 || w == 16) ? 32 : 64 > (reinterpret_cast<char*>(&sum3), t);
330✔
180
                s += v;
330✔
181
            }
330✔
182
        }
90✔
183
    }
12,333✔
184
#endif
12,333✔
185

186
    // Sum remaining elements
187
    for (; start < end; ++start)
76,115,601!
188
        s += get<w>(*this, start);
76,090,929✔
189

190
    return s;
24,672✔
191
}
24,696✔
192

193
size_t Array::count(int64_t value) const noexcept
194
{
42✔
195
    // This is not used anywhere in the code, I believe we can delete this
196
    // since the query logic does not use this
197
    const uint64_t* next = reinterpret_cast<uint64_t*>(m_data);
42✔
198
    size_t value_count = 0;
42✔
199
    const size_t end = m_size;
42✔
200
    size_t i = 0;
42✔
201

202
    // static values needed for fast population count
203
    const uint64_t m1 = 0x5555555555555555ULL;
42✔
204
    const uint64_t m2 = 0x3333333333333333ULL;
42✔
205
    const uint64_t m4 = 0x0f0f0f0f0f0f0f0fULL;
42✔
206
    const uint64_t h01 = 0x0101010101010101ULL;
42✔
207

208
    if (m_width == 0) {
42✔
NEW
209
        if (value == 0)
×
NEW
210
            return m_size;
×
NEW
211
        return 0;
×
NEW
212
    }
×
213
    if (m_width == 1) {
42✔
214
        if (uint64_t(value) > 1)
6✔
NEW
215
            return 0;
×
216

217
        const size_t chunkvals = 64;
6✔
218
        for (; i + chunkvals <= end; i += chunkvals) {
1,572,864✔
219
            uint64_t a = next[i / chunkvals];
1,572,858✔
220
            if (value == 0)
1,572,858✔
NEW
221
                a = ~a; // reverse
×
222

223
            a -= (a >> 1) & m1;
1,572,858✔
224
            a = (a & m2) + ((a >> 2) & m2);
1,572,858✔
225
            a = (a + (a >> 4)) & m4;
1,572,858✔
226
            a = (a * h01) >> 56;
1,572,858✔
227

228
            // Could use intrinsic instead:
229
            // a = __builtin_popcountll(a); // gcc intrinsic
230

231
            value_count += to_size_t(a);
1,572,858✔
232
        }
1,572,858✔
233
    }
6✔
234
    else if (m_width == 2) {
36✔
235
        if (uint64_t(value) > 3)
6✔
NEW
236
            return 0;
×
237

238
        const uint64_t v = ~0ULL / 0x3 * value;
6✔
239

240
        // Masks to avoid spillover between segments in cascades
241
        const uint64_t c1 = ~0ULL / 0x3 * 0x1;
6✔
242

243
        const size_t chunkvals = 32;
6✔
244
        for (; i + chunkvals <= end; i += chunkvals) {
3,145,728✔
245
            uint64_t a = next[i / chunkvals];
3,145,722✔
246
            a ^= v;             // zero matching bit segments
3,145,722✔
247
            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
3,145,722✔
248
            a &= m1;            // isolate single bit in each segment
3,145,722✔
249
            a ^= m1;            // reverse isolated bits
3,145,722✔
250
            // if (!a) continue;
251

252
            // Population count
253
            a = (a & m2) + ((a >> 2) & m2);
3,145,722✔
254
            a = (a + (a >> 4)) & m4;
3,145,722✔
255
            a = (a * h01) >> 56;
3,145,722✔
256

257
            value_count += to_size_t(a);
3,145,722✔
258
        }
3,145,722✔
259
    }
6✔
260
    else if (m_width == 4) {
30✔
NEW
261
        if (uint64_t(value) > 15)
×
NEW
262
            return 0;
×
263

NEW
264
        const uint64_t v = ~0ULL / 0xF * value;
×
NEW
265
        const uint64_t m = ~0ULL / 0xF * 0x1;
×
266

267
        // Masks to avoid spillover between segments in cascades
NEW
268
        const uint64_t c1 = ~0ULL / 0xF * 0x7;
×
NEW
269
        const uint64_t c2 = ~0ULL / 0xF * 0x3;
×
270

NEW
271
        const size_t chunkvals = 16;
×
NEW
272
        for (; i + chunkvals <= end; i += chunkvals) {
×
NEW
273
            uint64_t a = next[i / chunkvals];
×
NEW
274
            a ^= v;             // zero matching bit segments
×
NEW
275
            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
×
NEW
276
            a |= (a >> 2) & c2;
×
NEW
277
            a &= m; // isolate single bit in each segment
×
NEW
278
            a ^= m; // reverse isolated bits
×
279

280
            // Population count
NEW
281
            a = (a + (a >> 4)) & m4;
×
NEW
282
            a = (a * h01) >> 56;
×
283

NEW
284
            value_count += to_size_t(a);
×
NEW
285
        }
×
NEW
286
    }
×
287
    else if (m_width == 8) {
30✔
288
        if (value > 0x7FLL || value < -0x80LL)
6✔
NEW
289
            return 0; // by casting?
×
290

291
        const uint64_t v = ~0ULL / 0xFF * value;
6✔
292
        const uint64_t m = ~0ULL / 0xFF * 0x1;
6✔
293

294
        // Masks to avoid spillover between segments in cascades
295
        const uint64_t c1 = ~0ULL / 0xFF * 0x7F;
6✔
296
        const uint64_t c2 = ~0ULL / 0xFF * 0x3F;
6✔
297
        const uint64_t c3 = ~0ULL / 0xFF * 0x0F;
6✔
298

299
        const size_t chunkvals = 8;
6✔
300
        for (; i + chunkvals <= end; i += chunkvals) {
12,582,912✔
301
            uint64_t a = next[i / chunkvals];
12,582,906✔
302
            a ^= v;             // zero matching bit segments
12,582,906✔
303
            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
12,582,906✔
304
            a |= (a >> 2) & c2;
12,582,906✔
305
            a |= (a >> 4) & c3;
12,582,906✔
306
            a &= m; // isolate single bit in each segment
12,582,906✔
307
            a ^= m; // reverse isolated bits
12,582,906✔
308

309
            // Population count
310
            a = (a * h01) >> 56;
12,582,906✔
311

312
            value_count += to_size_t(a);
12,582,906✔
313
        }
12,582,906✔
314
    }
6✔
315
    else if (m_width == 16) {
24✔
316
        if (value > 0x7FFFLL || value < -0x8000LL)
6✔
NEW
317
            return 0; // by casting?
×
318

319
        const uint64_t v = ~0ULL / 0xFFFF * value;
6✔
320
        const uint64_t m = ~0ULL / 0xFFFF * 0x1;
6✔
321

322
        // Masks to avoid spillover between segments in cascades
323
        const uint64_t c1 = ~0ULL / 0xFFFF * 0x7FFF;
6✔
324
        const uint64_t c2 = ~0ULL / 0xFFFF * 0x3FFF;
6✔
325
        const uint64_t c3 = ~0ULL / 0xFFFF * 0x0FFF;
6✔
326
        const uint64_t c4 = ~0ULL / 0xFFFF * 0x00FF;
6✔
327

328
        const size_t chunkvals = 4;
6✔
329
        for (; i + chunkvals <= end; i += chunkvals) {
25,165,824✔
330
            uint64_t a = next[i / chunkvals];
25,165,818✔
331
            a ^= v;             // zero matching bit segments
25,165,818✔
332
            a |= (a >> 1) & c1; // cascade ones in non-zeroed segments
25,165,818✔
333
            a |= (a >> 2) & c2;
25,165,818✔
334
            a |= (a >> 4) & c3;
25,165,818✔
335
            a |= (a >> 8) & c4;
25,165,818✔
336
            a &= m; // isolate single bit in each segment
25,165,818✔
337
            a ^= m; // reverse isolated bits
25,165,818✔
338

339
            // Population count
340
            a = (a * h01) >> 56;
25,165,818✔
341

342
            value_count += to_size_t(a);
25,165,818✔
343
        }
25,165,818✔
344
    }
6✔
345
    else if (m_width == 32) {
18✔
346
        int32_t v = int32_t(value);
12✔
347
        const int32_t* d = reinterpret_cast<int32_t*>(m_data);
12✔
348
        for (; i < end; ++i) {
100,663,362✔
349
            if (d[i] == v)
100,663,350✔
350
                ++value_count;
100,663,350✔
351
        }
100,663,350✔
352
        return value_count;
12✔
353
    }
12✔
354
    else if (m_width == 64) {
6✔
355
        const int64_t* d = reinterpret_cast<int64_t*>(m_data);
6✔
356
        for (; i < end; ++i) {
66✔
357
            if (d[i] == value)
60✔
358
                ++value_count;
60✔
359
        }
60✔
360
        return value_count;
6✔
361
    }
6✔
362

363
    // Check remaining elements
364
    for (; i < end; ++i)
648✔
365
        if (value == get(i))
624✔
366
            ++value_count;
624✔
367

368
    return value_count;
24✔
369
}
42✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc