• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_402

21 Aug 2024 11:10AM UTC coverage: 91.054% (-0.03%) from 91.085%
jorgen.edelbo_402

Pull #7803

Evergreen

jedelbo
Small fix to Table::typed_write

When writing the realm to a new file from a write transaction,
the Table may be COW so that the top ref is changed. So don't
use the ref that is present in the group when the operation starts.
Pull Request #7803: Feature/string compression

103494 of 181580 branches covered (57.0%)

1929 of 1999 new or added lines in 46 files covered. (96.5%)

695 existing lines in 51 files now uncovered.

220142 of 241772 relevant lines covered (91.05%)

7344461.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.77
/src/realm/string_interner.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_interner.hpp>
20
#include <realm/string_compressor.hpp>
21
#include <realm/string_data.hpp>
22
#include <string_view>
23

24
namespace realm {
25

26
// Fast mapping of strings (or rather hash of strings) to string IDs.
27
//
28
// We use a tree where:
29
// * All interior nodes are radix nodes with a fan-out of 256.
30
// * Leaf nodes with up to 16 entries are just lists, searched linearly
31
// * Leaf nodes with more than 16 entries and less than 1K are hash tables.
32
//   Hash tables use linear search starting from the entry found by hashing.
33
//
34
constexpr static size_t linear_search_limit = 16;
35
constexpr static size_t hash_node_min_size = 32;
36
constexpr static size_t hash_node_max_size = 1024;
37
constexpr static size_t radix_node_consumes_bits = 8;
38
constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits;
39

40
// helpers
41
struct HashMapIter {
42
    Array& m_array;
43
    uint32_t hash_filter;
44
    uint16_t index;
45
    uint16_t left_to_search;
46
    uint8_t hash_size;
47
    HashMapIter(Array& array, uint32_t hash, uint8_t hash_size)
48
        : m_array(array)
31,954,950✔
49
        , hash_filter(hash)
31,954,950✔
50
        , hash_size(hash_size)
31,954,950✔
51
    {
63,910,458✔
52
        set_index(0);
63,910,458✔
53
    }
63,910,458✔
54
    HashMapIter(Array& dummy)
55
        : m_array(dummy)
NEW
56
    {
×
NEW
57
        left_to_search = 0;
×
NEW
58
    }
×
59
    inline uint32_t get()
60
    {
1,206,027✔
61
        return (uint32_t)(m_array.get(index) >> hash_size);
1,206,027✔
62
    }
1,206,027✔
63
    inline bool empty()
64
    {
4,554,129✔
65
        auto element = m_array.get(index);
4,554,129✔
66
        return (element >> hash_size) == 0;
4,554,129✔
67
    }
4,554,129✔
68
    inline void set(uint64_t element)
69
    {
1,751,511✔
70
        m_array.set(index, element);
1,751,511✔
71
    }
1,751,511✔
72
    inline bool matches()
73
    {
23,602,869✔
74
        auto mask = 0xFFFFFFFFUL >> (32 - hash_size);
23,602,869✔
75
        auto element = m_array.get(index);
23,602,869✔
76
        return ((element & mask) == hash_filter) && (element >> hash_size);
23,602,869✔
77
    }
23,602,869✔
78
    inline bool is_valid()
79
    {
116,068,110✔
80
        return left_to_search != 0;
116,068,110✔
81
    }
116,068,110✔
82
    inline void set_index(size_t i, size_t search_limit = linear_search_limit)
83
    {
66,973,911✔
84
        index = (uint16_t)i;
66,973,911✔
85
        left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit);
66,973,911✔
86
    }
66,973,911✔
87
    void operator++()
88
    {
25,258,641✔
89
        if (is_valid()) {
25,259,235✔
90
            left_to_search--;
25,259,235✔
91
            index++;
25,259,235✔
92
            if (index == m_array.size()) {
25,259,235✔
93
                index = 0;
1,105,590✔
94
            }
1,105,590✔
95
        }
25,259,235✔
96
    }
25,258,641✔
97
};
98

99
// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf.
100
static bool rehash(Array& from, Array& to, uint8_t hash_size)
101
{
18,729✔
102
    REALM_ASSERT_DEBUG(from.size() * 2 <= to.size());
18,729✔
103

104
    for (size_t i = 0; i < from.size(); ++i) {
1,622,562✔
105
        auto entry = (size_t)from.get(i);
1,603,833✔
106
        if ((entry >> hash_size) == 0)
1,603,833✔
107
            continue;
460,518✔
108
        size_t starting_index = entry & (to.size() - 1);
1,143,315✔
109
        HashMapIter it(to, 0, hash_size);
1,143,315✔
110
        it.set_index(starting_index);
1,143,315✔
111
        while (it.is_valid() && !it.empty()) {
1,461,552✔
112
            ++it;
318,147✔
113
        }
318,147✔
114
        if (!it.is_valid()) {
1,143,315✔
115
            // abort rehashing, we need a larger to-space
NEW
116
            return false;
×
NEW
117
        }
×
118
        REALM_ASSERT(it.empty());
1,143,315✔
119
        it.set(entry);
1,143,315✔
120
    }
1,143,315✔
121
    return true;
18,729✔
122
}
18,729✔
123

124
// Add a binding from hash value to id.
125
static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size)
126
{
2,307,135✔
127
    REALM_ASSERT(node.is_attached());
2,307,135✔
128
    if (!node.has_refs()) {
2,307,135✔
129
        // it's a leaf.
130
        if (node.size() < linear_search_limit) {
1,258,905✔
131
            // it's a list with room to grow
132
            node.add(((uint64_t)id << hash_size) | hash);
649,686✔
133
            return;
649,686✔
134
        }
649,686✔
135
        if (node.size() == linear_search_limit) {
609,219✔
136
            // it's a full list, must be converted to a hash table
137
            Array new_node(node.get_alloc());
7,158✔
138
            new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0);
7,158✔
139
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
7,158✔
140
            new_node.update_parent();
7,158✔
141
            // transform existing list into hash table
142
            rehash(node, new_node, hash_size);
7,158✔
143
            node.destroy();
7,158✔
144
            node.init_from_parent();
7,158✔
145
        }
7,158✔
146
        // it's a hash table. Grow if needed up till 'hash_node_max_size' entries
147
        while (node.size() < hash_node_max_size) {
620,781✔
148
            auto size = node.size();
619,806✔
149
            size_t start_index = hash & (size - 1);
619,806✔
150
            HashMapIter it(node, 0, hash_size);
619,806✔
151
            it.set_index(start_index);
619,806✔
152
            while (it.is_valid() && !it.empty()) {
1,961,361✔
153
                ++it;
1,341,555✔
154
            }
1,341,555✔
155
            if (it.is_valid()) {
619,806✔
156
                // found an empty spot within search range
157
                it.set(((uint64_t)id << hash_size) | hash);
608,244✔
158
                return;
608,244✔
159
            }
608,244✔
160
            if (node.size() >= hash_node_max_size)
11,562✔
NEW
161
                break;
×
162
            // No free spot found - rehash into bigger and bigger tables
163
            auto new_size = node.size();
11,562✔
164
            bool need_to_rehash = true;
11,562✔
165
            Array new_node(node.get_alloc());
11,562✔
166
            while (need_to_rehash && new_size < hash_node_max_size) {
23,133✔
167
                new_size *= 2;
11,571✔
168
                new_node.create(NodeHeader::type_Normal, false, new_size, 0);
11,571✔
169
                need_to_rehash = !rehash(node, new_node, hash_size);
11,571✔
170
                if (need_to_rehash) { // we failed, try again - or shift to radix
11,571✔
171
                    // I find it counter-intuitive. But it CAN happen.
NEW
172
                    new_node.destroy();
×
NEW
173
                }
×
174
            }
11,571✔
175
            if (need_to_rehash)
11,562✔
NEW
176
                break;
×
177
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
11,562✔
178
            new_node.update_parent();
11,562✔
179
            node.destroy();
11,562✔
180
            node.init_from_parent();
11,562✔
181
        }
11,562✔
182
        // we ran out of space. Rewrite as a radix node with subtrees
183
        Array new_node(node.get_alloc());
975✔
184
        new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0);
975✔
185
        new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
975✔
186
        new_node.update_parent();
975✔
187
        for (size_t index = 0; index < node.size(); ++index) {
1,045,455✔
188
            auto element = node.get(index);
1,044,480✔
189
            auto hash = element & (0xFFFFFFFF >> (32 - hash_size));
1,044,480✔
190
            auto string_id = element >> hash_size;
1,044,480✔
191
            if (string_id == 0)
1,044,480✔
192
                continue;
735,162✔
193
            auto remaining_hash = hash >> radix_node_consumes_bits;
309,318✔
194
            add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8);
309,318✔
195
        }
309,318✔
196
        node.destroy();
975✔
197
        node.init_from_parent();
975✔
198
    }
975✔
199
    // We have a radix node and need to insert the new binding into the proper subtree
200
    size_t index = hash & (radix_node_size - 1);
1,049,205✔
201
    auto rot = node.get_as_ref_or_tagged(index);
1,049,205✔
202
    REALM_ASSERT(!rot.is_tagged());
1,049,205✔
203
    Array subtree(node.get_alloc());
1,049,205✔
204
    if (rot.get_as_ref() == 0) {
1,049,205✔
205
        // no subtree present, create an empty one
206
        subtree.set_parent(&node, index);
197,424✔
207
        subtree.create(NodeHeader::type_Normal);
197,424✔
208
        subtree.update_parent();
197,424✔
209
    }
197,424✔
210
    else {
851,781✔
211
        // subtree already present
212
        subtree.set_parent(&node, index);
851,781✔
213
        subtree.init_from_parent();
851,781✔
214
    }
851,781✔
215
    // recurse into subtree
216
    add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits);
1,049,205✔
217
}
1,049,205✔
218

219
static std::vector<uint32_t> hash_to_id(Array& node, uint32_t hash, uint8_t hash_size)
220
{
63,067,932✔
221
    std::vector<uint32_t> result;
63,067,932✔
222
    REALM_ASSERT(node.is_attached());
63,067,932✔
223
    if (!node.has_refs()) {
63,067,932✔
224
        // it's a leaf - default is a list, search starts from index 0.
225
        HashMapIter it(node, hash, hash_size);
62,159,595✔
226
        if (node.size() >= hash_node_min_size) {
62,159,595✔
227
            // it is a hash table, so use hash to select index to start searching
228
            // table size must be power of two!
229
            size_t index = hash & (node.size() - 1);
1,308,312✔
230
            it.set_index(index);
1,308,312✔
231
        }
1,308,312✔
232
        // collect all matching values within allowed range
233
        while (it.is_valid()) {
85,762,470✔
234
            if (it.matches()) {
23,602,875✔
235
                result.push_back(it.get());
1,206,027✔
236
            }
1,206,027✔
237
            ++it;
23,602,875✔
238
        }
23,602,875✔
239
        return result;
62,159,595✔
240
    }
62,159,595✔
241
    else {
908,337✔
242
        // it's a radix node
243
        size_t index = hash & (node.size() - 1);
908,337✔
244
        auto rot = node.get_as_ref_or_tagged(index);
908,337✔
245
        REALM_ASSERT(rot.is_ref());
908,337✔
246
        if (rot.get_as_ref() == 0) {
908,337✔
247
            // no subtree, return empty vector
248
            return result;
15,822✔
249
        }
15,822✔
250
        // descend into subtree
251
        Array subtree(node.get_alloc());
892,515✔
252
        subtree.set_parent(&node, index);
892,515✔
253
        subtree.init_from_parent();
892,515✔
254
        return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits);
892,515✔
255
    }
908,337✔
256
}
63,067,932✔
257

258

259
enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size };
260
struct StringInterner::DataLeaf {
261
    ref_type m_leaf_ref = 0;
262
    std::vector<CompressedStringView> m_compressed;
263
    std::atomic<bool> m_is_loaded = false;
264
    DataLeaf() {}
803,886✔
265
    DataLeaf(ref_type ref)
266
        : m_leaf_ref(ref)
NEW
267
    {
×
NEW
268
    }
×
269
    DataLeaf(const DataLeaf&& other)
270
        : m_leaf_ref(other.m_leaf_ref)
44,622✔
271
        , m_compressed(other.m_compressed)
44,622✔
272
        , m_is_loaded(other.m_is_loaded.load(std::memory_order_acquire))
44,622✔
273
    {
91,227✔
274
    }
91,227✔
275
};
276

277
StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable)
278
    : m_parent(parent)
755,985✔
279
    , m_top(alloc)
755,985✔
280
    , m_data(alloc)
755,985✔
281
    , m_hash_map(alloc)
755,985✔
282
    , m_current_string_leaf(alloc)
755,985✔
283
    , m_current_long_string_node(alloc)
755,985✔
284
{
1,546,551✔
285
    REALM_ASSERT_DEBUG(col_key != ColKey());
1,546,551✔
286
    size_t index = col_key.get_index().val;
1,546,551✔
287
    // ensure that m_top and m_data is well defined and reflect any existing data
288
    // We'll have to extend this to handle no defined backing
289
    m_top.set_parent(&parent, index);
1,546,551✔
290
    m_data.set_parent(&m_top, Pos_Data);
1,546,551✔
291
    m_hash_map.set_parent(&m_top, Pos_Map);
1,546,551✔
292
    m_col_key = col_key;
1,546,551✔
293
    update_from_parent(writable);
1,546,551✔
294
}
1,546,551✔
295

296
void StringInterner::update_from_parent(bool writable)
297
{
2,658,255✔
298
    auto parent_idx = m_top.get_ndx_in_parent();
2,658,255✔
299
    bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size();
2,658,255✔
300
    bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx);
2,658,255✔
301
    if (valid_top) {
2,658,255✔
302
        m_top.update_from_parent();
2,468,382✔
303
        m_data.update_from_parent();
2,468,382✔
304
        m_hash_map.update_from_parent();
2,468,382✔
305
    }
2,468,382✔
306
    else if (writable && valid_top_ref_spot) {
189,873✔
307
        m_top.create(NodeHeader::type_HasRefs, false, Top_Size, 0);
189,720✔
308
        m_top.set(Pos_Version, (1 << 1) + 1); // version number 1.
189,720✔
309
        m_top.set(Pos_Size, (0 << 1) + 1);    // total size 0
189,720✔
310
        m_top.set(Pos_ColKey, (m_col_key.value << 1) + 1);
189,720✔
311
        m_top.set(Pos_Compressor, 0);
189,720✔
312

313
        // create first level of data tree here (to simplify other stuff)
314
        m_data.create(NodeHeader::type_HasRefs, false, 0);
189,720✔
315
        m_data.update_parent();
189,720✔
316

317
        m_hash_map.create(NodeHeader::type_Normal);
189,720✔
318
        m_hash_map.update_parent();
189,720✔
319
        m_top.update_parent();
189,720✔
320
        valid_top = true;
189,720✔
321
    }
189,720✔
322
    if (!valid_top) {
2,658,255✔
323
        // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode
NEW
324
        m_compressor.reset();
×
NEW
325
        m_compressed_leafs.clear();
×
326
        // m_compressed_string_map.clear();
NEW
327
        m_top.detach();
×
NEW
328
        m_data.detach();
×
NEW
329
        m_hash_map.detach();
×
NEW
330
        m_compressor.reset();
×
NEW
331
        return;
×
NEW
332
    }
×
333
    // validate we're accessing data for the correct column. A combination of column erase
334
    // and insert could lead to an interner being paired with wrong data in the file.
335
    // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch.
336
    int64_t data_colkey = m_top.get_as_ref_or_tagged(Pos_ColKey).get_as_int();
2,658,255✔
337
    if (m_col_key.value != data_colkey) {
2,658,255✔
338
        // new column, new data
NEW
339
        m_compressor.reset();
×
NEW
340
        m_decompressed_strings.clear();
×
NEW
341
    }
×
342
    if (!m_compressor)
2,658,255✔
343
        m_compressor = std::make_unique<StringCompressor>(m_top.get_alloc(), m_top, Pos_Compressor, writable);
1,546,536✔
344
    else
1,111,719✔
345
        m_compressor->refresh(writable);
1,111,719✔
346
    if (m_data.size()) {
2,658,255✔
347
        auto ref_to_write_buffer = m_data.get_as_ref(m_data.size() - 1);
1,196,550✔
348
        const char* header = m_top.get_alloc().translate(ref_to_write_buffer);
1,196,550✔
349
        bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header);
1,196,550✔
350
        if (is_array_of_cprs) {
1,196,550✔
351
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
1,110✔
352
            m_current_long_string_node.update_from_parent();
1,110✔
353
        }
1,110✔
354
        else {
1,195,440✔
355
            m_current_long_string_node.detach();
1,195,440✔
356
        }
1,195,440✔
357
    }
1,196,550✔
358
    else
1,461,705✔
359
        m_current_long_string_node.detach(); // just in case...
1,461,705✔
360

361
    // rebuild internal structures......
362
    rebuild_internal();
2,658,255✔
363
    m_current_string_leaf.detach();
2,658,255✔
364
}
2,658,255✔
365

366
void StringInterner::rebuild_internal()
367
{
2,658,222✔
368
    // release old decompressed strings
369
    for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) {
8,700,483✔
370
        StringID id = m_in_memory_strings[idx];
6,042,261✔
371
        if (id > m_decompressed_strings.size()) {
6,042,261✔
NEW
372
            m_in_memory_strings[idx] = m_in_memory_strings.back();
×
NEW
373
            m_in_memory_strings.pop_back();
×
NEW
374
            continue;
×
NEW
375
        }
×
376
        if (auto& w = m_decompressed_strings[id - 1].m_weight) {
6,042,261✔
377
            auto val = w.load(std::memory_order_acquire);
5,475,525✔
378
            val = val >> 1;
5,475,525✔
379
            w.store(val, std::memory_order_release);
5,475,525✔
380
        }
5,475,525✔
381
        else {
566,736✔
382
            m_decompressed_strings[id - 1].m_decompressed.reset();
566,736✔
383
            m_in_memory_strings[idx] = m_in_memory_strings.back();
566,736✔
384
            m_in_memory_strings.pop_back();
566,736✔
385
            continue;
566,736✔
386
        }
566,736✔
387
    }
6,042,261✔
388

389
    size_t target_size = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int();
2,658,222✔
390
    m_decompressed_strings.resize(target_size);
2,658,222✔
391
    if (m_data.size() != m_compressed_leafs.size()) {
2,658,222✔
392
        m_compressed_leafs.resize(m_data.size());
716,106✔
393
    }
716,106✔
394
    // always force new setup of all leafs:
395
    // update m_compressed_leafs to reflect m_data
396
    for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) {
4,012,629✔
397
        auto ref = m_data.get_as_ref(idx);
1,354,407✔
398
        auto& leaf_meta = m_compressed_leafs[idx];
1,354,407✔
399
        leaf_meta.m_is_loaded.store(false, std::memory_order_release);
1,354,407✔
400
        leaf_meta.m_compressed.clear();
1,354,407✔
401
        leaf_meta.m_leaf_ref = ref;
1,354,407✔
402
    }
1,354,407✔
403
}
2,658,222✔
404

405
StringInterner::~StringInterner() {}
1,546,542✔
406

407
StringID StringInterner::intern(StringData sd)
408
{
2,297,871✔
409
    REALM_ASSERT(m_top.is_attached());
2,297,871✔
410
    //  special case for null string
411
    if (sd.data() == nullptr)
2,297,871✔
412
        return 0;
176,718✔
413
    uint32_t h = (uint32_t)sd.hash();
2,121,153✔
414
    auto candidates = hash_to_id(m_hash_map, h, 32);
2,121,153✔
415
    for (auto& candidate : candidates) {
2,121,153✔
416
        auto candidate_cpr = get_compressed(candidate);
1,172,556✔
417
        if (m_compressor->compare(sd, candidate_cpr) == 0)
1,172,556✔
418
            return candidate;
1,172,550✔
419
    }
1,172,556✔
420
    // it's a new string
421
    bool learn = true;
948,603✔
422
    auto c_str = m_compressor->compress(sd, learn);
948,603✔
423
    m_decompressed_strings.emplace_back(64, std::make_unique<std::string>(sd));
948,603✔
424
    auto id = m_decompressed_strings.size();
948,603✔
425
    m_in_memory_strings.push_back(id);
948,603✔
426
    add_to_hash_map(m_hash_map, h, id, 32);
948,603✔
427
    size_t index = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int();
948,603✔
428
    REALM_ASSERT_DEBUG(index == id - 1);
948,603✔
429
    bool need_long_string_node = c_str.size() >= 65536;
948,603✔
430

431
    if (need_long_string_node && !m_current_long_string_node.is_attached()) {
948,603✔
432

433
        m_current_long_string_node.create(NodeHeader::type_HasRefs);
102✔
434

435
        if ((index & 0xFF) == 0) {
102✔
436
            // if we're starting on a new leaf, extend parent array for it
437
            m_data.add(0);
36✔
438
            m_compressed_leafs.push_back({});
36✔
439
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
36✔
440
            m_current_long_string_node.update_parent();
36✔
441
            REALM_ASSERT_DEBUG(!m_current_string_leaf.is_attached() || m_current_string_leaf.size() == 0);
36!
442
            m_current_string_leaf.detach();
36✔
443
        }
36✔
444
        else {
66✔
445
            // we have been building an existing leaf and need to shift representation.
446
            // but first we need to update leaf accessor for existing leaf
447
            if (m_current_string_leaf.is_attached()) {
66✔
448
                m_current_string_leaf.update_from_parent();
63✔
449
            }
63✔
450
            else {
3✔
451
                m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent());
3✔
452
            }
3✔
453
            REALM_ASSERT_DEBUG(m_current_string_leaf.size() > 0);
66✔
454
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
66✔
455
            m_current_long_string_node.update_parent();
66✔
456
            // convert the current leaf into a long string node. (array of strings in separate arrays)
457
            for (auto s : m_compressed_leafs.back().m_compressed) {
93✔
458
                ArrayUnsigned arr(m_top.get_alloc());
93✔
459
                arr.create(s.size, 65535);
93✔
460
                unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
93✔
461
                std::copy_n(s.data, s.size, dest);
93✔
462
                m_current_long_string_node.add(arr.get_ref());
93✔
463
            }
93✔
464
            m_current_string_leaf.destroy();
66✔
465
            // force later reload of leaf
466
            m_compressed_leafs.back().m_is_loaded.store(false, std::memory_order_release);
66✔
467
        }
66✔
468
    }
102✔
469
    if (m_current_long_string_node.is_attached()) {
948,603✔
470
        ArrayUnsigned arr(m_top.get_alloc());
2,535✔
471
        arr.create(c_str.size(), 65535);
2,535✔
472
        unsigned short* begin = c_str.data();
2,535✔
473
        if (begin) {
2,535✔
474
            // if the compressed string is empty, 'begin' is zero and we don't copy
475
            size_t n = c_str.size();
2,529✔
476
            unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
2,529✔
477
            std::copy_n(begin, n, dest);
2,529✔
478
        }
2,529✔
479
        m_current_long_string_node.add(arr.get_ref());
2,535✔
480
        m_current_long_string_node.update_parent();
2,535✔
481
        if (m_current_long_string_node.size() == 256) {
2,535✔
482
            // exit from  "long string mode"
NEW
483
            m_current_long_string_node.detach();
×
NEW
484
        }
×
485
        CompressionSymbol* p_start = reinterpret_cast<CompressionSymbol*>(arr.m_data);
2,535✔
486
        m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()});
2,535✔
487
    }
2,535✔
488
    else {
946,068✔
489
        // Append to leaf with up to 256 entries.
490
        // First create a new leaf if needed (limit number of entries to 256 pr leaf)
491
        bool need_leaf_update = !m_current_string_leaf.is_attached() || (index & 0xFF) == 0;
946,068✔
492
        if (need_leaf_update) {
946,068✔
493
            m_current_string_leaf.set_parent(&m_data, index >> 8);
124,779✔
494
            if ((index & 0xFF) == 0) {
124,779✔
495
                // create new leaf
496
                m_current_string_leaf.create(0, 65535);
87,027✔
497
                m_data.add(m_current_string_leaf.get_ref());
87,027✔
498
                m_compressed_leafs.push_back({});
87,027✔
499
            }
87,027✔
500
            else {
37,752✔
501
                // just setup leaf accessor
502
                if (m_current_string_leaf.is_attached()) {
37,752✔
NEW
503
                    m_current_string_leaf.update_from_parent();
×
NEW
504
                }
×
505
                else {
37,752✔
506
                    m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent());
37,752✔
507
                }
37,752✔
508
            }
37,752✔
509
        }
124,779✔
510
        REALM_ASSERT(c_str.size() < 65535);
946,068✔
511
        // Add compressed string at end of leaf
512
        m_current_string_leaf.add(c_str.size());
946,068✔
513
        for (auto c : c_str) {
69,840,825✔
514
            m_current_string_leaf.add(c);
69,840,825✔
515
        }
69,840,825✔
516
        REALM_ASSERT_DEBUG(m_compressed_leafs.size());
946,068✔
517
        CompressionSymbol* p = reinterpret_cast<CompressionSymbol*>(m_current_string_leaf.m_data);
946,068✔
518
        auto p_limit = p + m_current_string_leaf.size();
946,068✔
519
        auto p_start = p_limit - c_str.size();
946,068✔
520
        m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()});
946,068✔
521
        REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256);
946,068✔
522
    }
946,068✔
523
    m_top.adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2
948,603✔
524
    load_leaf_if_new_ref(m_compressed_leafs.back(), m_data.get_as_ref(m_data.size() - 1));
948,603✔
525
#ifdef REALM_DEBUG
948,603✔
526
    auto csv = get_compressed(id);
948,603✔
527
    CompressedStringView csv2(c_str);
948,603✔
528
    REALM_ASSERT(csv == csv2);
948,603✔
529
#endif
948,603✔
530
    return id;
948,603✔
531
}
2,121,153✔
532

533
bool StringInterner::load_leaf_if_needed(DataLeaf& leaf)
534
{
1,195,191✔
535
    if (!leaf.m_is_loaded.load(std::memory_order_relaxed)) {
1,195,191✔
536
        // start with an empty leaf:
537
        leaf.m_compressed.clear();
391,851✔
538
        leaf.m_compressed.reserve(256);
391,851✔
539

540
        // must interpret leaf first - the leaf is either a single array holding all strings,
541
        // or an array with each (compressed) string placed in its own array.
542
        const char* header = m_top.get_alloc().translate(leaf.m_leaf_ref);
391,851✔
543
        bool is_single_array = !NodeHeader::get_hasrefs_from_header(header);
391,851✔
544
        if (is_single_array) {
391,851✔
545
            size_t leaf_offset = 0;
390,825✔
546
            ArrayUnsigned leaf_array(m_top.get_alloc());
390,825✔
547
            leaf_array.init_from_ref(leaf.m_leaf_ref);
390,825✔
548
            REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits);
390,825✔
549
            REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16);
390,825✔
550
            // This is dangerous if the leaf is for some reason not in the assumed format
551
            CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(leaf_array.m_data);
390,825✔
552
            auto leaf_size = leaf_array.size();
390,825✔
553
            while (leaf_offset < leaf_size) {
10,185,042✔
554
                size_t length = c[leaf_offset];
9,794,217✔
555
                REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset));
9,794,217✔
556
                leaf_offset++;
9,794,217✔
557
                leaf.m_compressed.push_back({c + leaf_offset, length});
9,794,217✔
558
                REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256);
9,794,217✔
559
                leaf_offset += length;
9,794,217✔
560
            }
9,794,217✔
561
        }
390,825✔
562
        else {
1,026✔
563
            // Not a single leaf - instead an array of strings
564
            Array arr(m_top.get_alloc());
1,026✔
565
            arr.init_from_ref(leaf.m_leaf_ref);
1,026✔
566
            for (size_t idx = 0; idx < arr.size(); ++idx) {
12,522✔
567
                ArrayUnsigned str_array(m_top.get_alloc());
11,496✔
568
                ref_type ref = arr.get_as_ref(idx);
11,496✔
569
                str_array.init_from_ref(ref);
11,496✔
570
                REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits);
11,496✔
571
                REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16);
11,496✔
572
                CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(str_array.m_data);
11,496✔
573
                leaf.m_compressed.push_back({c, str_array.size()});
11,496✔
574
            }
11,496✔
575
        }
1,026✔
576
        leaf.m_is_loaded.store(true, std::memory_order_release);
391,851✔
577
        return true;
391,851✔
578
    }
391,851✔
579
    return false;
803,340✔
580
}
1,195,191✔
581

582
// Danger: Only to be used if you know that a change in content ==> different ref
583
bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref)
584
{
948,579✔
585
    if (leaf.m_leaf_ref != new_ref) {
948,579✔
586
        leaf.m_leaf_ref = new_ref;
145,254✔
587
        leaf.m_is_loaded = false;
145,254✔
588
        leaf.m_compressed.resize(0);
145,254✔
589
    }
145,254✔
590
    return load_leaf_if_needed(leaf);
948,579✔
591
}
948,579✔
592

593
CompressedStringView& StringInterner::get_compressed(StringID id, bool lock_if_mutating)
594
{
4,401,225✔
595
    auto index = id - 1; // 0 represents null
4,401,225✔
596
    auto hi = index >> 8;
4,401,225✔
597
    auto lo = index & 0xFFUL;
4,401,225✔
598

599
    // This is an instance of the "double checked locking" idiom, chosen to minimize
600
    // locking in the common case of the leaf already being fully initialized.
601
    DataLeaf& leaf = m_compressed_leafs[hi];
4,401,225✔
602
    if (leaf.m_is_loaded.load(std::memory_order_acquire)) {
4,401,225✔
603
        return leaf.m_compressed[lo];
4,154,742✔
604
    }
4,154,742✔
605
    if (lock_if_mutating) {
246,483✔
606
        std::lock_guard lock(m_mutex);
15,270✔
607
        load_leaf_if_needed(leaf);
15,270✔
608
    }
15,270✔
609
    else {
231,213✔
610
        load_leaf_if_needed(leaf);
231,213✔
611
    }
231,213✔
612
    REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size());
246,483✔
613
    return leaf.m_compressed[lo];
246,483✔
614
}
4,401,225✔
615

616
std::optional<StringID> StringInterner::lookup(StringData sd)
617
{
60,085,836✔
618
    if (!m_top.is_attached()) {
60,085,836✔
619
        // "dead" mode
NEW
620
        return {};
×
NEW
621
    }
×
622
    if (sd.data() == nullptr)
60,085,836✔
623
        return 0;
20,808✔
624
    uint32_t h = (uint32_t)sd.hash();
60,065,028✔
625
    auto candidates = hash_to_id(m_hash_map, h, 32);
60,065,028✔
626
    for (auto& candidate : candidates) {
60,065,028✔
627
        auto candidate_cpr = get_compressed(candidate, true);
33,477✔
628
        if (m_compressor->compare(sd, candidate_cpr) == 0)
33,477✔
629
            return candidate;
33,477✔
630
    }
33,477✔
631
    return {};
60,031,551✔
632
}
60,065,028✔
633

634
int StringInterner::compare(StringID A, StringID B)
635
{
1,026,093✔
636
    // comparisons against null
637
    if (A == B && A == 0)
1,026,093✔
638
        return 0;
24,741✔
639
    if (A == 0)
1,001,352✔
640
        return -1;
180✔
641
    if (B == 0)
1,001,172✔
642
        return 1;
32,724✔
643
    // ok, no nulls.
644
    // StringID 0 is null, the first true index starts from 1.
645
    REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
968,448✔
646
    REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size());
968,448✔
647
    REALM_ASSERT(m_compressor);
968,448✔
648
    return m_compressor->compare(get_compressed(A, true), get_compressed(B, true));
968,448✔
649
}
1,001,172✔
650

651
int StringInterner::compare(StringData s, StringID A)
652
{
4,740✔
653
    // comparisons against null
654
    if (s.data() == nullptr && A == 0)
4,740✔
655
        return 0;
6✔
656
    if (s.data() == nullptr)
4,734✔
657
        return -1;
6✔
658
    if (A == 0)
4,728✔
659
        return 1;
1,056✔
660
    // ok, no nulls
661
    REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
3,672✔
662
    REALM_ASSERT(m_compressor);
3,672✔
663
    return m_compressor->compare(s, get_compressed(A, true));
3,672✔
664
}
4,728✔
665

666

667
StringData StringInterner::get(StringID id)
668
{
8,538,360✔
669
    REALM_ASSERT(m_compressor);
8,538,360✔
670
    if (id == 0)
8,538,360✔
671
        return StringData{nullptr};
73,722✔
672
    REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size());
8,464,638✔
673

674
    // Avoid taking a lock in the (presumably) common case, where the leaf with the compressed
675
    // strings have already been loaded. Such leafs have "m_weight" > 0.
676
    CachedString& cs = m_decompressed_strings[id - 1];
8,464,638✔
677
    if (auto weight = cs.m_weight.load(std::memory_order_acquire)) {
8,464,638✔
678
        REALM_ASSERT_DEBUG(cs.m_decompressed);
8,158,131✔
679
        if (weight < 128) {
8,158,131✔
680
            // ignore if this fails - that happens if some other thread bumps the value
681
            // concurrently. And if so, we can live with loosing our own "bump"
682
            cs.m_weight.compare_exchange_strong(weight, weight + 64, std::memory_order_acq_rel);
977,145✔
683
        }
977,145✔
684
        return {cs.m_decompressed->c_str(), cs.m_decompressed->size()};
8,158,131✔
685
    }
8,158,131✔
686
    std::lock_guard lock(m_mutex);
306,507✔
687
    cs.m_decompressed = std::make_unique<std::string>(m_compressor->decompress(get_compressed(id)));
306,507✔
688
    m_in_memory_strings.push_back(id);
306,507✔
689
    cs.m_weight.store(64, std::memory_order_release);
306,507✔
690
    return {cs.m_decompressed->c_str(), cs.m_decompressed->size()};
306,507✔
691
}
8,464,638✔
692

693
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc