• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / finn.schiermer-andersen_89

04 Jun 2024 02:04PM UTC coverage: 90.651% (-0.03%) from 90.685%
finn.schiermer-andersen_89

Pull #7654

Evergreen

finnschiermer
optimized string cache gc
Pull Request #7654: Fsa/string interning

102644 of 180648 branches covered (56.82%)

1005 of 1125 new or added lines in 15 files covered. (89.33%)

154 existing lines in 21 files now uncovered.

217953 of 240431 relevant lines covered (90.65%)

7671710.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.82
/src/realm/string_interner.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_interner.hpp>
20
#include <realm/string_data.hpp>
21

22
#include <realm/array_unsigned.hpp>
23
#include <string_view>
24

25
namespace realm {
26

27
// Fast mapping of strings (or rather hash of strings) to string IDs.
28
//
29
// We use a tree where:
30
// * All interior nodes are radix nodes with a fan-out of 256.
31
// * Leaf nodes with up to 16 entries are just lists, searched linearly
32
// * Leaf nodes with more than 16 entries and less than 1K are hash tables.
33
//   Hash tables use linear search starting from the entry found by hashing.
34
//
35
constexpr static size_t linear_search_limit = 16;
36
constexpr static size_t hash_node_min_size = 32;
37
constexpr static size_t hash_node_max_size = 1024;
38
constexpr static size_t radix_node_consumes_bits = 8;
39
constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits;
40

41
// helpers
42
struct HashMapIter {
43
    Array& m_array;
44
    uint32_t hash_filter;
45
    uint16_t index;
46
    uint16_t left_to_search;
47
    uint8_t hash_size;
48
    HashMapIter(Array& array, uint32_t hash, uint8_t hash_size)
49
        : m_array(array)
1,992,603✔
50
        , hash_filter(hash)
1,992,603✔
51
        , hash_size(hash_size)
1,992,603✔
52
    {
4,028,829✔
53
        set_index(0);
4,028,829✔
54
    }
4,028,829✔
55
    HashMapIter(Array& dummy)
56
        : m_array(dummy)
NEW
57
    {
×
NEW
58
        left_to_search = 0;
×
NEW
59
    }
×
60
    inline uint32_t get()
61
    {
560,673✔
62
        return (uint32_t)(m_array.get(index) >> hash_size);
560,673✔
63
    }
560,673✔
64
    inline bool empty()
65
    {
5,987,991✔
66
        auto element = m_array.get(index);
5,987,991✔
67
        return (element >> hash_size) == 0;
5,987,991✔
68
    }
5,987,991✔
69
    inline void set(uint64_t element)
70
    {
2,337,288✔
71
        m_array.set(index, element);
2,337,288✔
72
    }
2,337,288✔
73
    inline bool matches()
74
    {
17,263,458✔
75
        auto mask = 0xFFFFFFFFUL >> (32 - hash_size);
17,263,458✔
76
        auto element = m_array.get(index);
17,263,458✔
77
        return ((element & mask) == hash_filter) && (element >> hash_size);
17,263,458✔
78
    }
17,263,458✔
79
    inline bool is_valid()
80
    {
45,104,808✔
81
        return left_to_search != 0;
45,104,808✔
82
    }
45,104,808✔
83
    inline void set_index(size_t i, size_t search_limit = linear_search_limit)
84
    {
7,214,025✔
85
        index = (uint16_t)i;
7,214,025✔
86
        left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit);
7,214,025✔
87
    }
7,214,025✔
88
    void operator++()
89
    {
19,365,705✔
90
        if (is_valid()) {
19,367,580✔
91
            left_to_search--;
19,367,568✔
92
            index++;
19,367,568✔
93
            if (index == m_array.size()) {
19,367,568✔
94
                index = 0;
785,757✔
95
            }
785,757✔
96
        }
19,367,568✔
97
    }
19,365,705✔
98
};
99

100
// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf.
101
static bool rehash(Array& from, Array& to, uint8_t hash_size)
102
{
21,195✔
103
    REALM_ASSERT_DEBUG(from.size() * 2 == to.size());
21,195✔
104

105
    for (size_t i = 0; i < from.size(); ++i) {
2,186,067✔
106
        auto entry = (size_t)from.get(i);
2,164,872✔
107
        if ((entry >> hash_size) == 0)
2,164,872✔
108
            continue;
620,577✔
109
        size_t starting_index = entry & (to.size() - 1);
1,544,295✔
110
        HashMapIter it(to, 0, hash_size);
1,544,295✔
111
        it.set_index(starting_index);
1,544,295✔
112
        while (it.is_valid() && !it.empty()) {
1,976,661✔
113
            ++it;
432,360✔
114
        }
432,360✔
115
        if (!it.is_valid()) {
1,544,295✔
116
            // abort rehashing, we need a larger to-space
NEW
117
            return false;
×
NEW
118
        }
×
119
        REALM_ASSERT(it.empty());
1,544,295✔
120
        it.set(entry);
1,544,295✔
121
    }
1,544,295✔
122
    return true;
21,195✔
123
}
21,195✔
124

125
// Add a binding from hash value to id.
126
static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size)
127
{
3,122,652✔
128
    REALM_ASSERT(node.is_attached());
3,122,652✔
129
    if (!node.has_refs()) {
3,122,652✔
130
        // it's a leaf.
131
        if (node.size() < linear_search_limit) {
1,712,691✔
132
            // it's a list with room to grow
133
            node.add(((uint64_t)id << hash_size) | hash);
918,018✔
134
            return;
918,018✔
135
        }
918,018✔
136
        if (node.size() == linear_search_limit) {
794,673✔
137
            // it's a full list, must be converted to a hash table
138
            Array new_node(node.get_alloc());
6,900✔
139
            new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0);
6,900✔
140
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
6,900✔
141
            new_node.update_parent();
6,900✔
142
            // transform existing list into hash table
143
            rehash(node, new_node, hash_size);
6,900✔
144
            node.destroy();
6,900✔
145
            node.init_from_parent();
6,900✔
146
        }
6,900✔
147
        // it's a hash table. Grow if needed up till 'hash_node_max_size' entries
148
        while (node.size() < hash_node_max_size) {
808,938✔
149
            auto size = node.size();
807,321✔
150
            size_t start_index = hash & (size - 1);
807,321✔
151
            HashMapIter it(node, 0, hash_size);
807,321✔
152
            it.set_index(start_index);
807,321✔
153
            while (it.is_valid() && !it.empty()) {
2,481,450✔
154
                ++it;
1,674,129✔
155
            }
1,674,129✔
156
            if (it.is_valid()) {
807,321✔
157
                // found an empty spot within search range
158
                it.set(((uint64_t)id << hash_size) | hash);
793,056✔
159
                return;
793,056✔
160
            }
793,056✔
161
            if (node.size() >= hash_node_max_size)
14,265✔
NEW
162
                break;
×
163
            // No free spot found - rehash into bigger and bigger tables
164
            auto new_size = node.size();
14,265✔
165
            bool need_to_rehash = true;
14,265✔
166
            Array new_node(node.get_alloc());
14,265✔
167
            while (need_to_rehash && new_size < hash_node_max_size) {
28,560✔
168
                new_size *= 2;
14,295✔
169
                new_node.create(NodeHeader::type_Normal, false, new_size, 0);
14,295✔
170
                need_to_rehash = !rehash(node, new_node, hash_size);
14,295✔
171
                if (need_to_rehash) { // we failed, try again - or shift to radix
14,295✔
172
                    // I find it counter-intuitive. But it CAN happen.
NEW
173
                    new_node.destroy();
×
NEW
174
                }
×
175
            }
14,295✔
176
            if (need_to_rehash)
14,265✔
NEW
177
                break;
×
178
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
14,265✔
179
            new_node.update_parent();
14,265✔
180
            node.destroy();
14,265✔
181
            node.init_from_parent();
14,265✔
182
        }
14,265✔
183
        // we ran out of space. Rewrite as a radix node with subtrees
184
        Array new_node(node.get_alloc());
1,617✔
185
        new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0);
1,617✔
186
        new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
1,617✔
187
        new_node.update_parent();
1,617✔
188
        for (size_t index = 0; index < node.size(); ++index) {
1,737,297✔
189
            auto element = node.get(index);
1,735,680✔
190
            auto hash = element & (0xFFFFFFFF >> (32 - hash_size));
1,735,680✔
191
            auto string_id = element >> hash_size;
1,735,680✔
192
            if (string_id == 0)
1,735,680✔
193
                continue;
1,171,104✔
194
            auto remaining_hash = hash >> radix_node_consumes_bits;
564,576✔
195
            add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8);
564,576✔
196
        }
564,576✔
197
        node.destroy();
1,617✔
198
        node.init_from_parent();
1,617✔
199
    }
1,617✔
200
    // We have a radix node and need to insert the new binding into the proper subtree
201
    size_t index = hash & (radix_node_size - 1);
1,411,578✔
202
    auto rot = node.get_as_ref_or_tagged(index);
1,411,578✔
203
    REALM_ASSERT(!rot.is_tagged());
1,411,578✔
204
    Array subtree(node.get_alloc());
1,411,578✔
205
    if (rot.get_as_ref() == 0) {
1,411,578✔
206
        // no subtree present, create an empty one
207
        subtree.set_parent(&node, index);
345,588✔
208
        subtree.create(NodeHeader::type_Normal);
345,588✔
209
        subtree.update_parent();
345,588✔
210
    }
345,588✔
211
    else {
1,065,990✔
212
        // subtree already present
213
        subtree.set_parent(&node, index);
1,065,990✔
214
        subtree.init_from_parent();
1,065,990✔
215
    }
1,065,990✔
216
    // recurse into subtree
217
    add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits);
1,411,578✔
218
}
1,411,578✔
219

220
static std::vector<uint32_t> hash_to_id(Array& node, uint32_t hash, uint8_t hash_size)
221
{
2,693,949✔
222
    std::vector<uint32_t> result;
2,693,949✔
223
    REALM_ASSERT(node.is_attached());
2,693,949✔
224
    if (!node.has_refs()) {
2,693,949✔
225
        // it's a leaf - default is a list, search starts from index 0.
226
        HashMapIter it(node, hash, hash_size);
1,677,573✔
227
        if (node.size() > hash_node_min_size) {
1,677,573✔
228
            // it is a hash table, so use hash to select index to start searching
229
            // table size must be power of two!
230
            size_t index = hash & (node.size() - 1);
833,835✔
231
            it.set_index(index);
833,835✔
232
        }
833,835✔
233
        // collect all matching values within allowed range
234
        while (it.is_valid()) {
18,939,066✔
235
            if (it.matches()) {
17,261,493✔
236
                result.push_back(it.get());
560,667✔
237
            }
560,667✔
238
            ++it;
17,261,493✔
239
        }
17,261,493✔
240
        return result;
1,677,573✔
241
    }
1,677,573✔
242
    else {
1,016,376✔
243
        // it's a radix node
244
        size_t index = hash & (node.size() - 1);
1,016,376✔
245
        auto rot = node.get_as_ref_or_tagged(index);
1,016,376✔
246
        REALM_ASSERT(rot.is_ref());
1,016,376✔
247
        if (rot.get_as_ref() == 0) {
1,016,376✔
248
            // no subtree, return empty vector
249
            return result;
31,998✔
250
        }
31,998✔
251
        // descend into subtree
252
        Array subtree(node.get_alloc());
984,378✔
253
        subtree.set_parent(&node, index);
984,378✔
254
        subtree.init_from_parent();
984,378✔
255
        return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits);
984,378✔
256
    }
1,016,376✔
257
}
2,693,949✔
258

259

260
enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size };
261
struct StringInterner::DataLeaf {
262
    std::vector<CompressedStringView> m_compressed;
263
    ref_type m_leaf_ref = 0;
264
    bool m_is_loaded = false;
265
    DataLeaf() {}
859,308✔
266
    DataLeaf(ref_type ref)
267
        : m_leaf_ref(ref)
NEW
268
    {
×
NEW
269
    }
×
270
};
271

272
StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable)
273
    : m_parent(parent)
4,130,670✔
274
{
7,452,471✔
275
    REALM_ASSERT_DEBUG(col_key != ColKey());
7,452,471✔
276
    size_t index = col_key.get_index().val;
7,452,471✔
277
    // ensure that m_top and m_data is well defined and reflect any existing data
278
    // We'll have to extend this to handle no defined backing
279
    m_top = std::make_unique<Array>(alloc);
7,452,471✔
280
    m_top->set_parent(&parent, index);
7,452,471✔
281
    m_data = std::make_unique<Array>(alloc);
7,452,471✔
282
    m_data->set_parent(m_top.get(), Pos_Data);
7,452,471✔
283
    m_hash_map = std::make_unique<Array>(alloc);
7,452,471✔
284
    m_hash_map->set_parent(m_top.get(), Pos_Map);
7,452,471✔
285
    m_current_string_leaf = std::make_unique<ArrayUnsigned>(alloc);
7,452,471✔
286
    m_col_key = col_key;
7,452,471✔
287
    update_from_parent(writable);
7,452,471✔
288
}
7,452,471✔
289

290
void StringInterner::update_from_parent(bool writable)
291
{
11,974,914✔
292
    auto parent_idx = m_top->get_ndx_in_parent();
11,974,914✔
293
    bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size();
11,974,914✔
294
    bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx);
11,974,914✔
295
    if (valid_top) {
11,974,914✔
296
        m_top->update_from_parent();
11,235,318✔
297
        m_data->update_from_parent();
11,235,318✔
298
        m_hash_map->update_from_parent();
11,235,318✔
299
    }
11,235,318✔
300
    else if (writable && valid_top_ref_spot) {
739,596✔
301
        m_top->create(NodeHeader::type_HasRefs, false, Top_Size, 0);
728,634✔
302
        m_top->set(Pos_Version, (1 << 1) + 1); // version number 1.
728,634✔
303
        m_top->set(Pos_Size, (0 << 1) + 1);    // total size 0
728,634✔
304
        m_top->set(Pos_ColKey, (m_col_key.value << 1) + 1);
728,634✔
305
        m_top->set(Pos_Compressor, 0);
728,634✔
306
        // create first level of data tree here (to simplify other stuff)
307
        m_data = std::make_unique<Array>(m_parent.get_alloc());
728,634✔
308
        m_data->set_parent(m_top.get(), Pos_Data);
728,634✔
309
        m_data->create(NodeHeader::type_HasRefs, false, 0);
728,634✔
310
        m_data->update_parent();
728,634✔
311
        m_hash_map = std::make_unique<Array>(m_parent.get_alloc());
728,634✔
312
        m_hash_map->set_parent(m_top.get(), Pos_Map);
728,634✔
313
        m_hash_map->create(NodeHeader::type_Normal);
728,634✔
314
        m_hash_map->update_parent();
728,634✔
315
        m_top->update_parent();
728,634✔
316
        valid_top = true;
728,634✔
317
    }
728,634✔
318
    if (!valid_top) {
11,974,914✔
319
        // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode
NEW
320
        m_compressor.reset();
×
NEW
321
        m_compressed_leafs.clear();
×
322
        // m_compressed_string_map.clear();
NEW
323
        m_top->detach(); // <-- indicates "dead" mode
×
NEW
324
        m_data->detach();
×
NEW
325
        m_hash_map->detach();
×
NEW
326
        m_compressor.reset();
×
NEW
327
        return;
×
NEW
328
    }
×
329
    // validate we're accessing data for the correct column. A combination of column erase
330
    // and insert could lead to an interner being paired with wrong data in the file.
331
    // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch.
332
    int64_t data_colkey = m_top->get_as_ref_or_tagged(Pos_ColKey).get_as_int();
11,974,914✔
333
    if (m_col_key.value != data_colkey) {
11,974,914✔
334
        // new column, new data
335
        m_compressor.reset();
258✔
336
        m_decompressed_strings.clear();
258✔
337
    }
258✔
338
    if (!m_compressor)
11,974,914✔
339
        m_compressor = std::make_unique<StringCompressor>(m_top->get_alloc(), *m_top, Pos_Compressor, writable);
7,449,351✔
340
    else
4,525,563✔
341
        m_compressor->refresh(writable);
4,525,563✔
342
    if (m_data->size()) {
11,974,914✔
343
        auto ref_to_write_buffer = m_data->get_as_ref(m_data->size() - 1);
1,126,521✔
344
        const char* header = m_top->get_alloc().translate(ref_to_write_buffer);
1,126,521✔
345
        bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header);
1,126,521✔
346
        if (is_array_of_cprs) {
1,126,521✔
347
            m_current_long_string_node = std::make_unique<Array>(m_top->get_alloc());
1,137✔
348
            m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1);
1,137✔
349
            m_current_long_string_node->update_from_parent();
1,137✔
350
        }
1,137✔
351
        else {
1,125,384✔
352
            m_current_long_string_node.reset();
1,125,384✔
353
        }
1,125,384✔
354
    }
1,126,521✔
355
    else
10,848,393✔
356
        m_current_long_string_node.reset(); // just in case...
10,848,393✔
357

358
    // rebuild internal structures......
359
    rebuild_internal();
11,974,914✔
360
    m_current_string_leaf->detach();
11,974,914✔
361
}
11,974,914✔
362

363
void StringInterner::rebuild_internal()
364
{
11,976,861✔
365
    std::lock_guard lock(m_mutex);
11,976,861✔
366
    // release old decompressed strings
367
    for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) {
16,217,691✔
368
        StringID id = m_in_memory_strings[idx];
4,240,830✔
369
        if (id > m_decompressed_strings.size()) {
4,240,830✔
NEW
370
            m_in_memory_strings[idx] = m_in_memory_strings.back();
×
NEW
371
            m_in_memory_strings.pop_back();
×
NEW
372
            continue;
×
NEW
373
        }
×
374
        if (auto& w = m_decompressed_strings[id - 1].m_weight) {
4,240,830✔
375
            w >>= 1;
3,755,322✔
376
        }
3,755,322✔
377
        else {
485,508✔
378
            m_decompressed_strings[id - 1].m_decompressed.reset();
485,508✔
379
            m_in_memory_strings[idx] = m_in_memory_strings.back();
485,508✔
380
            m_in_memory_strings.pop_back();
485,508✔
381
            continue;
485,508✔
382
        }
485,508✔
383
    }
4,240,830✔
384

385
    size_t target_size = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int();
11,976,861✔
386
    m_decompressed_strings.resize(target_size);
11,976,861✔
387
    if (m_data->size() != m_compressed_leafs.size()) {
11,976,861✔
388
        m_compressed_leafs.resize(m_data->size());
806,490✔
389
    }
806,490✔
390
    // allways force new setup of all leafs:
391
    // update m_compressed_leafs to reflect m_data
392
    for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) {
13,212,699✔
393
        auto ref = m_data->get_as_ref(idx);
1,235,838✔
394
        auto& leaf_meta = m_compressed_leafs[idx];
1,235,838✔
395
        // if (ref != leaf_meta.m_leaf_ref) {
396
        leaf_meta.m_is_loaded = false;
1,235,838✔
397
        leaf_meta.m_compressed.clear();
1,235,838✔
398
        leaf_meta.m_leaf_ref = ref;
1,235,838✔
399
        //}
400
    }
1,235,838✔
401
}
11,976,861✔
402

403
StringInterner::~StringInterner() {}
7,452,909✔
404

405
StringID StringInterner::intern(StringData sd)
406
{
1,852,974✔
407
    REALM_ASSERT(m_top->is_attached());
1,852,974✔
408
    std::lock_guard lock(m_mutex);
1,852,974✔
409
    // special case for null string
410
    if (sd.data() == nullptr)
1,852,974✔
411
        return 0;
180,321✔
412
    uint32_t h = (uint32_t)sd.hash();
1,672,653✔
413
    auto candidates = hash_to_id(*m_hash_map.get(), h, 32);
1,672,653✔
414
    for (auto& candidate : candidates) {
1,672,653✔
415
        auto candidate_cpr = get_compressed(candidate);
526,212✔
416
        if (m_compressor->compare(sd, candidate_cpr) == 0)
526,212✔
417
            return candidate;
526,209✔
418
    }
526,212✔
419
    // it's a new string
420
    bool learn = true;
1,146,444✔
421
    auto c_str = m_compressor->compress(sd, learn);
1,146,444✔
422
    m_decompressed_strings.push_back({64, std::make_unique<std::string>(sd)});
1,146,444✔
423
    auto id = m_decompressed_strings.size();
1,146,444✔
424
    m_in_memory_strings.push_back(id);
1,146,444✔
425
    add_to_hash_map(*m_hash_map.get(), h, id, 32);
1,146,444✔
426
    size_t index = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int();
1,146,444✔
427
    REALM_ASSERT_DEBUG(index == id - 1);
1,146,444✔
428
    bool need_long_string_node = c_str.size() >= 65536;
1,146,444✔
429

430
    // TODO: update_internal must set up m_current_long_string_node if it is in use
431

432
    if (need_long_string_node && !m_current_long_string_node) {
1,146,444✔
433
        if ((index & 0xFF) == 0) {
90✔
434
            // if we're starting on a new leaf, extend parent array for it
435
            m_data->add(0);
30✔
436
            m_compressed_leafs.push_back({});
30✔
437
            m_current_long_string_node = std::make_unique<Array>(m_top->get_alloc());
30✔
438
            m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1);
30✔
439
            m_current_long_string_node->create(NodeHeader::type_HasRefs);
30✔
440
            m_current_long_string_node->update_parent();
30✔
441
            REALM_ASSERT_DEBUG(!m_current_string_leaf->is_attached() || m_current_string_leaf->size() == 0);
30✔
442
            m_current_string_leaf->detach();
30✔
443
        }
30✔
444
        else {
60✔
445
            // we have been building an existing leaf and need to shift representation.
446
            // but first we need to update leaf accessor for existing leaf
447
            if (m_current_string_leaf->is_attached()) {
60✔
448
                m_current_string_leaf->update_from_parent();
60✔
449
            }
60✔
NEW
450
            else {
×
NEW
451
                m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent());
×
NEW
452
            }
×
453
            REALM_ASSERT_DEBUG(m_current_string_leaf->size() > 0);
60✔
454
            m_current_long_string_node = std::make_unique<Array>(m_top->get_alloc());
60✔
455
            m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1);
60✔
456
            m_current_long_string_node->create(NodeHeader::type_HasRefs);
60✔
457
            m_current_long_string_node->update_parent();
60✔
458
            // convert the current leaf into a long string node. (array of strings in separate arrays)
459
            for (auto s : m_compressed_leafs.back().m_compressed) {
84✔
460
                ArrayUnsigned arr(m_top->get_alloc());
84✔
461
                arr.create(s.size, 65535);
84✔
462
                unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
84✔
463
                std::copy_n(s.data, s.size, dest);
84✔
464
                m_current_long_string_node->add(arr.get_ref());
84✔
465
            }
84✔
466
            m_current_string_leaf->destroy();
60✔
467
            m_current_string_leaf->detach();
60✔
468
            // force later reload of leaf
469
            m_compressed_leafs.back().m_is_loaded = false;
60✔
470
            // m_compressed_leafs.back().m_leaf_ref = m_data->get_as_ref(m_data->size() - 1);
471
        }
60✔
472
    }
90✔
473
    if (m_current_long_string_node) {
1,146,444✔
474
        ArrayUnsigned arr(m_top->get_alloc());
2,526✔
475
        arr.create(c_str.size(), 65535);
2,526✔
476
        unsigned short* begin = c_str.data();
2,526✔
477
        if (begin) {
2,526✔
478
            // if the compressed string is empty, 'begin' is zero and we don't copy
479
            size_t n = c_str.size();
2,520✔
480
            unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
2,520✔
481
            std::copy_n(begin, n, dest);
2,520✔
482
        }
2,520✔
483
        m_current_long_string_node->add(arr.get_ref());
2,526✔
484
        m_current_long_string_node->update_parent();
2,526✔
485
        if (m_current_long_string_node->size() == 256) {
2,526✔
486
            // exit from  "long string mode"
NEW
487
            m_current_long_string_node.reset();
×
NEW
488
        }
×
489
        CompressionSymbol* p_start = reinterpret_cast<CompressionSymbol*>(arr.m_data);
2,526✔
490
        m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()});
2,526✔
491
    }
2,526✔
492
    else {
1,143,918✔
493
        // Append to leaf with up to 256 entries.
494
        // First create a new leaf if needed (limit number of entries to 256 pr leaf)
495
        bool need_leaf_update = !m_current_string_leaf->is_attached() || (index & 0xFF) == 0;
1,143,918✔
496
        if (need_leaf_update) {
1,143,918✔
497
            m_current_string_leaf->set_parent(m_data.get(), index >> 8);
70,416✔
498
            if ((index & 0xFF) == 0) {
70,416✔
499
                // create new leaf
500
                m_current_string_leaf->create(0, 65535);
47,532✔
501
                m_data->add(m_current_string_leaf->get_ref());
47,532✔
502
                m_compressed_leafs.push_back({});
47,532✔
503
            }
47,532✔
504
            else {
22,884✔
505
                // just setup leaf accessor
506
                if (m_current_string_leaf->is_attached()) {
22,884✔
NEW
507
                    m_current_string_leaf->update_from_parent();
×
NEW
508
                }
×
509
                else {
22,884✔
510
                    m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent());
22,884✔
511
                }
22,884✔
512
            }
22,884✔
513
        }
70,416✔
514
        REALM_ASSERT(c_str.size() < 65535);
1,143,918✔
515
        // Add compressed string at end of leaf
516
        m_current_string_leaf->add(c_str.size());
1,143,918✔
517
        for (auto c : c_str) {
69,728,502✔
518
            m_current_string_leaf->add(c);
69,728,502✔
519
        }
69,728,502✔
520
        REALM_ASSERT_DEBUG(m_compressed_leafs.size());
1,143,918✔
521
        CompressionSymbol* p = reinterpret_cast<CompressionSymbol*>(m_current_string_leaf->m_data);
1,143,918✔
522
        auto p_limit = p + m_current_string_leaf->size();
1,143,918✔
523
        auto p_start = p_limit - c_str.size();
1,143,918✔
524
        m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()});
1,143,918✔
525
        REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256);
1,143,918✔
526
    }
1,143,918✔
527
    m_top->adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2
1,146,444✔
528
    load_leaf_if_new_ref(m_compressed_leafs.back(), m_data->get_as_ref(m_data->size() - 1));
1,146,444✔
529
#ifdef REALM_DEBUG
1,146,444✔
530
    auto csv = get_compressed(id);
1,146,444✔
531
    CompressedStringView csv2(c_str);
1,146,444✔
532
    REALM_ASSERT(csv == csv2);
1,146,444✔
533
#endif
1,146,444✔
534
    return id;
1,146,444✔
535
}
1,672,653✔
536

537
bool StringInterner::load_leaf_if_needed(DataLeaf& leaf)
538
{
3,010,488✔
539
    if (!leaf.m_is_loaded) {
3,010,488✔
540
        // start with an empty leaf:
541
        leaf.m_compressed.clear();
297,102✔
542
        leaf.m_compressed.reserve(256);
297,102✔
543

544
        // must interpret leaf first - the leaf is either a single array holding all strings,
545
        // or an array with each (compressed) string placed in its own array.
546
        const char* header = m_top->get_alloc().translate(leaf.m_leaf_ref);
297,102✔
547
        bool is_single_array = !NodeHeader::get_hasrefs_from_header(header);
297,102✔
548
        if (is_single_array) {
297,102✔
549
            size_t leaf_offset = 0;
296,094✔
550
            ArrayUnsigned leaf_array(m_top->get_alloc());
296,094✔
551
            leaf_array.init_from_ref(leaf.m_leaf_ref);
296,094✔
552
            REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits);
296,094✔
553
            REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16);
296,094✔
554
            // This is dangerous if the leaf is for some reason not in the assumed format
555
            CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(leaf_array.m_data);
296,094✔
556
            auto leaf_size = leaf_array.size();
296,094✔
557
            while (leaf_offset < leaf_size) {
3,206,292✔
558
                size_t length = c[leaf_offset];
2,910,198✔
559
                REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset));
2,910,198✔
560
                leaf_offset++;
2,910,198✔
561
                leaf.m_compressed.push_back({c + leaf_offset, length});
2,910,198✔
562
                REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256);
2,910,198✔
563
                leaf_offset += length;
2,910,198✔
564
            }
2,910,198✔
565
        }
296,094✔
566
        else {
1,008✔
567
            // Not a single leaf - instead an array of strings
568
            Array arr(m_top->get_alloc());
1,008✔
569
            arr.init_from_ref(leaf.m_leaf_ref);
1,008✔
570
            for (size_t idx = 0; idx < arr.size(); ++idx) {
12,498✔
571
                ArrayUnsigned str_array(m_top->get_alloc());
11,490✔
572
                ref_type ref = arr.get_as_ref(idx);
11,490✔
573
                str_array.init_from_ref(ref);
11,490✔
574
                REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits);
11,490✔
575
                REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16);
11,490✔
576
                CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(str_array.m_data);
11,490✔
577
                leaf.m_compressed.push_back({c, str_array.size()});
11,490✔
578
            }
11,490✔
579
        }
1,008✔
580
        leaf.m_is_loaded = true;
297,102✔
581
        return true;
297,102✔
582
    }
297,102✔
583
    return false;
2,713,386✔
584
}
3,010,488✔
585

586
// Danger: Only to be used if you know that a change in content ==> different ref
587
bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref)
588
{
1,146,501✔
589
    if (leaf.m_leaf_ref != new_ref) {
1,146,501✔
590
        leaf.m_leaf_ref = new_ref;
95,373✔
591
        leaf.m_is_loaded = false;
95,373✔
592
        leaf.m_compressed.resize(0);
95,373✔
593
    }
95,373✔
594
    return load_leaf_if_needed(leaf);
1,146,501✔
595
}
1,146,501✔
596

597
CompressedStringView& StringInterner::get_compressed(StringID id)
598
{
1,863,990✔
599
    auto index = id - 1; // 0 represents null
1,863,990✔
600
    auto hi = index >> 8;
1,863,990✔
601
    auto lo = index & 0xFFUL;
1,863,990✔
602
    DataLeaf& leaf = m_compressed_leafs[hi];
1,863,990✔
603
    load_leaf_if_needed(leaf);
1,863,990✔
604
    REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size());
1,863,990✔
605
    return leaf.m_compressed[lo];
1,863,990✔
606
}
1,863,990✔
607

608
std::optional<StringID> StringInterner::lookup(StringData sd)
609
{
63,126✔
610
    if (!m_top->is_attached()) {
63,126✔
611
        // "dead" mode
NEW
612
        return {};
×
NEW
613
    }
×
614
    std::lock_guard lock(m_mutex);
63,126✔
615
    if (sd.data() == nullptr)
63,126✔
616
        return 0;
26,112✔
617
    uint32_t h = (uint32_t)sd.hash();
37,014✔
618
    auto candidates = hash_to_id(*m_hash_map.get(), h, 32);
37,014✔
619
    for (auto& candidate : candidates) {
37,014✔
620
        auto candidate_cpr = get_compressed(candidate);
33,798✔
621
        if (m_compressor->compare(sd, candidate_cpr) == 0)
33,798✔
622
            return candidate;
33,798✔
623
    }
33,798✔
624
    return {};
3,216✔
625
}
37,014✔
626

627
int StringInterner::compare(StringID A, StringID B)
NEW
628
{
×
NEW
629
    std::lock_guard lock(m_mutex);
×
NEW
630
    REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
×
NEW
631
    REALM_ASSERT_DEBUG(B < m_decompressed_strings.size());
×
632
    // comparisons against null
NEW
633
    if (A == B && A == 0)
×
NEW
634
        return 0;
×
NEW
635
    if (A == 0)
×
NEW
636
        return -1;
×
NEW
637
    if (B == 0)
×
NEW
638
        return 1;
×
639
    // ok, no nulls.
NEW
640
    REALM_ASSERT(m_compressor);
×
NEW
641
    return m_compressor->compare(get_compressed(A), get_compressed(B));
×
NEW
642
}
×
643

644
int StringInterner::compare(StringData s, StringID A)
NEW
645
{
×
NEW
646
    std::lock_guard lock(m_mutex);
×
NEW
647
    REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
×
648
    // comparisons against null
NEW
649
    if (s.data() == nullptr && A == 0)
×
NEW
650
        return 0;
×
NEW
651
    if (s.data() == nullptr)
×
NEW
652
        return 1;
×
NEW
653
    if (A == 0)
×
NEW
654
        return -1;
×
655
    // ok, no nulls
NEW
656
    REALM_ASSERT(m_compressor);
×
NEW
657
    return m_compressor->compare(s, get_compressed(A));
×
NEW
658
}
×
659

660

661
StringData StringInterner::get(StringID id)
662
{
1,404,327✔
663
    REALM_ASSERT(m_compressor);
1,404,327✔
664
    std::lock_guard lock(m_mutex);
1,404,327✔
665
    if (id == 0)
1,404,327✔
666
        return StringData{nullptr};
96,954✔
667
    REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size());
1,307,373✔
668
    CachedString& cs = m_decompressed_strings[id - 1];
1,307,373✔
669
    if (cs.m_decompressed) {
1,307,373✔
670
        std::string* ref_str = cs.m_decompressed.get();
1,149,780✔
671
        if (cs.m_weight < 128)
1,149,780✔
672
            cs.m_weight += 64;
149,256✔
673
        return {ref_str->c_str(), ref_str->size()};
1,149,780✔
674
    }
1,149,780✔
675
    cs.m_weight = 64;
157,593✔
676
    cs.m_decompressed = std::make_unique<std::string>(m_compressor->decompress(get_compressed(id)));
157,593✔
677
    m_in_memory_strings.push_back(id);
157,593✔
678
    return {cs.m_decompressed->c_str(), cs.m_decompressed->size()};
157,593✔
679
}
1,307,373✔
680

681
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc