• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_338

03 Jul 2024 03:00PM UTC coverage: 90.856% (-0.008%) from 90.864%
jorgen.edelbo_338

Pull #7803

Evergreen

nicola-cab
Merge branch 'next-major' into feature/string-compression
Pull Request #7803: Feature/string compression

103028 of 180606 branches covered (57.05%)

1144 of 1267 new or added lines in 33 files covered. (90.29%)

155 existing lines in 24 files now uncovered.

218583 of 240583 relevant lines covered (90.86%)

7959624.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.62
/src/realm/string_interner.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_interner.hpp>
20
#include <realm/string_compressor.hpp>
21
#include <realm/string_data.hpp>
22
#include <string_view>
23

24
namespace realm {
25

26
// Fast mapping of strings (or rather hash of strings) to string IDs.
27
//
28
// We use a tree where:
29
// * All interior nodes are radix nodes with a fan-out of 256.
30
// * Leaf nodes with up to 16 entries are just lists, searched linearly
31
// * Leaf nodes with more than 16 entries and less than 1K are hash tables.
32
//   Hash tables use linear search starting from the entry found by hashing.
33
//
34
constexpr static size_t linear_search_limit = 16;
35
constexpr static size_t hash_node_min_size = 32;
36
constexpr static size_t hash_node_max_size = 1024;
37
constexpr static size_t radix_node_consumes_bits = 8;
38
constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits;
39

40
// helpers
41
struct HashMapIter {
42
    Array& m_array;
43
    uint32_t hash_filter;
44
    uint16_t index;
45
    uint16_t left_to_search;
46
    uint8_t hash_size;
47
    HashMapIter(Array& array, uint32_t hash, uint8_t hash_size)
48
        : m_array(array)
1,816,155✔
49
        , hash_filter(hash)
1,816,155✔
50
        , hash_size(hash_size)
1,816,155✔
51
    {
3,638,625✔
52
        set_index(0);
3,638,625✔
53
    }
3,638,625✔
54
    HashMapIter(Array& dummy)
55
        : m_array(dummy)
NEW
56
    {
×
NEW
57
        left_to_search = 0;
×
NEW
58
    }
×
59
    inline uint32_t get()
60
    {
1,108,665✔
61
        return (uint32_t)(m_array.get(index) >> hash_size);
1,108,665✔
62
    }
1,108,665✔
63
    inline bool empty()
64
    {
4,429,020✔
65
        auto element = m_array.get(index);
4,429,020✔
66
        return (element >> hash_size) == 0;
4,429,020✔
67
    }
4,429,020✔
68
    inline void set(uint64_t element)
69
    {
1,696,044✔
70
        m_array.set(index, element);
1,696,044✔
71
    }
1,696,044✔
72
    inline bool matches()
73
    {
21,100,428✔
74
        auto mask = 0xFFFFFFFFUL >> (32 - hash_size);
21,100,428✔
75
        auto element = m_array.get(index);
21,100,428✔
76
        return ((element & mask) == hash_filter) && (element >> hash_size);
21,100,428✔
77
    }
21,100,428✔
78
    inline bool is_valid()
79
    {
50,785,170✔
80
        return left_to_search != 0;
50,785,170✔
81
    }
50,785,170✔
82
    inline void set_index(size_t i, size_t search_limit = linear_search_limit)
83
    {
6,415,602✔
84
        index = (uint16_t)i;
6,415,602✔
85
        left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit);
6,415,602✔
86
    }
6,415,602✔
87
    void operator++()
88
    {
22,720,806✔
89
        if (is_valid()) {
22,720,911✔
90
            left_to_search--;
22,720,911✔
91
            index++;
22,720,911✔
92
            if (index == m_array.size()) {
22,720,911✔
93
                index = 0;
817,392✔
94
            }
817,392✔
95
        }
22,720,911✔
96
    }
22,720,806✔
97
};
98

99
// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf.
100
static bool rehash(Array& from, Array& to, uint8_t hash_size)
101
{
18,423✔
102
    REALM_ASSERT_DEBUG(from.size() * 2 <= to.size());
18,423✔
103

104
    for (size_t i = 0; i < from.size(); ++i) {
1,587,876✔
105
        auto entry = (size_t)from.get(i);
1,569,453✔
106
        if ((entry >> hash_size) == 0)
1,569,453✔
107
            continue;
457,128✔
108
        size_t starting_index = entry & (to.size() - 1);
1,112,325✔
109
        HashMapIter it(to, 0, hash_size);
1,112,325✔
110
        it.set_index(starting_index);
1,112,325✔
111
        while (it.is_valid() && !it.empty()) {
1,433,190✔
112
            ++it;
320,865✔
113
        }
320,865✔
114
        if (!it.is_valid()) {
1,112,325✔
115
            // abort rehashing, we need a larger to-space
NEW
116
            return false;
×
NEW
117
        }
×
118
        REALM_ASSERT(it.empty());
1,112,325✔
119
        it.set(entry);
1,112,325✔
120
    }
1,112,325✔
121
    return true;
18,423✔
122
}
18,423✔
123

124
// Add a binding from hash value to id.
125
static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size)
126
{
2,328,765✔
127
    REALM_ASSERT(node.is_attached());
2,328,765✔
128
    if (!node.has_refs()) {
2,328,765✔
129
        // it's a leaf.
130
        if (node.size() < linear_search_limit) {
1,249,413✔
131
            // it's a list with room to grow
132
            node.add(((uint64_t)id << hash_size) | hash);
664,641✔
133
            return;
664,641✔
134
        }
664,641✔
135
        if (node.size() == linear_search_limit) {
584,772✔
136
            // it's a full list, must be converted to a hash table
137
            Array new_node(node.get_alloc());
6,927✔
138
            new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0);
6,927✔
139
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
6,927✔
140
            new_node.update_parent();
6,927✔
141
            // transform existing list into hash table
142
            rehash(node, new_node, hash_size);
6,927✔
143
            node.destroy();
6,927✔
144
            node.init_from_parent();
6,927✔
145
        }
6,927✔
146
        // it's a hash table. Grow if needed up till 'hash_node_max_size' entries
147
        while (node.size() < hash_node_max_size) {
596,259✔
148
            auto size = node.size();
595,215✔
149
            size_t start_index = hash & (size - 1);
595,215✔
150
            HashMapIter it(node, 0, hash_size);
595,215✔
151
            it.set_index(start_index);
595,215✔
152
            while (it.is_valid() && !it.empty()) {
1,895,064✔
153
                ++it;
1,299,849✔
154
            }
1,299,849✔
155
            if (it.is_valid()) {
595,215✔
156
                // found an empty spot within search range
157
                it.set(((uint64_t)id << hash_size) | hash);
583,728✔
158
                return;
583,728✔
159
            }
583,728✔
160
            if (node.size() >= hash_node_max_size)
11,487✔
NEW
161
                break;
×
162
            // No free spot found - rehash into bigger and bigger tables
163
            auto new_size = node.size();
11,487✔
164
            bool need_to_rehash = true;
11,487✔
165
            Array new_node(node.get_alloc());
11,487✔
166
            while (need_to_rehash && new_size < hash_node_max_size) {
22,983✔
167
                new_size *= 2;
11,496✔
168
                new_node.create(NodeHeader::type_Normal, false, new_size, 0);
11,496✔
169
                need_to_rehash = !rehash(node, new_node, hash_size);
11,496✔
170
                if (need_to_rehash) { // we failed, try again - or shift to radix
11,496✔
171
                    // I find it counter-intuitive. But it CAN happen.
NEW
172
                    new_node.destroy();
×
NEW
173
                }
×
174
            }
11,496✔
175
            if (need_to_rehash)
11,487✔
NEW
176
                break;
×
177
            new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
11,487✔
178
            new_node.update_parent();
11,487✔
179
            node.destroy();
11,487✔
180
            node.init_from_parent();
11,487✔
181
        }
11,487✔
182
        // we ran out of space. Rewrite as a radix node with subtrees
183
        Array new_node(node.get_alloc());
1,044✔
184
        new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0);
1,044✔
185
        new_node.set_parent(node.get_parent(), node.get_ndx_in_parent());
1,044✔
186
        new_node.update_parent();
1,044✔
187
        for (size_t index = 0; index < node.size(); ++index) {
1,094,496✔
188
            auto element = node.get(index);
1,093,452✔
189
            auto hash = element & (0xFFFFFFFF >> (32 - hash_size));
1,093,452✔
190
            auto string_id = element >> hash_size;
1,093,452✔
191
            if (string_id == 0)
1,093,452✔
192
                continue;
768,978✔
193
            auto remaining_hash = hash >> radix_node_consumes_bits;
324,474✔
194
            add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8);
324,474✔
195
        }
324,474✔
196
        node.destroy();
1,044✔
197
        node.init_from_parent();
1,044✔
198
    }
1,044✔
199
    // We have a radix node and need to insert the new binding into the proper subtree
200
    size_t index = hash & (radix_node_size - 1);
1,080,396✔
201
    auto rot = node.get_as_ref_or_tagged(index);
1,080,396✔
202
    REALM_ASSERT(!rot.is_tagged());
1,080,396✔
203
    Array subtree(node.get_alloc());
1,080,396✔
204
    if (rot.get_as_ref() == 0) {
1,080,396✔
205
        // no subtree present, create an empty one
206
        subtree.set_parent(&node, index);
206,505✔
207
        subtree.create(NodeHeader::type_Normal);
206,505✔
208
        subtree.update_parent();
206,505✔
209
    }
206,505✔
210
    else {
873,891✔
211
        // subtree already present
212
        subtree.set_parent(&node, index);
873,891✔
213
        subtree.init_from_parent();
873,891✔
214
    }
873,891✔
215
    // recurse into subtree
216
    add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits);
1,080,396✔
217
}
1,080,396✔
218

219
static std::vector<uint32_t> hash_to_id(Array& node, uint32_t hash, uint8_t hash_size)
220
{
2,860,299✔
221
    std::vector<uint32_t> result;
2,860,299✔
222
    REALM_ASSERT(node.is_attached());
2,860,299✔
223
    if (!node.has_refs()) {
2,860,299✔
224
        // it's a leaf - default is a list, search starts from index 0.
225
        HashMapIter it(node, hash, hash_size);
1,931,157✔
226
        if (node.size() > hash_node_min_size) {
1,931,157✔
227
            // it is a hash table, so use hash to select index to start searching
228
            // table size must be power of two!
229
            size_t index = hash & (node.size() - 1);
1,069,518✔
230
            it.set_index(index);
1,069,518✔
231
        }
1,069,518✔
232
        // collect all matching values within allowed range
233
        while (it.is_valid()) {
23,031,648✔
234
            if (it.matches()) {
21,100,491✔
235
                result.push_back(it.get());
1,108,665✔
236
            }
1,108,665✔
237
            ++it;
21,100,491✔
238
        }
21,100,491✔
239
        return result;
1,931,157✔
240
    }
1,931,157✔
241
    else {
929,142✔
242
        // it's a radix node
243
        size_t index = hash & (node.size() - 1);
929,142✔
244
        auto rot = node.get_as_ref_or_tagged(index);
929,142✔
245
        REALM_ASSERT(rot.is_ref());
929,142✔
246
        if (rot.get_as_ref() == 0) {
929,142✔
247
            // no subtree, return empty vector
248
            return result;
16,956✔
249
        }
16,956✔
250
        // descend into subtree
251
        Array subtree(node.get_alloc());
912,186✔
252
        subtree.set_parent(&node, index);
912,186✔
253
        subtree.init_from_parent();
912,186✔
254
        return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits);
912,186✔
255
    }
929,142✔
256
}
2,860,299✔
257

258

259
enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size };
260
struct StringInterner::DataLeaf {
261
    std::vector<CompressedStringView> m_compressed;
262
    ref_type m_leaf_ref = 0;
263
    bool m_is_loaded = false;
264
    DataLeaf() {}
774,186✔
265
    DataLeaf(ref_type ref)
266
        : m_leaf_ref(ref)
NEW
267
    {
×
NEW
268
    }
×
269
};
270

271
StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable)
272
    : m_parent(parent)
708,672✔
273
    , m_top(alloc)
708,672✔
274
    , m_data(alloc)
708,672✔
275
    , m_hash_map(alloc)
708,672✔
276
    , m_current_string_leaf(alloc)
708,672✔
277
    , m_current_long_string_node(alloc)
708,672✔
278
{
1,452,240✔
279
    REALM_ASSERT_DEBUG(col_key != ColKey());
1,452,240✔
280
    size_t index = col_key.get_index().val;
1,452,240✔
281
    // ensure that m_top and m_data is well defined and reflect any existing data
282
    // We'll have to extend this to handle no defined backing
283
    m_top.set_parent(&parent, index);
1,452,240✔
284
    m_data.set_parent(&m_top, Pos_Data);
1,452,240✔
285
    m_hash_map.set_parent(&m_top, Pos_Map);
1,452,240✔
286
    m_col_key = col_key;
1,452,240✔
287
    update_from_parent(writable);
1,452,240✔
288
}
1,452,240✔
289

290
void StringInterner::update_from_parent(bool writable)
291
{
2,545,686✔
292
    auto parent_idx = m_top.get_ndx_in_parent();
2,545,686✔
293
    bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size();
2,545,686✔
294
    bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx);
2,545,686✔
295
    if (valid_top) {
2,545,686✔
296
        m_top.update_from_parent();
2,358,552✔
297
        m_data.update_from_parent();
2,358,552✔
298
        m_hash_map.update_from_parent();
2,358,552✔
299
    }
2,358,552✔
300
    else if (writable && valid_top_ref_spot) {
187,134✔
301
        m_top.create(NodeHeader::type_HasRefs, false, Top_Size, 0);
187,080✔
302
        m_top.set(Pos_Version, (1 << 1) + 1); // version number 1.
187,080✔
303
        m_top.set(Pos_Size, (0 << 1) + 1);    // total size 0
187,080✔
304
        m_top.set(Pos_ColKey, (m_col_key.value << 1) + 1);
187,080✔
305
        m_top.set(Pos_Compressor, 0);
187,080✔
306

307
        // create first level of data tree here (to simplify other stuff)
308
        m_data.create(NodeHeader::type_HasRefs, false, 0);
187,080✔
309
        m_data.update_parent();
187,080✔
310

311
        m_hash_map.create(NodeHeader::type_Normal);
187,080✔
312
        m_hash_map.update_parent();
187,080✔
313
        m_top.update_parent();
187,080✔
314
        valid_top = true;
187,080✔
315
    }
187,080✔
316
    if (!valid_top) {
2,545,686✔
317
        // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode
NEW
318
        m_compressor.reset();
×
NEW
319
        m_compressed_leafs.clear();
×
320
        // m_compressed_string_map.clear();
NEW
321
        m_top.detach();
×
NEW
322
        m_data.detach();
×
NEW
323
        m_hash_map.detach();
×
NEW
324
        m_compressor.reset();
×
NEW
325
        return;
×
NEW
326
    }
×
327
    // validate we're accessing data for the correct column. A combination of column erase
328
    // and insert could lead to an interner being paired with wrong data in the file.
329
    // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch.
330
    int64_t data_colkey = m_top.get_as_ref_or_tagged(Pos_ColKey).get_as_int();
2,545,686✔
331
    if (m_col_key.value != data_colkey) {
2,545,686✔
332
        // new column, new data
NEW
333
        m_compressor.reset();
×
NEW
334
        m_decompressed_strings.clear();
×
NEW
335
    }
×
336
    if (!m_compressor)
2,545,686✔
337
        m_compressor = std::make_unique<StringCompressor>(m_top.get_alloc(), m_top, Pos_Compressor, writable);
1,452,240✔
338
    else
1,093,446✔
339
        m_compressor->refresh(writable);
1,093,446✔
340
    if (m_data.size()) {
2,545,686✔
341
        auto ref_to_write_buffer = m_data.get_as_ref(m_data.size() - 1);
1,160,211✔
342
        const char* header = m_top.get_alloc().translate(ref_to_write_buffer);
1,160,211✔
343
        bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header);
1,160,211✔
344
        if (is_array_of_cprs) {
1,160,211✔
345
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
1,083✔
346
            m_current_long_string_node.update_from_parent();
1,083✔
347
        }
1,083✔
348
        else {
1,159,128✔
349
            m_current_long_string_node.detach();
1,159,128✔
350
        }
1,159,128✔
351
    }
1,160,211✔
352
    else
1,385,475✔
353
        m_current_long_string_node.detach(); // just in case...
1,385,475✔
354

355
    // rebuild internal structures......
356
    rebuild_internal();
2,545,686✔
357
    m_current_string_leaf.detach();
2,545,686✔
358
}
2,545,686✔
359

360
void StringInterner::rebuild_internal()
361
{
2,545,695✔
362
    std::lock_guard lock(m_mutex);
2,545,695✔
363
    // release old decompressed strings
364
    for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) {
8,572,410✔
365
        StringID id = m_in_memory_strings[idx];
6,026,715✔
366
        if (id > m_decompressed_strings.size()) {
6,026,715✔
NEW
367
            m_in_memory_strings[idx] = m_in_memory_strings.back();
×
NEW
368
            m_in_memory_strings.pop_back();
×
NEW
369
            continue;
×
NEW
370
        }
×
371
        if (auto& w = m_decompressed_strings[id - 1].m_weight) {
6,026,715✔
372
            w >>= 1;
5,459,844✔
373
        }
5,459,844✔
374
        else {
566,871✔
375
            m_decompressed_strings[id - 1].m_decompressed.reset();
566,871✔
376
            m_in_memory_strings[idx] = m_in_memory_strings.back();
566,871✔
377
            m_in_memory_strings.pop_back();
566,871✔
378
            continue;
566,871✔
379
        }
566,871✔
380
    }
6,026,715✔
381

382
    size_t target_size = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int();
2,545,695✔
383
    m_decompressed_strings.resize(target_size);
2,545,695✔
384
    if (m_data.size() != m_compressed_leafs.size()) {
2,545,695✔
385
        m_compressed_leafs.resize(m_data.size());
688,356✔
386
    }
688,356✔
387
    // always force new setup of all leafs:
388
    // update m_compressed_leafs to reflect m_data
389
    for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) {
3,863,790✔
390
        auto ref = m_data.get_as_ref(idx);
1,318,095✔
391
        auto& leaf_meta = m_compressed_leafs[idx];
1,318,095✔
392
        leaf_meta.m_is_loaded = false;
1,318,095✔
393
        leaf_meta.m_compressed.clear();
1,318,095✔
394
        leaf_meta.m_leaf_ref = ref;
1,318,095✔
395
    }
1,318,095✔
396
}
2,545,695✔
397

398
StringInterner::~StringInterner() {}
1,452,249✔
399

400
StringID StringInterner::intern(StringData sd)
401
{
2,098,308✔
402
    REALM_ASSERT(m_top.is_attached());
2,098,308✔
403
    std::lock_guard lock(m_mutex);
2,098,308✔
404
    // special case for null string
405
    if (sd.data() == nullptr)
2,098,308✔
406
        return 0;
181,311✔
407
    uint32_t h = (uint32_t)sd.hash();
1,916,997✔
408
    auto candidates = hash_to_id(m_hash_map, h, 32);
1,916,997✔
409
    for (auto& candidate : candidates) {
1,916,997✔
410
        auto candidate_cpr = get_compressed(candidate);
993,210✔
411
        if (m_compressor->compare(sd, candidate_cpr) == 0)
993,210✔
412
            return candidate;
993,198✔
413
    }
993,210✔
414
    // it's a new string
415
    bool learn = true;
923,799✔
416
    auto c_str = m_compressor->compress(sd, learn);
923,799✔
417
    m_decompressed_strings.push_back({64, std::make_unique<std::string>(sd)});
923,799✔
418
    auto id = m_decompressed_strings.size();
923,799✔
419
    m_in_memory_strings.push_back(id);
923,799✔
420
    add_to_hash_map(m_hash_map, h, id, 32);
923,799✔
421
    size_t index = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int();
923,799✔
422
    REALM_ASSERT_DEBUG(index == id - 1);
923,799✔
423
    bool need_long_string_node = c_str.size() >= 65536;
923,799✔
424

425
    // TODO: update_internal must set up m_current_long_string_node if it is in use
426
    if (need_long_string_node && !m_current_long_string_node.is_attached()) {
923,799✔
427

428
        m_current_long_string_node.create(NodeHeader::type_HasRefs);
84✔
429

430
        if ((index & 0xFF) == 0) {
84✔
431
            // if we're starting on a new leaf, extend parent array for it
432
            m_data.add(0);
24✔
433
            m_compressed_leafs.push_back({});
24✔
434
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
24✔
435
            m_current_long_string_node.update_parent();
24✔
436
            REALM_ASSERT_DEBUG(!m_current_string_leaf.is_attached() || m_current_string_leaf.size() == 0);
24✔
437
            m_current_string_leaf.detach();
24✔
438
        }
24✔
439
        else {
60✔
440
            // we have been building an existing leaf and need to shift representation.
441
            // but first we need to update leaf accessor for existing leaf
442
            if (m_current_string_leaf.is_attached()) {
60✔
443
                m_current_string_leaf.update_from_parent();
57✔
444
            }
57✔
445
            else {
3✔
446
                m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent());
3✔
447
            }
3✔
448
            REALM_ASSERT_DEBUG(m_current_string_leaf.size() > 0);
60✔
449
            m_current_long_string_node.set_parent(&m_data, m_data.size() - 1);
60✔
450
            m_current_long_string_node.update_parent();
60✔
451
            // convert the current leaf into a long string node. (array of strings in separate arrays)
452
            for (auto s : m_compressed_leafs.back().m_compressed) {
87✔
453
                ArrayUnsigned arr(m_top.get_alloc());
87✔
454
                arr.create(s.size, 65535);
87✔
455
                unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
87✔
456
                std::copy_n(s.data, s.size, dest);
87✔
457
                m_current_long_string_node.add(arr.get_ref());
87✔
458
            }
87✔
459
            m_current_string_leaf.destroy();
60✔
460
            // force later reload of leaf
461
            m_compressed_leafs.back().m_is_loaded = false;
60✔
462
        }
60✔
463
    }
84✔
464
    if (m_current_long_string_node.is_attached()) {
923,799✔
465
        ArrayUnsigned arr(m_top.get_alloc());
2,493✔
466
        arr.create(c_str.size(), 65535);
2,493✔
467
        unsigned short* begin = c_str.data();
2,493✔
468
        if (begin) {
2,493✔
469
            // if the compressed string is empty, 'begin' is zero and we don't copy
470
            size_t n = c_str.size();
2,487✔
471
            unsigned short* dest = reinterpret_cast<unsigned short*>(arr.m_data);
2,487✔
472
            std::copy_n(begin, n, dest);
2,487✔
473
        }
2,487✔
474
        m_current_long_string_node.add(arr.get_ref());
2,493✔
475
        m_current_long_string_node.update_parent();
2,493✔
476
        if (m_current_long_string_node.size() == 256) {
2,493✔
477
            // exit from  "long string mode"
NEW
478
            m_current_long_string_node.detach();
×
NEW
479
        }
×
480
        CompressionSymbol* p_start = reinterpret_cast<CompressionSymbol*>(arr.m_data);
2,493✔
481
        m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()});
2,493✔
482
    }
2,493✔
483
    else {
921,306✔
484
        // Append to leaf with up to 256 entries.
485
        // First create a new leaf if needed (limit number of entries to 256 pr leaf)
486
        bool need_leaf_update = !m_current_string_leaf.is_attached() || (index & 0xFF) == 0;
921,306✔
487
        if (need_leaf_update) {
921,306✔
488
            m_current_string_leaf.set_parent(&m_data, index >> 8);
120,438✔
489
            if ((index & 0xFF) == 0) {
120,438✔
490
                // create new leaf
491
                m_current_string_leaf.create(0, 65535);
85,032✔
492
                m_data.add(m_current_string_leaf.get_ref());
85,032✔
493
                m_compressed_leafs.push_back({});
85,032✔
494
            }
85,032✔
495
            else {
35,406✔
496
                // just setup leaf accessor
497
                if (m_current_string_leaf.is_attached()) {
35,406✔
NEW
498
                    m_current_string_leaf.update_from_parent();
×
NEW
499
                }
×
500
                else {
35,406✔
501
                    m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent());
35,406✔
502
                }
35,406✔
503
            }
35,406✔
504
        }
120,438✔
505
        REALM_ASSERT(c_str.size() < 65535);
921,306✔
506
        // Add compressed string at end of leaf
507
        m_current_string_leaf.add(c_str.size());
921,306✔
508
        for (auto c : c_str) {
69,304,539✔
509
            m_current_string_leaf.add(c);
69,304,539✔
510
        }
69,304,539✔
511
        REALM_ASSERT_DEBUG(m_compressed_leafs.size());
921,306✔
512
        CompressionSymbol* p = reinterpret_cast<CompressionSymbol*>(m_current_string_leaf.m_data);
921,306✔
513
        auto p_limit = p + m_current_string_leaf.size();
921,306✔
514
        auto p_start = p_limit - c_str.size();
921,306✔
515
        m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()});
921,306✔
516
        REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256);
921,306✔
517
    }
921,306✔
518
    m_top.adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2
923,799✔
519
    load_leaf_if_new_ref(m_compressed_leafs.back(), m_data.get_as_ref(m_data.size() - 1));
923,799✔
520
#ifdef REALM_DEBUG
923,799✔
521
    auto csv = get_compressed(id);
923,799✔
522
    CompressedStringView csv2(c_str);
923,799✔
523
    REALM_ASSERT(csv == csv2);
923,799✔
524
#endif
923,799✔
525
    return id;
923,799✔
526
}
1,916,997✔
527

528
bool StringInterner::load_leaf_if_needed(DataLeaf& leaf)
529
{
3,150,954✔
530
    if (!leaf.m_is_loaded) {
3,150,954✔
531
        // start with an empty leaf:
532
        leaf.m_compressed.clear();
375,963✔
533
        leaf.m_compressed.reserve(256);
375,963✔
534

535
        // must interpret leaf first - the leaf is either a single array holding all strings,
536
        // or an array with each (compressed) string placed in its own array.
537
        const char* header = m_top.get_alloc().translate(leaf.m_leaf_ref);
375,963✔
538
        bool is_single_array = !NodeHeader::get_hasrefs_from_header(header);
375,963✔
539
        if (is_single_array) {
375,963✔
540
            size_t leaf_offset = 0;
374,985✔
541
            ArrayUnsigned leaf_array(m_top.get_alloc());
374,985✔
542
            leaf_array.init_from_ref(leaf.m_leaf_ref);
374,985✔
543
            REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits);
374,985✔
544
            REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16);
374,985✔
545
            // This is dangerous if the leaf is for some reason not in the assumed format
546
            CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(leaf_array.m_data);
374,985✔
547
            auto leaf_size = leaf_array.size();
374,985✔
548
            while (leaf_offset < leaf_size) {
9,908,088✔
549
                size_t length = c[leaf_offset];
9,533,103✔
550
                REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset));
9,533,103✔
551
                leaf_offset++;
9,533,103✔
552
                leaf.m_compressed.push_back({c + leaf_offset, length});
9,533,103✔
553
                REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256);
9,533,103✔
554
                leaf_offset += length;
9,533,103✔
555
            }
9,533,103✔
556
        }
374,985✔
557
        else {
978✔
558
            // Not a single leaf - instead an array of strings
559
            Array arr(m_top.get_alloc());
978✔
560
            arr.init_from_ref(leaf.m_leaf_ref);
978✔
561
            for (size_t idx = 0; idx < arr.size(); ++idx) {
12,366✔
562
                ArrayUnsigned str_array(m_top.get_alloc());
11,388✔
563
                ref_type ref = arr.get_as_ref(idx);
11,388✔
564
                str_array.init_from_ref(ref);
11,388✔
565
                REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits);
11,388✔
566
                REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16);
11,388✔
567
                CompressionSymbol* c = reinterpret_cast<CompressionSymbol*>(str_array.m_data);
11,388✔
568
                leaf.m_compressed.push_back({c, str_array.size()});
11,388✔
569
            }
11,388✔
570
        }
978✔
571
        leaf.m_is_loaded = true;
375,963✔
572
        return true;
375,963✔
573
    }
375,963✔
574
    return false;
2,774,991✔
575
}
3,150,954✔
576

577
// Danger: Only to be used if you know that a change in content ==> different ref
578
bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref)
579
{
923,799✔
580
    if (leaf.m_leaf_ref != new_ref) {
923,799✔
581
        leaf.m_leaf_ref = new_ref;
140,643✔
582
        leaf.m_is_loaded = false;
140,643✔
583
        leaf.m_compressed.resize(0);
140,643✔
584
    }
140,643✔
585
    return load_leaf_if_needed(leaf);
923,799✔
586
}
923,799✔
587

588
CompressedStringView& StringInterner::get_compressed(StringID id)
589
{
2,227,152✔
590
    auto index = id - 1; // 0 represents null
2,227,152✔
591
    auto hi = index >> 8;
2,227,152✔
592
    auto lo = index & 0xFFUL;
2,227,152✔
593
    DataLeaf& leaf = m_compressed_leafs[hi];
2,227,152✔
594
    load_leaf_if_needed(leaf);
2,227,152✔
595
    REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size());
2,227,152✔
596
    return leaf.m_compressed[lo];
2,227,152✔
597
}
2,227,152✔
598

599
std::optional<StringID> StringInterner::lookup(StringData sd)
600
{
57,234✔
601
    if (!m_top.is_attached()) {
57,234✔
602
        // "dead" mode
NEW
603
        return {};
×
NEW
604
    }
×
605
    std::lock_guard lock(m_mutex);
57,234✔
606
    if (sd.data() == nullptr)
57,234✔
607
        return 0;
26,112✔
608
    uint32_t h = (uint32_t)sd.hash();
31,122✔
609
    auto candidates = hash_to_id(m_hash_map, h, 32);
31,122✔
610
    for (auto& candidate : candidates) {
31,122✔
611
        auto candidate_cpr = get_compressed(candidate);
27,906✔
612
        if (m_compressor->compare(sd, candidate_cpr) == 0)
27,906✔
613
            return candidate;
27,906✔
614
    }
27,906✔
615
    return {};
3,216✔
616
}
31,122✔
617

618
int StringInterner::compare(StringID A, StringID B)
NEW
619
{
×
NEW
620
    std::lock_guard lock(m_mutex);
×
NEW
621
    REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
×
NEW
622
    REALM_ASSERT_DEBUG(B < m_decompressed_strings.size());
×
623
    // comparisons against null
NEW
624
    if (A == B && A == 0)
×
NEW
625
        return 0;
×
NEW
626
    if (A == 0)
×
NEW
627
        return -1;
×
NEW
628
    if (B == 0)
×
NEW
629
        return 1;
×
630
    // ok, no nulls.
NEW
631
    REALM_ASSERT(m_compressor);
×
NEW
632
    return m_compressor->compare(get_compressed(A), get_compressed(B));
×
NEW
633
}
×
634

635
int StringInterner::compare(StringData s, StringID A)
NEW
636
{
×
NEW
637
    std::lock_guard lock(m_mutex);
×
NEW
638
    REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
×
639
    // comparisons against null
NEW
640
    if (s.data() == nullptr && A == 0)
×
NEW
641
        return 0;
×
NEW
642
    if (s.data() == nullptr)
×
NEW
643
        return 1;
×
NEW
644
    if (A == 0)
×
NEW
645
        return -1;
×
646
    // ok, no nulls
NEW
647
    REALM_ASSERT(m_compressor);
×
NEW
648
    return m_compressor->compare(s, get_compressed(A));
×
NEW
649
}
×
650

651

652
StringData StringInterner::get(StringID id)
653
{
9,114,042✔
654
    REALM_ASSERT(m_compressor);
9,114,042✔
655
    std::lock_guard lock(m_mutex);
9,114,042✔
656
    if (id == 0)
9,114,042✔
657
        return StringData{nullptr};
98,196✔
658
    REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size());
9,015,846✔
659
    CachedString& cs = m_decompressed_strings[id - 1];
9,015,846✔
660
    if (cs.m_decompressed) {
9,015,846✔
661
        if (cs.m_weight < 128)
8,733,582✔
662
            cs.m_weight += 64;
1,070,100✔
663
        return {cs.m_decompressed->c_str(), cs.m_decompressed->size()};
8,733,582✔
664
    }
8,733,582✔
665
    cs.m_weight = 64;
282,264✔
666
    cs.m_decompressed = std::make_unique<std::string>(m_compressor->decompress(get_compressed(id)));
282,264✔
667
    m_in_memory_strings.push_back(id);
282,264✔
668
    return {cs.m_decompressed->c_str(), cs.m_decompressed->size()};
282,264✔
669
}
9,015,846✔
670

671
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc