• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / finn.schiermer-andersen_89

04 Jun 2024 02:04PM UTC coverage: 90.651% (-0.03%) from 90.685%
finn.schiermer-andersen_89

Pull #7654

Evergreen

finnschiermer
optimized string cache gc
Pull Request #7654: Fsa/string interning

102644 of 180648 branches covered (56.82%)

1005 of 1125 new or added lines in 15 files covered. (89.33%)

154 existing lines in 21 files now uncovered.

217953 of 240431 relevant lines covered (90.65%)

7671710.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.27
/src/realm/string_compressor.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_compressor.hpp>
20
#include <realm/string_data.hpp>
21

22
#include <realm/array_unsigned.hpp>
23

24
#include <iostream>
25
namespace realm {
26

27
StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable)
28
{
7,452,432✔
29
    m_compression_map.resize(16); // start with a very small compression map
7,452,432✔
30
    m_symbols.reserve(65536);
7,452,432✔
31
    m_data = std::make_unique<ArrayUnsigned>(alloc);
7,452,432✔
32
    m_data->set_parent(&parent, index);
7,452,432✔
33
    refresh(writable);
7,452,432✔
34
}
7,452,432✔
35

36
void StringCompressor::refresh(bool writable)
37
{
11,967,195✔
38
    // we assume that compressors are only created from a valid parent.
39
    // String interners in 'dead' mode should never instantiate a string compressor.
40
    if (m_data->get_ref_from_parent() == 0) {
11,967,195✔
41
        REALM_ASSERT(writable);
728,637✔
42
        m_data->create(0, 65535);
728,637✔
43
        m_data->update_parent();
728,637✔
44
    }
728,637✔
45
    else {
11,238,558✔
46
        if (m_data->is_attached())
11,238,558✔
47
            m_data->update_from_parent();
4,521,600✔
48
        else
6,716,958✔
49
            m_data->init_from_ref(m_data->get_ref_from_parent());
6,716,958✔
50
    }
11,238,558✔
51
    rebuild_internal();
11,967,195✔
52
}
11,967,195✔
53

54
static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b)
55
{
4,023,048,723✔
56
    // range of return value must match size of encoding table
57
    uint32_t tmp = a + 3;
4,023,048,723✔
58
    tmp *= b + 7;
4,023,048,723✔
59
    return (tmp ^ (tmp >> 16)) & 0xFFFF;
4,023,048,723✔
60
}
4,023,048,723✔
61

62
void StringCompressor::add_expansion(SymbolDef def)
63
{
83,529,141✔
64
    // compute expansion size:
65
    size_t exp_size = 0;
83,529,141✔
66
    if (def.expansion_a < 256)
83,529,141✔
67
        exp_size = 1;
27,544,833✔
68
    else
55,984,308✔
69
        exp_size = m_symbols[def.expansion_a - 256].expansion.size();
55,984,308✔
70
    if (def.expansion_b < 256)
83,529,141✔
71
        exp_size += 1;
28,553,514✔
72
    else
54,975,627✔
73
        exp_size += m_symbols[def.expansion_b - 256].expansion.size();
54,975,627✔
74
    // make sure there is room in active storage chunk:
75
    if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) {
83,529,141✔
76
        m_expansion_storage.push_back({});
596,517✔
77
        m_expansion_storage.back().reserve(storage_chunk_size);
596,517✔
78
    }
596,517✔
79
    // construct expansion at end of chunk:
80
    auto& chunk = m_expansion_storage.back();
83,529,141✔
81
    auto start_index = (uint32_t)chunk.size();
83,529,141✔
82
    if (def.expansion_a < 256)
83,529,141✔
83
        chunk.push_back((char)def.expansion_a);
27,539,040✔
84
    else
55,990,101✔
85
        chunk.append(m_symbols[def.expansion_a - 256].expansion);
55,990,101✔
86
    if (def.expansion_b < 256)
83,529,141✔
87
        chunk.push_back((char)def.expansion_b);
28,550,706✔
88
    else
54,978,435✔
89
        chunk.append(m_symbols[def.expansion_b - 256].expansion);
54,978,435✔
90
    std::string_view expansion(chunk.data() + start_index, exp_size);
83,529,141✔
91
    m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index});
83,529,141✔
92
}
83,529,141✔
93

94
void StringCompressor::expand_compression_map()
95
{
1,509,672✔
96
    size_t old_size = m_compression_map.size();
1,509,672✔
97
    REALM_ASSERT(old_size <= 16384);
1,509,672✔
98
    size_t new_size = 4 * old_size;
1,509,672✔
99
    std::vector<SymbolDef> map(new_size);
1,509,672✔
100
    for (size_t i = 0; i < m_compression_map.size(); ++i) {
2,253,221,700✔
101
        auto& entry = m_compression_map[i];
2,251,712,028✔
102
        if (entry.id == 0)
2,251,712,028✔
103
            continue;
2,205,805,977✔
104
        auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b);
45,906,051✔
105
        auto new_hash = hash & (new_size - 1);
45,906,051✔
106
        REALM_ASSERT(map[new_hash].id == 0);
45,906,051✔
107
        map[new_hash] = entry;
45,906,051✔
108
    }
45,906,051✔
109
    m_compression_map.swap(map);
1,509,672✔
110
}
1,509,672✔
111

112
void StringCompressor::rebuild_internal()
113
{
11,943,576✔
114
    auto num_symbols = m_data->size();
11,943,576✔
115
    if (num_symbols == m_symbols.size())
11,943,576✔
116
        return;
11,501,133✔
117
    if (num_symbols < m_symbols.size()) {
442,443✔
118
        // fewer symbols (likely a rollback) -- remove last ones added
119
        while (num_symbols < m_symbols.size()) {
126✔
120
            auto& symbol = m_symbols.back();
120✔
121
            auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b);
120✔
122
            hash &= m_compression_map.size() - 1;
120✔
123
            REALM_ASSERT(m_compression_map[hash].id == symbol.def.id);
120✔
124
            m_compression_map[hash] = {0, 0, 0};
120✔
125
            if (symbol.storage_index < m_expansion_storage.size() - 1) {
120✔
NEW
126
                m_expansion_storage.resize(symbol.storage_index + 1);
×
NEW
127
            }
×
128
            m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset);
120✔
129
            m_symbols.pop_back();
120✔
130
        }
120✔
131
        return;
6✔
132
    }
6✔
133
    // we have new symbols to add
134
    for (size_t i = m_symbols.size(); i < num_symbols; ++i) {
72,457,419✔
135
        auto pair = m_data->get(i);
72,014,982✔
136
        SymbolDef def;
72,014,982✔
137
        def.id = (CompressionSymbol)(i + 256);
72,014,982✔
138
        def.expansion_a = 0xFFFF & (pair >> 16);
72,014,982✔
139
        def.expansion_b = 0xFFFF & pair;
72,014,982✔
140
        auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b);
72,014,982✔
141
        while (m_compression_map[hash & (m_compression_map.size() - 1)].id) {
73,373,073✔
142
            expand_compression_map();
1,358,091✔
143
        }
1,358,091✔
144
        // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
145
        m_compression_map[hash & (m_compression_map.size() - 1)] = def;
72,014,982✔
146
        add_expansion(def);
72,014,982✔
147
    }
72,014,982✔
148
}
442,437✔
149

150
StringCompressor::~StringCompressor() {}
7,451,301✔
151

152
CompressedString StringCompressor::compress(StringData sd, bool learn)
153
{
1,146,540✔
154
    CompressedString result(sd.size());
1,146,540✔
155
    // expand string into array of symbols
156
    const char* d = sd.data();
1,146,540✔
157
    const size_t limit = sd.size();
1,146,540✔
158
    if (limit == 0)
1,146,540✔
159
        return {};
17,463✔
160
    size_t i = 0;
1,129,077✔
161
    while (i < limit) {
1,774,671,090✔
162
        result[i++] = 0xFF & *d++;
1,773,542,013✔
163
    }
1,773,542,013✔
164
    // iteratively compress array of symbols. Each run compresses pairs into single symbols.
165
    // 6 runs give a max compression of 64x - on average it will be much less :-)
166
    constexpr int run_limit = 6;
1,129,077✔
167
    CompressionSymbol* to;
1,129,077✔
168
    for (int run = 0; run < run_limit; ++run) {
5,507,991✔
169
        CompressionSymbol* from = to = result.data();
5,342,517✔
170
        CompressionSymbol* limit = from + result.size() - 1;
5,342,517✔
171
        while (from < limit) {
3,909,831,822✔
172
            auto hash = symbol_pair_hash(from[0], from[1]);
3,904,489,305✔
173
            hash &= m_compression_map.size() - 1;
3,904,489,305✔
174
            auto& def = m_compression_map[hash];
3,904,489,305✔
175
            if (def.id) {
3,904,489,305✔
176
                // existing symbol
177
                if (def.expansion_a == from[0] && def.expansion_b == from[1]) {
3,890,666,766✔
178
                    // matching symbol
179
                    *to++ = def.id;
751,914,321✔
180
                    from += 2;
751,914,321✔
181
                }
751,914,321✔
182
                else if (m_compression_map.size() < 65536) {
3,138,752,445✔
183
                    // Conflict: some other symbol is defined here - but we can expand the compression map
184
                    // and hope to find room!
185
                    expand_compression_map();
151,581✔
186
                    // simply retry:
187
                    continue;
151,581✔
188
                }
151,581✔
189
                else {
3,138,600,864✔
190
                    // also conflict: some other symbol is defined here, we can't compress
191
                    *to++ = *from++;
3,138,600,864✔
192
                    // In a normal hash table we'd have buckets and add a translation
193
                    // to a bucket. This is slower generally, but yields better compression.
194
                }
3,138,600,864✔
195
            }
3,890,666,766✔
196
            else {
13,822,539✔
197
                // free entry we can use for new symbol (and we're learning)
198
                if (m_symbols.size() < (65536 - 256) && learn) {
13,822,539✔
199
                    // define a new symbol for this entry and use it.
200
                    REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
11,452,038✔
201
                    REALM_ASSERT_DEBUG(m_symbols.size() == m_data->size());
11,452,038✔
202
                    REALM_ASSERT_DEBUG(m_data->is_attached());
11,452,038✔
203
                    CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size());
11,452,038✔
204
                    SymbolDef def{id, from[0], from[1]};
11,452,038✔
205
                    m_compression_map[hash] = def;
11,452,038✔
206
                    add_expansion(def);
11,452,038✔
207
                    m_data->add(((uint64_t)from[0]) << 16 | from[1]);
11,452,038✔
208
                    // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl;
209
                    *to++ = id;
11,452,038✔
210
                    from += 2;
11,452,038✔
211
                }
11,452,038✔
212
                else {
2,370,501✔
213
                    // no more symbol space, so can't compress
214
                    *to++ = *from++;
2,370,501✔
215
                }
2,370,501✔
216
            }
13,822,539✔
217
        }
3,904,489,305✔
218
        if (from == limit) {
5,342,517✔
219
            // copy over trailing symbol
220
            *to++ = *from++;
3,415,911✔
221
        }
3,415,911✔
222
        REALM_ASSERT_DEBUG(to > result.data());
5,342,517✔
223
        size_t sz = to - result.data();
5,342,517✔
224
        REALM_ASSERT_DEBUG(sz <= sd.size());
5,342,517✔
225
        result.resize(sz);
5,342,517✔
226
        if (from == to) // no compression took place in last iteration
5,342,517✔
227
            break;
963,603✔
228
    }
5,342,517✔
229
    return result;
1,129,077✔
230
}
1,146,540✔
231

232
std::string StringCompressor::decompress(CompressedStringView& c_str)
233
{
157,620✔
234
    CompressionSymbol* ptr = c_str.data;
157,620✔
235
    CompressionSymbol* limit = ptr + c_str.size;
157,620✔
236
    // compute size of decompressed string first to avoid allocations as string grows
237
    size_t result_size = 0;
157,620✔
238
    while (ptr < limit) {
45,049,995✔
239
        if (*ptr < 256)
44,892,375✔
240
            result_size += 1;
15,362,847✔
241
        else
29,529,528✔
242
            result_size += m_symbols[*ptr - 256].expansion.size();
29,529,528✔
243
        ++ptr;
44,892,375✔
244
    }
44,892,375✔
245
    std::string result2;
157,620✔
246
    result2.reserve(result_size);
157,620✔
247
    // generate result
248
    ptr = c_str.data;
157,620✔
249
    while (ptr < limit) {
45,049,995✔
250
        if (*ptr < 256)
44,892,375✔
251
            result2.push_back((char)*ptr);
15,362,847✔
252
        else
29,529,528✔
253
            result2.append(m_symbols[*ptr - 256].expansion);
29,529,528✔
254
        ptr++;
44,892,375✔
255
    }
44,892,375✔
256
#ifdef REALM_DEBUG
157,620✔
257
    std::string result;
157,620✔
258
    {
157,620✔
259
        auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void {
167,846,673✔
260
            if (symbol < 256) {
167,846,673✔
261
                result.push_back((char)symbol);
106,369,560✔
262
            }
106,369,560✔
263
            else {
61,477,113✔
264
                auto& s = m_symbols[symbol - 256];
61,477,113✔
265
                decompress(s.def.expansion_a, decompress);
61,477,113✔
266
                decompress(s.def.expansion_b, decompress);
61,477,113✔
267
            }
61,477,113✔
268
        };
167,846,673✔
269

270
        CompressionSymbol* ptr = c_str.data;
157,620✔
271
        CompressionSymbol* limit = ptr + c_str.size;
157,620✔
272
        while (ptr < limit) {
45,049,995✔
273
            decompress(*ptr, decompress);
44,892,375✔
274
            ++ptr;
44,892,375✔
275
        }
44,892,375✔
276
    }
157,620✔
277
    REALM_ASSERT_DEBUG(result == result2);
157,620✔
278
#endif
157,620✔
279
    return result2;
157,620✔
280
}
157,620✔
281

282
int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B)
NEW
283
{
×
NEW
284
    auto A_ptr = A.data;
×
NEW
285
    auto A_limit = A_ptr + A.size;
×
NEW
286
    auto B_ptr = B.data;
×
NEW
287
    auto B_limit = B_ptr + B.size;
×
NEW
288
    while (A_ptr < A_limit && B_ptr < B_limit) {
×
NEW
289
        auto code_A = *A_ptr++;
×
NEW
290
        auto code_B = *B_ptr++;
×
NEW
291
        if (code_A == code_B)
×
NEW
292
            continue;
×
293
        // symbols did not match:
294
        // 1. both symbols are single characters
NEW
295
        if (code_A < 256 && code_B < 256)
×
NEW
296
            return code_B - code_A;
×
NEW
297
        std::string a_str(code_A, 1);
×
NEW
298
        auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion);
×
NEW
299
        std::string b_str(code_B, 1);
×
NEW
300
        auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion);
×
301
        // to ensure comparison as StringData we need to convert the stringviews
NEW
302
        StringData sd_a(str_A.data(), str_A.size());
×
NEW
303
        StringData sd_b(str_B.data(), str_B.size());
×
NEW
304
        REALM_ASSERT_DEBUG(sd_a != sd_b);
×
NEW
305
        if (sd_a < sd_b)
×
NEW
306
            return 1;
×
NEW
307
        else
×
NEW
308
            return -1;
×
NEW
309
    }
×
310
    // The compressed strings are identical or one is the prefix of the other
NEW
311
    return B.size - A.size;
×
312
    // ^ a faster way of producing same positive / negative / zero as:
313
    // if (A.size() < B.size())
314
    //     return 1;
315
    // if (A.size() > B.size())
316
    //     return -1;
317
    // return 0;
NEW
318
}
×
319

320
int StringCompressor::compare(StringData sd, CompressedStringView& B)
321
{
560,010✔
322
    auto B_size = B.size;
560,010✔
323
    // make sure comparisons are unsigned, even though StringData does not specify signedness
324
    const unsigned char* A_ptr = reinterpret_cast<const unsigned char*>(sd.data());
560,010✔
325
    auto A_limit = A_ptr + sd.size();
560,010✔
326
    for (size_t i = 0; i < B_size; ++i) {
51,697,881✔
327
        if (A_ptr == A_limit) {
51,137,874✔
328
            // sd ended first, so B is bigger
NEW
329
            return -1;
×
NEW
330
        }
×
331
        auto code = B.data[i];
51,137,874✔
332
        if (code < 256) {
51,137,874✔
333
            if (code < *A_ptr)
305,250✔
NEW
334
                return 1;
×
335
            if (code > *A_ptr)
305,250✔
336
                return -1;
3✔
337
            ++A_ptr;
305,247✔
338
            continue;
305,247✔
339
        }
305,250✔
340
        auto& expansion = m_symbols[code - 256];
50,832,624✔
341
        for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) {
2,727,538,938✔
342
            uint8_t c = expansion.expansion[disp];
2,676,706,314✔
343
            if (c < *A_ptr)
2,676,706,314✔
NEW
344
                return 1;
×
345
            if (c > *A_ptr)
2,676,706,314✔
NEW
346
                return -1;
×
347
            ++A_ptr;
2,676,706,314✔
348
        }
2,676,706,314✔
349
    }
50,832,624✔
350
    // if sd is longer than B, sd is the biggest string
351
    if (A_ptr < A_limit)
560,007✔
NEW
352
        return 1;
×
353
    return 0;
560,007✔
354
}
560,007✔
355

356

357
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc