• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_402

21 Aug 2024 11:10AM UTC coverage: 91.054% (-0.03%) from 91.085%
jorgen.edelbo_402

Pull #7803

Evergreen

jedelbo
Small fix to Table::typed_write

When writing the realm to a new file from a write transaction,
the Table may be COW so that the top ref is changed. So don't
use the ref that is present in the group when the operation starts.
Pull Request #7803: Feature/string compression

103494 of 181580 branches covered (57.0%)

1929 of 1999 new or added lines in 46 files covered. (96.5%)

695 existing lines in 51 files now uncovered.

220142 of 241772 relevant lines covered (91.05%)

7344461.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.36
/src/realm/string_compressor.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_compressor.hpp>
20
#include <realm/string_interner.hpp>
21
#include <realm/string_data.hpp>
22

23
#include <iostream>
24
namespace realm {
25

26
StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable)
27
    : m_data(alloc)
755,976✔
28
{
1,546,536✔
29
    m_compression_map.resize(16); // start with a very small compression map
1,546,536✔
30
    m_symbols.reserve(65536);
1,546,536✔
31
    m_data.set_parent(&parent, index);
1,546,536✔
32
    refresh(writable);
1,546,536✔
33
}
1,546,536✔
34

35
void StringCompressor::refresh(bool writable)
36
{
2,658,225✔
37
    // we assume that compressors are only created from a valid parent.
38
    // String interners in 'dead' mode should never instantiate a string compressor.
39
    if (m_data.get_ref_from_parent() == 0) {
2,658,225✔
40
        REALM_ASSERT(writable);
189,720✔
41
        m_data.create(0, 65535);
189,720✔
42
        m_data.update_parent();
189,720✔
43
    }
189,720✔
44
    else {
2,468,505✔
45
        if (m_data.is_attached())
2,468,505✔
46
            m_data.update_from_parent();
1,111,668✔
47
        else
1,356,837✔
48
            m_data.init_from_ref(m_data.get_ref_from_parent());
1,356,837✔
49
    }
2,468,505✔
50
    rebuild_internal();
2,658,225✔
51
}
2,658,225✔
52

53
static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b)
54
{
4,294,967,294✔
55
    // range of return value must match size of encoding table
56
    uint32_t tmp = a + 3;
4,294,967,294✔
57
    tmp *= b + 7;
4,294,967,294✔
58
    return (tmp ^ (tmp >> 16)) & 0xFFFF;
4,294,967,294✔
59
}
4,294,967,294✔
60

61
void StringCompressor::add_expansion(SymbolDef def)
62
{
53,165,154✔
63
    // compute expansion size:
64
    size_t exp_size = 0;
53,165,154✔
65
    if (def.expansion_a < 256)
53,165,154✔
66
        exp_size = 1;
18,157,605✔
67
    else
35,007,549✔
68
        exp_size = m_symbols[def.expansion_a - 256].expansion.size();
35,007,549✔
69
    if (def.expansion_b < 256)
53,165,154✔
70
        exp_size += 1;
18,949,953✔
71
    else
34,215,201✔
72
        exp_size += m_symbols[def.expansion_b - 256].expansion.size();
34,215,201✔
73
    // make sure there is room in active storage chunk:
74
    if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) {
53,165,154✔
75
        m_expansion_storage.push_back({});
504,297✔
76
        m_expansion_storage.back().reserve(storage_chunk_size);
504,297✔
77
    }
504,297✔
78
    // construct expansion at end of chunk:
79
    auto& chunk = m_expansion_storage.back();
53,165,154✔
80
    auto start_index = (uint32_t)chunk.size();
53,165,154✔
81
    if (def.expansion_a < 256)
53,165,154✔
82
        chunk.push_back((char)def.expansion_a);
18,157,383✔
83
    else
35,007,771✔
84
        chunk.append(m_symbols[def.expansion_a - 256].expansion);
35,007,771✔
85
    if (def.expansion_b < 256)
53,165,154✔
86
        chunk.push_back((char)def.expansion_b);
18,949,824✔
87
    else
34,215,330✔
88
        chunk.append(m_symbols[def.expansion_b - 256].expansion);
34,215,330✔
89
    std::string_view expansion(chunk.data() + start_index, exp_size);
53,165,154✔
90
    m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index});
53,165,154✔
91
}
53,165,154✔
92

93
void StringCompressor::expand_compression_map()
94
{
1,197,108✔
95
    size_t old_size = m_compression_map.size();
1,197,108✔
96
    REALM_ASSERT(old_size <= 16384);
1,197,108✔
97
    size_t new_size = 4 * old_size;
1,197,108✔
98
    std::vector<SymbolDef> map(new_size);
1,197,108✔
99
    for (size_t i = 0; i < m_compression_map.size(); ++i) {
1,555,463,922✔
100
        auto& entry = m_compression_map[i];
1,554,266,814✔
101
        if (entry.id == 0)
1,554,266,814✔
102
            continue;
1,528,848,516✔
103
        auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b);
25,418,298✔
104
        auto new_hash = hash & (new_size - 1);
25,418,298✔
105
        REALM_ASSERT(map[new_hash].id == 0);
25,418,298✔
106
        map[new_hash] = entry;
25,418,298✔
107
    }
25,418,298✔
108
    m_compression_map.swap(map);
1,197,108✔
109
}
1,197,108✔
110

111
void StringCompressor::rebuild_internal()
112
{
2,658,117✔
113
    auto num_symbols = m_data.size();
2,658,117✔
114
    if (num_symbols == m_symbols.size())
2,658,117✔
115
        return;
2,273,304✔
116
    if (num_symbols < m_symbols.size()) {
384,813✔
117
        // fewer symbols (likely a rollback) -- remove last ones added
118
        while (num_symbols < m_symbols.size()) {
18✔
119
            auto& symbol = m_symbols.back();
12✔
120
            auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b);
12✔
121
            hash &= m_compression_map.size() - 1;
12✔
122
            REALM_ASSERT(m_compression_map[hash].id == symbol.def.id);
12✔
123
            m_compression_map[hash] = {0, 0, 0};
12✔
124
            if (symbol.storage_index < m_expansion_storage.size() - 1) {
12✔
NEW
125
                m_expansion_storage.resize(symbol.storage_index + 1);
×
NEW
126
            }
×
127
            m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset);
12✔
128
            m_symbols.pop_back();
12✔
129
        }
12✔
130
        return;
6✔
131
    }
6✔
132
    // we have new symbols to add
133
    for (size_t i = m_symbols.size(); i < num_symbols; ++i) {
41,742,819✔
134
        auto pair = m_data.get(i);
41,358,012✔
135
        SymbolDef def;
41,358,012✔
136
        def.id = (CompressionSymbol)(i + 256);
41,358,012✔
137
        def.expansion_a = 0xFFFF & (pair >> 16);
41,358,012✔
138
        def.expansion_b = 0xFFFF & pair;
41,358,012✔
139
        auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b);
41,358,012✔
140
        while (m_compression_map[hash & (m_compression_map.size() - 1)].id) {
42,253,263✔
141
            expand_compression_map();
895,251✔
142
        }
895,251✔
143
        // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
144
        m_compression_map[hash & (m_compression_map.size() - 1)] = def;
41,358,012✔
145
        add_expansion(def);
41,358,012✔
146
    }
41,358,012✔
147
}
384,807✔
148

149
StringCompressor::~StringCompressor() {}
1,546,533✔
150

151
CompressedString StringCompressor::compress(StringData sd, bool learn)
152
{
948,645✔
153
    CompressedString result(sd.size());
948,645✔
154
    // expand string into array of symbols
155
    const char* d = sd.data();
948,645✔
156
    const size_t limit = sd.size();
948,645✔
157
    if (limit == 0)
948,645✔
158
        return {};
19,209✔
159
    size_t i = 0;
929,436✔
160
    while (i < limit) {
1,999,703,994✔
161
        result[i++] = 0xFF & *d++;
1,998,774,558✔
162
    }
1,998,774,558✔
163
    // iteratively compress array of symbols. Each run compresses pairs into single symbols.
164
    // 6 runs give a max compression of 64x - on average it will be much less :-)
165
    constexpr int run_limit = 6;
929,436✔
166
    CompressionSymbol* to;
929,436✔
167
    for (int run = 0; run < run_limit; ++run) {
4,801,194✔
168
        CompressionSymbol* from = to = result.data();
4,633,161✔
169
        CompressionSymbol* limit = from + result.size() - 1;
4,633,161✔
170
        while (from < limit) {
4,294,967,294✔
171
            auto hash = symbol_pair_hash(from[0], from[1]);
4,293,242,002✔
172
            hash &= m_compression_map.size() - 1;
4,293,242,002✔
173
            auto& def = m_compression_map[hash];
4,293,242,002✔
174
            if (def.id) {
4,293,242,002✔
175
                // existing symbol
176
                if (def.expansion_a == from[0] && def.expansion_b == from[1]) {
4,282,428,562✔
177
                    // matching symbol
178
                    *to++ = def.id;
860,353,986✔
179
                    from += 2;
860,353,986✔
180
                }
860,353,986✔
181
                else if (m_compression_map.size() < 65536) {
3,642,953,592✔
182
                    // Conflict: some other symbol is defined here - but we can expand the compression map
183
                    // and hope to find room!
184
                    expand_compression_map();
301,857✔
185
                    // simply retry:
186
                    continue;
301,857✔
187
                }
301,857✔
188
                else {
3,642,651,735✔
189
                    // also conflict: some other symbol is defined here, we can't compress
190
                    *to++ = *from++;
3,642,651,735✔
191
                    // In a normal hash table we'd have buckets and add a translation
192
                    // to a bucket. This is slower generally, but yields better compression.
193
                }
3,642,651,735✔
194
            }
4,282,428,562✔
195
            else {
18,071,598✔
196
                // free entry we can use for new symbol (and we're learning)
197
                if (m_symbols.size() < (65536 - 256) && learn) {
18,071,598✔
198
                    // define a new symbol for this entry and use it.
199
                    REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
11,806,962✔
200
                    REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size());
11,806,962✔
201
                    REALM_ASSERT_DEBUG(m_data.is_attached());
11,806,962✔
202
                    CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size());
11,806,962✔
203
                    SymbolDef def{id, from[0], from[1]};
11,806,962✔
204
                    m_compression_map[hash] = def;
11,806,962✔
205
                    add_expansion(def);
11,806,962✔
206
                    m_data.add(((uint64_t)from[0]) << 16 | from[1]);
11,806,962✔
207
                    // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl;
208
                    *to++ = id;
11,806,962✔
209
                    from += 2;
11,806,962✔
210
                }
11,806,962✔
211
                else {
6,264,636✔
212
                    // no more symbol space, so can't compress
213
                    *to++ = *from++;
6,264,636✔
214
                }
6,264,636✔
215
            }
18,071,598✔
216
        }
4,293,242,002✔
217
        if (from == limit) {
4,633,161✔
218
            // copy over trailing symbol
219
            *to++ = *from++;
3,306,981✔
220
        }
3,306,981✔
221
        REALM_ASSERT_DEBUG(to > result.data());
4,633,161✔
222
        size_t sz = to - result.data();
4,633,161✔
223
        REALM_ASSERT_DEBUG(sz <= sd.size());
4,633,161✔
224
        result.resize(sz);
4,633,161✔
225
        if (from == to) // no compression took place in last iteration
4,633,161✔
226
            break;
761,403✔
227
    }
4,633,161✔
228
    return result;
929,436✔
229
}
948,645✔
230

231
std::string StringCompressor::decompress(CompressedStringView& c_str)
232
{
306,540✔
233
    CompressionSymbol* ptr = c_str.data;
306,540✔
234
    CompressionSymbol* limit = ptr + c_str.size;
306,540✔
235
    // compute size of decompressed string first to avoid allocations as string grows
236
    size_t result_size = 0;
306,540✔
237
    while (ptr < limit) {
45,289,779✔
238
        if (*ptr < 256)
44,983,239✔
239
            result_size += 1;
15,231,744✔
240
        else
29,751,495✔
241
            result_size += m_symbols[*ptr - 256].expansion.size();
29,751,495✔
242
        ++ptr;
44,983,239✔
243
    }
44,983,239✔
244
    std::string result2;
306,540✔
245
    result2.reserve(result_size);
306,540✔
246
    // generate result
247
    ptr = c_str.data;
306,540✔
248
    while (ptr < limit) {
45,289,776✔
249
        if (*ptr < 256)
44,983,236✔
250
            result2.push_back((char)*ptr);
15,231,744✔
251
        else
29,751,492✔
252
            result2.append(m_symbols[*ptr - 256].expansion);
29,751,492✔
253
        ptr++;
44,983,236✔
254
    }
44,983,236✔
255
#ifdef REALM_DEBUG
306,540✔
256
    std::string result;
306,540✔
257
    {
306,540✔
258
        auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void {
173,121,000✔
259
            if (symbol < 256) {
173,121,000✔
260
                result.push_back((char)symbol);
109,052,178✔
261
            }
109,052,178✔
262
            else {
64,068,822✔
263
                auto& s = m_symbols[symbol - 256];
64,068,822✔
264
                decompress(s.def.expansion_a, decompress);
64,068,822✔
265
                decompress(s.def.expansion_b, decompress);
64,068,822✔
266
            }
64,068,822✔
267
        };
173,121,000✔
268

269
        CompressionSymbol* ptr = c_str.data;
306,540✔
270
        CompressionSymbol* limit = ptr + c_str.size;
306,540✔
271
        while (ptr < limit) {
45,289,779✔
272
            decompress(*ptr, decompress);
44,983,239✔
273
            ++ptr;
44,983,239✔
274
        }
44,983,239✔
275
    }
306,540✔
276
    REALM_ASSERT_DEBUG(result == result2);
306,540✔
277
#endif
306,540✔
278
    return result2;
306,540✔
279
}
306,540✔
280

281
int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B)
282
{
968,448✔
283
    auto A_ptr = A.data;
968,448✔
284
    auto A_limit = A_ptr + A.size;
968,448✔
285
    auto B_ptr = B.data;
968,448✔
286
    auto B_limit = B_ptr + B.size;
968,448✔
287
    while (A_ptr < A_limit && B_ptr < B_limit) {
1,028,490✔
288
        auto code_A = *A_ptr++;
987,582✔
289
        auto code_B = *B_ptr++;
987,582✔
290
        if (code_A == code_B)
987,582✔
291
            continue;
60,042✔
292
        // symbols did not match:
293

294
        // 1. both symbols are single characters
295
        if (code_A < 256 && code_B < 256)
927,540✔
296
            return code_A - code_B;
2,946✔
297

298
        // 2. all the other possible cases
299
        std::string str_a{(char)code_A, 1};
924,594✔
300
        std::string str_b{(char)code_B, 1};
924,594✔
301
        StringData sd_a = code_A < 256 ? str_a : m_symbols[code_A - 256].expansion;
924,594✔
302
        StringData sd_b = code_B < 256 ? str_b : m_symbols[code_B - 256].expansion;
924,594✔
303

304
        REALM_ASSERT_DEBUG(sd_a != sd_b);
924,594✔
305
        if (sd_a < sd_b)
924,594✔
306
            return -1;
581,862✔
307
        else
342,732✔
308
            return 1;
342,732✔
309
    }
924,594✔
310
    // The compressed strings are identical or one is the prefix of the other
311
    return static_cast<int>(A.size - B.size);
40,908✔
312
    // ^ a faster way of producing same positive / negative / zero as:
313
    // if (A.size() < B.size())
314
    //     return -1;
315
    // if (A.size() > B.size())
316
    //     return 1;
317
    // return 0;
318
}
968,448✔
319

320
int StringCompressor::compare(StringData sd, CompressedStringView& B)
321
{
1,209,705✔
322
    auto B_size = B.size;
1,209,705✔
323
    // make sure comparisons are unsigned, even though StringData does not specify signedness
324
    const unsigned char* A_ptr = reinterpret_cast<const unsigned char*>(sd.data());
1,209,705✔
325
    auto A_limit = A_ptr + sd.size();
1,209,705✔
326
    for (size_t i = 0; i < B_size; ++i) {
63,322,722✔
327
        if (A_ptr == A_limit) {
62,113,071✔
328
            // sd ended first, so B is bigger
NEW
329
            return -1;
×
NEW
330
        }
×
331
        auto code = B.data[i];
62,113,071✔
332
        if (code < 256) {
62,113,071✔
333
            if (code < *A_ptr)
301,065✔
NEW
334
                return 1;
×
335
            if (code > *A_ptr)
301,065✔
NEW
336
                return -1;
×
337
            ++A_ptr;
301,065✔
338
            continue;
301,065✔
339
        }
301,065✔
340
        auto& expansion = m_symbols[code - 256];
61,812,006✔
341
        for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) {
3,427,427,091✔
342
            uint8_t c = expansion.expansion[disp];
3,365,615,139✔
343
            if (c < *A_ptr)
3,365,615,139✔
344
                return 1;
27✔
345
            if (c > *A_ptr)
3,365,615,112✔
346
                return -1;
27✔
347
            ++A_ptr;
3,365,615,085✔
348
        }
3,365,615,085✔
349
    }
61,812,006✔
350
    // if sd is longer than B, sd is the biggest string
351
    if (A_ptr < A_limit)
1,209,651✔
NEW
352
        return 1;
×
353
    return 0;
1,209,651✔
354
}
1,209,651✔
355

356

357
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc