• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_338

03 Jul 2024 03:00PM UTC coverage: 90.856% (-0.008%) from 90.864%
jorgen.edelbo_338

Pull #7803

Evergreen

nicola-cab
Merge branch 'next-major' into feature/string-compression
Pull Request #7803: Feature/string compression

103028 of 180606 branches covered (57.05%)

1144 of 1267 new or added lines in 33 files covered. (90.29%)

155 existing lines in 24 files now uncovered.

218583 of 240583 relevant lines covered (90.86%)

7959624.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.64
/src/realm/string_compressor.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_compressor.hpp>
20
#include <realm/string_interner.hpp>
21
#include <realm/string_data.hpp>
22

23
#include <iostream>
24
namespace realm {
25

26
StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable)
27
    : m_data(alloc)
708,672✔
28
{
1,452,225✔
29
    m_compression_map.resize(16); // start with a very small compression map
1,452,225✔
30
    m_symbols.reserve(65536);
1,452,225✔
31
    m_data.set_parent(&parent, index);
1,452,225✔
32
    refresh(writable);
1,452,225✔
33
}
1,452,225✔
34

35
void StringCompressor::refresh(bool writable)
36
{
2,545,704✔
37
    // we assume that compressors are only created from a valid parent.
38
    // String interners in 'dead' mode should never instantiate a string compressor.
39
    if (m_data.get_ref_from_parent() == 0) {
2,545,704✔
40
        REALM_ASSERT(writable);
187,080✔
41
        m_data.create(0, 65535);
187,080✔
42
        m_data.update_parent();
187,080✔
43
    }
187,080✔
44
    else {
2,358,624✔
45
        if (m_data.is_attached())
2,358,624✔
46
            m_data.update_from_parent();
1,093,476✔
47
        else
1,265,148✔
48
            m_data.init_from_ref(m_data.get_ref_from_parent());
1,265,148✔
49
    }
2,358,624✔
50
    rebuild_internal();
2,545,704✔
51
}
2,545,704✔
52

53
static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b)
54
{
4,256,888,092✔
55
    // range of return value must match size of encoding table
56
    uint32_t tmp = a + 3;
4,256,888,092✔
57
    tmp *= b + 7;
4,256,888,092✔
58
    return (tmp ^ (tmp >> 16)) & 0xFFFF;
4,256,888,092✔
59
}
4,256,888,092✔
60

61
void StringCompressor::add_expansion(SymbolDef def)
62
{
49,546,407✔
63
    // compute expansion size:
64
    size_t exp_size = 0;
49,546,407✔
65
    if (def.expansion_a < 256)
49,546,407✔
66
        exp_size = 1;
16,463,241✔
67
    else
33,083,166✔
68
        exp_size = m_symbols[def.expansion_a - 256].expansion.size();
33,083,166✔
69
    if (def.expansion_b < 256)
49,546,407✔
70
        exp_size += 1;
17,146,998✔
71
    else
32,399,409✔
72
        exp_size += m_symbols[def.expansion_b - 256].expansion.size();
32,399,409✔
73
    // make sure there is room in active storage chunk:
74
    if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) {
49,546,407✔
75
        m_expansion_storage.push_back({});
472,881✔
76
        m_expansion_storage.back().reserve(storage_chunk_size);
472,881✔
77
    }
472,881✔
78
    // construct expansion at end of chunk:
79
    auto& chunk = m_expansion_storage.back();
49,546,407✔
80
    auto start_index = (uint32_t)chunk.size();
49,546,407✔
81
    if (def.expansion_a < 256)
49,546,407✔
82
        chunk.push_back((char)def.expansion_a);
16,463,187✔
83
    else
33,083,220✔
84
        chunk.append(m_symbols[def.expansion_a - 256].expansion);
33,083,220✔
85
    if (def.expansion_b < 256)
49,546,407✔
86
        chunk.push_back((char)def.expansion_b);
17,146,920✔
87
    else
32,399,487✔
88
        chunk.append(m_symbols[def.expansion_b - 256].expansion);
32,399,487✔
89
    std::string_view expansion(chunk.data() + start_index, exp_size);
49,546,407✔
90
    m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index});
49,546,407✔
91
}
49,546,407✔
92

93
void StringCompressor::expand_compression_map()
94
{
1,135,092✔
95
    size_t old_size = m_compression_map.size();
1,135,092✔
96
    REALM_ASSERT(old_size <= 16384);
1,135,092✔
97
    size_t new_size = 4 * old_size;
1,135,092✔
98
    std::vector<SymbolDef> map(new_size);
1,135,092✔
99
    for (size_t i = 0; i < m_compression_map.size(); ++i) {
1,445,845,848✔
100
        auto& entry = m_compression_map[i];
1,444,710,756✔
101
        if (entry.id == 0)
1,444,710,756✔
102
            continue;
1,422,054,477✔
103
        auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b);
22,656,279✔
104
        auto new_hash = hash & (new_size - 1);
22,656,279✔
105
        REALM_ASSERT(map[new_hash].id == 0);
22,656,279✔
106
        map[new_hash] = entry;
22,656,279✔
107
    }
22,656,279✔
108
    m_compression_map.swap(map);
1,135,092✔
109
}
1,135,092✔
110

111
void StringCompressor::rebuild_internal()
112
{
2,545,677✔
113
    auto num_symbols = m_data.size();
2,545,677✔
114
    if (num_symbols == m_symbols.size())
2,545,677✔
115
        return;
2,187,084✔
116
    if (num_symbols < m_symbols.size()) {
358,593✔
117
        // fewer symbols (likely a rollback) -- remove last ones added
118
        while (num_symbols < m_symbols.size()) {
144✔
119
            auto& symbol = m_symbols.back();
132✔
120
            auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b);
132✔
121
            hash &= m_compression_map.size() - 1;
132✔
122
            REALM_ASSERT(m_compression_map[hash].id == symbol.def.id);
132✔
123
            m_compression_map[hash] = {0, 0, 0};
132✔
124
            if (symbol.storage_index < m_expansion_storage.size() - 1) {
132✔
NEW
125
                m_expansion_storage.resize(symbol.storage_index + 1);
×
NEW
126
            }
×
127
            m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset);
132✔
128
            m_symbols.pop_back();
132✔
129
        }
132✔
130
        return;
12✔
131
    }
12✔
132
    // we have new symbols to add
133
    for (size_t i = m_symbols.size(); i < num_symbols; ++i) {
38,729,157✔
134
        auto pair = m_data.get(i);
38,370,576✔
135
        SymbolDef def;
38,370,576✔
136
        def.id = (CompressionSymbol)(i + 256);
38,370,576✔
137
        def.expansion_a = 0xFFFF & (pair >> 16);
38,370,576✔
138
        def.expansion_b = 0xFFFF & pair;
38,370,576✔
139
        auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b);
38,370,576✔
140
        while (m_compression_map[hash & (m_compression_map.size() - 1)].id) {
39,208,893✔
141
            expand_compression_map();
838,317✔
142
        }
838,317✔
143
        // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
144
        m_compression_map[hash & (m_compression_map.size() - 1)] = def;
38,370,576✔
145
        add_expansion(def);
38,370,576✔
146
    }
38,370,576✔
147
}
358,581✔
148

149
StringCompressor::~StringCompressor() {}
1,452,246✔
150

151
CompressedString StringCompressor::compress(StringData sd, bool learn)
152
{
923,826✔
153
    CompressedString result(sd.size());
923,826✔
154
    // expand string into array of symbols
155
    const char* d = sd.data();
923,826✔
156
    const size_t limit = sd.size();
923,826✔
157
    if (limit == 0)
923,826✔
158
        return {};
17,877✔
159
    size_t i = 0;
905,949✔
160
    while (i < limit) {
1,906,168,086✔
161
        result[i++] = 0xFF & *d++;
1,905,262,137✔
162
    }
1,905,262,137✔
163
    // iteratively compress array of symbols. Each run compresses pairs into single symbols.
164
    // 6 runs give a max compression of 64x - on average it will be much less :-)
165
    constexpr int run_limit = 6;
905,949✔
166
    CompressionSymbol* to;
905,949✔
167
    for (int run = 0; run < run_limit; ++run) {
4,607,043✔
168
        CompressionSymbol* from = to = result.data();
4,441,881✔
169
        CompressionSymbol* limit = from + result.size() - 1;
4,441,881✔
170
        while (from < limit) {
4,230,416,482✔
171
            auto hash = symbol_pair_hash(from[0], from[1]);
4,228,190,479✔
172
            hash &= m_compression_map.size() - 1;
4,228,190,479✔
173
            auto& def = m_compression_map[hash];
4,228,190,479✔
174
            if (def.id) {
4,228,190,479✔
175
                // existing symbol
176
                if (def.expansion_a == from[0] && def.expansion_b == from[1]) {
4,218,773,131✔
177
                    // matching symbol
178
                    *to++ = def.id;
771,686,589✔
179
                    from += 2;
771,686,589✔
180
                }
771,686,589✔
181
                else if (m_compression_map.size() < 65536) {
3,509,018,436✔
182
                    // Conflict: some other symbol is defined here - but we can expand the compression map
183
                    // and hope to find room!
184
                    expand_compression_map();
296,775✔
185
                    // simply retry:
186
                    continue;
296,775✔
187
                }
296,775✔
188
                else {
3,508,721,661✔
189
                    // also conflict: some other symbol is defined here, we can't compress
190
                    *to++ = *from++;
3,508,721,661✔
191
                    // In a normal hash table we'd have buckets and add a translation
192
                    // to a bucket. This is slower generally, but yields better compression.
193
                }
3,508,721,661✔
194
            }
4,218,773,131✔
195
            else {
16,586,205✔
196
                // free entry we can use for new symbol (and we're learning)
197
                if (m_symbols.size() < (65536 - 256) && learn) {
16,586,205✔
198
                    // define a new symbol for this entry and use it.
199
                    REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
11,175,519✔
200
                    REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size());
11,175,519✔
201
                    REALM_ASSERT_DEBUG(m_data.is_attached());
11,175,519✔
202
                    CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size());
11,175,519✔
203
                    SymbolDef def{id, from[0], from[1]};
11,175,519✔
204
                    m_compression_map[hash] = def;
11,175,519✔
205
                    add_expansion(def);
11,175,519✔
206
                    m_data.add(((uint64_t)from[0]) << 16 | from[1]);
11,175,519✔
207
                    // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl;
208
                    *to++ = id;
11,175,519✔
209
                    from += 2;
11,175,519✔
210
                }
11,175,519✔
211
                else {
5,410,686✔
212
                    // no more symbol space, so can't compress
213
                    *to++ = *from++;
5,410,686✔
214
                }
5,410,686✔
215
            }
16,586,205✔
216
        }
4,228,190,479✔
217
        if (from == limit) {
4,441,881✔
218
            // copy over trailing symbol
219
            *to++ = *from++;
3,196,134✔
220
        }
3,196,134✔
221
        REALM_ASSERT_DEBUG(to > result.data());
4,441,881✔
222
        size_t sz = to - result.data();
4,441,881✔
223
        REALM_ASSERT_DEBUG(sz <= sd.size());
4,441,881✔
224
        result.resize(sz);
4,441,881✔
225
        if (from == to) // no compression took place in last iteration
4,441,881✔
226
            break;
740,787✔
227
    }
4,441,881✔
228
    return result;
905,949✔
229
}
923,826✔
230

231
std::string StringCompressor::decompress(CompressedStringView& c_str)
232
{
282,270✔
233
    CompressionSymbol* ptr = c_str.data;
282,270✔
234
    CompressionSymbol* limit = ptr + c_str.size;
282,270✔
235
    // compute size of decompressed string first to avoid allocations as string grows
236
    size_t result_size = 0;
282,270✔
237
    while (ptr < limit) {
45,268,146✔
238
        if (*ptr < 256)
44,985,876✔
239
            result_size += 1;
15,307,395✔
240
        else
29,678,481✔
241
            result_size += m_symbols[*ptr - 256].expansion.size();
29,678,481✔
242
        ++ptr;
44,985,876✔
243
    }
44,985,876✔
244
    std::string result2;
282,270✔
245
    result2.reserve(result_size);
282,270✔
246
    // generate result
247
    ptr = c_str.data;
282,270✔
248
    while (ptr < limit) {
45,268,143✔
249
        if (*ptr < 256)
44,985,873✔
250
            result2.push_back((char)*ptr);
15,307,395✔
251
        else
29,678,478✔
252
            result2.append(m_symbols[*ptr - 256].expansion);
29,678,478✔
253
        ptr++;
44,985,873✔
254
    }
44,985,873✔
255
#ifdef REALM_DEBUG
282,270✔
256
    std::string result;
282,270✔
257
    {
282,270✔
258
        auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void {
170,119,914✔
259
            if (symbol < 256) {
170,119,914✔
260
                result.push_back((char)symbol);
107,552,892✔
261
            }
107,552,892✔
262
            else {
62,567,022✔
263
                auto& s = m_symbols[symbol - 256];
62,567,022✔
264
                decompress(s.def.expansion_a, decompress);
62,567,022✔
265
                decompress(s.def.expansion_b, decompress);
62,567,022✔
266
            }
62,567,022✔
267
        };
170,119,914✔
268

269
        CompressionSymbol* ptr = c_str.data;
282,270✔
270
        CompressionSymbol* limit = ptr + c_str.size;
282,270✔
271
        while (ptr < limit) {
45,268,140✔
272
            decompress(*ptr, decompress);
44,985,870✔
273
            ++ptr;
44,985,870✔
274
        }
44,985,870✔
275
    }
282,270✔
276
    REALM_ASSERT_DEBUG(result == result2);
282,270✔
277
#endif
282,270✔
278
    return result2;
282,270✔
279
}
282,270✔
280

281
int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B)
NEW
282
{
×
NEW
283
    auto A_ptr = A.data;
×
NEW
284
    auto A_limit = A_ptr + A.size;
×
NEW
285
    auto B_ptr = B.data;
×
NEW
286
    auto B_limit = B_ptr + B.size;
×
NEW
287
    while (A_ptr < A_limit && B_ptr < B_limit) {
×
NEW
288
        auto code_A = *A_ptr++;
×
NEW
289
        auto code_B = *B_ptr++;
×
NEW
290
        if (code_A == code_B)
×
NEW
291
            continue;
×
292
        // symbols did not match:
293
        // 1. both symbols are single characters
NEW
294
        if (code_A < 256 && code_B < 256)
×
NEW
295
            return code_B - code_A;
×
NEW
296
        std::string a_str(code_A, 1);
×
NEW
297
        auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion);
×
NEW
298
        std::string b_str(code_B, 1);
×
NEW
299
        auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion);
×
300
        // to ensure comparison as StringData we need to convert the stringviews
NEW
301
        StringData sd_a(str_A.data(), str_A.size());
×
NEW
302
        StringData sd_b(str_B.data(), str_B.size());
×
NEW
303
        REALM_ASSERT_DEBUG(sd_a != sd_b);
×
NEW
304
        if (sd_a < sd_b)
×
NEW
305
            return 1;
×
NEW
306
        else
×
NEW
307
            return -1;
×
NEW
308
    }
×
309
    // The compressed strings are identical or one is the prefix of the other
NEW
310
    return B.size - A.size;
×
311
    // ^ a faster way of producing same positive / negative / zero as:
312
    // if (A.size() < B.size())
313
    //     return 1;
314
    // if (A.size() > B.size())
315
    //     return -1;
316
    // return 0;
NEW
317
}
×
318

319
int StringCompressor::compare(StringData sd, CompressedStringView& B)
320
{
1,021,116✔
321
    auto B_size = B.size;
1,021,116✔
322
    // make sure comparisons are unsigned, even though StringData does not specify signedness
323
    const unsigned char* A_ptr = reinterpret_cast<const unsigned char*>(sd.data());
1,021,116✔
324
    auto A_limit = A_ptr + sd.size();
1,021,116✔
325
    for (size_t i = 0; i < B_size; ++i) {
59,133,459✔
326
        if (A_ptr == A_limit) {
58,112,352✔
327
            // sd ended first, so B is bigger
NEW
328
            return -1;
×
NEW
329
        }
×
330
        auto code = B.data[i];
58,112,352✔
331
        if (code < 256) {
58,112,352✔
332
            if (code < *A_ptr)
209,373✔
NEW
333
                return 1;
×
334
            if (code > *A_ptr)
209,373✔
NEW
335
                return -1;
×
336
            ++A_ptr;
209,373✔
337
            continue;
209,373✔
338
        }
209,373✔
339
        auto& expansion = m_symbols[code - 256];
57,902,979✔
340
        for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) {
3,188,290,695✔
341
            uint8_t c = expansion.expansion[disp];
3,130,387,725✔
342
            if (c < *A_ptr)
3,130,387,725✔
343
                return 1;
6✔
344
            if (c > *A_ptr)
3,130,387,719✔
345
                return -1;
3✔
346
            ++A_ptr;
3,130,387,716✔
347
        }
3,130,387,716✔
348
    }
57,902,979✔
349
    // if sd is longer than B, sd is the biggest string
350
    if (A_ptr < A_limit)
1,021,107✔
NEW
351
        return 1;
×
352
    return 0;
1,021,107✔
353
}
1,021,107✔
354

355

356
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc