• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

realm / realm-core / jorgen.edelbo_334

01 Jul 2024 07:22AM UTC coverage: 90.829% (-0.04%) from 90.865%
jorgen.edelbo_334

Pull #7803

Evergreen

jedelbo
Merge branch 'next-major' into feature/string-compression
Pull Request #7803: Feature/string compression

102912 of 180568 branches covered (56.99%)

1141 of 1267 new or added lines in 33 files covered. (90.06%)

172 existing lines in 24 files now uncovered.

218291 of 240332 relevant lines covered (90.83%)

7818396.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.64
/src/realm/string_compressor.cpp
1
/*************************************************************************
2
 *
3
 * Copyright 2016 Realm Inc.
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 **************************************************************************/
18

19
#include <realm/string_compressor.hpp>
20
#include <realm/string_interner.hpp>
21
#include <realm/string_data.hpp>
22

23
#include <iostream>
24
namespace realm {
25

26
StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable)
27
    : m_data(alloc)
898,440✔
28
{
1,836,129✔
29
    m_compression_map.resize(16); // start with a very small compression map
1,836,129✔
30
    m_symbols.reserve(65536);
1,836,129✔
31
    m_data.set_parent(&parent, index);
1,836,129✔
32
    refresh(writable);
1,836,129✔
33
}
1,836,129✔
34

35
void StringCompressor::refresh(bool writable)
36
{
2,951,844✔
37
    // we assume that compressors are only created from a valid parent.
38
    // String interners in 'dead' mode should never instantiate a string compressor.
39
    if (m_data.get_ref_from_parent() == 0) {
2,951,844✔
40
        REALM_ASSERT(writable);
186,552✔
41
        m_data.create(0, 65535);
186,552✔
42
        m_data.update_parent();
186,552✔
43
    }
186,552✔
44
    else {
2,765,292✔
45
        if (m_data.is_attached())
2,765,292✔
46
            m_data.update_from_parent();
1,115,697✔
47
        else
1,649,595✔
48
            m_data.init_from_ref(m_data.get_ref_from_parent());
1,649,595✔
49
    }
2,765,292✔
50
    rebuild_internal();
2,951,844✔
51
}
2,951,844✔
52

53
static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b)
54
{
4,188,618,195✔
55
    // range of return value must match size of encoding table
56
    uint32_t tmp = a + 3;
4,188,618,195✔
57
    tmp *= b + 7;
4,188,618,195✔
58
    return (tmp ^ (tmp >> 16)) & 0xFFFF;
4,188,618,195✔
59
}
4,188,618,195✔
60

61
void StringCompressor::add_expansion(SymbolDef def)
62
{
80,382,150✔
63
    // compute expansion size:
64
    size_t exp_size = 0;
80,382,150✔
65
    if (def.expansion_a < 256)
80,382,150✔
66
        exp_size = 1;
27,796,188✔
67
    else
52,585,962✔
68
        exp_size = m_symbols[def.expansion_a - 256].expansion.size();
52,585,962✔
69
    if (def.expansion_b < 256)
80,382,150✔
70
        exp_size += 1;
28,560,342✔
71
    else
51,821,808✔
72
        exp_size += m_symbols[def.expansion_b - 256].expansion.size();
51,821,808✔
73
    // make sure there is room in active storage chunk:
74
    if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) {
80,382,150✔
75
        m_expansion_storage.push_back({});
692,379✔
76
        m_expansion_storage.back().reserve(storage_chunk_size);
692,379✔
77
    }
692,379✔
78
    // construct expansion at end of chunk:
79
    auto& chunk = m_expansion_storage.back();
80,382,150✔
80
    auto start_index = (uint32_t)chunk.size();
80,382,150✔
81
    if (def.expansion_a < 256)
80,382,150✔
82
        chunk.push_back((char)def.expansion_a);
27,790,797✔
83
    else
52,591,353✔
84
        chunk.append(m_symbols[def.expansion_a - 256].expansion);
52,591,353✔
85
    if (def.expansion_b < 256)
80,382,150✔
86
        chunk.push_back((char)def.expansion_b);
28,558,053✔
87
    else
51,824,097✔
88
        chunk.append(m_symbols[def.expansion_b - 256].expansion);
51,824,097✔
89
    std::string_view expansion(chunk.data() + start_index, exp_size);
80,382,150✔
90
    m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index});
80,382,150✔
91
}
80,382,150✔
92

93
void StringCompressor::expand_compression_map()
94
{
1,813,485✔
95
    size_t old_size = m_compression_map.size();
1,813,485✔
96
    REALM_ASSERT(old_size <= 16384);
1,813,485✔
97
    size_t new_size = 4 * old_size;
1,813,485✔
98
    std::vector<SymbolDef> map(new_size);
1,813,485✔
99
    for (size_t i = 0; i < m_compression_map.size(); ++i) {
2,583,945,123✔
100
        auto& entry = m_compression_map[i];
2,582,131,638✔
101
        if (entry.id == 0)
2,582,131,638✔
102
            continue;
2,533,084,467✔
103
        auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b);
49,047,171✔
104
        auto new_hash = hash & (new_size - 1);
49,047,171✔
105
        REALM_ASSERT(map[new_hash].id == 0);
49,047,171✔
106
        map[new_hash] = entry;
49,047,171✔
107
    }
49,047,171✔
108
    m_compression_map.swap(map);
1,813,485✔
109
}
1,813,485✔
110

111
void StringCompressor::rebuild_internal()
112
{
2,951,769✔
113
    auto num_symbols = m_data.size();
2,951,769✔
114
    if (num_symbols == m_symbols.size())
2,951,769✔
115
        return;
2,410,692✔
116
    if (num_symbols < m_symbols.size()) {
541,077✔
117
        // fewer symbols (likely a rollback) -- remove last ones added
118
        while (num_symbols < m_symbols.size()) {
144✔
119
            auto& symbol = m_symbols.back();
132✔
120
            auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b);
132✔
121
            hash &= m_compression_map.size() - 1;
132✔
122
            REALM_ASSERT(m_compression_map[hash].id == symbol.def.id);
132✔
123
            m_compression_map[hash] = {0, 0, 0};
132✔
124
            if (symbol.storage_index < m_expansion_storage.size() - 1) {
132✔
NEW
125
                m_expansion_storage.resize(symbol.storage_index + 1);
×
NEW
126
            }
×
127
            m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset);
132✔
128
            m_symbols.pop_back();
132✔
129
        }
132✔
130
        return;
12✔
131
    }
12✔
132
    // we have new symbols to add
133
    for (size_t i = m_symbols.size(); i < num_symbols; ++i) {
69,696,192✔
134
        auto pair = m_data.get(i);
69,155,127✔
135
        SymbolDef def;
69,155,127✔
136
        def.id = (CompressionSymbol)(i + 256);
69,155,127✔
137
        def.expansion_a = 0xFFFF & (pair >> 16);
69,155,127✔
138
        def.expansion_b = 0xFFFF & pair;
69,155,127✔
139
        auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b);
69,155,127✔
140
        while (m_compression_map[hash & (m_compression_map.size() - 1)].id) {
70,669,191✔
141
            expand_compression_map();
1,514,064✔
142
        }
1,514,064✔
143
        // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
144
        m_compression_map[hash & (m_compression_map.size() - 1)] = def;
69,155,127✔
145
        add_expansion(def);
69,155,127✔
146
    }
69,155,127✔
147
}
541,065✔
148

149
StringCompressor::~StringCompressor() {}
1,836,144✔
150

151
CompressedString StringCompressor::compress(StringData sd, bool learn)
152
{
921,981✔
153
    CompressedString result(sd.size());
921,981✔
154
    // expand string into array of symbols
155
    const char* d = sd.data();
921,981✔
156
    const size_t limit = sd.size();
921,981✔
157
    if (limit == 0)
921,981✔
158
        return {};
17,796✔
159
    size_t i = 0;
904,185✔
160
    while (i < limit) {
1,894,216,875✔
161
        result[i++] = 0xFF & *d++;
1,893,312,690✔
162
    }
1,893,312,690✔
163
    // iteratively compress array of symbols. Each run compresses pairs into single symbols.
164
    // 6 runs give a max compression of 64x - on average it will be much less :-)
165
    constexpr int run_limit = 6;
904,185✔
166
    CompressionSymbol* to;
904,185✔
167
    for (int run = 0; run < run_limit; ++run) {
4,606,341✔
168
        CompressionSymbol* from = to = result.data();
4,442,190✔
169
        CompressionSymbol* limit = from + result.size() - 1;
4,442,190✔
170
        while (from < limit) {
4,073,628,825✔
171
            auto hash = symbol_pair_hash(from[0], from[1]);
4,069,186,635✔
172
            hash &= m_compression_map.size() - 1;
4,069,186,635✔
173
            auto& def = m_compression_map[hash];
4,069,186,635✔
174
            if (def.id) {
4,069,186,635✔
175
                // existing symbol
176
                if (def.expansion_a == from[0] && def.expansion_b == from[1]) {
4,060,094,253✔
177
                    // matching symbol
178
                    *to++ = def.id;
771,662,682✔
179
                    from += 2;
771,662,682✔
180
                }
771,662,682✔
181
                else if (m_compression_map.size() < 65536) {
3,288,431,571✔
182
                    // Conflict: some other symbol is defined here - but we can expand the compression map
183
                    // and hope to find room!
184
                    expand_compression_map();
299,424✔
185
                    // simply retry:
186
                    continue;
299,424✔
187
                }
299,424✔
188
                else {
3,288,132,147✔
189
                    // also conflict: some other symbol is defined here, we can't compress
190
                    *to++ = *from++;
3,288,132,147✔
191
                    // In a normal hash table we'd have buckets and add a translation
192
                    // to a bucket. This is slower generally, but yields better compression.
193
                }
3,288,132,147✔
194
            }
4,060,094,253✔
195
            else {
9,092,382✔
196
                // free entry we can use for new symbol (and we're learning)
197
                if (m_symbols.size() < (65536 - 256) && learn) {
13,703,964✔
198
                    // define a new symbol for this entry and use it.
199
                    REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
11,143,494✔
200
                    REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size());
11,143,494✔
201
                    REALM_ASSERT_DEBUG(m_data.is_attached());
11,143,494✔
202
                    CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size());
11,143,494✔
203
                    SymbolDef def{id, from[0], from[1]};
11,143,494✔
204
                    m_compression_map[hash] = def;
11,143,494✔
205
                    add_expansion(def);
11,143,494✔
206
                    m_data.add(((uint64_t)from[0]) << 16 | from[1]);
11,143,494✔
207
                    // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl;
208
                    *to++ = id;
11,143,494✔
209
                    from += 2;
11,143,494✔
210
                }
11,143,494✔
211
                else {
2,150,044,117✔
212
                    // no more symbol space, so can't compress
213
                    *to++ = *from++;
2,150,044,117✔
214
                }
2,150,044,117✔
215
            }
9,092,382✔
216
        }
4,069,186,635✔
217
        if (from == limit) {
4,442,190✔
218
            // copy over trailing symbol
219
            *to++ = *from++;
3,182,193✔
220
        }
3,182,193✔
221
        REALM_ASSERT_DEBUG(to > result.data());
4,442,190✔
222
        size_t sz = to - result.data();
4,442,190✔
223
        REALM_ASSERT_DEBUG(sz <= sd.size());
4,442,190✔
224
        result.resize(sz);
4,442,190✔
225
        if (from == to) // no compression took place in last iteration
4,442,190✔
226
            break;
740,034✔
227
    }
4,442,190✔
228
    return result;
904,185✔
229
}
921,981✔
230

231
std::string StringCompressor::decompress(CompressedStringView& c_str)
232
{
284,907✔
233
    CompressionSymbol* ptr = c_str.data;
284,907✔
234
    CompressionSymbol* limit = ptr + c_str.size;
284,907✔
235
    // compute size of decompressed string first to avoid allocations as string grows
236
    size_t result_size = 0;
284,907✔
237
    while (ptr < limit) {
45,217,149✔
238
        if (*ptr < 256)
44,932,242✔
239
            result_size += 1;
15,213,909✔
240
        else
29,718,333✔
241
            result_size += m_symbols[*ptr - 256].expansion.size();
29,718,333✔
242
        ++ptr;
44,932,242✔
243
    }
44,932,242✔
244
    std::string result2;
284,907✔
245
    result2.reserve(result_size);
284,907✔
246
    // generate result
247
    ptr = c_str.data;
284,907✔
248
    while (ptr < limit) {
45,217,146✔
249
        if (*ptr < 256)
44,932,239✔
250
            result2.push_back((char)*ptr);
15,213,909✔
251
        else
29,718,330✔
252
            result2.append(m_symbols[*ptr - 256].expansion);
29,718,330✔
253
        ptr++;
44,932,239✔
254
    }
44,932,239✔
255
#ifdef REALM_DEBUG
284,907✔
256
    std::string result;
284,907✔
257
    {
284,907✔
258
        auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void {
170,187,036✔
259
            if (symbol < 256) {
170,187,036✔
260
                result.push_back((char)symbol);
107,559,681✔
261
            }
107,559,681✔
262
            else {
62,627,355✔
263
                auto& s = m_symbols[symbol - 256];
62,627,355✔
264
                decompress(s.def.expansion_a, decompress);
62,627,355✔
265
                decompress(s.def.expansion_b, decompress);
62,627,355✔
266
            }
62,627,355✔
267
        };
170,187,036✔
268

269
        CompressionSymbol* ptr = c_str.data;
284,907✔
270
        CompressionSymbol* limit = ptr + c_str.size;
284,907✔
271
        while (ptr < limit) {
45,217,149✔
272
            decompress(*ptr, decompress);
44,932,242✔
273
            ++ptr;
44,932,242✔
274
        }
44,932,242✔
275
    }
284,907✔
276
    REALM_ASSERT_DEBUG(result == result2);
284,907✔
277
#endif
284,907✔
278
    return result2;
284,907✔
279
}
284,907✔
280

281
int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B)
NEW
282
{
×
NEW
283
    auto A_ptr = A.data;
×
NEW
284
    auto A_limit = A_ptr + A.size;
×
NEW
285
    auto B_ptr = B.data;
×
NEW
286
    auto B_limit = B_ptr + B.size;
×
NEW
287
    while (A_ptr < A_limit && B_ptr < B_limit) {
×
NEW
288
        auto code_A = *A_ptr++;
×
NEW
289
        auto code_B = *B_ptr++;
×
NEW
290
        if (code_A == code_B)
×
NEW
291
            continue;
×
292
        // symbols did not match:
293
        // 1. both symbols are single characters
NEW
294
        if (code_A < 256 && code_B < 256)
×
NEW
295
            return code_B - code_A;
×
NEW
296
        std::string a_str(code_A, 1);
×
NEW
297
        auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion);
×
NEW
298
        std::string b_str(code_B, 1);
×
NEW
299
        auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion);
×
300
        // to ensure comparison as StringData we need to convert the stringviews
NEW
301
        StringData sd_a(str_A.data(), str_A.size());
×
NEW
302
        StringData sd_b(str_B.data(), str_B.size());
×
NEW
303
        REALM_ASSERT_DEBUG(sd_a != sd_b);
×
NEW
304
        if (sd_a < sd_b)
×
NEW
305
            return 1;
×
NEW
306
        else
×
NEW
307
            return -1;
×
NEW
308
    }
×
309
    // The compressed strings are identical or one is the prefix of the other
NEW
310
    return B.size - A.size;
×
311
    // ^ a faster way of producing same positive / negative / zero as:
312
    // if (A.size() < B.size())
313
    //     return 1;
314
    // if (A.size() > B.size())
315
    //     return -1;
316
    // return 0;
NEW
317
}
×
318

319
int StringCompressor::compare(StringData sd, CompressedStringView& B)
320
{
1,030,128✔
321
    auto B_size = B.size;
1,030,128✔
322
    // make sure comparisons are unsigned, even though StringData does not specify signedness
323
    const unsigned char* A_ptr = reinterpret_cast<const unsigned char*>(sd.data());
1,030,128✔
324
    auto A_limit = A_ptr + sd.size();
1,030,128✔
325
    for (size_t i = 0; i < B_size; ++i) {
59,317,434✔
326
        if (A_ptr == A_limit) {
58,287,315✔
327
            // sd ended first, so B is bigger
NEW
328
            return -1;
×
NEW
329
        }
×
330
        auto code = B.data[i];
58,287,315✔
331
        if (code < 256) {
58,287,315✔
332
            if (code < *A_ptr)
243,969✔
NEW
333
                return 1;
×
334
            if (code > *A_ptr)
243,969✔
NEW
335
                return -1;
×
336
            ++A_ptr;
243,969✔
337
            continue;
243,969✔
338
        }
243,969✔
339
        auto& expansion = m_symbols[code - 256];
58,043,346✔
340
        for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) {
3,190,848,285✔
341
            uint8_t c = expansion.expansion[disp];
3,132,804,948✔
342
            if (c < *A_ptr)
3,132,804,948✔
343
                return 1;
6✔
344
            if (c > *A_ptr)
3,132,804,942✔
345
                return -1;
3✔
346
            ++A_ptr;
3,132,804,939✔
347
        }
3,132,804,939✔
348
    }
58,043,346✔
349
    // if sd is longer than B, sd is the biggest string
350
    if (A_ptr < A_limit)
1,030,119✔
NEW
351
        return 1;
×
352
    return 0;
1,030,119✔
353
}
1,030,119✔
354

355

356
} // namespace realm
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc