• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tarantool / tarantool / 9054

pending completion
9054

push

travis-ci

kostja
test: update results

Update test results (changes not committed in scope of gh-2663).

30513 of 35273 relevant lines covered (86.51%)

1132512.44 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.37
/src/box/vy_range.c
1
/*
2
 * Copyright 2010-2017, Tarantool AUTHORS, please see AUTHORS file.
3
 *
4
 * Redistribution and use in source and binary forms, with or
5
 * without modification, are permitted provided that the following
6
 * conditions are met:
7
 *
8
 * 1. Redistributions of source code must retain the above
9
 *    copyright notice, this list of conditions and the
10
 *    following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above
13
 *    copyright notice, this list of conditions and the following
14
 *    disclaimer in the documentation and/or other materials
15
 *    provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21
 * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
 * SUCH DAMAGE.
30
 */
31
#include "vy_range.h"
32

33
#include <assert.h>
34
#include <stdbool.h>
35
#include <stddef.h>
36
#include <stdint.h>
37
#include <stdio.h>
38
#include <stdlib.h>
39

40
#define RB_COMPACT 1
41
#include <small/rb.h>
42
#include <small/rlist.h>
43

44
#include "diag.h"
45
#include "iterator_type.h"
46
#include "key_def.h"
47
#include "trivia/util.h"
48
#include "tuple.h"
49
#include "tuple_compare.h"
50
#include "vy_run.h"
51
#include "vy_stat.h"
52
#include "vy_stmt.h"
53

54
int
55
vy_range_tree_cmp(struct vy_range *range_a, struct vy_range *range_b)
12,083✔
56
{
57
        if (range_a == range_b)
12,083✔
58
                return 0;
11,898✔
59

60
        /* Any key > -inf. */
61
        if (range_a->begin == NULL)
185✔
62
                return -1;
111✔
63
        if (range_b->begin == NULL)
74✔
64
                return 1;
9✔
65

66
        assert(range_a->cmp_def == range_b->cmp_def);
65✔
67
        return vy_key_compare(range_a->begin, range_b->begin,
65✔
68
                              range_a->cmp_def);
69
}
70

71
int
72
vy_range_tree_key_cmp(const struct tuple *stmt, struct vy_range *range)
473,713✔
73
{
74
        /* Any key > -inf. */
75
        if (range->begin == NULL)
473,713✔
76
                return 1;
473,144✔
77
        return vy_stmt_compare_with_key(stmt, range->begin, range->cmp_def);
569✔
78
}
79

80
struct vy_range *
81
vy_range_tree_find_by_key(vy_range_tree_t *tree,
474,385✔
82
                          enum iterator_type iterator_type,
83
                          const struct tuple *key)
84
{
85
        uint32_t key_field_count = tuple_field_count(key);
474,385✔
86
        if (key_field_count == 0) {
474,385✔
87
                switch (iterator_type) {
2,255✔
88
                case ITER_LT:
89
                case ITER_LE:
90
                        return vy_range_tree_last(tree);
408✔
91
                case ITER_GT:
92
                case ITER_GE:
93
                case ITER_EQ:
94
                        return vy_range_tree_first(tree);
1,847✔
95
                default:
96
                        unreachable();
×
97
                        return NULL;
98
                }
99
        }
100
        struct vy_range *range;
101
        if (iterator_type == ITER_GE || iterator_type == ITER_GT ||
472,130✔
102
            iterator_type == ITER_EQ) {
103
                /**
104
                 * Case 1. part_count == 1, looking for [10]. ranges:
105
                 * {1, 3, 5} {7, 8, 9} {10, 15 20} {22, 32, 42}
106
                 *                      ^looking for this
107
                 * Case 2. part_count == 1, looking for [10]. ranges:
108
                 * {1, 2, 4} {5, 6, 7, 8} {50, 100, 200}
109
                 *            ^looking for this
110
                 * Case 3. part_count == 2, looking for [10]. ranges:
111
                 * {[1, 2], [2, 3]} {[9, 1], [10, 1], [10 2], [11 3]} {[12,..}
112
                 *                   ^looking for this
113
                 * Case 4. part_count == 2, looking for [10]. ranges:
114
                 * {[1, 2], [10, 1]} {[10, 2] [10 3] [11 3]} {[12, 1]..}
115
                 *  ^looking for this
116
                 * Case 5. part_count does not matter, looking for [10].
117
                 * ranges:
118
                 * {100, 200}, {300, 400}
119
                 * ^looking for this
120
                 */
121
                /**
122
                 * vy_range_tree_psearch finds least range with begin == key
123
                 * or previous if equal was not found
124
                 */
125
                range = vy_range_tree_psearch(tree, key);
468,391✔
126
                /* switch to previous for case (4) */
127
                if (range != NULL && range->begin != NULL &&
468,697✔
128
                    key_field_count < range->cmp_def->part_count &&
306✔
129
                    vy_stmt_compare_with_key(key, range->begin,
×
130
                                             range->cmp_def) == 0)
131
                        range = vy_range_tree_prev(tree, range);
×
132
                /* for case 5 or subcase of case 4 */
133
                if (range == NULL)
936,782✔
134
                        range = vy_range_tree_first(tree);
×
135
        } else {
136
                assert(iterator_type == ITER_LT || iterator_type == ITER_LE);
3,739✔
137
                /**
138
                 * Case 1. part_count == 1, looking for [10]. ranges:
139
                 * {1, 3, 5} {7, 8, 9} {10, 15 20} {22, 32, 42}
140
                 *                      ^looking for this
141
                 * Case 2. part_count == 1, looking for [10]. ranges:
142
                 * {1, 2, 4} {5, 6, 7, 8} {50, 100, 200}
143
                 *            ^looking for this
144
                 * Case 3. part_count == 2, looking for [10]. ranges:
145
                 * {[1, 2], [2, 3]} {[9, 1], [10, 1], [10 2], [11 3]} {[12,..}
146
                 *                   ^looking for this
147
                 * Case 4. part_count == 2, looking for [10]. ranges:
148
                 * {[1, 2], [10, 1]} {[10, 2] [10 3] [11 3]} {[12, 1]..}
149
                 *                    ^looking for this
150
                 * Case 5. part_count does not matter, looking for [10].
151
                 * ranges:
152
                 * {1, 2}, {3, 4, ..}
153
                 *          ^looking for this
154
                 */
155
                /**
156
                 * vy_range_tree_nsearch finds most range with begin == key
157
                 * or next if equal was not found
158
                 */
159
                range = vy_range_tree_nsearch(tree, key);
3,739✔
160
                if (range != NULL) {
3,739✔
161
                        /* fix curr_range for cases 2 and 3 */
162
                        if (range->begin != NULL &&
×
163
                            vy_stmt_compare_with_key(key, range->begin,
×
164
                                                     range->cmp_def) != 0) {
165
                                struct vy_range *prev;
166
                                prev = vy_range_tree_prev(tree, range);
×
167
                                if (prev != NULL)
×
168
                                        range = prev;
×
169
                        }
170
                } else {
171
                        /* Case 5 */
172
                        range = vy_range_tree_last(tree);
3,739✔
173
                }
174
        }
175
        return range;
472,130✔
176
}
177

178
struct vy_range *
179
vy_range_new(int64_t id, struct tuple *begin, struct tuple *end,
1,514✔
180
             const struct key_def *cmp_def)
181
{
182
        struct vy_range *range = calloc(1, sizeof(*range));
1,514✔
183
        if (range == NULL) {
1,514✔
184
                diag_set(OutOfMemory, sizeof(*range),
×
185
                         "malloc", "struct vy_range");
186
                return NULL;
×
187
        }
188
        range->id = id;
1,514✔
189
        if (begin != NULL) {
1,514✔
190
                tuple_ref(begin);
19✔
191
                range->begin = begin;
19✔
192
        }
193
        if (end != NULL) {
1,514✔
194
                tuple_ref(end);
18✔
195
                range->end = end;
18✔
196
        }
197
        range->cmp_def = cmp_def;
1,514✔
198
        rlist_create(&range->slices);
1,514✔
199
        range->heap_node.pos = UINT32_MAX;
1,514✔
200
        return range;
1,514✔
201
}
202

203
void
204
vy_range_delete(struct vy_range *range)
1,337✔
205
{
206
        if (range->begin != NULL)
1,337✔
207
                tuple_unref(range->begin);
10✔
208
        if (range->end != NULL)
1,337✔
209
                tuple_unref(range->end);
9✔
210

211
        struct vy_slice *slice, *next_slice;
212
        rlist_foreach_entry_safe(slice, &range->slices, in_range, next_slice)
1,511✔
213
                vy_slice_delete(slice);
174✔
214

215
        TRASH(range);
1,337✔
216
        free(range);
1,337✔
217
}
1,337✔
218

219
int
220
vy_range_snprint(char *buf, int size, const struct vy_range *range)
360✔
221
{
222
        int total = 0;
360✔
223
        SNPRINT(total, snprintf, buf, size, "(");
360✔
224
        if (range->begin != NULL)
360✔
225
                SNPRINT(total, vy_key_snprint, buf, size,
30✔
226
                        tuple_data(range->begin));
227
        else
228
                SNPRINT(total, snprintf, buf, size, "-inf");
330✔
229
        SNPRINT(total, snprintf, buf, size, "..");
360✔
230
        if (range->end != NULL)
360✔
231
                SNPRINT(total, vy_key_snprint, buf, size,
33✔
232
                        tuple_data(range->end));
233
        else
234
                SNPRINT(total, snprintf, buf, size, "inf");
327✔
235
        SNPRINT(total, snprintf, buf, size, ")");
360✔
236
        return total;
360✔
237
}
238

239
void
240
vy_range_add_slice(struct vy_range *range, struct vy_slice *slice)
787✔
241
{
242
        rlist_add_entry(&range->slices, slice, in_range);
787✔
243
        range->slice_count++;
787✔
244
        vy_disk_stmt_counter_add(&range->count, &slice->count);
787✔
245
}
787✔
246

247
void
248
vy_range_add_slice_before(struct vy_range *range, struct vy_slice *slice,
155✔
249
                          struct vy_slice *next_slice)
250
{
251
        rlist_add_tail(&next_slice->in_range, &slice->in_range);
155✔
252
        range->slice_count++;
155✔
253
        vy_disk_stmt_counter_add(&range->count, &slice->count);
155✔
254
}
155✔
255

256
void
257
vy_range_remove_slice(struct vy_range *range, struct vy_slice *slice)
629✔
258
{
259
        assert(range->slice_count > 0);
629✔
260
        assert(!rlist_empty(&range->slices));
629✔
261
        rlist_del_entry(slice, in_range);
629✔
262
        range->slice_count--;
629✔
263
        vy_disk_stmt_counter_sub(&range->count, &slice->count);
629✔
264
}
629✔
265

266
/**
267
 * To reduce write amplification caused by compaction, we follow
268
 * the LSM tree design. Runs in each range are divided into groups
269
 * called levels:
270
 *
271
 *   level 1: runs 1 .. L_1
272
 *   level 2: runs L_1 + 1 .. L_2
273
 *   ...
274
 *   level N: runs L_{N-1} .. L_N
275
 *
276
 * where L_N is the total number of runs, N is the total number of
277
 * levels, older runs have greater numbers. Runs at each subsequent
278
 * are run_size_ratio times larger than on the previous one. When
279
 * the number of runs at a level exceeds run_count_per_level, we
280
 * compact all its runs along with all runs from the upper levels
281
 * and in-memory indexes.  Including  previous levels into
282
 * compaction is relatively cheap, because of the level size
283
 * ratio.
284
 *
285
 * Given a range, this function computes the maximal level that needs
286
 * to be compacted and sets @compact_priority to the number of runs in
287
 * this level and all preceding levels.
288
 */
289
void
290
vy_range_update_compact_priority(struct vy_range *range,
850✔
291
                                 const struct index_opts *opts)
292
{
293
        assert(opts->run_count_per_level > 0);
850✔
294
        assert(opts->run_size_ratio > 1);
850✔
295

296
        range->compact_priority = 0;
850✔
297

298
        /* Total number of checked runs. */
299
        uint32_t total_run_count = 0;
850✔
300
        /* The total size of runs checked so far. */
301
        uint64_t total_size = 0;
850✔
302
        /* Estimated size of a compacted run, if compaction is scheduled. */
303
        uint64_t est_new_run_size = 0;
850✔
304
        /* The number of runs at the current level. */
305
        uint32_t level_run_count = 0;
850✔
306
        /*
307
         * The target (perfect) size of a run at the current level.
308
         * For the first level, it's the size of the newest run.
309
         * For lower levels it's computed as first level run size
310
         * times run_size_ratio.
311
         */
312
        uint64_t target_run_size = 0;
850✔
313

314
        struct vy_slice *slice;
315
        rlist_foreach_entry(slice, &range->slices, in_range) {
4,098✔
316
                uint64_t size = slice->count.bytes_compressed;
3,248✔
317
                /*
318
                 * The size of the first level is defined by
319
                 * the size of the most recent run.
320
                 */
321
                if (target_run_size == 0)
3,248✔
322
                        target_run_size = size;
833✔
323
                total_size += size;
3,248✔
324
                level_run_count++;
3,248✔
325
                total_run_count++;
3,248✔
326
                while (size > target_run_size) {
7,950✔
327
                        /*
328
                         * The run size exceeds the threshold
329
                         * set for the current level. Move this
330
                         * run down to a lower level. Switch the
331
                         * current level and reset the level run
332
                         * count.
333
                         */
334
                        level_run_count = 1;
1,454✔
335
                        /*
336
                         * If we have already scheduled
337
                         * a compaction of an upper level, and
338
                         * estimated compacted run will end up at
339
                         * this level, include the new run into
340
                         * this level right away to avoid
341
                         * a cascading compaction.
342
                         */
343
                        if (est_new_run_size > target_run_size)
1,454✔
344
                                level_run_count++;
374✔
345
                        /*
346
                         * Calculate the target run size for this
347
                         * level.
348
                         */
349
                        target_run_size *= opts->run_size_ratio;
1,454✔
350
                        /*
351
                         * Keep pushing the run down until
352
                         * we find an appropriate level for it.
353
                         */
354
                }
355
                if (level_run_count > opts->run_count_per_level) {
3,248✔
356
                        /*
357
                         * The number of runs at the current level
358
                         * exceeds the configured maximum. Arrange
359
                         * for compaction. We compact all runs at
360
                         * this level and upper levels.
361
                         */
362
                        range->compact_priority = total_run_count;
1,200✔
363
                        est_new_run_size = total_size;
1,200✔
364
                }
365
        }
366
}
850✔
367

368
/**
369
 * Return true and set split_key accordingly if the range needs to be
370
 * split in two.
371
 *
372
 * - We should never split a range until it was merged at least once
373
 *   (actually, it should be a function of run_count_per_level/number
374
 *   of runs used for the merge: with low run_count_per_level it's more
375
 *   than once, with high run_count_per_level it's once).
376
 * - We should use the last run size as the size of the range.
377
 * - We should split around the last run middle key.
378
 * - We should only split if the last run size is greater than
379
 *   4/3 * range_size.
380
 */
381
bool
382
vy_range_needs_split(struct vy_range *range, const struct index_opts *opts,
186✔
383
                     const char **p_split_key)
384
{
385
        struct vy_slice *slice;
386

387
        /* The range hasn't been merged yet - too early to split it. */
388
        if (range->n_compactions < 1)
186✔
389
                return false;
52✔
390

391
        /* Find the oldest run. */
392
        assert(!rlist_empty(&range->slices));
134✔
393
        slice = rlist_last_entry(&range->slices, struct vy_slice, in_range);
134✔
394

395
        /* The range is too small to be split. */
396
        if (slice->count.bytes_compressed < opts->range_size * 4 / 3)
134✔
397
                return false;
96✔
398

399
        /* Find the median key in the oldest run (approximately). */
400
        struct vy_page_info *mid_page;
401
        mid_page = vy_run_page_info(slice->run, slice->first_page_no +
76✔
402
                                    (slice->last_page_no -
76✔
403
                                     slice->first_page_no) / 2);
76✔
404

405
        struct vy_page_info *first_page = vy_run_page_info(slice->run,
38✔
406
                                                slice->first_page_no);
407

408
        /* No point in splitting if a new range is going to be empty. */
409
        if (key_compare(first_page->min_key, mid_page->min_key,
38✔
410
                        range->cmp_def) == 0)
411
                return false;
30✔
412
        /*
413
         * In extreme cases the median key can be < the beginning
414
         * of the slice, e.g.
415
         *
416
         * RUN:
417
         * ... |---- page N ----|-- page N + 1 --|-- page N + 2 --
418
         *     | min_key = [10] | min_key = [50] | min_key = [100]
419
         *
420
         * SLICE:
421
         * begin = [30], end = [70]
422
         * first_page_no = N, last_page_no = N + 1
423
         *
424
         * which makes mid_page_no = N and mid_page->min_key = [10].
425
         *
426
         * In such cases there's no point in splitting the range.
427
         */
428
        if (slice->begin != NULL && key_compare(mid_page->min_key,
9✔
429
                        tuple_data(slice->begin), range->cmp_def) <= 0)
1✔
430
                return false;
×
431
        /*
432
         * The median key can't be >= the end of the slice as we
433
         * take the min key of a page for the median key.
434
         */
435
        assert(slice->end == NULL || key_compare(mid_page->min_key,
8✔
436
                        tuple_data(slice->end), range->cmp_def) < 0);
437

438
        *p_split_key = mid_page->min_key;
8✔
439
        return true;
8✔
440
}
441

442
/**
443
 * Check if a range should be coalesced with one or more its neighbors.
444
 * If it should, return true and set @p_first and @p_last to the first
445
 * and last ranges to coalesce, otherwise return false.
446
 *
447
 * We coalesce ranges together when they become too small, less than
448
 * half the target range size to avoid split-coalesce oscillations.
449
 */
450
bool
451
vy_range_needs_coalesce(struct vy_range *range, vy_range_tree_t *tree,
178✔
452
                        const struct index_opts *opts,
453
                        struct vy_range **p_first, struct vy_range **p_last)
454
{
455
        struct vy_range *it;
456

457
        /* Size of the coalesced range. */
458
        uint64_t total_size = range->count.bytes_compressed;
178✔
459
        /* Coalesce ranges until total_size > max_size. */
460
        uint64_t max_size = opts->range_size / 2;
178✔
461

462
        /*
463
         * We can't coalesce a range that was scheduled for dump
464
         * or compaction, because it is about to be processed by
465
         * a worker thread.
466
         */
467
        assert(!vy_range_is_scheduled(range));
178✔
468

469
        *p_first = *p_last = range;
178✔
470
        for (it = vy_range_tree_next(tree, range);
359✔
471
             it != NULL && !vy_range_is_scheduled(it);
19✔
472
             it = vy_range_tree_next(tree, it)) {
3✔
473
                uint64_t size = it->count.bytes_compressed;
18✔
474
                if (total_size + size > max_size)
18✔
475
                        break;
15✔
476
                total_size += size;
3✔
477
                *p_last = it;
3✔
478
        }
479
        for (it = vy_range_tree_prev(tree, range);
356✔
480
             it != NULL && !vy_range_is_scheduled(it);
15✔
481
             it = vy_range_tree_prev(tree, it)) {
×
482
                uint64_t size = it->count.bytes_compressed;
7✔
483
                if (total_size + size > max_size)
7✔
484
                        break;
7✔
485
                total_size += size;
×
486
                *p_first = it;
×
487
        }
488
        return *p_first != *p_last;
178✔
489
}
490

491
void
492
vy_range_iterator_open(struct vy_range_iterator *itr, vy_range_tree_t *tree,
474,268✔
493
                       enum iterator_type iterator_type,
494
                       const struct tuple *key)
495
{
496
        itr->tree = tree;
474,268✔
497
        itr->iterator_type = iterator_type;
474,268✔
498
        itr->key = key;
474,268✔
499
        itr->curr_range = NULL;
474,268✔
500
}
474,268✔
501

502
void
503
vy_range_iterator_next(struct vy_range_iterator *itr, struct vy_range **result)
756,732✔
504
{
505
        struct vy_range *curr = itr->curr_range;
756,732✔
506
        struct vy_range *next;
507

508
        if (curr == NULL) {
756,732✔
509
                /* First iteration */
510
                next = vy_range_tree_find_by_key(itr->tree, itr->iterator_type,
474,268✔
511
                                                 itr->key);
512
                goto out;
474,268✔
513
        }
514
        switch (itr->iterator_type) {
282,464✔
515
        case ITER_LT:
516
        case ITER_LE:
517
                next = vy_range_tree_prev(itr->tree, curr);
3,729✔
518
                break;
3,729✔
519
        case ITER_GT:
520
        case ITER_GE:
521
                next = vy_range_tree_next(itr->tree, curr);
4,722✔
522
                break;
4,722✔
523
        case ITER_EQ:
524
                if (curr->end != NULL &&
274,013✔
525
                    vy_stmt_compare_with_key(itr->key, curr->end,
×
526
                                             curr->cmp_def) >= 0) {
527
                        /* A partial key can be found in more than one range. */
528
                        next = vy_range_tree_next(itr->tree, curr);
×
529
                } else {
530
                        next = NULL;
274,013✔
531
                }
532
                break;
274,013✔
533
        default:
534
                unreachable();
×
535
        }
536
out:
537
        *result = itr->curr_range = next;
756,732✔
538
}
756,732✔
539

540
void
541
vy_range_iterator_restore(struct vy_range_iterator *itr,
117✔
542
                          const struct tuple *last_stmt,
543
                          struct vy_range **result)
544
{
545
        struct vy_range *curr = vy_range_tree_find_by_key(itr->tree,
117✔
546
                                itr->iterator_type,
547
                                last_stmt != NULL ? last_stmt : itr->key);
548
        *result = itr->curr_range = curr;
117✔
549
}
117✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc