• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tarantool / tarantool / 12397458527

18 Dec 2024 04:44PM UTC coverage: 87.367%. Remained the same
12397458527

push

github

Totktonada
config: fix on_event roles callback config argument

The config object was passed to the `on_event` callback instead of the
corresponding roles config before. Lets fix it, implementing the
intended behavior.

NO_DOC=bugfix

Closes #10934

(cherry picked from commit d5d2291ae)

69673 of 123524 branches covered (56.4%)

102617 of 117455 relevant lines covered (87.37%)

2871961.47 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.14
/src/box/vy_regulator.c
1
/*
2
 * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
3
 *
4
 * Redistribution and use in source and binary forms, with or
5
 * without modification, are permitted provided that the following
6
 * conditions are met:
7
 *
8
 * 1. Redistributions of source code must retain the above
9
 *    copyright notice, this list of conditions and the
10
 *    following disclaimer.
11
 *
12
 * 2. Redistributions in binary form must reproduce the above
13
 *    copyright notice, this list of conditions and the following
14
 *    disclaimer in the documentation and/or other materials
15
 *    provided with the distribution.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
18
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21
 * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29
 * SUCH DAMAGE.
30
 */
31
#include "vy_regulator.h"
32

33
#include <math.h>
34
#include <stdbool.h>
35
#include <stddef.h>
36
#include <stdint.h>
37
#include <string.h>
38
#include <tarantool_ev.h>
39

40
#include "fiber.h"
41
#include "histogram.h"
42
#include "say.h"
43
#include "trivia/util.h"
44

45
#include "vy_quota.h"
46
#include "vy_stat.h"
47

48
/**
49
 * Regulator timer period, in seconds.
50
 */
51
static const double VY_REGULATOR_TIMER_PERIOD = 1;
52

53
/**
54
 * Time window over which the write rate is averaged,
55
 * in seconds.
56
 */
57
static const double VY_WRITE_RATE_AVG_WIN = 5;
58

59
/**
60
 * Histogram percentile used for estimating dump bandwidth.
61
 * For details see the comment to vy_regulator::dump_bandwidth_hist.
62
 */
63
static const int VY_DUMP_BANDWIDTH_PCT = 10;
64

65
/*
66
 * Until we dump anything, assume bandwidth to be 10 MB/s,
67
 * which should be fine for initial guess.
68
 */
69
static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
70

71
/**
72
 * Do not take into account small dumps when estimating dump
73
 * bandwidth, because they have too high overhead associated
74
 * with file creation.
75
 */
76
static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
77

78
/**
79
 * Number of dumps to take into account for rate limit calculation.
80
 * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
81
 * either - otherwise the rate limit will adapt too slowly to workload
82
 * changes. 100 feels like a good choice.
83
 */
84
static const int VY_RECENT_DUMP_COUNT = 100;
85

86
static void
87
vy_regulator_trigger_dump(struct vy_regulator *regulator)
12,534✔
88
{
89
        if (regulator->dump_in_progress)
12,534✔
90
                return;
12,361✔
91

92
        if (regulator->trigger_dump_cb(regulator) != 0)
173✔
93
                return;
1✔
94

95
        regulator->dump_in_progress = true;
172✔
96

97
        /*
98
         * To avoid unpredictably long stalls, we must limit
99
         * the write rate when a dump is in progress so that
100
         * we don't hit the hard limit before the dump has
101
         * completed, i.e.
102
         *
103
         *    mem_left        mem_used
104
         *   ---------- >= --------------
105
         *   write_rate    dump_bandwidth
106
         */
107
        struct vy_quota *quota = regulator->quota;
172✔
108
        size_t mem_left = (quota->used < quota->limit ?
344✔
109
                           quota->limit - quota->used : 0);
172✔
110
        size_t mem_used = quota->used;
172✔
111
        size_t max_write_rate = (double)mem_left / (mem_used + 1) *
172✔
112
                                        regulator->dump_bandwidth;
172✔
113
        max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
172✔
114
        vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
172✔
115
                                max_write_rate);
116

117
        say_info("dumping %zu bytes, expected rate %.1f MB/s, "
172!
118
                 "ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
119
                 quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
120
                 (double)quota->used / (regulator->dump_bandwidth + 1),
121
                 (double)regulator->write_rate / 1024 / 1024,
122
                 (double)regulator->write_rate_max / 1024 / 1024);
123

124
        regulator->write_rate_max = regulator->write_rate;
172✔
125
}
126

127
static void
128
vy_regulator_update_write_rate(struct vy_regulator *regulator)
10,694✔
129
{
130
        size_t used_curr = regulator->quota->used;
10,694✔
131
        size_t used_last = regulator->quota_used_last;
10,694✔
132

133
        /*
134
         * Memory can be dumped between two subsequent timer
135
         * callback invocations, in which case memory usage
136
         * will decrease. Ignore such observations - it's not
137
         * a big deal, because dump is a rare event.
138
         */
139
        if (used_curr < used_last) {
10,694✔
140
                regulator->quota_used_last = used_curr;
95✔
141
                return;
95✔
142
        }
143

144
        size_t rate_avg = regulator->write_rate;
10,599✔
145
        size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;
10,599✔
146

147
        double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /
10,599✔
148
                                VY_WRITE_RATE_AVG_WIN);
149
        rate_avg = (1 - weight) * rate_avg + weight * rate_curr;
10,599✔
150

151
        regulator->write_rate = rate_avg;
10,599✔
152
        if (regulator->write_rate_max < rate_curr)
10,599✔
153
                regulator->write_rate_max = rate_curr;
281✔
154
        regulator->quota_used_last = used_curr;
10,599✔
155
}
156

157
static void
158
vy_regulator_update_dump_watermark(struct vy_regulator *regulator)
11,039✔
159
{
160
        struct vy_quota *quota = regulator->quota;
11,039✔
161

162
        /*
163
         * Due to log structured nature of the lsregion allocator,
164
         * which is used for allocating statements, we cannot free
165
         * memory in chunks, only all at once. Therefore we should
166
         * configure the watermark so that by the time we hit the
167
         * limit, all memory have been dumped, i.e.
168
         *
169
         *   limit - watermark      watermark
170
         *   ----------------- = --------------
171
         *       write_rate      dump_bandwidth
172
         *
173
         * Be pessimistic when predicting the write rate - use the
174
         * max observed write rate multiplied by 1.5 - because it's
175
         * better to start memory dump early than delay it as long
176
         * as possible at the risk of experiencing unpredictably
177
         * long stalls.
178
         */
179
        size_t write_rate = regulator->write_rate_max * 3 / 2;
11,039✔
180
        regulator->dump_watermark =
11,039✔
181
                        (double)quota->limit * regulator->dump_bandwidth /
11,039✔
182
                        (regulator->dump_bandwidth + write_rate + 1);
11,039✔
183
        /*
184
         * It doesn't make sense to set the watermark below 50%
185
         * of the memory limit because the write rate can exceed
186
         * the dump bandwidth under no circumstances.
187
         */
188
        regulator->dump_watermark = MAX(regulator->dump_watermark,
11,039✔
189
                                        quota->limit / 2);
190
}
11,039✔
191

192
static void
193
vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events)
10,694✔
194
{
195
        (void)loop;
196
        (void)events;
197

198
        struct vy_regulator *regulator = timer->data;
10,694✔
199

200
        vy_regulator_update_write_rate(regulator);
10,694✔
201
        vy_regulator_update_dump_watermark(regulator);
10,694✔
202
        vy_regulator_check_dump_watermark(regulator);
10,694✔
203
}
10,694✔
204

205
void
206
vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
4,499✔
207
                    vy_trigger_dump_f trigger_dump_cb)
208
{
209
        enum { KB = 1024, MB = KB * KB };
210
        static int64_t dump_bandwidth_buckets[] = {
211
                100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
212
                700 * KB, 800 * KB, 900 * KB,   1 * MB,   2 * MB,   3 * MB,
213
                  4 * MB,   5 * MB,   6 * MB,   7 * MB,   8 * MB,   9 * MB,
214
                 10 * MB,  15 * MB,  20 * MB,  25 * MB,  30 * MB,  35 * MB,
215
                 40 * MB,  45 * MB,  50 * MB,  55 * MB,  60 * MB,  65 * MB,
216
                 70 * MB,  75 * MB,  80 * MB,  85 * MB,  90 * MB,  95 * MB,
217
                100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
218
                700 * MB, 800 * MB, 900 * MB,
219
        };
220
        memset(regulator, 0, sizeof(*regulator));
4,499✔
221
        regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
4,499✔
222
                                        lengthof(dump_bandwidth_buckets));
223
        if (regulator->dump_bandwidth_hist == NULL)
4,499!
224
                panic("failed to allocate dump bandwidth histogram");
×
225

226
        regulator->quota = quota;
4,499✔
227
        regulator->trigger_dump_cb = trigger_dump_cb;
4,499✔
228
        ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
4,499✔
229
                      VY_REGULATOR_TIMER_PERIOD);
230
        regulator->timer.data = regulator;
4,499✔
231
        regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
4,499✔
232
        regulator->dump_watermark = SIZE_MAX;
4,499✔
233
}
4,499✔
234

235
void
236
vy_regulator_start(struct vy_regulator *regulator)
4,444✔
237
{
238
        regulator->quota_used_last = regulator->quota->used;
4,444✔
239
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
4,444✔
240
                                regulator->dump_bandwidth);
241
        ev_timer_start(loop(), &regulator->timer);
4,444!
242
}
4,444✔
243

244
void
245
vy_regulator_destroy(struct vy_regulator *regulator)
4,360✔
246
{
247
        ev_timer_stop(loop(), &regulator->timer);
4,360!
248
        histogram_delete(regulator->dump_bandwidth_hist);
4,360✔
249
}
4,360✔
250

251
void
252
vy_regulator_quota_exceeded(struct vy_regulator *regulator)
554✔
253
{
254
        vy_regulator_trigger_dump(regulator);
554✔
255
}
554✔
256

257
void
258
vy_regulator_check_dump_watermark(struct vy_regulator *regulator)
1,621,830✔
259
{
260
        if (regulator->quota->used >= regulator->dump_watermark)
1,621,830✔
261
                vy_regulator_trigger_dump(regulator);
11,980✔
262
}
1,621,830✔
263

264
void
265
vy_regulator_dump_complete(struct vy_regulator *regulator,
1,266✔
266
                           size_t mem_dumped, double dump_duration)
267
{
268
        regulator->dump_in_progress = false;
1,266✔
269

270
        if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {
1,266✔
271
                histogram_collect(regulator->dump_bandwidth_hist,
36✔
272
                                  mem_dumped / dump_duration);
36✔
273
                /*
274
                 * To avoid unpredictably long stalls caused by
275
                 * mispredicting dump time duration, we need to
276
                 * know the worst (smallest) dump bandwidth so
277
                 * use a lower-bound percentile estimate.
278
                 */
279
                regulator->dump_bandwidth = histogram_percentile_lower(
36✔
280
                        regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
281
        }
282

283
        /*
284
         * Reset the rate limit.
285
         *
286
         * It doesn't make sense to allow to consume memory at
287
         * a higher rate than it can be dumped so we set the rate
288
         * limit to the dump bandwidth rather than disabling it
289
         * completely.
290
         */
291
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
1,266✔
292
                                regulator->dump_bandwidth);
293

294
        if (dump_duration > 0) {
1,266✔
295
                say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",
1,229!
296
                         mem_dumped, dump_duration,
297
                         mem_dumped / dump_duration / 1024 / 1024);
298
        }
299
}
1,266✔
300

301
void
302
vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)
345✔
303
{
304
        vy_quota_set_limit(regulator->quota, limit);
345✔
305
        vy_regulator_update_dump_watermark(regulator);
345✔
306
}
345✔
307

308
void
309
vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
378✔
310
{
311
        histogram_reset(regulator->dump_bandwidth_hist);
378✔
312
        regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
378✔
313
        if (max > 0 && regulator->dump_bandwidth > max)
378✔
314
                regulator->dump_bandwidth = max;
2✔
315
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
378✔
316
                                regulator->dump_bandwidth);
317
}
378✔
318

319
void
320
vy_regulator_reset_stat(struct vy_regulator *regulator)
28✔
321
{
322
        memset(&regulator->sched_stat_last, 0,
28✔
323
               sizeof(regulator->sched_stat_last));
324
}
28✔
325

326
/*
327
 * The goal of rate limiting is to ensure LSM trees stay close to
328
 * their perfect shape, as defined by run_size_ratio. When dump rate
329
 * is too high, we have to throttle database writes to ensure
330
 * compaction can keep up with dumps. We can't deduce optimal dump
331
 * bandwidth from LSM configuration, such as run_size_ratio or
332
 * run_count_per_level, since different spaces or different indexes
333
 * within a space can have different configuration settings. The
334
 * workload can also vary significantly from space to space. So,
335
 * when setting the limit, we have to consider dump and compaction
336
 * activities of the database as a whole.
337
 *
338
 * To this end, we keep track of compaction bandwidth and write
339
 * amplification of the entire database, across all LSM trees.
340
 * The idea is simple: observe the current write amplification
341
 * and compaction bandwidth, and set maximal write rate to a value
342
 * somewhat below the implied limit, so as to make room for
343
 * compaction to do more work if necessary.
344
 *
345
 * We use the following metrics to calculate the limit:
346
 *  - dump_output - number of bytes dumped to disk over the last
347
 *    observation period. The period itself is measured in dumps,
348
 *    not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
349
 *  - compaction_output - number of bytes produced by compaction
350
 *    over the same period.
351
 *  - compaction_rate - total compaction output, in bytes, divided
352
 *    by total time spent on doing compaction by compaction threads,
353
 *    both measured over the same observation period. This gives an
354
 *    estimate of the speed at which compaction can write output.
355
 *    In the real world this speed is dependent on the disk write
356
 *    throughput, number of dump threads, and actual dump rate, but
357
 *    given the goal of rate limiting is providing compaction with
358
 *    extra bandwidth, this metric is considered an accurate enough
359
 *    approximation of the disk bandwidth available to compaction.
360
 *
361
 * We calculate the compaction rate with the following formula:
362
 *
363
 *                                            compaction_output
364
 *     compaction_rate = compaction_threads * -----------------
365
 *                                             compaction_time
366
 *
367
 * where compaction_threads represents the total number of available
368
 * compaction threads and compaction_time is the total time, in
369
 * seconds, spent by all threads doing compaction. You can look at
370
 * the formula this way: compaction_ouptut / compaction_time gives
371
 * the average write speed of a single compaction thread, and by
372
 * multiplying it by the number of compaction threads we get the
373
 * compaction rate of the entire database.
374
 *
375
 * In an optimal system dump rate must be proportional to compaction
376
 * rate and inverse to write amplification:
377
 *
378
 *     dump_rate = compaction_rate / (write_amplification - 1)
379
 *
380
 * The latter can be obtained by dividing total output of compaction
381
 * by total output of dumps over the observation period:
382
 *
383
 *                           dump_output + compaction_output
384
 *     write_amplification = ------------------------------- =
385
 *                                    dump_output
386
 *
387
 *                         = 1 + compaction_output / dump_output
388
 *
389
 * Putting this all together and taking into account data compaction
390
 * during memory dump, we get for the max transaction rate:
391
 *
392
 *                           dump_input
393
 *     tx_rate = dump_rate * ----------- =
394
 *                           dump_output
395
 *
396
 *                                    compaction_output
397
 *             = compaction_threads * ----------------- *
398
 *                                     compaction_time
399
 *
400
 *                              dump_output      dump_input
401
 *                         * ----------------- * ----------- =
402
 *                           compaction_output   dump_output
403
 *
404
 *             = compaction_threads * dump_input / compaction_time
405
 *
406
 * We set the rate limit to 0.75 of the approximated optimal to
407
 * leave the database engine enough room needed to use more disk
408
 * bandwidth for compaction if necessary. As soon as compaction gets
409
 * enough disk bandwidth to keep LSM trees in optimal shape
410
 * compaction speed becomes stable, as does write amplification.
411
 */
412
void
413
vy_regulator_update_rate_limit(struct vy_regulator *regulator,
1,266✔
414
                               const struct vy_scheduler_stat *stat,
415
                               int compaction_threads)
416
{
417
        struct vy_scheduler_stat *last = &regulator->sched_stat_last;
1,266✔
418
        struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;
1,266✔
419

420
        int32_t dump_count = stat->dump_count - last->dump_count;
1,266✔
421
        int64_t dump_input = stat->dump_input - last->dump_input;
1,266✔
422
        double compaction_time = stat->compaction_time - last->compaction_time;
1,266✔
423
        *last = *stat;
1,266✔
424

425
        if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN || compaction_time == 0)
1,266!
426
                return;
1,266✔
427

428
        recent->dump_count += dump_count;
×
429
        recent->dump_input += dump_input;
×
430
        recent->compaction_time += compaction_time;
×
431

432
        double rate = 0.75 * compaction_threads * recent->dump_input /
×
433
                                                  recent->compaction_time;
×
434
        /*
435
         * We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
436
         * the rate from double to size_t here, because on a 64-bit
437
         * system SIZE_MAX equals 2^64-1, which can't be represented
438
         * as double without loss of precision and hence is rounded
439
         * up to 2^64, which in turn can't be converted back to size_t.
440
         * So we first convert the rate to uint64_t using exp2(64) to
441
         * check if it fits and only then cast the uint64_t to size_t.
442
         */
443
        uint64_t rate64;
444
        if (rate < exp2(64))
×
445
                rate64 = rate;
×
446
        else
447
                rate64 = UINT64_MAX;
×
448
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
×
449
                                (size_t)MIN(rate64, SIZE_MAX));
450

451
        /*
452
         * Periodically rotate statistics for quicker adaptation
453
         * to workload changes.
454
         */
455
        if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
×
456
                recent->dump_count /= 2;
×
457
                recent->dump_input /= 2;
×
458
                recent->compaction_time /= 2;
×
459
        }
460
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc