12397458527

Committed 18 Dec 2024 04:44PM UTC coverage: 87.367%. Remained the same

Build # 12397458527

Build Type

push

github

Committed by

Totktonada

Commit Message

config: fix on_event roles callback config argument

The config object was passed to the `on_event` callback instead of the
corresponding roles config before. Lets fix it, implementing the
intended behavior.

NO_DOC=bugfix

Closes #10934

(cherry picked from commit d5d2291ae)

Run Details

69673 of 123524 branches covered (56.4%)

102617 of 117455 relevant lines covered (87.37%)

2871961.47 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.14

/src/box/vy_regulator.c

/*
 * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * 1. Redistributions of source code must retain the above
 *    copyright notice, this list of conditions and the
 *    following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials
 *    provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
#include "vy_regulator.h"

#include <math.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <tarantool_ev.h>

#include "fiber.h"
#include "histogram.h"
#include "say.h"
#include "trivia/util.h"

#include "vy_quota.h"
#include "vy_stat.h"

/**
 * Regulator timer period, in seconds.
 */
static const double VY_REGULATOR_TIMER_PERIOD = 1;

/**
 * Time window over which the write rate is averaged,
 * in seconds.
 */
static const double VY_WRITE_RATE_AVG_WIN = 5;

/**
 * Histogram percentile used for estimating dump bandwidth.
 * For details see the comment to vy_regulator::dump_bandwidth_hist.
 */
static const int VY_DUMP_BANDWIDTH_PCT = 10;

/*
 * Until we dump anything, assume bandwidth to be 10 MB/s,
 * which should be fine for initial guess.
 */
static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;

/**
 * Do not take into account small dumps when estimating dump
 * bandwidth, because they have too high overhead associated
 * with file creation.
 */
static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;

/**
 * Number of dumps to take into account for rate limit calculation.
 * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
 * either - otherwise the rate limit will adapt too slowly to workload
 * changes. 100 feels like a good choice.
 */
static const int VY_RECENT_DUMP_COUNT = 100;

static void
vy_regulator_trigger_dump(struct vy_regulator *regulator)
{
        if (regulator->dump_in_progress)
                return;

        if (regulator->trigger_dump_cb(regulator) != 0)
                return;

        regulator->dump_in_progress = true;

        /*
         * To avoid unpredictably long stalls, we must limit
         * the write rate when a dump is in progress so that
         * we don't hit the hard limit before the dump has
         * completed, i.e.
         *
         *    mem_left        mem_used
         *   ---------- >= --------------
         *   write_rate    dump_bandwidth
         */
        struct vy_quota *quota = regulator->quota;
        size_t mem_left = (quota->used < quota->limit ?
                           quota->limit - quota->used : 0);
        size_t mem_used = quota->used;
        size_t max_write_rate = (double)mem_left / (mem_used + 1) *
                                        regulator->dump_bandwidth;
        max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
        vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
                                max_write_rate);

        say_info("dumping %zu bytes, expected rate %.1f MB/s, "
                 "ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
                 quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
                 (double)quota->used / (regulator->dump_bandwidth + 1),
                 (double)regulator->write_rate / 1024 / 1024,
                 (double)regulator->write_rate_max / 1024 / 1024);

        regulator->write_rate_max = regulator->write_rate;
}

static void
vy_regulator_update_write_rate(struct vy_regulator *regulator)
{
        size_t used_curr = regulator->quota->used;
        size_t used_last = regulator->quota_used_last;

        /*
         * Memory can be dumped between two subsequent timer
         * callback invocations, in which case memory usage
         * will decrease. Ignore such observations - it's not
         * a big deal, because dump is a rare event.
         */
        if (used_curr < used_last) {
                regulator->quota_used_last = used_curr;
                return;
        }

        size_t rate_avg = regulator->write_rate;
        size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;

        double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /
                                VY_WRITE_RATE_AVG_WIN);
        rate_avg = (1 - weight) * rate_avg + weight * rate_curr;

        regulator->write_rate = rate_avg;
        if (regulator->write_rate_max < rate_curr)
                regulator->write_rate_max = rate_curr;
        regulator->quota_used_last = used_curr;
}

static void
vy_regulator_update_dump_watermark(struct vy_regulator *regulator)
{
        struct vy_quota *quota = regulator->quota;

        /*
         * Due to log structured nature of the lsregion allocator,
         * which is used for allocating statements, we cannot free
         * memory in chunks, only all at once. Therefore we should
         * configure the watermark so that by the time we hit the
         * limit, all memory have been dumped, i.e.
         *
         *   limit - watermark      watermark
         *   ----------------- = --------------
         *       write_rate      dump_bandwidth
         *
         * Be pessimistic when predicting the write rate - use the
         * max observed write rate multiplied by 1.5 - because it's
         * better to start memory dump early than delay it as long
         * as possible at the risk of experiencing unpredictably
         * long stalls.
         */
        size_t write_rate = regulator->write_rate_max * 3 / 2;
        regulator->dump_watermark =
                        (double)quota->limit * regulator->dump_bandwidth /
                        (regulator->dump_bandwidth + write_rate + 1);
        /*
         * It doesn't make sense to set the watermark below 50%
         * of the memory limit because the write rate can exceed
         * the dump bandwidth under no circumstances.
         */
        regulator->dump_watermark = MAX(regulator->dump_watermark,
                                        quota->limit / 2);
}

static void
vy_regulator_timer_cb(ev_loop *loop, ev_timer *timer, int events)
{
        (void)loop;
        (void)events;

        struct vy_regulator *regulator = timer->data;

        vy_regulator_update_write_rate(regulator);
        vy_regulator_update_dump_watermark(regulator);
        vy_regulator_check_dump_watermark(regulator);
}

void
vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
                    vy_trigger_dump_f trigger_dump_cb)
{
        enum { KB = 1024, MB = KB * KB };
        static int64_t dump_bandwidth_buckets[] = {
                100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
                700 * KB, 800 * KB, 900 * KB,   1 * MB,   2 * MB,   3 * MB,
                  4 * MB,   5 * MB,   6 * MB,   7 * MB,   8 * MB,   9 * MB,
                 10 * MB,  15 * MB,  20 * MB,  25 * MB,  30 * MB,  35 * MB,
                 40 * MB,  45 * MB,  50 * MB,  55 * MB,  60 * MB,  65 * MB,
                 70 * MB,  75 * MB,  80 * MB,  85 * MB,  90 * MB,  95 * MB,
                100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
                700 * MB, 800 * MB, 900 * MB,
        };
        memset(regulator, 0, sizeof(*regulator));
        regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
                                        lengthof(dump_bandwidth_buckets));
        if (regulator->dump_bandwidth_hist == NULL)
                panic("failed to allocate dump bandwidth histogram");

        regulator->quota = quota;
        regulator->trigger_dump_cb = trigger_dump_cb;
        ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
                      VY_REGULATOR_TIMER_PERIOD);
        regulator->timer.data = regulator;
        regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
        regulator->dump_watermark = SIZE_MAX;
}

void
vy_regulator_start(struct vy_regulator *regulator)
{
        regulator->quota_used_last = regulator->quota->used;
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
                                regulator->dump_bandwidth);
        ev_timer_start(loop(), &regulator->timer);
}

void
vy_regulator_destroy(struct vy_regulator *regulator)
{
        ev_timer_stop(loop(), &regulator->timer);
        histogram_delete(regulator->dump_bandwidth_hist);
}

void
vy_regulator_quota_exceeded(struct vy_regulator *regulator)
{
        vy_regulator_trigger_dump(regulator);
}

void
vy_regulator_check_dump_watermark(struct vy_regulator *regulator)
{
        if (regulator->quota->used >= regulator->dump_watermark)
                vy_regulator_trigger_dump(regulator);
}

void
vy_regulator_dump_complete(struct vy_regulator *regulator,
                           size_t mem_dumped, double dump_duration)
{
        regulator->dump_in_progress = false;

        if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {
                histogram_collect(regulator->dump_bandwidth_hist,
                                  mem_dumped / dump_duration);
                /*
                 * To avoid unpredictably long stalls caused by
                 * mispredicting dump time duration, we need to
                 * know the worst (smallest) dump bandwidth so
                 * use a lower-bound percentile estimate.
                 */
                regulator->dump_bandwidth = histogram_percentile_lower(
                        regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
        }

        /*
         * Reset the rate limit.
         *
         * It doesn't make sense to allow to consume memory at
         * a higher rate than it can be dumped so we set the rate
         * limit to the dump bandwidth rather than disabling it
         * completely.
         */
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
                                regulator->dump_bandwidth);

        if (dump_duration > 0) {
                say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",
                         mem_dumped, dump_duration,
                         mem_dumped / dump_duration / 1024 / 1024);
        }
}

void
vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)
{
        vy_quota_set_limit(regulator->quota, limit);
        vy_regulator_update_dump_watermark(regulator);
}

void
vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
{
        histogram_reset(regulator->dump_bandwidth_hist);
        regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
        if (max > 0 && regulator->dump_bandwidth > max)
                regulator->dump_bandwidth = max;
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
                                regulator->dump_bandwidth);
}

void
vy_regulator_reset_stat(struct vy_regulator *regulator)
{
        memset(&regulator->sched_stat_last, 0,
               sizeof(regulator->sched_stat_last));
}

/*
 * The goal of rate limiting is to ensure LSM trees stay close to
 * their perfect shape, as defined by run_size_ratio. When dump rate
 * is too high, we have to throttle database writes to ensure
 * compaction can keep up with dumps. We can't deduce optimal dump
 * bandwidth from LSM configuration, such as run_size_ratio or
 * run_count_per_level, since different spaces or different indexes
 * within a space can have different configuration settings. The
 * workload can also vary significantly from space to space. So,
 * when setting the limit, we have to consider dump and compaction
 * activities of the database as a whole.
 *
 * To this end, we keep track of compaction bandwidth and write
 * amplification of the entire database, across all LSM trees.
 * The idea is simple: observe the current write amplification
 * and compaction bandwidth, and set maximal write rate to a value
 * somewhat below the implied limit, so as to make room for
 * compaction to do more work if necessary.
 *
 * We use the following metrics to calculate the limit:
 *  - dump_output - number of bytes dumped to disk over the last
 *    observation period. The period itself is measured in dumps,
 *    not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
 *  - compaction_output - number of bytes produced by compaction
 *    over the same period.
 *  - compaction_rate - total compaction output, in bytes, divided
 *    by total time spent on doing compaction by compaction threads,
 *    both measured over the same observation period. This gives an
 *    estimate of the speed at which compaction can write output.
 *    In the real world this speed is dependent on the disk write
 *    throughput, number of dump threads, and actual dump rate, but
 *    given the goal of rate limiting is providing compaction with
 *    extra bandwidth, this metric is considered an accurate enough
 *    approximation of the disk bandwidth available to compaction.
 *
 * We calculate the compaction rate with the following formula:
 *
 *                                            compaction_output
 *     compaction_rate = compaction_threads * -----------------
 *                                             compaction_time
 *
 * where compaction_threads represents the total number of available
 * compaction threads and compaction_time is the total time, in
 * seconds, spent by all threads doing compaction. You can look at
 * the formula this way: compaction_ouptut / compaction_time gives
 * the average write speed of a single compaction thread, and by
 * multiplying it by the number of compaction threads we get the
 * compaction rate of the entire database.
 *
 * In an optimal system dump rate must be proportional to compaction
 * rate and inverse to write amplification:
 *
 *     dump_rate = compaction_rate / (write_amplification - 1)
 *
 * The latter can be obtained by dividing total output of compaction
 * by total output of dumps over the observation period:
 *
 *                           dump_output + compaction_output
 *     write_amplification = ------------------------------- =
 *                                    dump_output
 *
 *                         = 1 + compaction_output / dump_output
 *
 * Putting this all together and taking into account data compaction
 * during memory dump, we get for the max transaction rate:
 *
 *                           dump_input
 *     tx_rate = dump_rate * ----------- =
 *                           dump_output
 *
 *                                    compaction_output
 *             = compaction_threads * ----------------- *
 *                                     compaction_time
 *
 *                              dump_output      dump_input
 *                         * ----------------- * ----------- =
 *                           compaction_output   dump_output
 *
 *             = compaction_threads * dump_input / compaction_time
 *
 * We set the rate limit to 0.75 of the approximated optimal to
 * leave the database engine enough room needed to use more disk
 * bandwidth for compaction if necessary. As soon as compaction gets
 * enough disk bandwidth to keep LSM trees in optimal shape
 * compaction speed becomes stable, as does write amplification.
 */
void
vy_regulator_update_rate_limit(struct vy_regulator *regulator,
                               const struct vy_scheduler_stat *stat,
                               int compaction_threads)
{
        struct vy_scheduler_stat *last = &regulator->sched_stat_last;
        struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;

        int32_t dump_count = stat->dump_count - last->dump_count;
        int64_t dump_input = stat->dump_input - last->dump_input;
        double compaction_time = stat->compaction_time - last->compaction_time;
        *last = *stat;

        if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN || compaction_time == 0)
                return;

        recent->dump_count += dump_count;
        recent->dump_input += dump_input;
        recent->compaction_time += compaction_time;

        double rate = 0.75 * compaction_threads * recent->dump_input /
                                                  recent->compaction_time;
        /*
         * We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
         * the rate from double to size_t here, because on a 64-bit
         * system SIZE_MAX equals 2^64-1, which can't be represented
         * as double without loss of precision and hence is rounded
         * up to 2^64, which in turn can't be converted back to size_t.
         * So we first convert the rate to uint64_t using exp2(64) to
         * check if it fits and only then cast the uint64_t to size_t.
         */
        uint64_t rate64;
        if (rate < exp2(64))
                rate64 = rate;
        else
                rate64 = UINT64_MAX;
        vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
                                (size_t)MIN(rate64, SIZE_MAX));

        /*
         * Periodically rotate statistics for quicker adaptation
         * to workload changes.
         */
        if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
                recent->dump_count /= 2;
                recent->dump_input /= 2;
                recent->compaction_time /= 2;
        }
}

1	/*
2	* Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
3	*
4	* Redistribution and use in source and binary forms, with or
5	* without modification, are permitted provided that the following
6	* conditions are met:
7	*
8	* 1. Redistributions of source code must retain the above
9	* copyright notice, this list of conditions and the
10	* following disclaimer.
11	*
12	* 2. Redistributions in binary form must reproduce the above
13	* copyright notice, this list of conditions and the following
14	* disclaimer in the documentation and/or other materials
15	* provided with the distribution.
16	*
17	* THIS SOFTWARE IS PROVIDED BY AUTHORS ``AS IS'' AND
18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
21	* AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
22	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
25	* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28	* THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29	* SUCH DAMAGE.
30	*/
31	#include "vy_regulator.h"
32
33	#include <math.h>
34	#include <stdbool.h>
35	#include <stddef.h>
36	#include <stdint.h>
37	#include <string.h>
38	#include <tarantool_ev.h>
39
40	#include "fiber.h"
41	#include "histogram.h"
42	#include "say.h"
43	#include "trivia/util.h"
44
45	#include "vy_quota.h"
46	#include "vy_stat.h"
47
48	/**
49	* Regulator timer period, in seconds.
50	*/
51	static const double VY_REGULATOR_TIMER_PERIOD = 1;
52
53	/**
54	* Time window over which the write rate is averaged,
55	* in seconds.
56	*/
57	static const double VY_WRITE_RATE_AVG_WIN = 5;
58
59	/**
60	* Histogram percentile used for estimating dump bandwidth.
61	* For details see the comment to vy_regulator::dump_bandwidth_hist.
62	*/
63	static const int VY_DUMP_BANDWIDTH_PCT = 10;
64
65	/*
66	* Until we dump anything, assume bandwidth to be 10 MB/s,
67	* which should be fine for initial guess.
68	*/
69	static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
70
71	/**
72	* Do not take into account small dumps when estimating dump
73	* bandwidth, because they have too high overhead associated
74	* with file creation.
75	*/
76	static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
77
78	/**
79	* Number of dumps to take into account for rate limit calculation.
80	* Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
81	* either - otherwise the rate limit will adapt too slowly to workload
82	* changes. 100 feels like a good choice.
83	*/
84	static const int VY_RECENT_DUMP_COUNT = 100;
85
86	static void
87	vy_regulator_trigger_dump(struct vy_regulator *regulator)	12,534✔
88	{
89	if (regulator->dump_in_progress)	12,534✔
90	return;	12,361✔
91
92	if (regulator->trigger_dump_cb(regulator) != 0)	173✔
93	return;	1✔
94
95	regulator->dump_in_progress = true;	172✔
96
97	/*
98	* To avoid unpredictably long stalls, we must limit
99	* the write rate when a dump is in progress so that
100	* we don't hit the hard limit before the dump has
101	* completed, i.e.
102	*
103	* mem_left mem_used
104	* ---------- >= --------------
105	* write_rate dump_bandwidth
106	*/
107	struct vy_quota *quota = regulator->quota;	172✔
108	size_t mem_left = (quota->used < quota->limit ?	344✔
109	quota->limit - quota->used : 0);	172✔
110	size_t mem_used = quota->used;	172✔
111	size_t max_write_rate = (double)mem_left / (mem_used + 1) *	172✔
112	regulator->dump_bandwidth;	172✔
113	max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);	172✔
114	vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,	172✔
115	max_write_rate);
116
117	say_info("dumping %zu bytes, expected rate %.1f MB/s, "	172!
118	"ETA %.1f s, write rate (avg/max) %.1f/%.1f MB/s",
119	quota->used, (double)regulator->dump_bandwidth / 1024 / 1024,
120	(double)quota->used / (regulator->dump_bandwidth + 1),
121	(double)regulator->write_rate / 1024 / 1024,
122	(double)regulator->write_rate_max / 1024 / 1024);
123
124	regulator->write_rate_max = regulator->write_rate;	172✔
125	}
126
127	static void
128	vy_regulator_update_write_rate(struct vy_regulator *regulator)	10,694✔
129	{
130	size_t used_curr = regulator->quota->used;	10,694✔
131	size_t used_last = regulator->quota_used_last;	10,694✔
132
133	/*
134	* Memory can be dumped between two subsequent timer
135	* callback invocations, in which case memory usage
136	* will decrease. Ignore such observations - it's not
137	* a big deal, because dump is a rare event.
138	*/
139	if (used_curr < used_last) {	10,694✔
140	regulator->quota_used_last = used_curr;	95✔
141	return;	95✔
142	}
143
144	size_t rate_avg = regulator->write_rate;	10,599✔
145	size_t rate_curr = (used_curr - used_last) / VY_REGULATOR_TIMER_PERIOD;	10,599✔
146
147	double weight = 1 - exp(-VY_REGULATOR_TIMER_PERIOD /	10,599✔
148	VY_WRITE_RATE_AVG_WIN);
149	rate_avg = (1 - weight) * rate_avg + weight * rate_curr;	10,599✔
150
151	regulator->write_rate = rate_avg;	10,599✔
152	if (regulator->write_rate_max < rate_curr)	10,599✔
153	regulator->write_rate_max = rate_curr;	281✔
154	regulator->quota_used_last = used_curr;	10,599✔
155	}
156
157	static void
158	vy_regulator_update_dump_watermark(struct vy_regulator *regulator)	11,039✔
159	{
160	struct vy_quota *quota = regulator->quota;	11,039✔
161
162	/*
163	* Due to log structured nature of the lsregion allocator,
164	* which is used for allocating statements, we cannot free
165	* memory in chunks, only all at once. Therefore we should
166	* configure the watermark so that by the time we hit the
167	* limit, all memory have been dumped, i.e.
168	*
169	* limit - watermark watermark
170	* ----------------- = --------------
171	* write_rate dump_bandwidth
172	*
173	* Be pessimistic when predicting the write rate - use the
174	* max observed write rate multiplied by 1.5 - because it's
175	* better to start memory dump early than delay it as long
176	* as possible at the risk of experiencing unpredictably
177	* long stalls.
178	*/
179	size_t write_rate = regulator->write_rate_max * 3 / 2;	11,039✔
180	regulator->dump_watermark =	11,039✔
181	(double)quota->limit * regulator->dump_bandwidth /	11,039✔
182	(regulator->dump_bandwidth + write_rate + 1);	11,039✔
183	/*
184	* It doesn't make sense to set the watermark below 50%
185	* of the memory limit because the write rate can exceed
186	* the dump bandwidth under no circumstances.
187	*/
188	regulator->dump_watermark = MAX(regulator->dump_watermark,	11,039✔
189	quota->limit / 2);
190	}	11,039✔
191
192	static void
193	vy_regulator_timer_cb(ev_loop loop, ev_timer timer, int events)	10,694✔
194	{
195	(void)loop;
196	(void)events;
197
198	struct vy_regulator *regulator = timer->data;	10,694✔
199
200	vy_regulator_update_write_rate(regulator);	10,694✔
201	vy_regulator_update_dump_watermark(regulator);	10,694✔
202	vy_regulator_check_dump_watermark(regulator);	10,694✔
203	}	10,694✔
204
205	void
206	vy_regulator_create(struct vy_regulator regulator, struct vy_quota quota,	4,499✔
207	vy_trigger_dump_f trigger_dump_cb)
208	{
209	enum { KB = 1024, MB = KB * KB };
210	static int64_t dump_bandwidth_buckets[] = {
211	100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
212	700 * KB, 800 * KB, 900 * KB, 1 * MB, 2 * MB, 3 * MB,
213	4 * MB, 5 * MB, 6 * MB, 7 * MB, 8 * MB, 9 * MB,
214	10 * MB, 15 * MB, 20 * MB, 25 * MB, 30 * MB, 35 * MB,
215	40 * MB, 45 * MB, 50 * MB, 55 * MB, 60 * MB, 65 * MB,
216	70 * MB, 75 * MB, 80 * MB, 85 * MB, 90 * MB, 95 * MB,
217	100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
218	700 * MB, 800 * MB, 900 * MB,
219	};
220	memset(regulator, 0, sizeof(*regulator));	4,499✔
221	regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,	4,499✔
222	lengthof(dump_bandwidth_buckets));
223	if (regulator->dump_bandwidth_hist == NULL)	4,499!
224	panic("failed to allocate dump bandwidth histogram");	×
225
226	regulator->quota = quota;	4,499✔
227	regulator->trigger_dump_cb = trigger_dump_cb;	4,499✔
228	ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,	4,499✔
229	VY_REGULATOR_TIMER_PERIOD);
230	regulator->timer.data = regulator;	4,499✔
231	regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;	4,499✔
232	regulator->dump_watermark = SIZE_MAX;	4,499✔
233	}	4,499✔
234
235	void
236	vy_regulator_start(struct vy_regulator *regulator)	4,444✔
237	{
238	regulator->quota_used_last = regulator->quota->used;	4,444✔
239	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,	4,444✔
240	regulator->dump_bandwidth);
241	ev_timer_start(loop(), &regulator->timer);	4,444!
242	}	4,444✔
243
244	void
245	vy_regulator_destroy(struct vy_regulator *regulator)	4,360✔
246	{
247	ev_timer_stop(loop(), &regulator->timer);	4,360!
248	histogram_delete(regulator->dump_bandwidth_hist);	4,360✔
249	}	4,360✔
250
251	void
252	vy_regulator_quota_exceeded(struct vy_regulator *regulator)	554✔
253	{
254	vy_regulator_trigger_dump(regulator);	554✔
255	}	554✔
256
257	void
258	vy_regulator_check_dump_watermark(struct vy_regulator *regulator)	1,621,830✔
259	{
260	if (regulator->quota->used >= regulator->dump_watermark)	1,621,830✔
261	vy_regulator_trigger_dump(regulator);	11,980✔
262	}	1,621,830✔
263
264	void
265	vy_regulator_dump_complete(struct vy_regulator *regulator,	1,266✔
266	size_t mem_dumped, double dump_duration)
267	{
268	regulator->dump_in_progress = false;	1,266✔
269
270	if (mem_dumped >= VY_DUMP_SIZE_ACCT_MIN && dump_duration > 0) {	1,266✔
271	histogram_collect(regulator->dump_bandwidth_hist,	36✔
272	mem_dumped / dump_duration);	36✔
273	/*
274	* To avoid unpredictably long stalls caused by
275	* mispredicting dump time duration, we need to
276	* know the worst (smallest) dump bandwidth so
277	* use a lower-bound percentile estimate.
278	*/
279	regulator->dump_bandwidth = histogram_percentile_lower(	36✔
280	regulator->dump_bandwidth_hist, VY_DUMP_BANDWIDTH_PCT);
281	}
282
283	/*
284	* Reset the rate limit.
285	*
286	* It doesn't make sense to allow to consume memory at
287	* a higher rate than it can be dumped so we set the rate
288	* limit to the dump bandwidth rather than disabling it
289	* completely.
290	*/
291	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,	1,266✔
292	regulator->dump_bandwidth);
293
294	if (dump_duration > 0) {	1,266✔
295	say_info("dumped %zu bytes in %.1f s, rate %.1f MB/s",	1,229!
296	mem_dumped, dump_duration,
297	mem_dumped / dump_duration / 1024 / 1024);
298	}
299	}	1,266✔
300
301	void
302	vy_regulator_set_memory_limit(struct vy_regulator *regulator, size_t limit)	345✔
303	{
304	vy_quota_set_limit(regulator->quota, limit);	345✔
305	vy_regulator_update_dump_watermark(regulator);	345✔
306	}	345✔
307
308	void
309	vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)	378✔
310	{
311	histogram_reset(regulator->dump_bandwidth_hist);	378✔
312	regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;	378✔
313	if (max > 0 && regulator->dump_bandwidth > max)	378✔
314	regulator->dump_bandwidth = max;	2✔
315	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,	378✔
316	regulator->dump_bandwidth);
317	}	378✔
318
319	void
320	vy_regulator_reset_stat(struct vy_regulator *regulator)	28✔
321	{
322	memset(&regulator->sched_stat_last, 0,	28✔
323	sizeof(regulator->sched_stat_last));
324	}	28✔
325
326	/*
327	* The goal of rate limiting is to ensure LSM trees stay close to
328	* their perfect shape, as defined by run_size_ratio. When dump rate
329	* is too high, we have to throttle database writes to ensure
330	* compaction can keep up with dumps. We can't deduce optimal dump
331	* bandwidth from LSM configuration, such as run_size_ratio or
332	* run_count_per_level, since different spaces or different indexes
333	* within a space can have different configuration settings. The
334	* workload can also vary significantly from space to space. So,
335	* when setting the limit, we have to consider dump and compaction
336	* activities of the database as a whole.
337	*
338	* To this end, we keep track of compaction bandwidth and write
339	* amplification of the entire database, across all LSM trees.
340	* The idea is simple: observe the current write amplification
341	* and compaction bandwidth, and set maximal write rate to a value
342	* somewhat below the implied limit, so as to make room for
343	* compaction to do more work if necessary.
344	*
345	* We use the following metrics to calculate the limit:
346	* - dump_output - number of bytes dumped to disk over the last
347	* observation period. The period itself is measured in dumps,
348	* not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
349	* - compaction_output - number of bytes produced by compaction
350	* over the same period.
351	* - compaction_rate - total compaction output, in bytes, divided
352	* by total time spent on doing compaction by compaction threads,
353	* both measured over the same observation period. This gives an
354	* estimate of the speed at which compaction can write output.
355	* In the real world this speed is dependent on the disk write
356	* throughput, number of dump threads, and actual dump rate, but
357	* given the goal of rate limiting is providing compaction with
358	* extra bandwidth, this metric is considered an accurate enough
359	* approximation of the disk bandwidth available to compaction.
360	*
361	* We calculate the compaction rate with the following formula:
362	*
363	* compaction_output
364	* compaction_rate = compaction_threads * -----------------
365	* compaction_time
366	*
367	* where compaction_threads represents the total number of available
368	* compaction threads and compaction_time is the total time, in
369	* seconds, spent by all threads doing compaction. You can look at
370	* the formula this way: compaction_ouptut / compaction_time gives
371	* the average write speed of a single compaction thread, and by
372	* multiplying it by the number of compaction threads we get the
373	* compaction rate of the entire database.
374	*
375	* In an optimal system dump rate must be proportional to compaction
376	* rate and inverse to write amplification:
377	*
378	* dump_rate = compaction_rate / (write_amplification - 1)
379	*
380	* The latter can be obtained by dividing total output of compaction
381	* by total output of dumps over the observation period:
382	*
383	* dump_output + compaction_output
384	* write_amplification = ------------------------------- =
385	* dump_output
386	*
387	* = 1 + compaction_output / dump_output
388	*
389	* Putting this all together and taking into account data compaction
390	* during memory dump, we get for the max transaction rate:
391	*
392	* dump_input
393	* tx_rate = dump_rate * ----------- =
394	* dump_output
395	*
396	* compaction_output
397	* = compaction_threads * ----------------- *
398	* compaction_time
399	*
400	* dump_output dump_input
401	* * ----------------- * ----------- =
402	* compaction_output dump_output
403	*
404	* = compaction_threads * dump_input / compaction_time
405	*
406	* We set the rate limit to 0.75 of the approximated optimal to
407	* leave the database engine enough room needed to use more disk
408	* bandwidth for compaction if necessary. As soon as compaction gets
409	* enough disk bandwidth to keep LSM trees in optimal shape
410	* compaction speed becomes stable, as does write amplification.
411	*/
412	void
413	vy_regulator_update_rate_limit(struct vy_regulator *regulator,	1,266✔
414	const struct vy_scheduler_stat *stat,
415	int compaction_threads)
416	{
417	struct vy_scheduler_stat *last = &regulator->sched_stat_last;	1,266✔
418	struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;	1,266✔
419
420	int32_t dump_count = stat->dump_count - last->dump_count;	1,266✔
421	int64_t dump_input = stat->dump_input - last->dump_input;	1,266✔
422	double compaction_time = stat->compaction_time - last->compaction_time;	1,266✔
423	last = stat;	1,266✔
424
425	if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN \|\| compaction_time == 0)	1,266!
426	return;	1,266✔
427
428	recent->dump_count += dump_count;	×
429	recent->dump_input += dump_input;	×
430	recent->compaction_time += compaction_time;	×
431
432	double rate = 0.75 * compaction_threads * recent->dump_input /	×
433	recent->compaction_time;	×
434	/*
435	* We can't simply use (size_t)MIN(rate, SIZE_MAX) to cast
436	* the rate from double to size_t here, because on a 64-bit
437	* system SIZE_MAX equals 2^64-1, which can't be represented
438	* as double without loss of precision and hence is rounded
439	* up to 2^64, which in turn can't be converted back to size_t.
440	* So we first convert the rate to uint64_t using exp2(64) to
441	* check if it fits and only then cast the uint64_t to size_t.
442	*/
443	uint64_t rate64;
444	if (rate < exp2(64))	×
445	rate64 = rate;	×
446	else
447	rate64 = UINT64_MAX;	×
448	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,	×
449	(size_t)MIN(rate64, SIZE_MAX));
450
451	/*
452	* Periodically rotate statistics for quicker adaptation
453	* to workload changes.
454	*/
455	if (recent->dump_count > VY_RECENT_DUMP_COUNT) {	×
456	recent->dump_count /= 2;	×
457	recent->dump_input /= 2;	×
458	recent->compaction_time /= 2;	×
459	}
460	}

tarantool / tarantool / 12397458527

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous