12009901531

Committed 25 Nov 2024 12:20PM UTC coverage: 94.92% (+0.009%) from 94.911%

Build # 12009901531

Build Type

push

github

Committed by

fknorr

Commit Message

Add missing includes and consistently order them

We can't add the misc-include-cleaner lint because it causes too many
false positives with "interface headers" such as sycl.hpp.

Run Details

3190 of 3626 branches covered (87.98%)

Branch coverage included in aggregate %.

7049 of 7161 relevant lines covered (98.44%)

1242183.17 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.97

/src/split.cc

#include "split.h"

#include "grid.h"
#include "ranges.h"

#include <algorithm>
#include <array>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <tuple>
#include <vector>


namespace {

using namespace celerity;
using namespace celerity::detail;

[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
        region<3> reconstructed_chunk;
        for(auto& chnk : split) {
                assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
                reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
        }
        assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
}

template <int Dims>
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
    const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
        range<Dims> small_chunk_size{zeros};
        range<Dims> large_chunk_size{zeros};
        range<Dims> num_large_chunks{zeros};
        for(int d = 0; d < Dims; ++d) {
                const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
                small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
                large_chunk_size[d] = small_chunk_size[d] + granularity[d];
                num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
        }
        return {small_chunk_size, large_chunk_size, num_large_chunks};
}

/**
 * Given a factorization of `num_chunks` (i.e., `f0 * f1 = num_chunks`), try to find the assignment of factors to
 * dimensions that produces more chunks under the given constraints. If they are tied, try to find the assignment
 * that results in a "nicer" split according to some heuristics (see below).
 *
 * The single argument `factor` specifies both factors, as `f0 = factor` and `f1 = num_chunks / factor`.
 *
 * @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
 *          (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
 */
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
        assert(num_chunks % factor == 0);
        const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
        const size_t f0 = factor;
        const size_t f1 = num_chunks / factor;

        // Decide in which direction to split by first checking which
        // factor assignment produces more chunks under the given constraints.
        const std::array<size_t, 2> split_0_1 = {std::min(f0, max_chunks[0]), std::min(f1, max_chunks[1])};
        const std::array<size_t, 2> split_1_0 = {std::min(f1, max_chunks[0]), std::min(f0, max_chunks[1])};
        const auto count0 = split_0_1[0] * split_0_1[1];
        const auto count1 = split_1_0[0] * split_1_0[1];

        if(count0 > count1) { return split_0_1; }
        if(count0 < count1) { return split_1_0; }

        // If we're tied for the number of chunks we can create, try some heuristics to decide.

        // If domain is square(-ish), prefer splitting along slower dimension.
        // (These bounds have been chosen arbitrarily!)
        const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
        if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }

        // For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
        const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
        const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
        return circ0 < circ1 ? split_0_1 : split_1_0;

        // TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
        // i.e., how balanced the workload is.
}

} // namespace

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
        assert(num_chunks > 0);
        for(int d = 0; d < 3; ++d) {
                assert(granularity[d] > 0);
                assert(full_chunk.range[d] % granularity[d] == 0);
        }
#endif

        // Due to split granularity requirements or if num_workers > global_size[0],
        // we may not be able to create the requested number of chunks.
        const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
        const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);

        std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
        for(auto i = 0u; i < num_large_chunks[0]; ++i) {
                result[i].range[0] = large_chunk_size[0];
                result[i].offset[0] += i * large_chunk_size[0];
        }
        for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
                result[i].range[0] = small_chunk_size[0];
                result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
        }

#ifndef NDEBUG
        sanity_check_split(full_chunk, result);
#endif

        return result;
}

// TODO: Make the split dimensions configurable for 3D chunks?
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
        assert(num_chunks > 0);
        for(int d = 0; d < 3; ++d) {
                assert(granularity[d] > 0);
                assert(full_chunk.range[d] % granularity[d] == 0);
        }
#endif

        // Factorize num_chunks
        // We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
        // and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
        // the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
        size_t factor = std::floor(std::sqrt(num_chunks));
        std::array<size_t, 2> best_chunk_counts = {0, 0};
        while(factor >= 1) {
                while(factor > 1 && num_chunks % factor != 0) {
                        factor--;
                }
                // The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
                const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
                if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
                if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
                factor--;
        }
        const auto actual_num_chunks = best_chunk_counts;
        const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);

        std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
        id<3> offset = full_chunk.offset;

        for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
                range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
                for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
                        chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
                        auto& chnk = result[j * actual_num_chunks[1] + i];
                        chnk.offset = offset;
                        chnk.range[0] = chunk_size[0];
                        chnk.range[1] = chunk_size[1];
                        offset[1] += chunk_size[1];
                }
                offset[0] += chunk_size[0];
                offset[1] = full_chunk.offset[1];
        }

#ifndef NDEBUG
        sanity_check_split(full_chunk, result);
#endif

        return result;
}
} // namespace celerity::detail

1	#include "split.h"
2
3	#include "grid.h"
4	#include "ranges.h"
5
6	#include <algorithm>
7	#include <array>
8	#include <cassert>
9	#include <cmath>
10	#include <cstddef>
11	#include <tuple>
12	#include <vector>
13
14
15	namespace {
16
17	using namespace celerity;
18	using namespace celerity::detail;
19
20	[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {	8,960✔
21	region<3> reconstructed_chunk;	8,960✔
22	for(auto& chnk : split) {	21,452✔
23	assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());	12,492✔
24	reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);	12,492✔
25	}
26	assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());	8,960✔
27	}	17,920✔
28
29	template <int Dims>
30	std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(	8,960✔
31	const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
32	range<Dims> small_chunk_size{zeros};	8,960✔
33	range<Dims> large_chunk_size{zeros};	8,960✔
34	range<Dims> num_large_chunks{zeros};	8,960✔
35	for(int d = 0; d < Dims; ++d) {	18,271✔
36	const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];	9,311✔
37	small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];	9,311✔
38	large_chunk_size[d] = small_chunk_size[d] + granularity[d];	9,311✔
39	num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];	9,311✔
40	}
41	return {small_chunk_size, large_chunk_size, num_large_chunks};	17,920✔
42	}
43
44	/**
45	* Given a factorization of `num_chunks` (i.e., `f0 * f1 = num_chunks`), try to find the assignment of factors to
46	* dimensions that produces more chunks under the given constraints. If they are tied, try to find the assignment
47	* that results in a "nicer" split according to some heuristics (see below).
48	*
49	* The single argument `factor` specifies both factors, as `f0 = factor` and `f1 = num_chunks / factor`.
50	*
51	* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
52	* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
53	*/
54	std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {	356✔
55	assert(num_chunks % factor == 0);	356✔
56	const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};	356✔
57	const size_t f0 = factor;	356✔
58	const size_t f1 = num_chunks / factor;	356✔
59
60	// Decide in which direction to split by first checking which
61	// factor assignment produces more chunks under the given constraints.
62	const std::array<size_t, 2> split_0_1 = {std::min(f0, max_chunks[0]), std::min(f1, max_chunks[1])};	356✔
63	const std::array<size_t, 2> split_1_0 = {std::min(f1, max_chunks[0]), std::min(f0, max_chunks[1])};	356✔
64	const auto count0 = split_0_1[0] * split_0_1[1];	356✔
65	const auto count1 = split_1_0[0] * split_1_0[1];	356✔
66
67	if(count0 > count1) { return split_0_1; }	356✔
68	if(count0 < count1) { return split_1_0; }	355✔
69
70	// If we're tied for the number of chunks we can create, try some heuristics to decide.
71
72	// If domain is square(-ish), prefer splitting along slower dimension.
73	// (These bounds have been chosen arbitrarily!)
74	const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);	347✔
75	if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }	347✔
76
77	// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
78	const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];	27✔
79	const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];	27✔
80	return circ0 < circ1 ? split_0_1 : split_1_0;	27✔
81
82	// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
83	// i.e., how balanced the workload is.
84	}
85
86	} // namespace
87
88	namespace celerity::detail {
89
90	std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {	8,609✔
91	#ifndef NDEBUG
92	assert(num_chunks > 0);	8,609✔
93	for(int d = 0; d < 3; ++d) {	34,436✔
94	assert(granularity[d] > 0);	25,827✔
95	assert(full_chunk.range[d] % granularity[d] == 0);	25,827✔
96	}
97	#endif
98
99	// Due to split granularity requirements or if num_workers > global_size[0],
100	// we may not be able to create the requested number of chunks.
101	const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};	8,609✔
102	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);	8,609✔
103
104	std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});	25,827✔
105	for(auto i = 0u; i < num_large_chunks[0]; ++i) {	8,705✔
106	result[i].range[0] = large_chunk_size[0];	96✔
107	result[i].offset[0] += i * large_chunk_size[0];	96✔
108	}
109	for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {	20,088✔
110	result[i].range[0] = small_chunk_size[0];	11,479✔
111	result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];	11,479✔
112	}
113
114	#ifndef NDEBUG
115	sanity_check_split(full_chunk, result);	8,609✔
116	#endif
117
118	return result;	17,218✔
119	}	×
120
121	// TODO: Make the split dimensions configurable for 3D chunks?
122	std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {	351✔
123	#ifndef NDEBUG
124	assert(num_chunks > 0);	351✔
125	for(int d = 0; d < 3; ++d) {	1,404✔
126	assert(granularity[d] > 0);	1,053✔
127	assert(full_chunk.range[d] % granularity[d] == 0);	1,053✔
128	}
129	#endif
130
131	// Factorize num_chunks
132	// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
133	// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
134	// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
135	size_t factor = std::floor(std::sqrt(num_chunks));	351✔
136	std::array<size_t, 2> best_chunk_counts = {0, 0};	351✔
137	while(factor >= 1) {	357✔
138	while(factor > 1 && num_chunks % factor != 0) {	356!
139	factor--;	×
140	}
141	// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
142	const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);	356✔
143	if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }	356!
144	if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }	356✔
145	factor--;	6✔
146	}
147	const auto actual_num_chunks = best_chunk_counts;	351✔
148	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);	351✔
149
150	std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});	1,053✔
151	id<3> offset = full_chunk.offset;	351✔
152
153	for(size_t j = 0; j < actual_num_chunks[0]; ++j) {	905✔
154	range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};	554✔
155	for(size_t i = 0; i < actual_num_chunks[1]; ++i) {	1,471✔
156	chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];	917✔
157	auto& chnk = result[j * actual_num_chunks[1] + i];	917✔
158	chnk.offset = offset;	917✔
159	chnk.range[0] = chunk_size[0];	917✔
160	chnk.range[1] = chunk_size[1];	917✔
161	offset[1] += chunk_size[1];	917✔
162	}
163	offset[0] += chunk_size[0];	554✔
164	offset[1] = full_chunk.offset[1];	554✔
165	}
166
167	#ifndef NDEBUG
168	sanity_check_split(full_chunk, result);	351✔
169	#endif
170
171	return result;	702✔
172	}	×
173	} // namespace celerity::detail

celerity / celerity-runtime / 12009901531

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous