• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 11385101423

17 Oct 2024 12:38PM UTC coverage: 95.216% (-0.09%) from 95.303%
11385101423

push

github

fknorr
Add type trait to match ReductionsAndKernel parameter lists

3050 of 3442 branches covered (88.61%)

Branch coverage included in aggregate %.

5 of 5 new or added lines in 1 file covered. (100.0%)

24 existing lines in 8 files now uncovered.

6821 of 6925 relevant lines covered (98.5%)

1449510.68 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.13
/include/handler.h
1
#pragma once
2

3
#include <memory>
4
#include <type_traits>
5
#include <typeinfo>
6
#include <utility>
7

8
#include <fmt/format.h>
9
#include <sycl/sycl.hpp>
10

11
#include "buffer.h"
12
#include "cgf_diagnostics.h"
13
#include "item.h"
14
#include "partition.h"
15
#include "range_mapper.h"
16
#include "ranges.h"
17
#include "task.h"
18
#include "types.h"
19
#include "version.h"
20
#include "workaround.h"
21

22
namespace celerity {
23
class handler;
24
}
25

26
namespace celerity::experimental {
27

28
/**
29
 * Constrains the granularity at which a task's global range can be split into chunks.
30
 *
31
 * In some situations an output buffer access is only guaranteed to write to non-overlapping subranges
32
 * if the task is split in a certain way. For example when computing the row-wise sum of a 2D matrix into
33
 * a 1D vector, a split constraint is required to ensure that each element of the vector is written by
34
 * exactly one chunk.
35
 *
36
 * Another use case is for performance optimization, for example when the creation of lots of small chunks
37
 * would result in hardware under-utilization and excessive data transfers.
38
 *
39
 * Since ND-range parallel_for kernels are already constrained to be split with group size granularity,
40
 * adding an additional constraint on top results in an effective constraint of LCM(group size, constraint).
41
 *
42
 * The constraint (or effective constraint) must evenly divide the global range.
43
 * This function has no effect when called for a task without a user-provided global range.
44
 */
45
template <int Dims>
46
void constrain_split(handler& cgh, const range<Dims>& constraint);
47

48
class collective_group;
49

50
} // namespace celerity::experimental
51

52
namespace celerity {
53

54
namespace detail {
55
        class task_manager;
56

57
        handler make_command_group_handler(const task_id tid, const size_t num_collective_nodes);
58
        std::unique_ptr<task> into_task(handler&& cgh);
59
        hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm);
60
        void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
61
        void add_reduction(handler& cgh, const reduction_info& rinfo);
62

63
        void set_task_name(handler& cgh, const std::string& debug_name);
64

65
        struct unnamed_kernel {};
66

67
        template <typename KernelName>
68
        constexpr bool is_unnamed_kernel = std::is_same_v<KernelName, unnamed_kernel>;
69

70
        template <typename KernelName>
71
        std::string kernel_debug_name() {
2,795✔
72
                return !is_unnamed_kernel<KernelName> ? utils::get_simplified_type_name<KernelName>() : std::string{};
2,795✔
73
        }
74

75
        struct simple_kernel_flavor {};
76
        struct nd_range_kernel_flavor {};
77

78
        template <typename Flavor, int Dims>
79
        struct kernel_flavor_traits;
80

81
        struct no_local_size {};
82

83
        template <int Dims>
84
        struct kernel_flavor_traits<simple_kernel_flavor, Dims> {
85
                inline static constexpr bool has_local_size = false;
86
                using local_size_type = no_local_size;
87
        };
88

89
        template <int Dims>
90
        struct kernel_flavor_traits<nd_range_kernel_flavor, Dims> {
91
                inline static constexpr bool has_local_size = true;
92
                using local_size_type = range<Dims>;
93
        };
94

95
        class collective_tag_factory;
96
} // namespace detail
97

98
namespace experimental {
99
        /**
100
         * Each collective host task is executed within a collective group. If multiple host tasks are scheduled within the same collective group, they are
101
         * guaranteed to execute in the same order on every node and within a single thread per node. Each group has its own MPI communicator spanning all
102
         * participating nodes, so MPI operations the user invokes from different collective groups do not race.
103
         */
104
        class collective_group {
105
          public:
106
                /// Creates a new collective group with a globally unique id. This must only be called from the main thread.
107
                collective_group() noexcept : m_cgid(s_next_cgid++) {}
23✔
108

109
          private:
110
                friend class detail::collective_tag_factory;
111
                detail::collective_group_id m_cgid;
112
                inline static detail::collective_group_id s_next_cgid = detail::root_collective_group_id + 1;
113
        };
114
} // namespace experimental
115

116
namespace detail {
117
        /**
118
         * The collective group used in collective host tasks when no group is specified explicitly.
119
         */
120
        inline const experimental::collective_group default_collective_group;
121

122
        /**
123
         * Tag type marking a `handler::host_task` as a collective task. Do not construct this type directly, but use `celerity::experimental::collective`
124
         * or `celerity::experimental::collective(group)`.
125
         */
126
        class collective_tag {
127
          private:
128
                friend class collective_tag_factory;
129
                friend class celerity::handler;
130
                collective_tag(collective_group_id cgid) : m_cgid(cgid) {}
61✔
131
                collective_group_id m_cgid;
132
        };
133

134
        /**
135
         * Tag type construction helper. Do not construct this type directly, use `celerity::experimental::collective` instead.
136
         */
137
        class collective_tag_factory {
138
          public:
139
                operator collective_tag() const { return default_collective_group.m_cgid; }
28✔
140
                collective_tag operator()(experimental::collective_group cg) const { return cg.m_cgid; }
33✔
141
        };
142
} // namespace detail
143

144
namespace experimental {
145
        /**
146
         * Pass to `handler::host_task` to select the collective host task overload.
147
         *
148
         * Either as a value to schedule with the `default_collective_group`:
149
         * ```c++
150
         * cgh.host_task(celerity::experimental::collective, []...);
151
         * ```
152
         *
153
         * Or by specifying a collective group explicitly:
154
         * ```c++
155
         * celerity::experimental::collective_group my_group;
156
         * ...
157
         * cgh.host_task(celerity::experimental::collective(my_group), []...);
158
         * ```
159
         */
160
        inline constexpr detail::collective_tag_factory collective;
161
} // namespace experimental
162

163
namespace detail {
164

165
        /// Tag type marking a `handler::host_task` as a master-node task. Do not construct this type directly, but use `celerity::on_master_node`.
166
        class on_master_node_tag {};
167

168
        /// Tag type marking a `handler::host_task` as a single-item host task equivalent to `range<0>{}`. Do not construct this type directly, but use
169
        /// `celerity::once`.
170
        class once_tag {};
171

172
        template <typename Kernel, int Dims, typename... Reducers>
173
        inline void invoke_kernel(const Kernel& kernel, const sycl::id<std::max(1, Dims)>& s_id, const range<Dims>& global_range, const id<Dims>& global_offset,
16,804,136✔
174
            const id<Dims>& chunk_offset, Reducers&... reducers) {
175
                kernel(make_item<Dims>(id_cast<Dims>(id<std::max(1, Dims)>(s_id)) + chunk_offset, global_offset, global_range), reducers...);
16,804,136✔
176
        }
16,804,136✔
177

178
        template <typename Kernel, int Dims, typename... Reducers>
179
        inline void invoke_kernel(const Kernel& kernel, const sycl::nd_item<std::max(1, Dims)>& s_item, const range<Dims>& global_range,
25,330✔
180
            const id<Dims>& global_offset, const id<Dims>& chunk_offset, const range<Dims>& group_range, const id<Dims>& group_offset, Reducers&... reducers) {
181
                kernel(make_nd_item<Dims>(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...);
25,330✔
182
        }
25,330✔
183

184
        template <typename Kernel, int Dims>
185
        auto bind_simple_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims>& chunk_offset) {
298✔
186
                return [=](auto s_item_or_id, auto&... reducers) {
16,804,136✔
187
                        static_assert(std::is_invocable_v<Kernel, celerity::item<Dims>, decltype(reducers)...>,
188
                            "Kernel function must be invocable with celerity::item<Dims> and as many reducer objects as reductions passed to parallel_for");
189
                        if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v<sycl::id<Dims>, decltype(s_item_or_id)>) {
190
                                // CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions
191
                                invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...);
192
                        } else {
193
                                invoke_kernel(kernel, s_item_or_id.get_id(), global_range, global_offset, chunk_offset, reducers...);
16,804,136✔
194
                        }
195
                };
298✔
196
        }
197

198
        template <typename Kernel, int Dims>
199
        auto bind_nd_range_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims> chunk_offset,
39✔
200
            const range<Dims>& group_range, const id<Dims>& group_offset) {
201
                return [=](sycl::nd_item<std::max(1, Dims)> s_item, auto&... reducers) {
50,664✔
202
                        static_assert(std::is_invocable_v<Kernel, celerity::nd_item<Dims>, decltype(reducers)...>,
203
                            "Kernel function must be invocable with celerity::nd_item<Dims> or and as many reducer objects as reductions passed to parallel_for");
204
                        invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...);
25,330✔
205
                };
39✔
206
        }
207

208
        template <typename KernelName, typename... Params>
209
        inline void invoke_sycl_parallel_for(sycl::handler& cgh, Params&&... args) {
335✔
210
                static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel<KernelName>,
211
                    "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation");
212
                if constexpr(detail::is_unnamed_kernel<KernelName>) {
213
#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above
214
                        cgh.parallel_for(std::forward<Params>(args)...);
148✔
215
#endif
216
                } else {
217
                        cgh.parallel_for<KernelName>(std::forward<Params>(args)...);
187✔
218
                }
219
        }
335✔
220

221
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
222
        class reduction_descriptor;
223

224
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
225
        auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d, void* ptr) {
77✔
226
                if constexpr(WithExplicitIdentity) {
227
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_identity, d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
16✔
228
                } else {
229
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
138✔
230
                }
231
        }
232

233
        template <typename DataT, int Dims, typename BinaryOperation>
234
        class reduction_descriptor<DataT, Dims, BinaryOperation, false /* WithExplicitIdentity */> {
235
          public:
236
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value)
45✔
237
                    : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value) {}
45✔
238

239
          private:
240
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, false>(const reduction_descriptor&, void*);
241

242
                buffer_id m_bid;
243
                BinaryOperation m_op;
244
                bool m_include_current_buffer_value;
245
        };
246

247
        template <typename DataT, int Dims, typename BinaryOperation>
248
        class reduction_descriptor<DataT, Dims, BinaryOperation, true /* WithExplicitIdentity */> {
249
          public:
250
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value)
6✔
251
                    : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value) {}
6✔
252

253
          private:
254
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, true>(const reduction_descriptor&, void*);
255

256
                buffer_id m_bid;
257
                BinaryOperation m_op;
258
                DataT m_identity{};
259
                bool m_include_current_buffer_value;
260
        };
261

262
        template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
263
        auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const sycl::property_list& prop_list) {
54✔
264
                if(vars.get_range().size() != 1) {
54✔
265
                        // Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer
266
                        // as distributed_state and pending_reduction_state.
267
                        throw std::runtime_error("Only unit-sized buffers can be reduction targets");
3✔
268
                }
269

270
                const auto bid = detail::get_buffer_id(vars);
51✔
271
                const auto include_current_buffer_value = !prop_list.has_property<celerity::property::reduction::initialize_to_identity>();
51✔
272

273
                const auto rid = detail::runtime::get_instance().create_reduction(detail::make_reducer(op, identity));
51✔
274
                add_reduction(cgh, reduction_info{rid, bid, include_current_buffer_value});
51✔
275

276
                return detail::reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>{bid, op, identity, include_current_buffer_value};
102✔
277
        }
278

279
        template <typename... ReductionsAndKernel>
280
        struct is_reductions_and_kernel_object : std::false_type {};
281

282
        template <typename Kernel>
283
        struct is_reductions_and_kernel_object<Kernel> : std::true_type {};
284

285
        template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation, typename... ReductionsAndKernel>
286
        struct is_reductions_and_kernel_object<reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>, ReductionsAndKernel...>
287
            : is_reductions_and_kernel_object<ReductionsAndKernel...> {};
288

289
        template <typename... KernelAndReductions>
290
        constexpr bool is_reductions_and_kernel_v = is_reductions_and_kernel_object<std::remove_cv_t<std::remove_reference_t<KernelAndReductions>>...>::value;
291

292
} // namespace detail
293

294
/// Pass to `handler::host_task` to select the master-node task overload.
295
inline constexpr detail::on_master_node_tag on_master_node;
296

297
/// Equivalent to `range<0>{}` when passed to `handler::host_task`.
298
inline constexpr detail::once_tag once;
299

300
class handler {
301
  public:
302
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
303
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
304
        void parallel_for(range<Dims> global_range, ReductionsAndKernel&&... reductions_and_kernel) {
187✔
305
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, id<Dims>(),
279✔
306
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
307
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
308
        }
176✔
309

310
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
311
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
312
        void parallel_for(range<Dims> global_range, id<Dims> global_offset, ReductionsAndKernel&&... reductions_and_kernel) {
818✔
313
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, global_offset,
818✔
314
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
315
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
316
        }
818✔
317

318
        template <typename KernelName = detail::unnamed_kernel, typename... ReductionsAndKernel,
319
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
320
        void parallel_for(const size_t global_range, ReductionsAndKernel&&... reductions_and_kernel) {
1✔
321
                parallel_for<KernelName>(range<1>(global_range), std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
1✔
322
        }
1✔
323

324
        template <typename KernelName = detail::unnamed_kernel, typename... ReductionsAndKernel,
325
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
326
        void parallel_for(const size_t global_range, const size_t global_offset, ReductionsAndKernel&&... reductions_and_kernel) {
1✔
327
                parallel_for<KernelName>(range<1>(global_range), id<1>(global_offset), std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
1✔
328
        }
1✔
329

330
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel,
331
            std::enable_if_t<detail::is_reductions_and_kernel_v<ReductionsAndKernel...>, int> = 0>
332
        void parallel_for(celerity::nd_range<Dims> execution_range, ReductionsAndKernel&&... reductions_and_kernel) {
48✔
333
                parallel_for_reductions_and_kernel<detail::nd_range_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(execution_range.get_global_range(),
96✔
334
                    execution_range.get_offset(), execution_range.get_local_range(), std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
48✔
335
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
336
        }
45✔
337

338
        /**
339
         * Schedules `task` to execute on the master node only. Call via `cgh.host_task(celerity::on_master_node, []...)`. The task functor is assumed to be
340
         * invocable with the signature `void(const celerity::partition<0> &)` or `void()`.
341
         *
342
         * The task is executed in a background thread and multiple master node tasks may be executed concurrently if they are independent in the task graph, so
343
         * proper synchronization must be ensured.
344
         */
345
        template <typename Functor>
346
        void host_task(const detail::on_master_node_tag on_master_node, Functor&& task) {
3,322✔
347
                auto launcher = make_host_task_launcher<0, false>(detail::zeros, detail::non_collective_group_id, std::forward<Functor>(task));
3,322✔
348
                create_master_node_task(std::move(launcher));
3,319✔
349
        }
6,638✔
350
        /**
351
         * Schedules `task` to be executed collectively on all nodes participating in the specified collective group. Call via
352
         * `cgh.host_task(celerity::experimental::collective, []...)` or  `cgh.host_task(celerity::experimental::collective(group), []...)`.
353
         * The task functor is assumed to be invocable with the signature `void(const celerity::experimental::collective_partition&)`
354
         * or `void(const celerity::partition<1>&)`.
355
         *
356
         * This provides framework to use arbitrary collective MPI operations in a host task, such as performing collective I/O with parallel HDF5.
357
         * The local node id,t the number of participating nodes as well as the group MPI communicator can be obtained from the `collective_partition` passed into
358
         * the task functor.
359
         *
360
         * All collective tasks within a collective group are guaranteed to be executed in the same order on all nodes.
361
         */
362
        template <typename Functor>
363
        void host_task(const detail::collective_tag collective, Functor&& task) {
61✔
364
                // FIXME: We should not have to know how the global range is determined for collective tasks to create the launcher
365
                auto launcher = make_host_task_launcher<1, true>(range<3>{m_num_collective_nodes, 1, 1}, collective.m_cgid, std::forward<Functor>(task));
61✔
366
                create_collective_task(collective.m_cgid, std::move(launcher));
61✔
367
        }
122✔
368

369
        /**
370
         * Schedules a distributed execution of `task` by splitting the iteration space in a runtime-defined manner. The task functor is assumed to be invocable
371
         * with the signature `void(const celerity::partition<Dims>&)`.
372
         *
373
         * The task is executed in a background thread with multiple host tasks being run concurrently if they are independent in the task graph, so proper
374
         * synchronization must be ensured. The partition passed into the task functor describes the split each host receives. It may be used with accessors to
375
         * obtain the per-node portion of a buffer en-bloc, see `celerity::accessor::get_allocation_window` for details.
376
         *
377
         * There are no guarantees with respect to the split size and the order in which host tasks are reordered between nodes other than
378
         * the restrictions imposed by dependencies in the task graph. Also, the task functor may be invoked multiple times on one node and not be scheduled on
379
         * another node. If you need guarantees about execution order, consider `host_task(experimental::collective)` instead.
380
         */
381
        template <int Dims, typename Functor>
382
        void host_task(range<Dims> global_range, id<Dims> global_offset, Functor&& task) {
336✔
383
                const detail::task_geometry geometry{
672✔
384
                    Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(global_range, range<Dims>(detail::ones))};
672✔
385
                auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, std::forward<Functor>(task));
336✔
386
                create_host_compute_task(geometry, std::move(launcher));
336✔
387
        }
672✔
388

389
        /**
390
         * Like `host_task(range<Dims> global_range, id<Dims> global_offset, Functor task)`, but with a `global_offset` of zero.
391
         */
392
        template <int Dims, typename Functor>
393
        void host_task(range<Dims> global_range, Functor&& task) {
333✔
394
                host_task(global_range, {}, std::forward<Functor>(task));
349✔
395
        }
333✔
396

397
        /**
398
         * Schedules a host task with a single-element iteration space, causing it to be executed exactly once and on a single cluster node.
399
         * Equivalent to `host_task(range<0>{}, ...)`.
400
         */
401
        template <typename Functor>
402
        void host_task(const detail::once_tag once, Functor&& task) {
263✔
403
                host_task(range<0>{}, std::forward<Functor>(task));
263✔
404
        }
263✔
405

406
  private:
407
        friend handler detail::make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes);
408
        friend std::unique_ptr<detail::task> detail::into_task(handler&& cgh);
409
        friend detail::hydration_id detail::add_requirement(handler& cgh, const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm);
410
        friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
411
        friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo);
412
        template <int Dims>
413
        friend void experimental::constrain_split(handler& cgh, const range<Dims>& constraint);
414
        template <typename Hint>
415
        friend void experimental::hint(handler& cgh, Hint&& hint);
416
        friend void detail::set_task_name(handler& cgh, const std::string& debug_name);
417

418
        detail::task_id m_tid;
419
        detail::buffer_access_map m_access_map;
420
        detail::side_effect_map m_side_effects;
421
        size_t m_non_void_side_effects_count = 0;
422
        detail::reduction_set m_reductions;
423
        std::unique_ptr<detail::task> m_task = nullptr;
424
        size_t m_num_collective_nodes;
425
        detail::hydration_id m_next_accessor_hydration_id = 1;
426
        std::optional<std::string> m_usr_def_task_name;
427
        range<3> m_split_constraint = detail::ones;
428
        std::vector<std::unique_ptr<detail::hint_base>> m_hints;
429

430
        handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}
4,779✔
431

432
        template <typename KernelFlavor, typename KernelName, int Dims, typename... ReductionsAndKernel, size_t... ReductionIndices>
433
        void parallel_for_reductions_and_kernel(range<Dims> global_range, id<Dims> global_offset,
1,053✔
434
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_size, std::index_sequence<ReductionIndices...> indices,
435
            ReductionsAndKernel&&... kernel_and_reductions) {
436
                auto args_tuple = std::forward_as_tuple(kernel_and_reductions...);
1,053✔
437
                auto&& kernel = std::get<sizeof...(kernel_and_reductions) - 1>(args_tuple);
1,053✔
438
                parallel_for_kernel_and_reductions<KernelFlavor, KernelName>(
1,053✔
439
                    global_range, global_offset, local_size, std::forward<decltype(kernel)>(kernel), std::get<ReductionIndices>(args_tuple)...);
440
        }
1,039✔
441

442
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, typename... Reductions>
443
        void parallel_for_kernel_and_reductions(range<Dims> global_range, id<Dims> global_offset,
1,053✔
444
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel, Reductions&... reductions) {
445
                range<3> granularity = {1, 1, 1};
1,053✔
446
                if constexpr(detail::kernel_flavor_traits<KernelFlavor, Dims>::has_local_size) {
447
                        for(int d = 0; d < Dims; ++d) {
133✔
448
                                granularity[d] = local_range[d];
85✔
449
                        }
450
                }
451
                const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset),
1,053✔
452
                    get_constrained_granularity(global_range, detail::range_cast<Dims>(granularity))};
1,053✔
453
                auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
1,043✔
454
                    global_range, global_offset, local_range, std::forward<Kernel>(kernel), std::index_sequence_for<Reductions...>(), reductions...);
455
                create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
1,039✔
456
        }
2,078✔
457

458
        [[nodiscard]] detail::hydration_id add_requirement(const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm) {
3,652✔
459
                assert(m_task == nullptr);
3,652✔
460
                m_access_map.add_access(bid, std::move(rm));
3,652✔
461
                return m_next_accessor_hydration_id++;
3,652✔
462
        }
463

464
        void add_requirement(const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
465
                assert(m_task == nullptr);
73✔
466
                m_side_effects.add_side_effect(hoid, order);
73✔
467
                if(!is_void) { m_non_void_side_effects_count++; }
73✔
468
        }
73✔
469

470
        void add_reduction(const detail::reduction_info& rinfo) {
125✔
471
                assert(m_task == nullptr);
125✔
472
                m_reductions.push_back(rinfo);
125✔
473
        }
125✔
474

475
        template <int Dims>
476
        void experimental_constrain_split(const range<Dims>& constraint) {
18✔
477
                assert(m_task == nullptr);
18✔
478
                m_split_constraint = detail::range_cast<3>(constraint);
18✔
479
        }
18✔
480

481
        template <typename Hint>
482
        void experimental_hint(Hint&& hint) {
86✔
483
                static_assert(std::is_base_of_v<detail::hint_base, std::decay_t<Hint>>, "Hint must extend hint_base");
484
                static_assert(std::is_move_constructible_v<Hint>, "Hint must be move-constructible");
485
                for(auto& h : m_hints) {
99✔
486
                        // We currently don't allow more than one hint of the same type for simplicity; this could be loosened in the future.
487
                        auto& hr = *h; // Need to do this here to avoid -Wpotentially-evaluated-expression
17✔
488
                        if(typeid(hr) == typeid(hint)) { throw std::runtime_error("Providing more than one hint of the same type is not allowed"); }
17✔
489
                        h->validate(hint);
16✔
490
                }
491
                m_hints.emplace_back(std::make_unique<std::decay_t<Hint>>(std::forward<Hint>(hint)));
82✔
492
        }
82✔
493

494
        template <int Dims>
495
        range<3> get_constrained_granularity(const range<Dims>& global_size, const range<Dims>& granularity) const {
1,389✔
496
                range<3> result = detail::range_cast<3>(granularity);
1,389✔
497
                for(int i = 0; i < Dims; ++i) {
2,845✔
498
                        const auto lcm = std::lcm(granularity[i], m_split_constraint[i]);
1,459✔
499
                        if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); }
1,459✔
500
                        result[i] = lcm;
1,456✔
501
                }
502
                if(global_size % detail::range_cast<Dims>(result) != range<Dims>(detail::zeros)) {
1,386✔
503
                        throw std::runtime_error(fmt::format("The{}split constraint {} does not evenly divide the kernel global size {}",
28✔
504
                            granularity.size() > 1 ? " effective " : " ", detail::range_cast<Dims>(result), global_size));
21✔
505
                }
506
                return result;
1,379✔
507
        }
508

509
        void create_host_compute_task(const detail::task_geometry& geometry, detail::host_task_launcher launcher) {
336✔
510
                assert(m_task == nullptr);
336✔
511
                if(geometry.global_size.size() == 0) {
336!
512
                        // TODO this can be easily supported by not creating a task in case the execution range is empty
UNCOV
513
                        throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"};
×
514
                }
515
                m_task =
336✔
516
                    detail::task::make_host_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions));
336✔
517

518
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
336✔
519
        }
336✔
520

521
        void create_device_compute_task(const detail::task_geometry& geometry, const std::string& debug_name, detail::device_kernel_launcher launcher) {
1,039✔
522
                assert(m_task == nullptr);
1,039✔
523
                if(geometry.global_size.size() == 0) {
1,039!
524
                        // TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty.
525
                        // Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets
526
                        // the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on
527
                        // each node that reads from the reduction output buffer, initializing it to the identity value locally.
UNCOV
528
                        throw std::runtime_error{"The execution range of device tasks must have at least one item"};
×
529
                }
530
                // Note that cgf_diagnostics has a similar check, but we don't catch void side effects there.
531
                if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; }
1,039!
532
                m_task = detail::task::make_device_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_reductions));
1,039✔
533

534
                m_task->set_debug_name(m_usr_def_task_name.value_or(debug_name));
1,039✔
535
        }
1,039✔
536

537
        void create_collective_task(const detail::collective_group_id cgid, detail::host_task_launcher launcher) {
61✔
538
                assert(m_task == nullptr);
61✔
539
                m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
61✔
540

541
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
61✔
542
        }
61✔
543

544
        void create_master_node_task(detail::host_task_launcher launcher) {
3,319✔
545
                assert(m_task == nullptr);
3,319✔
546
                m_task = detail::task::make_master_node(m_tid, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
3,319✔
547

548
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
3,319✔
549
        }
3,319✔
550

551
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, size_t... ReductionIndices, typename... Reductions>
552
        detail::device_kernel_launcher make_device_kernel_launcher(const range<Dims>& global_range, const id<Dims>& global_offset,
1,043✔
553
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel,
554
            std::index_sequence<ReductionIndices...> /* indices */, Reductions... reductions) {
555
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
556

557
                // Check whether all accessors are being captured by value etc.
558
                // Although the diagnostics should always be available, we currently disable them for some test cases.
559
                if(detail::cgf_diagnostics::is_available()) { detail::cgf_diagnostics::get_instance().check<target::device>(kernel, m_access_map); }
1,043✔
560

561
                return [=](sycl::handler& sycl_cgh, const detail::box<3>& execution_range, const std::vector<void*>& reduction_ptrs) {
1,530!
562
                        constexpr int sycl_dims = std::max(1, Dims);
335✔
563
                        if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
564
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
296✔
565
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, sycl_global_range,
888✔
566
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
71✔
567
                                    detail::bind_simple_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset())));
592✔
568
                        } else if constexpr(std::is_same_v<KernelFlavor, detail::nd_range_kernel_flavor>) {
569
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
39✔
570
                                const auto sycl_local_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(local_range));
39✔
571
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, sycl::nd_range{sycl_global_range, sycl_local_range},
170✔
572
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
6✔
573
                                    detail::bind_nd_range_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset()),
69✔
574
                                        global_range / local_range, detail::id_cast<Dims>(execution_range.get_offset()) / local_range));
117✔
575
                        } else {
576
                                static_assert(detail::constexpr_false<KernelFlavor>);
577
                        }
578
                };
2,232✔
579
        }
580

581
        template <int Dims, bool Collective, typename Kernel>
582
        detail::host_task_launcher make_host_task_launcher(const range<3>& global_range, const detail::collective_group_id cgid, Kernel&& kernel) {
3,719✔
583
                static_assert(Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const partition<Dims>>,
584
                    "Kernel for host task must be invocable with either no arguments or a celerity::partition<Dims>");
585
                static_assert(!Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const experimental::collective_partition>,
586
                    "Kernel for collective host task must be invocable with either no arguments or a celerity::experimental::collective_partition");
587
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
588
                static_assert(Dims >= 0);
589

590
                // Check whether all accessors are being captured by value etc.
591
                // Although the diagnostics should always be available, we currently disable them for some test cases.
592
                if(detail::cgf_diagnostics::is_available()) {
3,719✔
593
                        detail::cgf_diagnostics::get_instance().check<target::host_task>(kernel, m_access_map, m_non_void_side_effects_count);
3,204✔
594
                }
595

596
                return [kernel, global_range](const detail::box<3>& execution_range, const detail::communicator* collective_comm) {
12,149✔
597
                        (void)global_range;
598
                        (void)collective_comm;
599
                        if constexpr(Dims > 0) {
600
                                if constexpr(Collective) {
601
                                        static_assert(Dims == 1);
602
                                        assert(collective_comm != nullptr);
26✔
603
                                        const auto part =
52✔
604
                                            detail::make_collective_partition(detail::range_cast<1>(global_range), detail::box_cast<1>(execution_range), *collective_comm);
52✔
605
                                        kernel(part);
26✔
606
                                } else {
607
                                        const auto part = detail::make_partition<Dims>(detail::range_cast<Dims>(global_range), detail::box_cast<Dims>(execution_range));
14✔
608
                                        kernel(part);
14✔
609
                                }
610
                        } else if constexpr(std::is_invocable_v<Kernel, const partition<0>&>) {
611
                                (void)execution_range;
612
                                const auto part = detail::make_partition<0>(range<0>(), subrange<0>());
3✔
613
                                kernel(part);
3✔
614
                        } else {
615
                                (void)execution_range;
616
                                kernel();
3,112✔
617
                        }
618
                };
9,600✔
619
        }
620

621
        std::unique_ptr<detail::task> into_task() && {
4,746✔
622
                assert(m_task != nullptr);
4,746✔
623
                for(auto& h : m_hints) {
4,823✔
624
                        m_task->add_hint(std::move(h));
77✔
625
                }
626
                return std::move(m_task);
4,746✔
627
        }
628
};
629

630
namespace detail {
631

632
        inline handler make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes) { return handler(tid, num_collective_nodes); }
4,779✔
633

634
        inline std::unique_ptr<detail::task> into_task(handler&& cgh) { return std::move(cgh).into_task(); }
4,746✔
635

636
        [[nodiscard]] inline hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm) {
3,652✔
637
                return cgh.add_requirement(bid, std::move(rm));
7,304✔
638
        }
639

640
        inline void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
641
                return cgh.add_requirement(hoid, order, is_void);
73✔
642
        }
643

644
        inline void add_reduction(handler& cgh, const detail::reduction_info& rinfo) { return cgh.add_reduction(rinfo); }
125✔
645

646
        inline void set_task_name(handler& cgh, const std::string& debug_name) { cgh.m_usr_def_task_name = {debug_name}; }
352✔
647

648
        // TODO: The _impl functions in detail only exist during the grace period for deprecated reductions on const buffers; move outside again afterwards.
649
        template <typename DataT, int Dims, typename BinaryOperation>
650
        auto reduction_impl(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
48✔
651
                static_assert(sycl::has_known_identity_v<BinaryOperation, DataT>,
652
                    "Celerity does not currently support reductions without an identity. Either specialize "
653
                    "sycl::known_identity or use the reduction() overload taking an identity at runtime");
654
                return detail::make_reduction<false>(vars, cgh, combiner, sycl::known_identity_v<BinaryOperation, DataT>, prop_list);
48✔
655
        }
656

657
        template <typename DataT, int Dims, typename BinaryOperation>
658
        auto reduction_impl(
6✔
659
            const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
660
                static_assert(!sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
661
                return detail::make_reduction<true>(vars, cgh, combiner, identity, prop_list);
6✔
662
        }
663

664
} // namespace detail
665

666
template <typename DataT, int Dims, typename BinaryOperation>
667
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
47✔
668
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
47✔
669
}
670

671
template <typename DataT, int Dims, typename BinaryOperation>
672
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
6✔
673
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
6✔
674
}
675

676
template <typename DataT, int Dims, typename BinaryOperation>
677
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
1✔
678
    const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
679
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
1✔
680
}
681

682
template <typename DataT, int Dims, typename BinaryOperation>
683
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
684
    const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const sycl::property_list& prop_list = {}) {
685
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
686
}
687

688
} // namespace celerity
689

690
namespace celerity::experimental {
691
template <int Dims>
692
void constrain_split(handler& cgh, const range<Dims>& constraint) {
18✔
693
        cgh.experimental_constrain_split(constraint);
18✔
694
}
18✔
695

696
template <typename Hint>
697
void hint(handler& cgh, Hint&& hint) {
86✔
698
        cgh.experimental_hint(std::forward<Hint>(hint));
86✔
699
}
82✔
700
} // namespace celerity::experimental
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc