• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 10216674169

02 Aug 2024 01:45PM UTC coverage: 94.951% (+2.1%) from 92.884%
10216674169

push

github

fknorr
Remove experimental::user_benchmarker

user_benchmarker has been obsolete ever since we moved away from
structured logging as a the profiler (CPAT) interface.

2978 of 3372 branches covered (88.32%)

Branch coverage included in aggregate %.

6557 of 6670 relevant lines covered (98.31%)

1534446.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.03
/include/handler.h
1
#pragma once
2

3
#include <memory>
4
#include <type_traits>
5
#include <typeinfo>
6
#include <utility>
7

8
#include <CL/sycl.hpp>
9
#include <fmt/format.h>
10

11
#include "buffer.h"
12
#include "cgf_diagnostics.h"
13
#include "item.h"
14
#include "partition.h"
15
#include "range_mapper.h"
16
#include "ranges.h"
17
#include "task.h"
18
#include "types.h"
19
#include "workaround.h"
20

21
namespace celerity {
22
class handler;
23
}
24

25
namespace celerity::experimental {
26

27
/**
28
 * Constrains the granularity at which a task's global range can be split into chunks.
29
 *
30
 * In some situations an output buffer access is only guaranteed to write to non-overlapping subranges
31
 * if the task is split in a certain way. For example when computing the row-wise sum of a 2D matrix into
32
 * a 1D vector, a split constraint is required to ensure that each element of the vector is written by
33
 * exactly one chunk.
34
 *
35
 * Another use case is for performance optimization, for example when the creation of lots of small chunks
36
 * would result in hardware under-utilization and excessive data transfers.
37
 *
38
 * Since ND-range parallel_for kernels are already constrained to be split with group size granularity,
39
 * adding an additional constraint on top results in an effective constraint of LCM(group size, constraint).
40
 *
41
 * The constraint (or effective constraint) must evenly divide the global range.
42
 * This function has no effect when called for a task without a user-provided global range.
43
 */
44
template <int Dims>
45
void constrain_split(handler& cgh, const range<Dims>& constraint);
46

47
} // namespace celerity::experimental
48

49
namespace celerity {
50

51
namespace detail {
52
        class task_manager;
53

54
        handler make_command_group_handler(const task_id tid, const size_t num_collective_nodes);
55
        std::unique_ptr<task> into_task(handler&& cgh);
56
        hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm);
57
        void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
58
        void add_reduction(handler& cgh, const reduction_info& rinfo);
59

60
        void set_task_name(handler& cgh, const std::string& debug_name);
61

62
        struct unnamed_kernel {};
63

64
        template <typename KernelName>
65
        constexpr bool is_unnamed_kernel = std::is_same_v<KernelName, unnamed_kernel>;
66

67
        template <typename KernelName>
68
        std::string kernel_debug_name() {
2,016✔
69
                return !is_unnamed_kernel<KernelName> ? utils::get_simplified_type_name<KernelName>() : std::string{};
2,016✔
70
        }
71

72
        struct simple_kernel_flavor {};
73
        struct nd_range_kernel_flavor {};
74

75
        template <typename Flavor, int Dims>
76
        struct kernel_flavor_traits;
77

78
        struct no_local_size {};
79

80
        template <int Dims>
81
        struct kernel_flavor_traits<simple_kernel_flavor, Dims> {
82
                inline static constexpr bool has_local_size = false;
83
                using local_size_type = no_local_size;
84
        };
85

86
        template <int Dims>
87
        struct kernel_flavor_traits<nd_range_kernel_flavor, Dims> {
88
                inline static constexpr bool has_local_size = true;
89
                using local_size_type = range<Dims>;
90
        };
91
} // namespace detail
92

93
/**
94
 * Tag type marking a `handler::host_task` as a master-node task. Do not construct this type directly, but use `celerity::on_master_node`.
95
 */
96
class on_master_node_tag {};
97

98
/**
99
 * Pass to `handler::host_task` to select the master-node task overload.
100
 */
101
inline constexpr on_master_node_tag on_master_node;
102

103
namespace experimental {
104
        class collective_tag_factory;
105

106
        /**
107
         * Each collective host task is executed within a collective group. If multiple host tasks are scheduled within the same collective group, they are
108
         * guaranteed to execute in the same order on every node and within a single thread per node. Each group has its own MPI communicator spanning all
109
         * participating nodes, so MPI operations the user invokes from different collective groups do not race.
110
         */
111
        class collective_group {
112
          public:
113
                /// Creates a new collective group with a globally unique id. This must only be called from the main thread.
114
                collective_group() noexcept : m_cgid(s_next_cgid++) {}
23✔
115

116
          private:
117
                friend class collective_tag_factory;
118
                detail::collective_group_id m_cgid;
119
                inline static detail::collective_group_id s_next_cgid = detail::root_collective_group_id + 1;
120
        };
121

122
        /**
123
         * Tag type marking a `handler::host_task` as a collective task. Do not construct this type directly, but use `celerity::experimental::collective`
124
         * or `celerity::experimental::collective(group)`.
125
         */
126
        class collective_tag {
127
          private:
128
                friend class collective_tag_factory;
129
                friend class celerity::handler;
130
                collective_tag(detail::collective_group_id cgid) : m_cgid(cgid) {}
59✔
131
                detail::collective_group_id m_cgid;
132
        };
133

134
        /**
135
         * The collective group used in collective host tasks when no group is specified explicitly.
136
         */
137
        inline const collective_group default_collective_group;
138

139
        /**
140
         * Tag type construction helper. Do not construct this type directly, use `celerity::experimental::collective` instead.
141
         */
142
        class collective_tag_factory {
143
          public:
144
                operator experimental::collective_tag() const { return default_collective_group.m_cgid; }
28✔
145
                experimental::collective_tag operator()(experimental::collective_group cg) const { return cg.m_cgid; }
31✔
146
        };
147

148
        /**
149
         * Pass to `handler::host_task` to select the collective host task overload.
150
         *
151
         * Either as a value to schedule with the `default_collective_group`:
152
         * ```c++
153
         * cgh.host_task(celerity::experimental::collective, []...);
154
         * ```
155
         *
156
         * Or by specifying a collective group explicitly:
157
         * ```c++
158
         * celerity::experimental::collective_group my_group;
159
         * ...
160
         * cgh.host_task(celerity::experimental::collective(my_group), []...);
161
         * ```
162
         */
163
        inline constexpr collective_tag_factory collective;
164
} // namespace experimental
165

166
namespace detail {
167
        template <typename Kernel, int Dims, typename... Reducers>
168
        inline void invoke_kernel(const Kernel& kernel, const sycl::id<std::max(1, Dims)>& s_id, const range<Dims>& global_range, const id<Dims>& global_offset,
16,805,511✔
169
            const id<Dims>& chunk_offset, Reducers&... reducers) {
170
                kernel(make_item<Dims>(id_cast<Dims>(id<std::max(1, Dims)>(s_id)) + chunk_offset, global_offset, global_range), reducers...);
16,805,511✔
171
        }
16,805,511✔
172

173
        template <typename Kernel, int Dims, typename... Reducers>
174
        inline void invoke_kernel(const Kernel& kernel, const cl::sycl::nd_item<std::max(1, Dims)>& s_item, const range<Dims>& global_range,
521,878✔
175
            const id<Dims>& global_offset, const id<Dims>& chunk_offset, const range<Dims>& group_range, const id<Dims>& group_offset, Reducers&... reducers) {
176
                kernel(make_nd_item<Dims>(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...);
521,878✔
177
        }
521,878✔
178

179
        template <typename Kernel, int Dims>
180
        auto bind_simple_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims>& chunk_offset) {
315✔
181
                return [=](auto s_item_or_id, auto&... reducers) {
16,805,511✔
182
                        constexpr int sycl_dims = std::max(1, Dims);
16,805,511✔
183
                        static_assert(std::is_invocable_v<Kernel, celerity::item<Dims>, decltype(reducers)...>,
184
                            "Kernel function must be invocable with celerity::item<Dims> and as many reducer objects as reductions passed to parallel_for");
185
                        if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v<sycl::id<Dims>, decltype(s_item_or_id)>) {
186
                                // CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions
187
                                invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...);
188
                        } else {
189
                                // Explicit item constructor: ComputeCpp does not pass a sycl::item, but an implicitly convertible sycl::item_base (?) which does not have
190
                                // `sycl::id<> get_id()`
191
                                invoke_kernel(kernel, cl::sycl::item<sycl_dims>{s_item_or_id}.get_id(), global_range, global_offset, chunk_offset, reducers...);
16,805,511✔
192
                        }
193
                };
315✔
194
        }
195

196
        template <typename Kernel, int Dims>
197
        auto bind_nd_range_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims> chunk_offset,
63✔
198
            const range<Dims>& group_range, const id<Dims>& group_offset) {
199
                return [=](sycl::nd_item<std::max(1, Dims)> s_item, auto&... reducers) {
1,043,760✔
200
                        static_assert(std::is_invocable_v<Kernel, celerity::nd_item<Dims>, decltype(reducers)...>,
201
                            "Kernel function must be invocable with celerity::nd_item<Dims> or and as many reducer objects as reductions passed to parallel_for");
202
                        invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...);
521,878✔
203
                };
63✔
204
        }
205

206
        template <typename KernelName, typename... Params>
207
        inline void invoke_sycl_parallel_for(cl::sycl::handler& cgh, Params&&... args) {
376✔
208
                static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel<KernelName>,
209
                    "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation");
210
                if constexpr(detail::is_unnamed_kernel<KernelName>) {
211
#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above
212
                        cgh.parallel_for(std::forward<Params>(args)...);
172✔
213
#endif
214
                } else {
215
                        cgh.parallel_for<KernelName>(std::forward<Params>(args)...);
204✔
216
                }
217
        }
376✔
218

219
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
220
        class reduction_descriptor;
221

222
        template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
223
        auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d, void* ptr) {
77✔
224
                if constexpr(WithExplicitIdentity) {
225
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_identity, d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
16✔
226
                } else {
227
                        return sycl::reduction(static_cast<DataT*>(ptr), d.m_op, sycl::property_list{sycl::property::reduction::initialize_to_identity{}});
138✔
228
                }
229
        }
230

231
        template <typename DataT, int Dims, typename BinaryOperation>
232
        class reduction_descriptor<DataT, Dims, BinaryOperation, false /* WithExplicitIdentity */> {
233
          public:
234
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value)
45✔
235
                    : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value) {}
45✔
236

237
          private:
238
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, false>(const reduction_descriptor&, void*);
239

240
                buffer_id m_bid;
241
                BinaryOperation m_op;
242
                bool m_include_current_buffer_value;
243
        };
244

245
        template <typename DataT, int Dims, typename BinaryOperation>
246
        class reduction_descriptor<DataT, Dims, BinaryOperation, true /* WithExplicitIdentity */> {
247
          public:
248
                reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value)
6✔
249
                    : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value) {}
6✔
250

251
          private:
252
                friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, true>(const reduction_descriptor&, void*);
253

254
                buffer_id m_bid;
255
                BinaryOperation m_op;
256
                DataT m_identity{};
257
                bool m_include_current_buffer_value;
258
        };
259

260
        template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
261
        auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) {
54✔
262
                if(vars.get_range().size() != 1) {
54✔
263
                        // Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer
264
                        // as distributed_state and pending_reduction_state.
265
                        throw std::runtime_error("Only unit-sized buffers can be reduction targets");
3✔
266
                }
267

268
                const auto bid = detail::get_buffer_id(vars);
51✔
269
                const auto include_current_buffer_value = !prop_list.has_property<celerity::property::reduction::initialize_to_identity>();
51✔
270

271
                const auto rid = detail::runtime::get_instance().create_reduction(detail::make_reducer(op, identity));
51✔
272
                add_reduction(cgh, reduction_info{rid, bid, include_current_buffer_value});
51✔
273

274
                return detail::reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>{bid, op, identity, include_current_buffer_value};
102✔
275
        }
276

277
} // namespace detail
278

279
class handler {
280
  public:
281
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel>
282
        void parallel_for(range<Dims> global_range, ReductionsAndKernel&&... reductions_and_kernel) {
185✔
283
                static_assert(sizeof...(reductions_and_kernel) > 0, "No kernel given");
284
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, id<Dims>(),
277✔
285
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
286
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
287
        }
174✔
288

289
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel>
290
        void parallel_for(range<Dims> global_range, id<Dims> global_offset, ReductionsAndKernel&&... reductions_and_kernel) {
772✔
291
                static_assert(sizeof...(reductions_and_kernel) > 0, "No kernel given");
292
                parallel_for_reductions_and_kernel<detail::simple_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(global_range, global_offset,
772✔
293
                    detail::no_local_size{}, std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
294
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
295
        }
772✔
296

297
        template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel>
298
        void parallel_for(celerity::nd_range<Dims> execution_range, ReductionsAndKernel&&... reductions_and_kernel) {
49✔
299
                static_assert(sizeof...(reductions_and_kernel) > 0, "No kernel given");
300
                parallel_for_reductions_and_kernel<detail::nd_range_kernel_flavor, KernelName, Dims, ReductionsAndKernel...>(execution_range.get_global_range(),
98✔
301
                    execution_range.get_offset(), execution_range.get_local_range(), std::make_index_sequence<sizeof...(reductions_and_kernel) - 1>{},
49✔
302
                    std::forward<ReductionsAndKernel>(reductions_and_kernel)...);
303
        }
46✔
304

305
        /**
306
         * Schedules `kernel` to execute on the master node only. Call via `cgh.host_task(celerity::on_master_node, []...)`. The kernel is assumed to be invocable
307
         * with the signature `void(const celerity::partition<0> &)` or `void()`.
308
         *
309
         * The kernel is executed in a background thread pool and multiple master node tasks may be executed concurrently if they are independent in the
310
         * task graph, so proper synchronization must be ensured.
311
         *
312
         * **Compatibility note:** This replaces master-access tasks from Celerity 0.1 which were executed on the master node's main thread, so this implementation
313
         * may require different lifetimes for captures. See `celerity::allow_by_ref` for more information on this topic.
314
         */
315
        template <typename Functor>
316
        void host_task(on_master_node_tag /* tag */, Functor&& kernel) {
3,321✔
317
                auto launcher = make_host_task_launcher<0, false>(detail::zeros, 0, std::forward<Functor>(kernel));
3,324✔
318
                create_master_node_task(std::move(launcher));
3,318✔
319
        }
6,636✔
320

321
        /**
322
         * Schedules `kernel` to be executed collectively on all nodes participating in the specified collective group. Call via
323
         * `cgh.host_task(celerity::experimental::collective, []...)` or  `cgh.host_task(celerity::experimental::collective(group), []...)`.
324
         * The kernel is assumed to be invocable with the signature `void(const celerity::experimental::collective_partition&)`
325
         * or `void(const celerity::partition<1>&)`.
326
         *
327
         * This provides framework to use arbitrary collective MPI operations in a host task, such as performing collective I/O with parallel HDF5.
328
         * The local node id,t the number of participating nodes as well as the group MPI communicator can be obtained from the `collective_partition` passed into
329
         * the kernel.
330
         *
331
         * All collective tasks within a collective group are guaranteed to be executed in the same order on all nodes, additionally, all internal MPI operations
332
         * and all host kernel invocations are executed in a single thread on each host.
333
         */
334
        template <typename Functor>
335
        void host_task(experimental::collective_tag tag, Functor&& kernel) {
59✔
336
                // FIXME: We should not have to know how the global range is determined for collective tasks to create the launcher
337
                auto launcher = make_host_task_launcher<1, true>(range<3>{m_num_collective_nodes, 1, 1}, tag.m_cgid, std::forward<Functor>(kernel));
59✔
338
                create_collective_task(tag.m_cgid, std::move(launcher));
59✔
339
        }
118✔
340

341
        /**
342
         * Schedules a distributed execution of `kernel` by splitting the iteration space in a runtime-defined manner. The kernel is assumed to be invocable
343
         * with the signature `void(const celerity::partition<Dims>&)`.
344
         *
345
         * The kernel is executed in a background thread pool with multiple host tasks being run concurrently if they are independent in the task graph,
346
         * so proper synchronization must be ensured. The partition passed into the kernel describes the split each host receives. It may be used with accessors
347
         * to obtain the per-node portion of a buffer en-bloc, see `celerity::accessor::get_allocation_window` for details.
348
         *
349
         * There are no guarantees with respect to the split size and the order in which host tasks are re-orered between nodes other than
350
         * the restrictions imposed by dependencies in the task graph. Also, the kernel may be invoked multiple times on one node and not be scheduled on
351
         * another node. If you need guarantees about execution order
352
         */
353
        template <int Dims, typename Functor>
354
        void host_task(range<Dims> global_range, id<Dims> global_offset, Functor&& kernel) {
73✔
355
                const detail::task_geometry geometry{
146✔
356
                    Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(global_range, range<Dims>(detail::ones))};
146✔
357
                auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, std::forward<Functor>(kernel));
73✔
358
                create_host_compute_task(geometry, std::move(launcher));
73✔
359
        }
146✔
360

361
        /**
362
         * Like `host_task(range<Dims> global_range, id<Dims> global_offset, Functor kernel)`, but with a `global_offset` of zero.
363
         */
364
        template <int Dims, typename Functor>
365
        void host_task(range<Dims> global_range, Functor&& kernel) {
70✔
366
                host_task(global_range, {}, std::forward<Functor>(kernel));
86✔
367
        }
70✔
368

369
  private:
370
        friend handler detail::make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes);
371
        friend std::unique_ptr<detail::task> detail::into_task(handler&& cgh);
372
        friend detail::hydration_id detail::add_requirement(handler& cgh, const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm);
373
        friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
374
        friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo);
375
        template <int Dims>
376
        friend void experimental::constrain_split(handler& cgh, const range<Dims>& constraint);
377
        template <typename Hint>
378
        friend void experimental::hint(handler& cgh, Hint&& hint);
379
        friend void detail::set_task_name(handler& cgh, const std::string& debug_name);
380

381
        detail::task_id m_tid;
382
        detail::buffer_access_map m_access_map;
383
        detail::side_effect_map m_side_effects;
384
        size_t m_non_void_side_effects_count = 0;
385
        detail::reduction_set m_reductions;
386
        std::unique_ptr<detail::task> m_task = nullptr;
387
        size_t m_num_collective_nodes;
388
        detail::hydration_id m_next_accessor_hydration_id = 1;
389
        std::optional<std::string> m_usr_def_task_name;
390
        range<3> m_split_constraint = detail::ones;
391
        std::vector<std::unique_ptr<detail::hint_base>> m_hints;
392

393
        handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}
4,462✔
394

395
        template <typename KernelFlavor, typename KernelName, int Dims, typename... ReductionsAndKernel, size_t... ReductionIndices>
396
        void parallel_for_reductions_and_kernel(range<Dims> global_range, id<Dims> global_offset,
1,006✔
397
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_size, std::index_sequence<ReductionIndices...> indices,
398
            ReductionsAndKernel&&... kernel_and_reductions) {
399
                auto args_tuple = std::forward_as_tuple(kernel_and_reductions...);
1,006✔
400
                auto&& kernel = std::get<sizeof...(kernel_and_reductions) - 1>(args_tuple);
1,006✔
401
                parallel_for_kernel_and_reductions<KernelFlavor, KernelName>(
1,006✔
402
                    global_range, global_offset, local_size, std::forward<decltype(kernel)>(kernel), std::get<ReductionIndices>(args_tuple)...);
403
        }
992✔
404

405
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, typename... Reductions>
406
        void parallel_for_kernel_and_reductions(range<Dims> global_range, id<Dims> global_offset,
1,006✔
407
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel, Reductions&... reductions) {
408
                range<3> granularity = {1, 1, 1};
1,006✔
409
                if constexpr(detail::kernel_flavor_traits<KernelFlavor, Dims>::has_local_size) {
410
                        for(int d = 0; d < Dims; ++d) {
135✔
411
                                granularity[d] = local_range[d];
86✔
412
                        }
413
                }
414
                const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset),
1,006✔
415
                    get_constrained_granularity(global_range, detail::range_cast<Dims>(granularity))};
1,006✔
416
                auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
996✔
417
                    global_range, global_offset, local_range, std::forward<Kernel>(kernel), std::index_sequence_for<Reductions...>(), reductions...);
418
                create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
992✔
419
        }
1,984✔
420

421
        [[nodiscard]] detail::hydration_id add_requirement(const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm) {
3,595✔
422
                assert(m_task == nullptr);
3,595✔
423
                m_access_map.add_access(bid, std::move(rm));
3,595✔
424
                return m_next_accessor_hydration_id++;
3,595✔
425
        }
426

427
        void add_requirement(const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
428
                assert(m_task == nullptr);
73✔
429
                m_side_effects.add_side_effect(hoid, order);
73✔
430
                if(!is_void) { m_non_void_side_effects_count++; }
73✔
431
        }
73✔
432

433
        void add_reduction(const detail::reduction_info& rinfo) {
121✔
434
                assert(m_task == nullptr);
121✔
435
                m_reductions.push_back(rinfo);
121✔
436
        }
121✔
437

438
        template <int Dims>
439
        void experimental_constrain_split(const range<Dims>& constraint) {
18✔
440
                assert(m_task == nullptr);
18✔
441
                m_split_constraint = detail::range_cast<3>(constraint);
18✔
442
        }
18✔
443

444
        template <typename Hint>
445
        void experimental_hint(Hint&& hint) {
74✔
446
                static_assert(std::is_base_of_v<detail::hint_base, std::decay_t<Hint>>, "Hint must extend hint_base");
447
                static_assert(std::is_move_constructible_v<Hint>, "Hint must be move-constructible");
448
                for(auto& h : m_hints) {
87✔
449
                        // We currently don't allow more than one hint of the same type for simplicity; this could be loosened in the future.
450
                        auto& hr = *h; // Need to do this here to avoid -Wpotentially-evaluated-expression
17✔
451
                        if(typeid(hr) == typeid(hint)) { throw std::runtime_error("Providing more than one hint of the same type is not allowed"); }
17✔
452
                        h->validate(hint);
16✔
453
                }
454
                m_hints.emplace_back(std::make_unique<std::decay_t<Hint>>(std::forward<Hint>(hint)));
70✔
455
        }
70✔
456

457
        template <int Dims>
458
        range<3> get_constrained_granularity(const range<Dims>& global_size, const range<Dims>& granularity) const {
1,079✔
459
                range<3> result = detail::range_cast<3>(granularity);
1,079✔
460
                for(int i = 0; i < Dims; ++i) {
2,479✔
461
                        const auto lcm = std::lcm(granularity[i], m_split_constraint[i]);
1,403✔
462
                        if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); }
1,403✔
463
                        result[i] = lcm;
1,400✔
464
                }
465
                if(global_size % detail::range_cast<Dims>(result) != range<Dims>(detail::zeros)) {
1,076✔
466
                        throw std::runtime_error(fmt::format("The{}split constraint {} does not evenly divide the kernel global size {}",
28✔
467
                            granularity.size() > 1 ? " effective " : " ", detail::range_cast<Dims>(result), global_size));
21✔
468
                }
469
                return result;
1,069✔
470
        }
471

472
        void create_host_compute_task(const detail::task_geometry& geometry, detail::host_task_launcher launcher) {
73✔
473
                assert(m_task == nullptr);
73✔
474
                if(geometry.global_size.size() == 0) {
73!
475
                        // TODO this can be easily supported by not creating a task in case the execution range is empty
476
                        throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"};
×
477
                }
478
                m_task =
73✔
479
                    detail::task::make_host_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions));
73✔
480

481
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
73✔
482
        }
73✔
483

484
        void create_device_compute_task(const detail::task_geometry& geometry, const std::string& debug_name, detail::device_kernel_launcher launcher) {
992✔
485
                assert(m_task == nullptr);
992✔
486
                if(geometry.global_size.size() == 0) {
992!
487
                        // TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty.
488
                        // Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets
489
                        // the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on
490
                        // each node that reads from the reduction output buffer, initializing it to the identity value locally.
491
                        throw std::runtime_error{"The execution range of device tasks must have at least one item"};
×
492
                }
493
                // Note that cgf_diagnostics has a similar check, but we don't catch void side effects there.
494
                if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; }
992!
495
                m_task = detail::task::make_device_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_reductions));
992✔
496

497
                m_task->set_debug_name(m_usr_def_task_name.value_or(debug_name));
992✔
498
        }
992✔
499

500
        void create_collective_task(const detail::collective_group_id cgid, detail::host_task_launcher launcher) {
59✔
501
                assert(m_task == nullptr);
59✔
502
                m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
59✔
503

504
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
59✔
505
        }
59✔
506

507
        void create_master_node_task(detail::host_task_launcher launcher) {
3,318✔
508
                assert(m_task == nullptr);
3,318✔
509
                m_task = detail::task::make_master_node(m_tid, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
3,318✔
510

511
                m_task->set_debug_name(m_usr_def_task_name.value_or(""));
3,318✔
512
        }
3,318✔
513

514
        template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, size_t... ReductionIndices, typename... Reductions>
515
        detail::device_kernel_launcher make_device_kernel_launcher(const range<Dims>& global_range, const id<Dims>& global_offset,
996✔
516
            typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel&& kernel,
517
            std::index_sequence<ReductionIndices...> /* indices */, Reductions... reductions) {
518
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
519

520
                // Check whether all accessors are being captured by value etc.
521
                // Although the diagnostics should always be available, we currently disable them for some test cases.
522
                if(detail::cgf_diagnostics::is_available()) { detail::cgf_diagnostics::get_instance().check<target::device>(kernel, m_access_map); }
996✔
523

524
                return [=](sycl::handler& sycl_cgh, const detail::box<3>& execution_range, const std::vector<void*>& reduction_ptrs) {
1,524!
525
                        constexpr int sycl_dims = std::max(1, Dims);
376✔
526
                        if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
527
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
313✔
528
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, sycl_global_range,
922✔
529
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
71✔
530
                                    detail::bind_simple_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset())));
626✔
531
                        } else if constexpr(std::is_same_v<KernelFlavor, detail::nd_range_kernel_flavor>) {
532
                                const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_range.get_range()));
63✔
533
                                const auto sycl_local_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(local_range));
63✔
534
                                detail::invoke_sycl_parallel_for<KernelName>(sycl_cgh, cl::sycl::nd_range{sycl_global_range, sycl_local_range},
218✔
535
                                    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices])...,
6✔
536
                                    detail::bind_nd_range_kernel(kernel, global_range, global_offset, detail::id_cast<Dims>(execution_range.get_offset()),
165✔
537
                                        global_range / local_range, detail::id_cast<Dims>(execution_range.get_offset()) / local_range));
189✔
538
                        } else {
539
                                static_assert(detail::constexpr_false<KernelFlavor>);
540
                        }
541
                };
2,138✔
542
        }
543

544
        template <int Dims, bool Collective, typename Kernel>
545
        detail::host_task_launcher make_host_task_launcher(const range<3>& global_range, const detail::collective_group_id cgid, Kernel&& kernel) {
3,453✔
546
                static_assert(Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const partition<Dims>>,
547
                    "Kernel for host task must be invocable with either no arguments or a celerity::partition<Dims>");
548
                static_assert(!Collective || std::is_invocable_v<Kernel> || std::is_invocable_v<Kernel, const experimental::collective_partition>,
549
                    "Kernel for collective host task must be invocable with either no arguments or a celerity::experimental::collective_partition");
550
                static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
551
                static_assert(Dims >= 0);
552

553
                // Check whether all accessors are being captured by value etc.
554
                // Although the diagnostics should always be available, we currently disable them for some test cases.
555
                if(detail::cgf_diagnostics::is_available()) {
3,453✔
556
                        detail::cgf_diagnostics::get_instance().check<target::host_task>(kernel, m_access_map, m_non_void_side_effects_count);
3,203✔
557
                }
558

559
                return [kernel, global_range](const detail::box<3>& execution_range, const detail::communicator* collective_comm) {
11,878✔
560
                        (void)global_range;
561
                        (void)collective_comm;
562
                        if constexpr(Dims > 0) {
563
                                if constexpr(Collective) {
564
                                        static_assert(Dims == 1);
565
                                        assert(collective_comm != nullptr);
26✔
566
                                        const auto part =
52✔
567
                                            detail::make_collective_partition(detail::range_cast<1>(global_range), detail::box_cast<1>(execution_range), *collective_comm);
52✔
568
                                        kernel(part);
26✔
569
                                } else {
570
                                        const auto part = detail::make_partition<Dims>(detail::range_cast<Dims>(global_range), detail::box_cast<Dims>(execution_range));
14✔
571
                                        kernel(part);
14✔
572
                                }
573
                        } else if constexpr(std::is_invocable_v<Kernel, const partition<0>&>) {
574
                                (void)execution_range;
575
                                const auto part = detail::make_partition<0>(range<0>(), subrange<0>());
3✔
576
                                kernel(part);
3✔
577
                        } else {
578
                                (void)execution_range;
579
                                kernel();
3,110✔
580
                        }
581
                };
9,067✔
582
        }
583

584
        std::unique_ptr<detail::task> into_task() && {
4,442✔
585
                assert(m_task != nullptr);
4,442✔
586
                for(auto& h : m_hints) {
4,512✔
587
                        m_task->add_hint(std::move(h));
70✔
588
                }
589
                return std::move(m_task);
4,442✔
590
        }
591
};
592

593
namespace detail {
594

595
        inline handler make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes) { return handler(tid, num_collective_nodes); }
4,462✔
596

597
        inline std::unique_ptr<detail::task> into_task(handler&& cgh) { return std::move(cgh).into_task(); }
4,442✔
598

599
        [[nodiscard]] inline hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm) {
3,595✔
600
                return cgh.add_requirement(bid, std::move(rm));
7,190✔
601
        }
602

603
        inline void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order, const bool is_void) {
73✔
604
                return cgh.add_requirement(hoid, order, is_void);
73✔
605
        }
606

607
        inline void add_reduction(handler& cgh, const detail::reduction_info& rinfo) { return cgh.add_reduction(rinfo); }
121✔
608

609
        inline void set_task_name(handler& cgh, const std::string& debug_name) { cgh.m_usr_def_task_name = {debug_name}; }
295✔
610

611
        // TODO: The _impl functions in detail only exist during the grace period for deprecated reductions on const buffers; move outside again afterwards.
612
        template <typename DataT, int Dims, typename BinaryOperation>
613
        auto reduction_impl(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
48✔
614
                static_assert(cl::sycl::has_known_identity_v<BinaryOperation, DataT>,
615
                    "Celerity does not currently support reductions without an identity. Either specialize "
616
                    "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime");
617
                return detail::make_reduction<false>(vars, cgh, combiner, cl::sycl::known_identity_v<BinaryOperation, DataT>, prop_list);
48✔
618
        }
619

620
        template <typename DataT, int Dims, typename BinaryOperation>
621
        auto reduction_impl(
6✔
622
            const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
623
                static_assert(!cl::sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
624
                return detail::make_reduction<true>(vars, cgh, combiner, identity, prop_list);
6✔
625
        }
626

627
} // namespace detail
628

629
template <typename DataT, int Dims, typename BinaryOperation>
630
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
47✔
631
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
47✔
632
}
633

634
template <typename DataT, int Dims, typename BinaryOperation>
635
auto reduction(buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
6✔
636
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
6✔
637
}
638

639
template <typename DataT, int Dims, typename BinaryOperation>
640
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
1✔
641
    const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
642
        return detail::reduction_impl(vars, cgh, combiner, prop_list);
1✔
643
}
644

645
template <typename DataT, int Dims, typename BinaryOperation>
646
[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
647
    const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
648
        return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
649
}
650

651
} // namespace celerity
652

653
namespace celerity::experimental {
654
template <int Dims>
655
void constrain_split(handler& cgh, const range<Dims>& constraint) {
18✔
656
        cgh.experimental_constrain_split(constraint);
18✔
657
}
18✔
658

659
template <typename Hint>
660
void hint(handler& cgh, Hint&& hint) {
74✔
661
        cgh.experimental_hint(std::forward<Hint>(hint));
74✔
662
}
70✔
663
} // namespace celerity::experimental
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc