• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 12787716791

15 Jan 2025 11:51AM UTC coverage: 95.128% (+0.07%) from 95.057%
12787716791

Pull #318

github

web-flow
Merge eeff7935c into 277403ad7
Pull Request #318: SimSYCL is now thread-safe; remove workarounds

3185 of 3610 branches covered (88.23%)

Branch coverage included in aggregate %.

4 of 4 new or added lines in 1 file covered. (100.0%)

2 existing lines in 1 file now uncovered.

7085 of 7186 relevant lines covered (98.59%)

1913655.04 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.86
/src/backend/sycl_backend.cc
1
#include "backend/sycl_backend.h"
2

3
#include "async_event.h"
4
#include "backend/backend.h"
5
#include "cgf.h"
6
#include "closure_hydrator.h"
7
#include "dense_map.h"
8
#include "grid.h"
9
#include "named_threads.h"
10
#include "nd_memory.h"
11
#include "system_info.h"
12
#include "thread_queue.h"
13
#include "tracy.h"
14
#include "types.h"
15
#include "utils.h"
16
#include "workaround.h"
17

18
#include <algorithm>
19
#include <atomic>
20
#include <cassert>
21
#include <chrono>
22
#include <cstddef>
23
#include <cstring>
24
#include <exception>
25
#include <functional>
26
#include <memory>
27
#include <optional>
28
#include <string>
29
#include <utility>
30
#include <vector>
31

32
#include <fmt/format.h>
33
#include <fmt/ranges.h>
34
#include <sycl/sycl.hpp>
35

36

37
namespace celerity::detail::sycl_backend_detail {
38

39
bool sycl_event::is_complete() { return m_last.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete; }
3,771✔
40

41
std::optional<std::chrono::nanoseconds> sycl_event::get_native_execution_time() {
917✔
42
        if(!m_first.has_value()) return std::nullopt; // avoid the cost of throwing + catching a sycl exception by when profiling is disabled
917✔
43
        return std::chrono::nanoseconds(m_last.get_profiling_info<sycl::info::event_profiling::command_end>() //
4✔
44
                                        - m_first->get_profiling_info<sycl::info::event_profiling::command_start>());
6✔
45
}
46

47
void delayed_async_event::state::set_value(async_event event) {
3,761✔
48
        m_event = std::move(event);
3,761✔
49
        [[maybe_unused]] const bool previously_ready = m_is_ready.exchange(true, std::memory_order_release);
3,760✔
50
        assert(!previously_ready && "delayed_async_event::state::set_value() called more than once");
3,761✔
51
}
3,761✔
52

53
bool delayed_async_event::is_complete() {
38,706,452✔
54
        if(!m_state->m_is_ready.load(std::memory_order_acquire)) return false;
38,706,452✔
55
        return m_state->m_event.is_complete();
3,761✔
56
}
57

58
void* delayed_async_event::get_result() {
823✔
59
        assert(m_state->m_is_ready.load(std::memory_order_acquire));
823✔
60
        return m_state->m_event.get_result();
823✔
61
}
62

63
std::optional<std::chrono::nanoseconds> delayed_async_event::get_native_execution_time() {
917✔
64
        assert(m_state->m_is_ready.load(std::memory_order_acquire));
917✔
65
        return m_state->m_event.get_native_execution_time();
917✔
66
}
67

68
void flush(sycl::queue& queue) {
3,771✔
69
#if CELERITY_WORKAROUND(ACPP)
70
        // AdaptiveCpp does not guarantee that command groups are actually scheduled until an explicit await operation, which we cannot insert without
71
        // blocking the executor loop (see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/599). Instead, we explicitly flush the queue to be able to continue
72
        // using our polling-based approach.
73
        queue.get_context().AdaptiveCpp_runtime()->dag().flush_async();
74
#else
75
        (void)queue;
76
#endif
77
}
3,771✔
78

79
// LCOV_EXCL_START
80
void report_errors(const sycl::exception_list& errors) {
81
        if(errors.size() == 0) return;
82

83
        std::vector<std::string> what;
84
        for(const auto& e : errors) {
85
                try {
86
                        std::rethrow_exception(e);
87
                } catch(sycl::exception& e) { //
88
                        what.push_back(e.what());
89
                } catch(std::exception& e) { //
90
                        what.push_back(e.what());
91
                } catch(...) { //
92
                        what.push_back("unknown exception");
93
                }
94
        }
95

96
        // Errors usually manifest on calls to sycl::event::get_info(), not their actual origin, and therefore will contain many duplicates
97
        std::sort(what.begin(), what.end());
98
        what.erase(std::unique(what.begin(), what.end()), what.end());
99

100
        utils::panic("asynchronous SYCL errors:\n\t{}", fmt::join(what, "\n\t"));
101
}
102
// LCOV_EXCL_STOP
103

104
} // namespace celerity::detail::sycl_backend_detail
105

106
namespace celerity::detail {
107

108
struct sycl_backend::impl {
109
        struct device_state {
110
                sycl::device sycl_device;
111
                sycl::context sycl_context;
112
                std::vector<sycl::queue> queues;
113
                std::optional<detail::thread_queue> submission_thread;
114
                std::atomic_flag active_async_error_check = false;
115

116
                device_state() = default;
117
                explicit device_state(const sycl::device& dev) : sycl_device(dev), sycl_context(sycl_device) {}
600✔
118
        };
119

120
        struct host_state {
121
                sycl::context sycl_context;
122
                thread_queue alloc_queue;
123
                std::vector<thread_queue> queues; // TODO naming vs alloc_queue?
124

125
                // pass devices to ensure the sycl_context receives the correct platform
126
                explicit host_state(const std::vector<sycl::device>& all_devices, bool enable_profiling)
251✔
127
                    // DPC++ requires exactly one CUDA device here, but for allocation the sycl_context mostly means "platform".
128
                    // - TODO assert that all devices belong to the same platform + backend here
129
                    // - TODO test Celerity on a (SimSYCL) system without GPUs
130
                    : sycl_context(all_devices.at(0)), //
251✔
131
                      alloc_queue(named_threads::thread_type::alloc, enable_profiling) {}
251✔
132
        };
133

134
        system_info system;
135
        dense_map<device_id, device_state> devices; // thread-safe for read access (not resized after construction)
136
        host_state host;
137
        using configuration = sycl_backend::configuration;
138
        configuration config;
139

140
        impl(const std::vector<sycl::device>& devices, const configuration& config)
251✔
141
            : devices(devices.begin(), devices.end()), host(devices, config.profiling), config(config) //
251✔
142
        {
143
                // For now, we assume distinct memories per device. TODO some targets, (OpenMP emulated devices), might deviate from that.
144
                system.devices.resize(devices.size());
251✔
145
                system.memories.resize(2 + devices.size()); //  user + host + device memories
251✔
146
                system.memories[user_memory_id].copy_peers.set(user_memory_id);
251✔
147
                system.memories[host_memory_id].copy_peers.set(host_memory_id);
251✔
148
                system.memories[host_memory_id].copy_peers.set(user_memory_id);
251✔
149
                system.memories[user_memory_id].copy_peers.set(host_memory_id);
251✔
150
                for(device_id did = 0; did < devices.size(); ++did) {
851✔
151
                        const memory_id mid = first_device_memory_id + did;
600✔
152
                        system.devices[did].native_memory = mid;
600✔
153
                        system.memories[mid].copy_peers.set(mid);
600✔
154
                        system.memories[mid].copy_peers.set(host_memory_id);
600✔
155
                        system.memories[host_memory_id].copy_peers.set(mid);
600✔
156
                        // device-to-device copy capabilities are added in cuda_backend constructor
157
                }
158
        }
251✔
159

160
        thread_queue& get_host_queue(const size_t lane) {
1,595✔
161
                assert(lane <= host.queues.size());
1,595✔
162
                if(lane == host.queues.size()) { host.queues.emplace_back(named_threads::task_type_host_queue(lane), config.profiling); }
1,595✔
163
                return host.queues[lane];
1,595✔
164
        }
165

166
        sycl::queue& get_device_queue(const device_id did, const size_t lane) {
4,274✔
167
                auto& device = devices[did];
4,274✔
168
                assert(lane <= device.queues.size());
4,276✔
169
                if(lane == device.queues.size()) {
4,276✔
170
                        const auto properties = config.profiling ? sycl::property_list{sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}
734✔
171
                                                                 : sycl::property_list{sycl::property::queue::in_order{}};
734✔
172
                        device.queues.emplace_back(device.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), properties);
734✔
173
                }
733✔
174
                return device.queues[lane];
4,275✔
175
        }
176
};
177

178
sycl_backend::sycl_backend(const std::vector<sycl::device>& devices, const configuration& config) : m_impl(new impl(devices, config)) {
251!
179
        // Initialize a submission thread with hydrator for each device, if they are enabled
180
        if(m_impl->config.per_device_submission_threads) {
251✔
181
                for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
841✔
182
                        m_impl->devices[did].submission_thread.emplace(named_threads::task_type_device_submitter(did.value), m_impl->config.profiling);
592✔
183
                        // no need to wait for the event -> will happen before the first task is submitted
184
                        (void)m_impl->devices[did].submission_thread->submit([did] { closure_hydrator::make_available(); });
1,184✔
185
                }
186
        }
187
}
251✔
188

189
sycl_backend::~sycl_backend() {
251✔
190
        // If we are using submission threads, tear down their hydrators before they are destroyed
191
        if(m_impl->config.per_device_submission_threads) {
251✔
192
                for(auto& device : m_impl->devices) {
841✔
193
                        // no need to wait for the event -> destruction will wait for the submission thread to finish
194
                        (void)device.submission_thread->submit([] { closure_hydrator::teardown(); });
1,178✔
195
                }
196
        }
197
}
251✔
198

199
const system_info& sycl_backend::get_system_info() const { return m_impl->system; }
457✔
200

201
void sycl_backend::init() {
225✔
202
        CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init);
203

204
        // Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device.
205
        for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
730✔
206
                (void)m_impl->get_device_queue(did, 0 /* lane */);
505✔
207
        }
208
}
225✔
209

210
void* sycl_backend::debug_alloc(const size_t size) {
1,998✔
211
        const auto ptr = sycl::malloc_host(size, m_impl->host.sycl_context);
1,998✔
212
#if CELERITY_DETAIL_ENABLE_DEBUG
213
        memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);
1,998✔
214
#endif
215
        return ptr;
1,998✔
216
}
217

218
void sycl_backend::debug_free(void* const ptr) { sycl::free(ptr, m_impl->host.sycl_context); }
1,998✔
219

220
async_event sycl_backend::enqueue_host_alloc(const size_t size, const size_t alignment) {
434✔
221
        return m_impl->host.alloc_queue.submit([this, size, alignment] {
868✔
222
                const auto ptr = sycl::aligned_alloc_host(alignment, size, m_impl->host.sycl_context);
434✔
223
#if CELERITY_DETAIL_ENABLE_DEBUG
224
                memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);
434✔
225
#endif
226
                return ptr;
434✔
227
        });
868✔
228
}
229

230
async_event sycl_backend::enqueue_device_alloc(const device_id device, const size_t size, const size_t alignment) {
505✔
231
        return m_impl->host.alloc_queue.submit([this, device, size, alignment] {
1,010✔
232
                auto& d = m_impl->devices[device];
505✔
233
                const auto ptr = sycl::aligned_alloc_device(alignment, size, d.sycl_device, d.sycl_context);
505✔
234
#if CELERITY_DETAIL_ENABLE_DEBUG
235
                sycl::queue(d.sycl_context, d.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), sycl::property::queue::in_order{})
1,010✔
236
                    .fill(ptr, sycl_backend_detail::uninitialized_memory_pattern, size)
2,020✔
237
                    .wait_and_throw();
1,010✔
238
#endif
239
                return ptr;
505✔
240
        });
1,010✔
241
}
242

243
async_event sycl_backend::enqueue_host_free(void* const ptr) {
434✔
244
        return m_impl->host.alloc_queue.submit([this, ptr] { sycl::free(ptr, m_impl->host.sycl_context); });
868✔
245
}
246

247
async_event sycl_backend::enqueue_device_free(const device_id device, void* const ptr) {
505✔
248
        return m_impl->host.alloc_queue.submit([this, device, ptr] { sycl::free(ptr, m_impl->devices[device].sycl_context); });
1,010✔
249
}
250

251
async_event sycl_backend::enqueue_host_task(size_t host_lane, const host_task_launcher& launcher, std::vector<closure_hydrator::accessor_info> accessor_infos,
1,182✔
252
    const range<3>& global_range, const box<3>& execution_range, const communicator* collective_comm) //
253
{
254
        auto& hydrator = closure_hydrator::get_instance();
1,182✔
255
        hydrator.arm(target::host_task, std::move(accessor_infos));
1,182✔
256
        auto launch_hydrated = hydrator.hydrate<target::host_task>(launcher);
1,182✔
257
        return m_impl->get_host_queue(host_lane).submit(
1,182✔
258
            [=, launch_hydrated = std::move(launch_hydrated)] { launch_hydrated(global_range, execution_range, collective_comm); });
4,728!
259
}
1,182✔
260

261
async_event sycl_backend::enqueue_device_kernel(const device_id device, const size_t lane, const device_kernel_launcher& launch,
927✔
262
    std::vector<closure_hydrator::accessor_info> accessor_infos, const box<3>& execution_range, const std::vector<void*>& reduction_ptrs) //
263
{
264
        return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable {
1,854✔
265
                CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
266
                auto event = queue.submit([&](sycl::handler& sycl_cgh) {
927✔
267
                        auto& hydrator = closure_hydrator::get_instance();
927✔
268
                        hydrator.arm(target::device, std::move(acc_infos));
927✔
269
                        const auto launch_hydrated = hydrator.hydrate<target::device>(sycl_cgh, launch);
927✔
270
                        launch_hydrated(sycl_cgh, execution_range, reduction_ptrs);
927✔
271
                });
2,781✔
272
                sycl_backend_detail::flush(queue);
927✔
273
                return make_async_event<sycl_backend_detail::sycl_event>(std::move(event), m_impl->config.profiling);
1,854✔
274
        });
3,708✔
275
}
276

277
async_event sycl_backend::enqueue_host_copy(size_t host_lane, const void* const source_base, void* const dest_base, const region_layout& source_layout,
413✔
278
    const region_layout& dest_layout, const region<3>& copy_region, const size_t elem_size) //
279
{
280
        return m_impl->get_host_queue(host_lane).submit([=] { nd_copy_host(source_base, dest_base, source_layout, dest_layout, copy_region, elem_size); });
826!
281
}
282

283
void sycl_backend::check_async_errors() {
193,772✔
284
        for(size_t i = 0; i < m_impl->devices.size(); ++i) {
435,318✔
285
                auto& device = m_impl->devices[i];
241,546✔
286
                if(m_impl->config.per_device_submission_threads) {
241,546!
287
                        // Prevent multiple error checks from being enqueued at the same time
288
                        if(!device.active_async_error_check.test_and_set()) {
483,092✔
289
                                (void)device.submission_thread->submit([&]() {
109,037✔
290
                                        for(auto& queue : device.queues) {
379,487✔
291
                                                queue.throw_asynchronous();
270,505✔
292
                                        }
293
                                        device.active_async_error_check.clear();
108,913✔
294
                                });
108,888✔
295
                        }
296
                } else {
UNCOV
297
                        for(auto& queue : device.queues) {
×
UNCOV
298
                                queue.throw_asynchronous();
×
299
                        }
300
                }
301
        }
302
}
193,772✔
303

304
system_info& sycl_backend::get_system_info() { return m_impl->system; }
×
305

306
async_event celerity::detail::sycl_backend::enqueue_device_work(
3,771✔
307
    const device_id device, const size_t lane, const std::function<async_event(sycl::queue&)>& work) {
308
        // Basic case: no per-device submission threads
309
        if(!m_impl->config.per_device_submission_threads) { return work(m_impl->get_device_queue(device, lane)); }
3,771✔
310

311
        auto& device_state = m_impl->devices[device];
3,761✔
312
        auto& submission_thread = device_state.submission_thread;
3,761✔
313
        assert(submission_thread.has_value());
3,761✔
314

315
        // Note: this mechanism is quite similar in principle to a std::future/promise,
316
        //       but implementing it with that caused a 50% (!) slowdown in system-level benchmarks
317
        const auto async_event_state = std::make_shared<sycl_backend_detail::delayed_async_event::state>();
3,761✔
318
        auto async_event = make_async_event<sycl_backend_detail::delayed_async_event>(async_event_state);
3,761✔
319

320
        (void)submission_thread->submit([this, device, lane, work, async_event_state] {
3,761!
321
                auto event = work(m_impl->get_device_queue(device, lane));
3,760✔
322
                async_event_state->set_value(std::move(event));
3,760✔
323
        });
7,522✔
324
        return async_event;
3,761✔
325
}
3,761✔
326

327
bool sycl_backend::is_profiling_enabled() const { return m_impl->config.profiling; }
2,843✔
328

329
std::vector<sycl_backend_type> sycl_backend_enumerator::compatible_backends(const sycl::device& device) const {
1,200✔
330
        std::vector<backend_type> backends{backend_type::generic};
3,600✔
331
#if CELERITY_WORKAROUND(ACPP) && defined(SYCL_EXT_HIPSYCL_BACKEND_CUDA)
332
        if(device.get_backend() == sycl::backend::cuda) { backends.push_back(sycl_backend_type::cuda); }
333
#elif CELERITY_WORKAROUND(DPCPP)
334
        if(device.get_backend() == sycl::backend::ext_oneapi_cuda) { backends.push_back(sycl_backend_type::cuda); }
335
#endif
336
        assert(std::is_sorted(backends.begin(), backends.end()));
1,200✔
337
        return backends;
1,200✔
338
}
×
339

340
std::vector<sycl_backend_type> sycl_backend_enumerator::available_backends() const {
241✔
341
        std::vector<backend_type> backends{backend_type::generic};
723✔
342
#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
343
        backends.push_back(sycl_backend_type::cuda);
344
#endif
345
        assert(std::is_sorted(backends.begin(), backends.end()));
241✔
346
        return backends;
241✔
347
}
×
348

349
bool sycl_backend_enumerator::is_specialized(backend_type type) const {
233✔
350
        switch(type) {
233!
351
        case backend_type::generic: return false;
232✔
352
        case backend_type::cuda: return true;
1✔
353
        default: utils::unreachable(); // LCOV_EXCL_LINE
354
        }
355
}
356

357
int sycl_backend_enumerator::get_priority(backend_type type) const {
2✔
358
        switch(type) {
2!
359
        case backend_type::generic: return 0;
1✔
360
        case backend_type::cuda: return 1;
1✔
361
        default: utils::unreachable(); // LCOV_EXCL_LINE
362
        }
363
}
364

365
} // namespace celerity::detail
366

367
namespace celerity::detail {
368

369
std::unique_ptr<backend> make_sycl_backend(const sycl_backend_type type, const std::vector<sycl::device>& devices, const sycl_backend::configuration& config) {
251✔
370
        assert(std::all_of(
851✔
371
            devices.begin(), devices.end(), [=](const sycl::device& d) { return utils::contains(sycl_backend_enumerator{}.compatible_backends(d), type); }));
372

373
        switch(type) {
251!
374
        case sycl_backend_type::generic: //
251✔
375
                return std::make_unique<sycl_generic_backend>(devices, config);
251✔
376

377
        case sycl_backend_type::cuda:
×
378
#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
379
                return std::make_unique<sycl_cuda_backend>(devices, config);
380
#else
381
                utils::panic("CUDA backend has not been compiled");
×
382
#endif
383
        }
384
        utils::unreachable(); // LCOV_EXCL_LINE
385
}
386

387
} // namespace celerity::detail
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc