12787716791

Committed 15 Jan 2025 11:51AM UTC coverage: 95.128% (+0.07%) from 95.057%

Build # 12787716791

Build Type

Pull #318

github

Committed by

web-flow

Commit Message

Merge eeff7935c into 277403ad7

Pull Request Pull Request #318: SimSYCL is now thread-safe; remove workarounds

Run Details

3185 of 3610 branches covered (88.23%)

Branch coverage included in aggregate %.

4 of 4 new or added lines in 1 file covered. (100.0%)

2 existing lines in 1 file now uncovered.

7085 of 7186 relevant lines covered (98.59%)

1913655.04 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.86

/src/backend/sycl_backend.cc

#include "backend/sycl_backend.h"

#include "async_event.h"
#include "backend/backend.h"
#include "cgf.h"
#include "closure_hydrator.h"
#include "dense_map.h"
#include "grid.h"
#include "named_threads.h"
#include "nd_memory.h"
#include "system_info.h"
#include "thread_queue.h"
#include "tracy.h"
#include "types.h"
#include "utils.h"
#include "workaround.h"

#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstring>
#include <exception>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include <fmt/format.h>
#include <fmt/ranges.h>
#include <sycl/sycl.hpp>


namespace celerity::detail::sycl_backend_detail {

bool sycl_event::is_complete() { return m_last.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete; }

std::optional<std::chrono::nanoseconds> sycl_event::get_native_execution_time() {
        if(!m_first.has_value()) return std::nullopt; // avoid the cost of throwing + catching a sycl exception by when profiling is disabled
        return std::chrono::nanoseconds(m_last.get_profiling_info<sycl::info::event_profiling::command_end>() //
                                        - m_first->get_profiling_info<sycl::info::event_profiling::command_start>());
}

void delayed_async_event::state::set_value(async_event event) {
        m_event = std::move(event);
        [[maybe_unused]] const bool previously_ready = m_is_ready.exchange(true, std::memory_order_release);
        assert(!previously_ready && "delayed_async_event::state::set_value() called more than once");
}

bool delayed_async_event::is_complete() {
        if(!m_state->m_is_ready.load(std::memory_order_acquire)) return false;
        return m_state->m_event.is_complete();
}

void* delayed_async_event::get_result() {
        assert(m_state->m_is_ready.load(std::memory_order_acquire));
        return m_state->m_event.get_result();
}

std::optional<std::chrono::nanoseconds> delayed_async_event::get_native_execution_time() {
        assert(m_state->m_is_ready.load(std::memory_order_acquire));
        return m_state->m_event.get_native_execution_time();
}

void flush(sycl::queue& queue) {
#if CELERITY_WORKAROUND(ACPP)
        // AdaptiveCpp does not guarantee that command groups are actually scheduled until an explicit await operation, which we cannot insert without
        // blocking the executor loop (see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/599). Instead, we explicitly flush the queue to be able to continue
        // using our polling-based approach.
        queue.get_context().AdaptiveCpp_runtime()->dag().flush_async();
#else
        (void)queue;
#endif
}

// LCOV_EXCL_START
void report_errors(const sycl::exception_list& errors) {
        if(errors.size() == 0) return;

        std::vector<std::string> what;
        for(const auto& e : errors) {
                try {
                        std::rethrow_exception(e);
                } catch(sycl::exception& e) { //
                        what.push_back(e.what());
                } catch(std::exception& e) { //
                        what.push_back(e.what());
                } catch(...) { //
                        what.push_back("unknown exception");
                }
        }

        // Errors usually manifest on calls to sycl::event::get_info(), not their actual origin, and therefore will contain many duplicates
        std::sort(what.begin(), what.end());
        what.erase(std::unique(what.begin(), what.end()), what.end());

        utils::panic("asynchronous SYCL errors:\n\t{}", fmt::join(what, "\n\t"));
}
// LCOV_EXCL_STOP

} // namespace celerity::detail::sycl_backend_detail

namespace celerity::detail {

struct sycl_backend::impl {
        struct device_state {
                sycl::device sycl_device;
                sycl::context sycl_context;
                std::vector<sycl::queue> queues;
                std::optional<detail::thread_queue> submission_thread;
                std::atomic_flag active_async_error_check = false;

                device_state() = default;
                explicit device_state(const sycl::device& dev) : sycl_device(dev), sycl_context(sycl_device) {}
        };

        struct host_state {
                sycl::context sycl_context;
                thread_queue alloc_queue;
                std::vector<thread_queue> queues; // TODO naming vs alloc_queue?

                // pass devices to ensure the sycl_context receives the correct platform
                explicit host_state(const std::vector<sycl::device>& all_devices, bool enable_profiling)
                    // DPC++ requires exactly one CUDA device here, but for allocation the sycl_context mostly means "platform".
                    // - TODO assert that all devices belong to the same platform + backend here
                    // - TODO test Celerity on a (SimSYCL) system without GPUs
                    : sycl_context(all_devices.at(0)), //
                      alloc_queue(named_threads::thread_type::alloc, enable_profiling) {}
        };

        system_info system;
        dense_map<device_id, device_state> devices; // thread-safe for read access (not resized after construction)
        host_state host;
        using configuration = sycl_backend::configuration;
        configuration config;

        impl(const std::vector<sycl::device>& devices, const configuration& config)
            : devices(devices.begin(), devices.end()), host(devices, config.profiling), config(config) //
        {
                // For now, we assume distinct memories per device. TODO some targets, (OpenMP emulated devices), might deviate from that.
                system.devices.resize(devices.size());
                system.memories.resize(2 + devices.size()); //  user + host + device memories
                system.memories[user_memory_id].copy_peers.set(user_memory_id);
                system.memories[host_memory_id].copy_peers.set(host_memory_id);
                system.memories[host_memory_id].copy_peers.set(user_memory_id);
                system.memories[user_memory_id].copy_peers.set(host_memory_id);
                for(device_id did = 0; did < devices.size(); ++did) {
                        const memory_id mid = first_device_memory_id + did;
                        system.devices[did].native_memory = mid;
                        system.memories[mid].copy_peers.set(mid);
                        system.memories[mid].copy_peers.set(host_memory_id);
                        system.memories[host_memory_id].copy_peers.set(mid);
                        // device-to-device copy capabilities are added in cuda_backend constructor
                }
        }

        thread_queue& get_host_queue(const size_t lane) {
                assert(lane <= host.queues.size());
                if(lane == host.queues.size()) { host.queues.emplace_back(named_threads::task_type_host_queue(lane), config.profiling); }
                return host.queues[lane];
        }

        sycl::queue& get_device_queue(const device_id did, const size_t lane) {
                auto& device = devices[did];
                assert(lane <= device.queues.size());
                if(lane == device.queues.size()) {
                        const auto properties = config.profiling ? sycl::property_list{sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}
                                                                 : sycl::property_list{sycl::property::queue::in_order{}};
                        device.queues.emplace_back(device.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), properties);
                }
                return device.queues[lane];
        }
};

sycl_backend::sycl_backend(const std::vector<sycl::device>& devices, const configuration& config) : m_impl(new impl(devices, config)) {
        // Initialize a submission thread with hydrator for each device, if they are enabled
        if(m_impl->config.per_device_submission_threads) {
                for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
                        m_impl->devices[did].submission_thread.emplace(named_threads::task_type_device_submitter(did.value), m_impl->config.profiling);
                        // no need to wait for the event -> will happen before the first task is submitted
                        (void)m_impl->devices[did].submission_thread->submit([did] { closure_hydrator::make_available(); });
                }
        }
}

sycl_backend::~sycl_backend() {
        // If we are using submission threads, tear down their hydrators before they are destroyed
        if(m_impl->config.per_device_submission_threads) {
                for(auto& device : m_impl->devices) {
                        // no need to wait for the event -> destruction will wait for the submission thread to finish
                        (void)device.submission_thread->submit([] { closure_hydrator::teardown(); });
                }
        }
}

const system_info& sycl_backend::get_system_info() const { return m_impl->system; }

void sycl_backend::init() {
        CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init);

        // Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device.
        for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
                (void)m_impl->get_device_queue(did, 0 /* lane */);
        }
}

void* sycl_backend::debug_alloc(const size_t size) {
        const auto ptr = sycl::malloc_host(size, m_impl->host.sycl_context);
#if CELERITY_DETAIL_ENABLE_DEBUG
        memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);
#endif
        return ptr;
}

void sycl_backend::debug_free(void* const ptr) { sycl::free(ptr, m_impl->host.sycl_context); }

async_event sycl_backend::enqueue_host_alloc(const size_t size, const size_t alignment) {
        return m_impl->host.alloc_queue.submit([this, size, alignment] {
                const auto ptr = sycl::aligned_alloc_host(alignment, size, m_impl->host.sycl_context);
#if CELERITY_DETAIL_ENABLE_DEBUG
                memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);
#endif
                return ptr;
        });
}

async_event sycl_backend::enqueue_device_alloc(const device_id device, const size_t size, const size_t alignment) {
        return m_impl->host.alloc_queue.submit([this, device, size, alignment] {
                auto& d = m_impl->devices[device];
                const auto ptr = sycl::aligned_alloc_device(alignment, size, d.sycl_device, d.sycl_context);
#if CELERITY_DETAIL_ENABLE_DEBUG
                sycl::queue(d.sycl_context, d.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), sycl::property::queue::in_order{})
                    .fill(ptr, sycl_backend_detail::uninitialized_memory_pattern, size)
                    .wait_and_throw();
#endif
                return ptr;
        });
}

async_event sycl_backend::enqueue_host_free(void* const ptr) {
        return m_impl->host.alloc_queue.submit([this, ptr] { sycl::free(ptr, m_impl->host.sycl_context); });
}

async_event sycl_backend::enqueue_device_free(const device_id device, void* const ptr) {
        return m_impl->host.alloc_queue.submit([this, device, ptr] { sycl::free(ptr, m_impl->devices[device].sycl_context); });
}

async_event sycl_backend::enqueue_host_task(size_t host_lane, const host_task_launcher& launcher, std::vector<closure_hydrator::accessor_info> accessor_infos,
    const range<3>& global_range, const box<3>& execution_range, const communicator* collective_comm) //
{
        auto& hydrator = closure_hydrator::get_instance();
        hydrator.arm(target::host_task, std::move(accessor_infos));
        auto launch_hydrated = hydrator.hydrate<target::host_task>(launcher);
        return m_impl->get_host_queue(host_lane).submit(
            [=, launch_hydrated = std::move(launch_hydrated)] { launch_hydrated(global_range, execution_range, collective_comm); });
}

async_event sycl_backend::enqueue_device_kernel(const device_id device, const size_t lane, const device_kernel_launcher& launch,
    std::vector<closure_hydrator::accessor_info> accessor_infos, const box<3>& execution_range, const std::vector<void*>& reduction_ptrs) //
{
        return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable {
                CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
                auto event = queue.submit([&](sycl::handler& sycl_cgh) {
                        auto& hydrator = closure_hydrator::get_instance();
                        hydrator.arm(target::device, std::move(acc_infos));
                        const auto launch_hydrated = hydrator.hydrate<target::device>(sycl_cgh, launch);
                        launch_hydrated(sycl_cgh, execution_range, reduction_ptrs);
                });
                sycl_backend_detail::flush(queue);
                return make_async_event<sycl_backend_detail::sycl_event>(std::move(event), m_impl->config.profiling);
        });
}

async_event sycl_backend::enqueue_host_copy(size_t host_lane, const void* const source_base, void* const dest_base, const region_layout& source_layout,
    const region_layout& dest_layout, const region<3>& copy_region, const size_t elem_size) //
{
        return m_impl->get_host_queue(host_lane).submit([=] { nd_copy_host(source_base, dest_base, source_layout, dest_layout, copy_region, elem_size); });
}

void sycl_backend::check_async_errors() {
        for(size_t i = 0; i < m_impl->devices.size(); ++i) {
                auto& device = m_impl->devices[i];
                if(m_impl->config.per_device_submission_threads) {
                        // Prevent multiple error checks from being enqueued at the same time
                        if(!device.active_async_error_check.test_and_set()) {
                                (void)device.submission_thread->submit([&]() {
                                        for(auto& queue : device.queues) {
                                                queue.throw_asynchronous();
                                        }
                                        device.active_async_error_check.clear();
                                });
                        }
                } else {
                        for(auto& queue : device.queues) {
                                queue.throw_asynchronous();
                        }
                }
        }
}

system_info& sycl_backend::get_system_info() { return m_impl->system; }

async_event celerity::detail::sycl_backend::enqueue_device_work(
    const device_id device, const size_t lane, const std::function<async_event(sycl::queue&)>& work) {
        // Basic case: no per-device submission threads
        if(!m_impl->config.per_device_submission_threads) { return work(m_impl->get_device_queue(device, lane)); }

        auto& device_state = m_impl->devices[device];
        auto& submission_thread = device_state.submission_thread;
        assert(submission_thread.has_value());

        // Note: this mechanism is quite similar in principle to a std::future/promise,
        //       but implementing it with that caused a 50% (!) slowdown in system-level benchmarks
        const auto async_event_state = std::make_shared<sycl_backend_detail::delayed_async_event::state>();
        auto async_event = make_async_event<sycl_backend_detail::delayed_async_event>(async_event_state);

        (void)submission_thread->submit([this, device, lane, work, async_event_state] {
                auto event = work(m_impl->get_device_queue(device, lane));
                async_event_state->set_value(std::move(event));
        });
        return async_event;
}

bool sycl_backend::is_profiling_enabled() const { return m_impl->config.profiling; }

std::vector<sycl_backend_type> sycl_backend_enumerator::compatible_backends(const sycl::device& device) const {
        std::vector<backend_type> backends{backend_type::generic};
#if CELERITY_WORKAROUND(ACPP) && defined(SYCL_EXT_HIPSYCL_BACKEND_CUDA)
        if(device.get_backend() == sycl::backend::cuda) { backends.push_back(sycl_backend_type::cuda); }
#elif CELERITY_WORKAROUND(DPCPP)
        if(device.get_backend() == sycl::backend::ext_oneapi_cuda) { backends.push_back(sycl_backend_type::cuda); }
#endif
        assert(std::is_sorted(backends.begin(), backends.end()));
        return backends;
}

std::vector<sycl_backend_type> sycl_backend_enumerator::available_backends() const {
        std::vector<backend_type> backends{backend_type::generic};
#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
        backends.push_back(sycl_backend_type::cuda);
#endif
        assert(std::is_sorted(backends.begin(), backends.end()));
        return backends;
}

bool sycl_backend_enumerator::is_specialized(backend_type type) const {
        switch(type) {
        case backend_type::generic: return false;
        case backend_type::cuda: return true;
        default: utils::unreachable(); // LCOV_EXCL_LINE
        }
}

int sycl_backend_enumerator::get_priority(backend_type type) const {
        switch(type) {
        case backend_type::generic: return 0;
        case backend_type::cuda: return 1;
        default: utils::unreachable(); // LCOV_EXCL_LINE
        }
}

} // namespace celerity::detail

namespace celerity::detail {

std::unique_ptr<backend> make_sycl_backend(const sycl_backend_type type, const std::vector<sycl::device>& devices, const sycl_backend::configuration& config) {
        assert(std::all_of(
            devices.begin(), devices.end(), [=](const sycl::device& d) { return utils::contains(sycl_backend_enumerator{}.compatible_backends(d), type); }));

        switch(type) {
        case sycl_backend_type::generic: //
                return std::make_unique<sycl_generic_backend>(devices, config);

        case sycl_backend_type::cuda:
#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
                return std::make_unique<sycl_cuda_backend>(devices, config);
#else
                utils::panic("CUDA backend has not been compiled");
#endif
        }
        utils::unreachable(); // LCOV_EXCL_LINE
}

} // namespace celerity::detail

1	#include "backend/sycl_backend.h"
2
3	#include "async_event.h"
4	#include "backend/backend.h"
5	#include "cgf.h"
6	#include "closure_hydrator.h"
7	#include "dense_map.h"
8	#include "grid.h"
9	#include "named_threads.h"
10	#include "nd_memory.h"
11	#include "system_info.h"
12	#include "thread_queue.h"
13	#include "tracy.h"
14	#include "types.h"
15	#include "utils.h"
16	#include "workaround.h"
17
18	#include <algorithm>
19	#include <atomic>
20	#include <cassert>
21	#include <chrono>
22	#include <cstddef>
23	#include <cstring>
24	#include <exception>
25	#include <functional>
26	#include <memory>
27	#include <optional>
28	#include <string>
29	#include <utility>
30	#include <vector>
31
32	#include <fmt/format.h>
33	#include <fmt/ranges.h>
34	#include <sycl/sycl.hpp>
35
36
37	namespace celerity::detail::sycl_backend_detail {
38
39	bool sycl_event::is_complete() { return m_last.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete; }	3,771✔
40
41	std::optional<std::chrono::nanoseconds> sycl_event::get_native_execution_time() {	917✔
42	if(!m_first.has_value()) return std::nullopt; // avoid the cost of throwing + catching a sycl exception by when profiling is disabled	917✔
43	return std::chrono::nanoseconds(m_last.get_profiling_info<sycl::info::event_profiling::command_end>() //	4✔
44	- m_first->get_profiling_info<sycl::info::event_profiling::command_start>());	6✔
45	}
46
47	void delayed_async_event::state::set_value(async_event event) {	3,761✔
48	m_event = std::move(event);	3,761✔
49	[[maybe_unused]] const bool previously_ready = m_is_ready.exchange(true, std::memory_order_release);	3,760✔
50	assert(!previously_ready && "delayed_async_event::state::set_value() called more than once");	3,761✔
51	}	3,761✔
52
53	bool delayed_async_event::is_complete() {	38,706,452✔
54	if(!m_state->m_is_ready.load(std::memory_order_acquire)) return false;	38,706,452✔
55	return m_state->m_event.is_complete();	3,761✔
56	}
57
58	void* delayed_async_event::get_result() {	823✔
59	assert(m_state->m_is_ready.load(std::memory_order_acquire));	823✔
60	return m_state->m_event.get_result();	823✔
61	}
62
63	std::optional<std::chrono::nanoseconds> delayed_async_event::get_native_execution_time() {	917✔
64	assert(m_state->m_is_ready.load(std::memory_order_acquire));	917✔
65	return m_state->m_event.get_native_execution_time();	917✔
66	}
67
68	void flush(sycl::queue& queue) {	3,771✔
69	#if CELERITY_WORKAROUND(ACPP)
70	// AdaptiveCpp does not guarantee that command groups are actually scheduled until an explicit await operation, which we cannot insert without
71	// blocking the executor loop (see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/599). Instead, we explicitly flush the queue to be able to continue
72	// using our polling-based approach.
73	queue.get_context().AdaptiveCpp_runtime()->dag().flush_async();
74	#else
75	(void)queue;
76	#endif
77	}	3,771✔
78
79	// LCOV_EXCL_START
80	void report_errors(const sycl::exception_list& errors) {
81	if(errors.size() == 0) return;
82
83	std::vector<std::string> what;
84	for(const auto& e : errors) {
85	try {
86	std::rethrow_exception(e);
87	} catch(sycl::exception& e) { //
88	what.push_back(e.what());
89	} catch(std::exception& e) { //
90	what.push_back(e.what());
91	} catch(...) { //
92	what.push_back("unknown exception");
93	}
94	}
95
96	// Errors usually manifest on calls to sycl::event::get_info(), not their actual origin, and therefore will contain many duplicates
97	std::sort(what.begin(), what.end());
98	what.erase(std::unique(what.begin(), what.end()), what.end());
99
100	utils::panic("asynchronous SYCL errors:\n\t{}", fmt::join(what, "\n\t"));
101	}
102	// LCOV_EXCL_STOP
103
104	} // namespace celerity::detail::sycl_backend_detail
105
106	namespace celerity::detail {
107
108	struct sycl_backend::impl {
109	struct device_state {
110	sycl::device sycl_device;
111	sycl::context sycl_context;
112	std::vector<sycl::queue> queues;
113	std::optional<detail::thread_queue> submission_thread;
114	std::atomic_flag active_async_error_check = false;
115
116	device_state() = default;
117	explicit device_state(const sycl::device& dev) : sycl_device(dev), sycl_context(sycl_device) {}	600✔
118	};
119
120	struct host_state {
121	sycl::context sycl_context;
122	thread_queue alloc_queue;
123	std::vector<thread_queue> queues; // TODO naming vs alloc_queue?
124
125	// pass devices to ensure the sycl_context receives the correct platform
126	explicit host_state(const std::vector<sycl::device>& all_devices, bool enable_profiling)	251✔
127	// DPC++ requires exactly one CUDA device here, but for allocation the sycl_context mostly means "platform".
128	// - TODO assert that all devices belong to the same platform + backend here
129	// - TODO test Celerity on a (SimSYCL) system without GPUs
130	: sycl_context(all_devices.at(0)), //	251✔
131	alloc_queue(named_threads::thread_type::alloc, enable_profiling) {}	251✔
132	};
133
134	system_info system;
135	dense_map<device_id, device_state> devices; // thread-safe for read access (not resized after construction)
136	host_state host;
137	using configuration = sycl_backend::configuration;
138	configuration config;
139
140	impl(const std::vector<sycl::device>& devices, const configuration& config)	251✔
141	: devices(devices.begin(), devices.end()), host(devices, config.profiling), config(config) //	251✔
142	{
143	// For now, we assume distinct memories per device. TODO some targets, (OpenMP emulated devices), might deviate from that.
144	system.devices.resize(devices.size());	251✔
145	system.memories.resize(2 + devices.size()); // user + host + device memories	251✔
146	system.memories[user_memory_id].copy_peers.set(user_memory_id);	251✔
147	system.memories[host_memory_id].copy_peers.set(host_memory_id);	251✔
148	system.memories[host_memory_id].copy_peers.set(user_memory_id);	251✔
149	system.memories[user_memory_id].copy_peers.set(host_memory_id);	251✔
150	for(device_id did = 0; did < devices.size(); ++did) {	851✔
151	const memory_id mid = first_device_memory_id + did;	600✔
152	system.devices[did].native_memory = mid;	600✔
153	system.memories[mid].copy_peers.set(mid);	600✔
154	system.memories[mid].copy_peers.set(host_memory_id);	600✔
155	system.memories[host_memory_id].copy_peers.set(mid);	600✔
156	// device-to-device copy capabilities are added in cuda_backend constructor
157	}
158	}	251✔
159
160	thread_queue& get_host_queue(const size_t lane) {	1,595✔
161	assert(lane <= host.queues.size());	1,595✔
162	if(lane == host.queues.size()) { host.queues.emplace_back(named_threads::task_type_host_queue(lane), config.profiling); }	1,595✔
163	return host.queues[lane];	1,595✔
164	}
165
166	sycl::queue& get_device_queue(const device_id did, const size_t lane) {	4,274✔
167	auto& device = devices[did];	4,274✔
168	assert(lane <= device.queues.size());	4,276✔
169	if(lane == device.queues.size()) {	4,276✔
170	const auto properties = config.profiling ? sycl::property_list{sycl::property::queue::enable_profiling{}, sycl::property::queue::in_order{}}	734✔
171	: sycl::property_list{sycl::property::queue::in_order{}};	734✔
172	device.queues.emplace_back(device.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), properties);	734✔
173	}	733✔
174	return device.queues[lane];	4,275✔
175	}
176	};
177
178	sycl_backend::sycl_backend(const std::vector<sycl::device>& devices, const configuration& config) : m_impl(new impl(devices, config)) {	251!
179	// Initialize a submission thread with hydrator for each device, if they are enabled
180	if(m_impl->config.per_device_submission_threads) {	251✔
181	for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {	841✔
182	m_impl->devices[did].submission_thread.emplace(named_threads::task_type_device_submitter(did.value), m_impl->config.profiling);	592✔
183	// no need to wait for the event -> will happen before the first task is submitted
184	(void)m_impl->devices[did].submission_thread->submit([did] { closure_hydrator::make_available(); });	1,184✔
185	}
186	}
187	}	251✔
188
189	sycl_backend::~sycl_backend() {	251✔
190	// If we are using submission threads, tear down their hydrators before they are destroyed
191	if(m_impl->config.per_device_submission_threads) {	251✔
192	for(auto& device : m_impl->devices) {	841✔
193	// no need to wait for the event -> destruction will wait for the submission thread to finish
194	(void)device.submission_thread->submit([] { closure_hydrator::teardown(); });	1,178✔
195	}
196	}
197	}	251✔
198
199	const system_info& sycl_backend::get_system_info() const { return m_impl->system; }	457✔
200
201	void sycl_backend::init() {	225✔
202	CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init);
203
204	// Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device.
205	for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {	730✔
206	(void)m_impl->get_device_queue(did, 0 /* lane */);	505✔
207	}
208	}	225✔
209
210	void* sycl_backend::debug_alloc(const size_t size) {	1,998✔
211	const auto ptr = sycl::malloc_host(size, m_impl->host.sycl_context);	1,998✔
212	#if CELERITY_DETAIL_ENABLE_DEBUG
213	memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);	1,998✔
214	#endif
215	return ptr;	1,998✔
216	}
217
218	void sycl_backend::debug_free(void* const ptr) { sycl::free(ptr, m_impl->host.sycl_context); }	1,998✔
219
220	async_event sycl_backend::enqueue_host_alloc(const size_t size, const size_t alignment) {	434✔
221	return m_impl->host.alloc_queue.submit([this, size, alignment] {	868✔
222	const auto ptr = sycl::aligned_alloc_host(alignment, size, m_impl->host.sycl_context);	434✔
223	#if CELERITY_DETAIL_ENABLE_DEBUG
224	memset(ptr, static_cast<int>(sycl_backend_detail::uninitialized_memory_pattern), size);	434✔
225	#endif
226	return ptr;	434✔
227	});	868✔
228	}
229
230	async_event sycl_backend::enqueue_device_alloc(const device_id device, const size_t size, const size_t alignment) {	505✔
231	return m_impl->host.alloc_queue.submit([this, device, size, alignment] {	1,010✔
232	auto& d = m_impl->devices[device];	505✔
233	const auto ptr = sycl::aligned_alloc_device(alignment, size, d.sycl_device, d.sycl_context);	505✔
234	#if CELERITY_DETAIL_ENABLE_DEBUG
235	sycl::queue(d.sycl_context, d.sycl_device, sycl::async_handler(sycl_backend_detail::report_errors), sycl::property::queue::in_order{})	1,010✔
236	.fill(ptr, sycl_backend_detail::uninitialized_memory_pattern, size)	2,020✔
237	.wait_and_throw();	1,010✔
238	#endif
239	return ptr;	505✔
240	});	1,010✔
241	}
242
243	async_event sycl_backend::enqueue_host_free(void* const ptr) {	434✔
244	return m_impl->host.alloc_queue.submit([this, ptr] { sycl::free(ptr, m_impl->host.sycl_context); });	868✔
245	}
246
247	async_event sycl_backend::enqueue_device_free(const device_id device, void* const ptr) {	505✔
248	return m_impl->host.alloc_queue.submit([this, device, ptr] { sycl::free(ptr, m_impl->devices[device].sycl_context); });	1,010✔
249	}
250
251	async_event sycl_backend::enqueue_host_task(size_t host_lane, const host_task_launcher& launcher, std::vector<closure_hydrator::accessor_info> accessor_infos,	1,182✔
252	const range<3>& global_range, const box<3>& execution_range, const communicator* collective_comm) //
253	{
254	auto& hydrator = closure_hydrator::get_instance();	1,182✔
255	hydrator.arm(target::host_task, std::move(accessor_infos));	1,182✔
256	auto launch_hydrated = hydrator.hydrate<target::host_task>(launcher);	1,182✔
257	return m_impl->get_host_queue(host_lane).submit(	1,182✔
258	[=, launch_hydrated = std::move(launch_hydrated)] { launch_hydrated(global_range, execution_range, collective_comm); });	4,728!
259	}	1,182✔
260
261	async_event sycl_backend::enqueue_device_kernel(const device_id device, const size_t lane, const device_kernel_launcher& launch,	927✔
262	std::vector<closure_hydrator::accessor_info> accessor_infos, const box<3>& execution_range, const std::vector<void*>& reduction_ptrs) //
263	{
264	return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable {	1,854✔
265	CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
266	auto event = queue.submit([&](sycl::handler& sycl_cgh) {	927✔
267	auto& hydrator = closure_hydrator::get_instance();	927✔
268	hydrator.arm(target::device, std::move(acc_infos));	927✔
269	const auto launch_hydrated = hydrator.hydrate<target::device>(sycl_cgh, launch);	927✔
270	launch_hydrated(sycl_cgh, execution_range, reduction_ptrs);	927✔
271	});	2,781✔
272	sycl_backend_detail::flush(queue);	927✔
273	return make_async_event<sycl_backend_detail::sycl_event>(std::move(event), m_impl->config.profiling);	1,854✔
274	});	3,708✔
275	}
276
277	async_event sycl_backend::enqueue_host_copy(size_t host_lane, const void* const source_base, void* const dest_base, const region_layout& source_layout,	413✔
278	const region_layout& dest_layout, const region<3>& copy_region, const size_t elem_size) //
279	{
280	return m_impl->get_host_queue(host_lane).submit([=] { nd_copy_host(source_base, dest_base, source_layout, dest_layout, copy_region, elem_size); });	826!
281	}
282
283	void sycl_backend::check_async_errors() {	193,772✔
284	for(size_t i = 0; i < m_impl->devices.size(); ++i) {	435,318✔
285	auto& device = m_impl->devices[i];	241,546✔
286	if(m_impl->config.per_device_submission_threads) {	241,546!
287	// Prevent multiple error checks from being enqueued at the same time
288	if(!device.active_async_error_check.test_and_set()) {	483,092✔
289	(void)device.submission_thread->submit([&]() {	109,037✔
290	for(auto& queue : device.queues) {	379,487✔
291	queue.throw_asynchronous();	270,505✔
292	}
293	device.active_async_error_check.clear();	108,913✔
294	});	108,888✔
295	}
296	} else {
UNCOV 297	for(auto& queue : device.queues) {	×
UNCOV 298	queue.throw_asynchronous();	×
299	}
300	}
301	}
302	}	193,772✔
303
304	system_info& sycl_backend::get_system_info() { return m_impl->system; }	×
305
306	async_event celerity::detail::sycl_backend::enqueue_device_work(	3,771✔
307	const device_id device, const size_t lane, const std::function<async_event(sycl::queue&)>& work) {
308	// Basic case: no per-device submission threads
309	if(!m_impl->config.per_device_submission_threads) { return work(m_impl->get_device_queue(device, lane)); }	3,771✔
310
311	auto& device_state = m_impl->devices[device];	3,761✔
312	auto& submission_thread = device_state.submission_thread;	3,761✔
313	assert(submission_thread.has_value());	3,761✔
314
315	// Note: this mechanism is quite similar in principle to a std::future/promise,
316	// but implementing it with that caused a 50% (!) slowdown in system-level benchmarks
317	const auto async_event_state = std::make_shared<sycl_backend_detail::delayed_async_event::state>();	3,761✔
318	auto async_event = make_async_event<sycl_backend_detail::delayed_async_event>(async_event_state);	3,761✔
319
320	(void)submission_thread->submit([this, device, lane, work, async_event_state] {	3,761!
321	auto event = work(m_impl->get_device_queue(device, lane));	3,760✔
322	async_event_state->set_value(std::move(event));	3,760✔
323	});	7,522✔
324	return async_event;	3,761✔
325	}	3,761✔
326
327	bool sycl_backend::is_profiling_enabled() const { return m_impl->config.profiling; }	2,843✔
328
329	std::vector<sycl_backend_type> sycl_backend_enumerator::compatible_backends(const sycl::device& device) const {	1,200✔
330	std::vector<backend_type> backends{backend_type::generic};	3,600✔
331	#if CELERITY_WORKAROUND(ACPP) && defined(SYCL_EXT_HIPSYCL_BACKEND_CUDA)
332	if(device.get_backend() == sycl::backend::cuda) { backends.push_back(sycl_backend_type::cuda); }
333	#elif CELERITY_WORKAROUND(DPCPP)
334	if(device.get_backend() == sycl::backend::ext_oneapi_cuda) { backends.push_back(sycl_backend_type::cuda); }
335	#endif
336	assert(std::is_sorted(backends.begin(), backends.end()));	1,200✔
337	return backends;	1,200✔
338	}	×
339
340	std::vector<sycl_backend_type> sycl_backend_enumerator::available_backends() const {	241✔
341	std::vector<backend_type> backends{backend_type::generic};	723✔
342	#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
343	backends.push_back(sycl_backend_type::cuda);
344	#endif
345	assert(std::is_sorted(backends.begin(), backends.end()));	241✔
346	return backends;	241✔
347	}	×
348
349	bool sycl_backend_enumerator::is_specialized(backend_type type) const {	233✔
350	switch(type) {	233!
351	case backend_type::generic: return false;	232✔
352	case backend_type::cuda: return true;	1✔
353	default: utils::unreachable(); // LCOV_EXCL_LINE
354	}
355	}
356
357	int sycl_backend_enumerator::get_priority(backend_type type) const {	2✔
358	switch(type) {	2!
359	case backend_type::generic: return 0;	1✔
360	case backend_type::cuda: return 1;	1✔
361	default: utils::unreachable(); // LCOV_EXCL_LINE
362	}
363	}
364
365	} // namespace celerity::detail
366
367	namespace celerity::detail {
368
369	std::unique_ptr<backend> make_sycl_backend(const sycl_backend_type type, const std::vector<sycl::device>& devices, const sycl_backend::configuration& config) {	251✔
370	assert(std::all_of(	851✔
371	devices.begin(), devices.end(), [=](const sycl::device& d) { return utils::contains(sycl_backend_enumerator{}.compatible_backends(d), type); }));
372
373	switch(type) {	251!
374	case sycl_backend_type::generic: //	251✔
375	return std::make_unique<sycl_generic_backend>(devices, config);	251✔
376
377	case sycl_backend_type::cuda:	×
378	#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
379	return std::make_unique<sycl_cuda_backend>(devices, config);
380	#else
381	utils::panic("CUDA backend has not been compiled");	×
382	#endif
383	}
384	utils::unreachable(); // LCOV_EXCL_LINE
385	}
386
387	} // namespace celerity::detail

celerity / celerity-runtime / 12787716791

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous