• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 9549024017

17 Jun 2024 01:57PM UTC coverage: 94.675%. Remained the same
9549024017

push

github

fknorr
Implement and test out_of_order_engine

The out-of-order engine keeps track of all instructions that have not yet
completed and decides which instructions to schedule onto which backend
resources at what time, and receives back information on which instructions
have already completed. This will allow us to keep the instruction executor
free of most instruction state tracking.

This new form of scheduling is based on a definition of backends which maintain
an array of in-order thread queues for host work and in-order SYCL queues for
device submissions. This allows the engine to omit host / executor loop
round-trips between consecutive GPU / CPU loads by scheduling successors onto
the same in-order queues to implicitly fulfil dependencies, and thus hide SYCL
and CUDA kernel launch latency.

In the future this could be improved further with support to submit
instructions with dependencies on multiple queues / devices earlier by waiting
on in-flight SYCL events.

3064 of 3419 branches covered (89.62%)

Branch coverage included in aggregate %.

187 of 201 new or added lines in 4 files covered. (93.03%)

20 existing lines in 4 files now uncovered.

7053 of 7267 relevant lines covered (97.06%)

216909.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.36
/include/device_queue.h
1
#pragma once
2

3
#include <algorithm>
4
#include <memory>
5
#include <type_traits>
6
#include <variant>
7

8
#include <CL/sycl.hpp>
9

10
#include "backend/backend.h"
11
#include "config.h"
12
#include "log.h"
13
#include "workaround.h"
14

15
namespace celerity {
16
namespace detail {
17

18
        struct auto_select_device {};
19
        using device_selector = std::function<int(const sycl::device&)>;
20
        using device_or_selector = std::variant<auto_select_device, sycl::device, device_selector>;
21

22
        class task;
23

24
        struct device_allocation {
25
                void* ptr = nullptr;
26
                size_t size_bytes = 0;
27
        };
28

29
        class allocation_error : public std::runtime_error {
30
          public:
31
                allocation_error(const std::string& msg) : std::runtime_error(msg) {}
3✔
32
        };
33

34
        /**
35
         * The @p device_queue wraps the actual SYCL queue and is used to submit kernels.
36
         */
37
        class device_queue {
38
          public:
39
                /**
40
                 * @brief Initializes the @p device_queue, selecting an appropriate device in the process.
41
                 *
42
                 * @param cfg The configuration is used to select the appropriate SYCL device.
43
                 * @param user_device_or_selector Optionally a device (which will take precedence over any configuration) or a device selector can be provided.
44
                 */
45
                void init(const config& cfg, const device_or_selector& user_device_or_selector);
46

47
                /**
48
                 * @brief Executes the kernel associated with task @p ctsk over the chunk @p chnk.
49
                 */
50
                template <typename Fn>
51
                cl::sycl::event submit(Fn&& fn) {
252✔
52
                        auto evt = m_sycl_queue->submit([fn = std::forward<Fn>(fn)](cl::sycl::handler& sycl_handler) { fn(sycl_handler); });
504✔
53
#if CELERITY_WORKAROUND(HIPSYCL)
54
#pragma GCC diagnostic push
55
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
56
                        // hipSYCL does not guarantee that command groups are actually scheduled until an explicit await operation, which we cannot insert without
57
                        // blocking the executor loop (see https://github.com/illuhad/hipSYCL/issues/599). Instead, we explicitly flush the queue to be able to continue
58
                        // using our polling-based approach.
59
                        m_sycl_queue->get_context().hipSYCL_runtime()->dag().flush_async();
60
#pragma GCC diagnostic pop
61
#endif
62
                        return evt;
252✔
63
                }
64

65
                template <typename T>
66
                [[nodiscard]] device_allocation malloc(const size_t count) {
201✔
67
                        const size_t size_bytes = count * sizeof(T);
201✔
68
                        assert(m_sycl_queue != nullptr);
201✔
69
                        assert(m_global_mem_allocated_bytes + size_bytes < m_global_mem_total_size_bytes);
201✔
70
                        CELERITY_DEBUG("Allocating {} bytes on device", size_bytes);
402✔
71
                        T* ptr = nullptr;
201✔
72
                        try {
73
                                ptr = sycl::aligned_alloc_device<T>(alignof(T), count, *m_sycl_queue);
201✔
UNCOV
74
                        } catch(sycl::exception& e) {
×
UNCOV
75
                                CELERITY_CRITICAL("sycl::aligned_alloc_device failed with exception: {}", e.what());
×
UNCOV
76
                                ptr = nullptr;
×
77
                        }
78
                        if(ptr == nullptr) {
201✔
79
                                throw allocation_error(fmt::format("Allocation of {} bytes failed; likely out of memory. Currently allocated: {} out of {} bytes.",
3✔
80
                                    count * sizeof(T), m_global_mem_allocated_bytes, m_global_mem_total_size_bytes));
2✔
81
                        }
82
                        m_global_mem_allocated_bytes += size_bytes;
200✔
83
                        return device_allocation{ptr, size_bytes};
200✔
84
                }
85

86
                void free(device_allocation alloc) {
205✔
87
                        assert(m_sycl_queue != nullptr);
205✔
88
                        assert(alloc.size_bytes <= m_global_mem_allocated_bytes);
205✔
89
                        assert(alloc.ptr != nullptr || alloc.size_bytes == 0);
205✔
90
                        CELERITY_DEBUG("Freeing {} bytes on device", alloc.size_bytes);
410✔
91
                        if(alloc.size_bytes != 0) { sycl::free(alloc.ptr, *m_sycl_queue); }
205✔
92
                        m_global_mem_allocated_bytes -= alloc.size_bytes;
205✔
93
                }
205✔
94

95
                size_t get_global_memory_total_size_bytes() const { return m_global_mem_total_size_bytes; }
351✔
96

97
                size_t get_global_memory_allocated_bytes() const { return m_global_mem_allocated_bytes; }
340✔
98

99
                /**
100
                 * @brief Waits until all currently submitted operations have completed.
101
                 */
102
                void wait() { m_sycl_queue->wait_and_throw(); }
191✔
103

104
                /**
105
                 * @brief Returns whether device profiling is enabled.
106
                 */
107
                bool is_profiling_enabled() const { return m_device_profiling_enabled; }
470✔
108

109
                cl::sycl::queue& get_sycl_queue() const {
826✔
110
                        assert(m_sycl_queue != nullptr);
826✔
111
                        return *m_sycl_queue;
826✔
112
                }
113

114
          private:
115
                size_t m_global_mem_total_size_bytes = 0;
116
                size_t m_global_mem_allocated_bytes = 0;
117
                std::unique_ptr<cl::sycl::queue> m_sycl_queue;
118
                bool m_device_profiling_enabled = false;
119

120
                void handle_async_exceptions(cl::sycl::exception_list el) const;
121
        };
122

123
        // Try to find a platform that can provide a unique device for each node using a device selector.
124
        template <typename DeviceT, typename PlatformT, typename SelectorT>
125
        bool try_find_device_per_node(
9✔
126
            std::string& how_selected, DeviceT& device, const std::vector<PlatformT>& platforms, const host_config& host_cfg, SelectorT selector) {
127
                std::vector<std::tuple<DeviceT, size_t>> devices_with_platform_idx;
9✔
128
                for(size_t i = 0; i < platforms.size(); ++i) {
32✔
129
                        auto&& platform = platforms[i];
23✔
130
                        for(auto device : platform.get_devices()) {
142✔
131
                                if(selector(device) == -1) { continue; }
62✔
132
                                devices_with_platform_idx.emplace_back(device, i);
57✔
133
                        }
134
                }
135

136
                std::stable_sort(devices_with_platform_idx.begin(), devices_with_platform_idx.end(),
9✔
137
                    [selector](const auto& a, const auto& b) { return selector(std::get<0>(a)) > selector(std::get<0>(b)); });
128✔
138
                bool same_platform = true;
9✔
139
                bool same_device_type = true;
9✔
140
                if(devices_with_platform_idx.size() >= host_cfg.node_count) {
9✔
141
                        auto [device_from_platform, idx] = devices_with_platform_idx[0];
7✔
142
                        const auto platform = device_from_platform.get_platform();
7✔
143
                        const auto device_type = device_from_platform.template get_info<sycl::info::device::device_type>();
7✔
144

145
                        for(size_t i = 1; i < host_cfg.node_count; ++i) {
31✔
146
                                auto [device_from_platform, idx] = devices_with_platform_idx[i];
12✔
147
                                if(device_from_platform.get_platform() != platform) { same_platform = false; }
12✔
148
                                if(device_from_platform.template get_info<sycl::info::device::device_type>() != device_type) { same_device_type = false; }
12✔
149
                        }
150

151
                        if(!same_platform || !same_device_type) { CELERITY_WARN("Selected devices are of different type and/or do not belong to the same platform"); }
10✔
152

153
                        auto [selected_device_from_platform, selected_idx] = devices_with_platform_idx[host_cfg.local_rank];
7✔
154
                        how_selected = fmt::format("device selector specified: platform {}, device {}", selected_idx, host_cfg.local_rank);
14✔
155
                        device = selected_device_from_platform;
7✔
156
                        return true;
7✔
157
                }
7✔
158

159
                return false;
2✔
160
        }
9✔
161

162
        // Try to find a platform that can provide a unique device for each node.
163
        template <typename DeviceT, typename PlatformT>
164
        bool try_find_device_per_node(
327✔
165
            std::string& how_selected, DeviceT& device, const std::vector<PlatformT>& platforms, const host_config& host_cfg, sycl::info::device_type type) {
166
                for(size_t i = 0; i < platforms.size(); ++i) {
772✔
167
                        auto&& platform = platforms[i];
371✔
168
                        std::vector<DeviceT> platform_devices;
371✔
169

170
                        platform_devices = platform.get_devices(type);
371✔
171
                        if(platform_devices.size() >= host_cfg.node_count) {
371✔
172
                                how_selected = fmt::format("automatically selected platform {}, device {}", i, host_cfg.local_rank);
594✔
173
                                device = platform_devices[host_cfg.local_rank];
297✔
174
                                return true;
297✔
175
                        }
176
                }
177

178
                return false;
30✔
179
        }
180

181
        template <typename DeviceT, typename PlatformT, typename SelectorT>
182
        bool try_find_one_device(
2✔
183
            std::string& how_selected, DeviceT& device, const std::vector<PlatformT>& platforms, const host_config& host_cfg, SelectorT selector) {
184
                std::vector<DeviceT> platform_devices;
2✔
185
                for(auto& p : platforms) {
14✔
186
                        auto p_devices = p.get_devices();
6✔
187
                        platform_devices.insert(platform_devices.end(), p_devices.begin(), p_devices.end());
6✔
188
                }
189

190
                std::stable_sort(platform_devices.begin(), platform_devices.end(), [selector](const auto& a, const auto& b) { return selector(a) > selector(b); });
13✔
191
                if(!platform_devices.empty()) {
2!
192
                        if(selector(platform_devices[0]) == -1) { return false; }
2✔
193
                        device = platform_devices[0];
1✔
194
                        return true;
1✔
195
                }
196

UNCOV
197
                return false;
×
198
        };
2✔
199

200
        template <typename DeviceT, typename PlatformT>
201
        bool try_find_one_device(
9✔
202
            std::string& how_selected, DeviceT& device, const std::vector<PlatformT>& platforms, const host_config& host_cfg, sycl::info::device_type type) {
203
                for(auto& p : platforms) {
16✔
204
                        for(auto& d : p.get_devices(type)) {
17✔
205
                                device = d;
5✔
206
                                return true;
5✔
207
                        }
208
                }
209

210
                return false;
4✔
211
        };
212

213

214
        template <typename DevicePtrOrSelector, typename PlatformT>
215
        auto pick_device(const config& cfg, const DevicePtrOrSelector& user_device_or_selector, const std::vector<PlatformT>& platforms) {
321✔
216
                using DeviceT = typename decltype(std::declval<PlatformT&>().get_devices())::value_type;
217

218
                constexpr bool user_device_provided = std::is_same_v<DevicePtrOrSelector, DeviceT>;
321✔
219
                constexpr bool device_selector_provided = std::is_invocable_r_v<int, DevicePtrOrSelector, DeviceT>;
321✔
220
                constexpr bool auto_select = std::is_same_v<auto_select_device, DevicePtrOrSelector>;
321✔
221
                static_assert(
222
                    user_device_provided ^ device_selector_provided ^ auto_select, "pick_device requires either a device, a selector, or the auto_select_device tag");
223

224
                DeviceT device;
321✔
225
                std::string how_selected = "automatically selected";
642✔
226
                if constexpr(user_device_provided) {
227
                        device = user_device_or_selector;
5✔
228
                        how_selected = "specified by user";
5✔
229
                } else {
230
                        const auto device_cfg = cfg.get_device_config();
316✔
231
                        if(device_cfg != std::nullopt) {
316✔
232
                                how_selected = fmt::format("set by CELERITY_DEVICES: platform {}, device {}", device_cfg->platform_id, device_cfg->device_id);
8✔
233
                                CELERITY_DEBUG("{} platforms available", platforms.size());
8✔
234
                                if(device_cfg->platform_id >= platforms.size()) {
4✔
235
                                        throw std::runtime_error(fmt::format("Invalid platform id {}: Only {} platforms available", device_cfg->platform_id, platforms.size()));
2✔
236
                                }
237
                                const auto devices = platforms[device_cfg->platform_id].get_devices();
3✔
238
                                if(device_cfg->device_id >= devices.size()) {
3✔
239
                                        throw std::runtime_error(fmt::format(
3✔
240
                                            "Invalid device id {}: Only {} devices available on platform {}", device_cfg->device_id, devices.size(), device_cfg->platform_id));
2✔
241
                                }
242
                                device = devices[device_cfg->device_id];
2✔
243
                        } else {
3✔
244
                                const auto host_cfg = cfg.get_host_config();
312✔
245

246
                                if constexpr(!device_selector_provided) {
247
                                        // Try to find a unique GPU per node.
248
                                        if(!try_find_device_per_node(how_selected, device, platforms, host_cfg, sycl::info::device_type::gpu)) {
303✔
249
                                                if(try_find_device_per_node(how_selected, device, platforms, host_cfg, sycl::info::device_type::all)) {
24✔
250
                                                        CELERITY_WARN("No suitable platform found that can provide {} GPU devices, and CELERITY_DEVICES not set", host_cfg.node_count);
36✔
251
                                                } else {
252
                                                        CELERITY_WARN("No suitable platform found that can provide {} devices, and CELERITY_DEVICES not set", host_cfg.node_count);
12✔
253
                                                        // Just use the first available device. Prefer GPUs, but settle for anything.
254
                                                        if(!try_find_one_device(how_selected, device, platforms, host_cfg, sycl::info::device_type::gpu)
6✔
255
                                                            && !try_find_one_device(how_selected, device, platforms, host_cfg, sycl::info::device_type::all)) {
6✔
256
                                                                throw std::runtime_error("Automatic device selection failed: No device available");
1✔
257
                                                        }
258
                                                }
259
                                        }
260
                                } else {
261
                                        // Try to find a unique device per node using a selector.
262
                                        if(!try_find_device_per_node(how_selected, device, platforms, host_cfg, user_device_or_selector)) {
9✔
263
                                                CELERITY_WARN("No suitable platform found that can provide {} devices that match the specified device selector, and "
4✔
264
                                                              "CELERITY_DEVICES not set",
265
                                                    host_cfg.node_count);
266
                                                // Use the first available device according to the selector, but fails if no such device is found.
267
                                                if(!try_find_one_device(how_selected, device, platforms, host_cfg, user_device_or_selector)) {
2✔
268
                                                        throw std::runtime_error("Device selection with device selector failed: No device available");
1✔
269
                                                }
270
                                        }
271
                                }
272
                        }
273
                }
274

275
                const auto platform_name = device.get_platform().template get_info<sycl::info::platform::name>();
590✔
276
                const auto device_name = device.template get_info<sycl::info::device::name>();
317✔
277
                CELERITY_INFO("Using platform '{}', device '{}' ({})", platform_name, device_name, how_selected);
634✔
278

279
                if constexpr(std::is_same_v<DeviceT, sycl::device>) {
280
                        if(backend::get_effective_type(device) == backend::type::generic) {
273!
281
                                if(backend::get_type(device) == backend::type::unknown) {
273!
282
                                        CELERITY_WARN("No backend specialization available for selected platform '{}', falling back to generic. Performance may be degraded.",
546✔
283
                                            device.get_platform().template get_info<sycl::info::platform::name>());
284
                                } else {
UNCOV
285
                                        CELERITY_WARN(
×
286
                                            "Selected platform '{}' is compatible with specialized {} backend, but it has not been compiled. Performance may be degraded.",
287
                                            device.get_platform().template get_info<sycl::info::platform::name>(), backend::get_name(backend::get_type(device)));
288
                                }
289
                        } else {
UNCOV
290
                                CELERITY_DEBUG("Using {} backend for selected platform '{}'.", backend::get_name(backend::get_effective_type(device)),
×
291
                                    device.get_platform().template get_info<sycl::info::platform::name>());
292
                        }
293
                }
294

295
                return device;
634✔
296
        }
325✔
297

298
} // namespace detail
299
} // namespace celerity
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc