• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

celerity / celerity-runtime / 9549024017

17 Jun 2024 01:57PM UTC coverage: 94.675%. Remained the same
9549024017

push

github

fknorr
Implement and test out_of_order_engine

The out-of-order engine keeps track of all instructions that have not yet
completed and decides which instructions to schedule onto which backend
resources at what time, and receives back information on which instructions
have already completed. This will allow us to keep the instruction executor
free of most instruction state tracking.

This new form of scheduling is based on a definition of backends which maintain
an array of in-order thread queues for host work and in-order SYCL queues for
device submissions. This allows the engine to omit host / executor loop
round-trips between consecutive GPU / CPU loads by scheduling successors onto
the same in-order queues to implicitly fulfil dependencies, and thus hide SYCL
and CUDA kernel launch latency.

In the future this could be improved further with support to submit
instructions with dependencies on multiple queues / devices earlier by waiting
on in-flight SYCL events.

3064 of 3419 branches covered (89.62%)

Branch coverage included in aggregate %.

187 of 201 new or added lines in 4 files covered. (93.03%)

20 existing lines in 4 files now uncovered.

7053 of 7267 relevant lines covered (97.06%)

216909.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.16
/src/task.cc
1
#include "task.h"
2

3
#include "access_modes.h"
4
#include "utils.h"
5

6

7
namespace celerity {
8
namespace detail {
9

10
        std::unordered_set<buffer_id> buffer_access_map::get_accessed_buffers() const {
31,322✔
11
                std::unordered_set<buffer_id> result;
31,322✔
12
                for(const auto& [bid, _] : m_accesses) {
56,750✔
13
                        result.emplace(bid);
25,440✔
14
                }
15
                return result;
31,305✔
16
        }
×
17

18
        std::unordered_set<cl::sycl::access::mode> buffer_access_map::get_access_modes(buffer_id bid) const {
22,009✔
19
                std::unordered_set<cl::sycl::access::mode> result;
22,009✔
20
                for(const auto& [b, rm] : m_accesses) {
51,516✔
21
                        if(b == bid) { result.insert(rm->get_access_mode()); }
29,505✔
22
                }
23
                return result;
22,008✔
24
        }
×
25

26
        template <int KernelDims>
27
        subrange<3> apply_range_mapper(const range_mapper_base* rm, const chunk<KernelDims>& chnk) {
28,725✔
28
                switch(rm->get_buffer_dimensions()) {
28,725!
29
                case 0: return subrange_cast<3>(subrange<0>());
236✔
30
                case 1: return subrange_cast<3>(rm->map_1(chnk));
36,546✔
31
                case 2: return subrange_cast<3>(rm->map_2(chnk));
18,960✔
32
                case 3: return rm->map_3(chnk);
851✔
33
                default: assert(false); return subrange<3>{};
34
                }
35
        }
36

37
        subrange<3> apply_range_mapper(const range_mapper_base* rm, const chunk<3>& chnk, int kernel_dims) {
28,729✔
38
                switch(kernel_dims) {
28,729!
39
                case 0: return apply_range_mapper<0>(rm, chunk_cast<0>(chnk));
28,811✔
40
                case 1: return apply_range_mapper<1>(rm, chunk_cast<1>(chnk));
16,715✔
41
                case 2: return apply_range_mapper<2>(rm, chunk_cast<2>(chnk));
10,844✔
42
                case 3: return apply_range_mapper<3>(rm, chunk_cast<3>(chnk));
1,077✔
UNCOV
43
                default: assert(!"Unreachable"); return subrange<3>{};
×
44
                }
45
        }
46

47
        region<3> buffer_access_map::get_mode_requirements(
53,128✔
48
            const buffer_id bid, const access_mode mode, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
49
                box_vector<3> boxes;
53,128✔
50
                for(size_t i = 0; i < m_accesses.size(); ++i) {
119,630✔
51
                        if(m_accesses[i].first != bid || m_accesses[i].second->get_access_mode() != mode) continue;
66,525✔
52
                        boxes.push_back(get_requirements_for_nth_access(i, kernel_dims, sr, global_size));
25,019✔
53
                }
54
                return region(std::move(boxes));
106,154✔
55
        }
53,070✔
56

57
        box<3> buffer_access_map::get_requirements_for_nth_access(const size_t n, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
28,341✔
58
                return apply_range_mapper(m_accesses[n].second.get(), chunk<3>{sr.offset, sr.range, global_size}, kernel_dims);
56,638✔
59
        }
60

61
        box_vector<3> buffer_access_map::get_required_contiguous_boxes(
447✔
62
            const buffer_id bid, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
63
                box_vector<3> boxes;
447✔
64
                for(const auto& [a_bid, a_rm] : m_accesses) {
859✔
65
                        if(a_bid == bid) {
412✔
66
                                const auto accessed_box = box(apply_range_mapper(a_rm.get(), chunk<3>{sr.offset, sr.range, global_size}, kernel_dims));
392✔
67
                                if(!accessed_box.empty()) { boxes.push_back(accessed_box); }
392✔
68
                        }
69
                }
70
                return boxes;
447✔
71
        }
×
72

73
        void side_effect_map::add_side_effect(const host_object_id hoid, const experimental::side_effect_order order) {
228✔
74
                // TODO for multiple side effects on the same hoid, find the weakest order satisfying all of them
75
                emplace(hoid, order);
228✔
76
        }
228✔
77

78
        std::string print_task_debug_label(const task& tsk, bool title_case) {
31✔
79
                const auto type_string = [&] {
31✔
80
                        switch(tsk.get_type()) {
31!
81
                        case task_type::epoch: return "epoch";
×
82
                        case task_type::host_compute: return "host-compute task";
2✔
83
                        case task_type::device_compute: return "device kernel";
20✔
84
                        case task_type::collective: return "collective host task";
7✔
85
                        case task_type::master_node: return "master-node host task";
2✔
86
                        case task_type::horizon: return "horizon";
×
87
                        case task_type::fence: return "fence";
×
88
                        default: return "unknown task";
×
89
                        }
90
                }();
31✔
91

92
                auto label = fmt::format("{} T{}", type_string, tsk.get_id());
62✔
93
                if(title_case) { label[0] = static_cast<char>(std::toupper(label[0])); }
31✔
94
                if(!tsk.get_debug_name().empty()) { fmt::format_to(std::back_inserter(label), " \"{}\"", tsk.get_debug_name()); }
31✔
95
                return label;
62✔
96
        }
×
97

98
        std::unordered_map<buffer_id, region<3>> detect_overlapping_writes(const task& tsk, const box_vector<3>& chunks) {
5,256✔
99
                const box<3> scalar_reduction_box({0, 0, 0}, {1, 1, 1});
5,256✔
100

101
                auto& bam = tsk.get_buffer_access_map();
5,256✔
102

103
                // track the union of writes we have checked so far in order to detect an overlap between that union and the next write
104
                std::unordered_map<buffer_id, region<3>> buffer_write_accumulators;
5,256✔
105
                // collect overlapping writes in order to report all of them before throwing
106
                std::unordered_map<buffer_id, region<3>> overlapping_writes;
5,256✔
107

108
                for(const auto bid : bam.get_accessed_buffers()) {
9,447✔
109
                        for(const auto& ck : chunks) {
9,585✔
110
                                region<3> writes;
5,394✔
111
                                for(const auto mode : bam.get_access_modes(bid)) {
11,120✔
112
                                        if(access::mode_traits::is_producer(mode)) {
5,726✔
113
                                                const auto req = bam.get_mode_requirements(bid, mode, tsk.get_dimensions(), ck.get_subrange(), tsk.get_global_size());
3,945✔
114
                                                writes = region_union(writes, req);
3,945✔
115
                                        }
3,945✔
116
                                }
5,394✔
117
                                if(!writes.empty()) {
5,394✔
118
                                        auto& write_accumulator = buffer_write_accumulators[bid]; // allow default-insert
3,898✔
119
                                        if(const auto overlap = region_intersection(write_accumulator, writes); !overlap.empty()) {
3,898✔
120
                                                auto& full_overlap = overlapping_writes[bid]; // allow default-insert
20✔
121
                                                full_overlap = region_union(full_overlap, overlap);
20✔
122
                                        }
3,898✔
123
                                        write_accumulator = region_union(write_accumulator, writes);
3,898✔
124
                                }
125
                        }
5,394✔
126
                }
5,256✔
127

128
                // we already check for accessor-reduction overlaps on task generation, but we still repeat the sanity-check here
129
                for(const auto& rinfo : tsk.get_reductions()) {
5,442✔
130
                        auto& write_accumulator = buffer_write_accumulators[rinfo.bid]; // allow default-insert
186✔
131
                        if(const auto overlap = region_intersection(write_accumulator, scalar_reduction_box); !overlap.empty()) {
186!
132
                                auto& full_overlap = overlapping_writes[rinfo.bid]; // allow default-insert
×
133
                                full_overlap = region_union(full_overlap, overlap);
×
134
                        }
186✔
135
                        write_accumulator = region_union(write_accumulator, scalar_reduction_box);
186✔
136
                }
137

138
                return overlapping_writes;
5,256✔
139
        }
5,256✔
140

141
} // namespace detail
142
} // namespace celerity
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc