• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 28182474503

25 Jun 2026 02:25PM UTC coverage: 61.9% (+0.3%) from 61.644%
28182474503

Pull #812

github

web-flow
Merge aafbca6ca into fe9abd1cb
Pull Request #812: Define Buffer reuse pass for offloaded containers

290 of 317 new or added lines in 2 files covered. (91.48%)

1 existing line in 1 file now uncovered.

38657 of 62451 relevant lines covered (61.9%)

987.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.46
/opt/src/passes/offloading/device_buffer_reuse_pass.cpp
1
#include "sdfg/passes/offloading/device_buffer_reuse_pass.h"
2

3
#include <algorithm>
4
#include <string>
5
#include <tuple>
6
#include <unordered_map>
7
#include <unordered_set>
8
#include <vector>
9

10
#include "sdfg/analysis/dominance_analysis.h"
11
#include "sdfg/analysis/users.h"
12
#include "sdfg/data_flow/access_node.h"
13
#include "sdfg/data_flow/code_node.h"
14
#include "sdfg/data_flow/data_flow_graph.h"
15
#include "sdfg/data_flow/data_flow_node.h"
16
#include "sdfg/data_flow/memlet.h"
17
#include "sdfg/structured_control_flow/block.h"
18
#include "sdfg/structured_control_flow/if_else.h"
19
#include "sdfg/structured_control_flow/sequence.h"
20
#include "sdfg/structured_control_flow/structured_loop.h"
21
#include "sdfg/symbolic/symbolic.h"
22
#include "sdfg/targets/offloading/data_offloading_node.h"
23

24
namespace sdfg {
25
namespace passes {
26

27
// The device container an offloading node operates on: the destination of an ALLOC node's `_dev`
28
// edge, or the source of a FREE node's `_dev` edge.
29
const data_flow::AccessNode* device_access(structured_control_flow::Block& block, offloading::DataOffloadingNode& node) {
150✔
30
    auto& dataflow = block.dataflow();
150✔
31
    if (node.is_alloc()) {
150✔
32
        for (auto& memlet : dataflow.out_edges(node)) {
76✔
33
            if (auto* access = dynamic_cast<const data_flow::AccessNode*>(&memlet.dst())) {
76✔
34
                return access;
76✔
35
            }
76✔
36
        }
76✔
37
    } else if (node.is_free()) {
76✔
38
        for (auto& memlet : dataflow.in_edges(node)) {
74✔
39
            if (auto* access = dynamic_cast<const data_flow::AccessNode*>(&memlet.src())) {
74✔
40
                return access;
74✔
41
            }
74✔
42
        }
74✔
43
    }
74✔
NEW
44
    return nullptr;
×
45
}
150✔
46

47
// Find the liveness User attached to a specific access-node element of a container.
48
analysis::User* user_for_element(analysis::Users& users, const std::string& container, const data_flow::AccessNode* el) {
148✔
49
    for (auto* user : users.uses(container)) {
358✔
50
        if (user->element() == el) {
358✔
51
            return user;
148✔
52
        }
148✔
53
    }
358✔
NEW
54
    return nullptr;
×
55
}
148✔
56

57
BufferReuseMarker::BufferReuseMarker(
58
    builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager, analysis::Users& users
59
)
60
    : visitor::StructuredSDFGVisitor(builder, analysis_manager), users_(users), sdfg_(builder.subject()) {}
34✔
61

62
bool BufferReuseMarker::visit_internal(structured_control_flow::Sequence& parent) {
46✔
63
    for (size_t i = 0; i < parent.size(); i++) {
286✔
64
        auto& current = parent.at(i).first;
240✔
65
        if (auto* block = dynamic_cast<structured_control_flow::Block*>(&current)) {
240✔
66
            process_block(*block);
232✔
67
        } else if (auto* seq = dynamic_cast<structured_control_flow::Sequence*>(&current)) {
232✔
NEW
68
            visit_internal(*seq);
×
69
        } else if (auto* if_else = dynamic_cast<structured_control_flow::IfElse*>(&current)) {
8✔
70
            // Buffers created in different branches are not totally ordered, so they interfere.
71
            std::vector<std::pair<size_t, size_t>> ranges;
6✔
72
            for (size_t c = 0; c < if_else->size(); c++) {
16✔
73
                size_t start = candidates.size();
10✔
74
                visit_internal(if_else->at(c).first);
10✔
75
                ranges.emplace_back(start, candidates.size());
10✔
76
            }
10✔
77
            for (size_t a = 0; a < ranges.size(); a++) {
16✔
78
                for (size_t b = a + 1; b < ranges.size(); b++) {
14✔
79
                    for (size_t x = ranges[a].first; x < ranges[a].second; x++) {
8✔
80
                        for (size_t y = ranges[b].first; y < ranges[b].second; y++) {
8✔
81
                            add_edge(x, y);
4✔
82
                        }
4✔
83
                    }
4✔
84
                }
4✔
85
            }
10✔
86
        } else if (auto* loop = dynamic_cast<structured_control_flow::StructuredLoop*>(&current)) {
6✔
87
            visit_internal(loop->root());
2✔
88
        }
2✔
89
    }
240✔
90
    return false;
46✔
91
}
46✔
92

93
void BufferReuseMarker::add_edge(size_t a, size_t b) {
8✔
94
    if (a == b) {
8✔
NEW
95
        return;
×
NEW
96
    }
×
97
    adjacency[a].insert(b);
8✔
98
    adjacency[b].insert(a);
8✔
99
}
8✔
100

101
// Promote a pending allocation to a live candidate once a using kernel has been found.
102
void BufferReuseMarker::assign(const std::string& container) {
86✔
103
    auto it = unassigned_.find(container);
86✔
104
    if (it == unassigned_.end()) {
86✔
105
        return;
12✔
106
    }
12✔
107
    AllocRec rec = it->second;
74✔
108
    unassigned_.erase(it);
74✔
109

110
    Candidate cand;
74✔
111
    cand.container = container;
74✔
112
    cand.dtype = sdfg_.type(container).primitive_type();
74✔
113
    cand.size = rec.node->size();
74✔
114
    cand.alloc_block = rec.block;
74✔
115
    cand.alloc_node = rec.node;
74✔
116
    cand.alloc_user = user_for_element(users_, container, rec.access);
74✔
117

118
    size_t idx = candidates.size();
74✔
119
    candidates.push_back(std::move(cand));
74✔
120
    adjacency.emplace_back();
74✔
121

122
    // Overlapping live ranges interfere.
123
    for (size_t l : live_) {
74✔
124
        add_edge(idx, l);
4✔
125
    }
4✔
126
    live_.push_back(idx);
74✔
127
    live_index_[container] = idx;
74✔
128
}
74✔
129

130
// End a buffer's live range at its free.
131
void BufferReuseMarker::close(
132
    const std::string& container,
133
    structured_control_flow::Block& block,
134
    offloading::DataOffloadingNode& node,
135
    const data_flow::AccessNode* access
136
) {
74✔
137
    if (live_index_.find(container) == live_index_.end()) {
74✔
138
        // Allocated but never used by a kernel: realise it now so the alloc/free still pair up.
NEW
139
        assign(container);
×
NEW
140
    }
×
141
    auto li = live_index_.find(container);
74✔
142
    if (li == live_index_.end()) {
74✔
NEW
143
        return;
×
NEW
144
    }
×
145
    size_t idx = li->second;
74✔
146
    candidates[idx].free_block = &block;
74✔
147
    candidates[idx].free_node = &node;
74✔
148
    candidates[idx].free_user = user_for_element(users_, container, access);
74✔
149
    live_.erase(std::remove(live_.begin(), live_.end(), idx), live_.end());
74✔
150
    live_index_.erase(li);
74✔
151
}
74✔
152

153
void BufferReuseMarker::process_block(structured_control_flow::Block& block) {
232✔
154
    auto& dataflow = block.dataflow();
232✔
155

156
    // A kernel use of a container is an access node wired to a (non-offloading) code node.
157
    std::unordered_set<std::string> kernel_uses;
232✔
158
    for (auto& edge : dataflow.edges()) {
316✔
159
        const data_flow::AccessNode* access = nullptr;
316✔
160
        const data_flow::DataFlowNode* other = nullptr;
316✔
161
        if (auto* a = dynamic_cast<const data_flow::AccessNode*>(&edge.src())) {
316✔
162
            access = a;
158✔
163
            other = &edge.dst();
158✔
164
        } else if (auto* a = dynamic_cast<const data_flow::AccessNode*>(&edge.dst())) {
158✔
165
            access = a;
158✔
166
            other = &edge.src();
158✔
167
        }
158✔
168
        if (access == nullptr || dynamic_cast<const offloading::DataOffloadingNode*>(other) != nullptr) {
316✔
169
            continue;
152✔
170
        }
152✔
171
        if (dynamic_cast<const data_flow::CodeNode*>(other) != nullptr) {
164✔
172
            kernel_uses.insert(access->data());
164✔
173
        }
164✔
174
    }
164✔
175

176
    // Record pure allocs as pending, disqualify transfer buffers, and gather frees.
177
    std::vector<std::tuple<std::string, offloading::DataOffloadingNode*, const data_flow::AccessNode*>> frees;
232✔
178
    for (auto& n : dataflow.nodes()) {
548✔
179
        auto* off = dynamic_cast<offloading::DataOffloadingNode*>(&n);
548✔
180
        if (off == nullptr) {
548✔
181
            continue;
398✔
182
        }
398✔
183
        const auto* access = device_access(block, *off);
150✔
184
        if (access == nullptr) {
150✔
NEW
185
            continue;
×
NEW
186
        }
×
187
        const std::string& container = access->data();
150✔
188
        if (off->has_transfer()) {
150✔
189
            disqualified_.insert(container);
2✔
190
            unassigned_.erase(container);
2✔
191
        } else if (off->is_alloc()) {
148✔
192
            if (!disqualified_.count(container)) {
74✔
193
                unassigned_[container] = {&block, off, access};
74✔
194
            }
74✔
195
        } else if (off->is_free()) {
74✔
196
            frees.emplace_back(container, off, access);
74✔
197
        }
74✔
198
    }
150✔
199

200
    // Assign allocations that this block's kernels consume, then apply frees.
201
    for (const auto& container : kernel_uses) {
232✔
202
        assign(container);
86✔
203
    }
86✔
204
    for (auto& [container, node, access] : frees) {
232✔
205
        close(container, block, *node, access);
74✔
206
    }
74✔
207
}
232✔
208

209
DeviceBufferReusePass::DeviceBufferReusePass(bool consider_dataflow_branching)
210
    : consider_dataflow_branching_(consider_dataflow_branching) {}
34✔
211

NEW
212
std::string DeviceBufferReusePass::name() { return "DeviceBufferReuse"; }
×
213

214
void add_edge(std::vector<std::unordered_set<size_t>>& adjacency, size_t a, size_t b) {
8✔
215
    if (a == b) {
8✔
NEW
216
        return;
×
NEW
217
    }
×
218
    adjacency[a].insert(b);
8✔
219
    adjacency[b].insert(a);
8✔
220
}
8✔
221

222
// Control-flow ordering of two buffers: 1 if `a` is entirely before `b` (a's free dominates b's
223
// alloc), 2 if `b` is entirely before `a`, 0 if they are not totally ordered (overlap/divergent).
224
int ordering(const Candidate& a, const Candidate& b, analysis::DominanceAnalysis& dominance) {
138✔
225
    if (a.alloc_user == nullptr || a.free_user == nullptr || b.alloc_user == nullptr || b.free_user == nullptr) {
138✔
NEW
226
        return 0;
×
NEW
227
    }
×
228
    if (dominance.dominates(*a.free_user, *b.alloc_user)) {
138✔
229
        return 1;
74✔
230
    }
74✔
231
    if (dominance.dominates(*b.free_user, *a.alloc_user)) {
64✔
232
        return 2;
64✔
233
    }
64✔
NEW
234
    return 0;
×
235
}
64✔
236

237
// Whether a user lies within a candidate's live range [alloc, free].
238
bool in_range(const Candidate& c, analysis::User* user, analysis::DominanceAnalysis& dominance) {
18✔
239
    return dominance.dominates(*c.alloc_user, *user) && dominance.post_dominates(*c.free_user, *user);
18✔
240
};
18✔
241

242
// For a control-flow-ordered pair, decide whether a data dependency forces the later buffer's
243
// use to happen-after the earlier's. If so, the two are not independent dataflow branches.
244
bool data_serialized(
245
    const Candidate& earlier, const Candidate& later, analysis::Users& users, analysis::DominanceAnalysis& dominance
246
) {
10✔
247
    std::unordered_set<std::string> produced;
10✔
248
    for (auto* w : users.writes()) {
56✔
249
        const std::string& c = w->container();
56✔
250
        if (c == earlier.container || c == later.container) {
56✔
251
            continue;
42✔
252
        }
42✔
253
        if (in_range(earlier, w, dominance)) {
14✔
254
            produced.insert(c);
2✔
255
        }
2✔
256
    }
14✔
257
    if (produced.empty()) {
10✔
258
        return false;
8✔
259
    }
8✔
260
    for (auto* u : users.uses()) {
14✔
261
        const std::string& c = u->container();
14✔
262
        if (c == earlier.container || c == later.container) {
14✔
263
            continue;
10✔
264
        }
10✔
265
        if (produced.count(c) && in_range(later, u, dominance)) {
4✔
266
            return true;
2✔
267
        }
2✔
268
    }
4✔
NEW
269
    return false;
×
270
};
2✔
271

272
bool DeviceBufferReusePass::run_pass(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
34✔
273
    auto& users = analysis_manager.get<analysis::Users>();
34✔
274
    auto& dominance = analysis_manager.get<analysis::DominanceAnalysis>();
34✔
275

276
    // =======================================================================================
277
    // MARK: build candidates and the interference graph in a single visitor traversal.
278
    //
279
    // The marker walks the SDFG in program order with a set of unassigned allocations. An
280
    // allocation becomes a candidate the moment a kernel using it is found, at which point it is
281
    // wired against every buffer currently live (overlapping live ranges interfere). Frees end a
282
    // live range directly. Buffers created in sibling if/else branches interfere too, since they
283
    // are not totally ordered.
284
    // =======================================================================================
285
    BufferReuseMarker marker(builder, analysis_manager, users);
34✔
286
    marker.visit();
34✔
287

288
    auto& candidates = marker.candidates;
34✔
289
    auto& adjacency = marker.adjacency;
34✔
290

291
    if (candidates.size() < 2) {
34✔
292
        return false;
2✔
293
    }
2✔
294

295
    // A buffer whose liveness boundaries could not be resolved is never safe to merge.
296
    for (size_t i = 0; i < candidates.size(); i++) {
104✔
297
        if (candidates[i].alloc_user == nullptr || candidates[i].free_user == nullptr) {
72✔
NEW
298
            for (size_t j = 0; j < candidates.size(); j++) {
×
NEW
299
                add_edge(adjacency, i, j);
×
NEW
300
            }
×
NEW
301
        }
×
302
    }
72✔
303

304
    // Dataflow branching: asynchronous kernels that are control-flow-ordered but exchange no data
305
    // may still run concurrently, so independent sequential buffers must not share storage. This is
306
    // the only interference that depends on data dependencies, so it augments the traversal graph
307
    // here, during marking, using the cached user analysis.
308
    if (consider_dataflow_branching_) {
32✔
309
        for (size_t i = 0; i < candidates.size(); i++) {
20✔
310
            for (size_t j = i + 1; j < candidates.size(); j++) {
24✔
311
                if (adjacency[i].count(j) || candidates[i].dtype != candidates[j].dtype) {
10✔
NEW
312
                    continue;
×
NEW
313
                }
×
314
                int ord = ordering(candidates[i], candidates[j], dominance);
10✔
315
                if (ord == 0) {
10✔
NEW
316
                    add_edge(adjacency, i, j);
×
NEW
317
                    continue;
×
NEW
318
                }
×
319
                const Candidate& earlier = (ord == 1) ? candidates[i] : candidates[j];
10✔
320
                const Candidate& later = (ord == 1) ? candidates[j] : candidates[i];
10✔
321
                if (!data_serialized(earlier, later, users, dominance)) {
10✔
322
                    add_edge(adjacency, i, j);
8✔
323
                }
8✔
324
            }
10✔
325
        }
14✔
326
    }
6✔
327

328
    // =======================================================================================
329
    // COLOR: greedy graph colouring, preferring larger allocations first.
330
    // =======================================================================================
331
    std::vector<size_t> order(candidates.size());
32✔
332
    for (size_t i = 0; i < order.size(); i++) {
104✔
333
        order[i] = i;
72✔
334
    }
72✔
335
    std::stable_sort(order.begin(), order.end(), [&](size_t i, size_t j) {
48✔
336
        return symbolic::is_true(symbolic::Gt(candidates[i].size, candidates[j].size));
48✔
337
    });
48✔
338

339
    std::vector<std::vector<size_t>> colors; // each colour is a list of candidate indices
32✔
340
    for (size_t idx : order) {
72✔
341
        bool placed = false;
72✔
342
        for (auto& members : colors) {
72✔
343
            if (candidates[members.front()].dtype != candidates[idx].dtype) {
42✔
344
                continue;
2✔
345
            }
2✔
346
            bool ok = true;
40✔
347
            for (size_t m : members) {
46✔
348
                if (adjacency[idx].count(m)) {
46✔
349
                    ok = false;
14✔
350
                    break;
14✔
351
                }
14✔
352
            }
46✔
353
            if (ok) {
40✔
354
                members.push_back(idx);
26✔
355
                placed = true;
26✔
356
                break;
26✔
357
            }
26✔
358
        }
40✔
359
        if (!placed) {
72✔
360
            colors.push_back({idx});
46✔
361
        }
46✔
362
    }
72✔
363

364
    // =======================================================================================
365
    // SWEEP: apply every merge in a single pass.
366
    // =======================================================================================
367
    bool changed = false;
32✔
368
    for (auto& members : colors) {
46✔
369
        if (members.size() < 2) {
46✔
370
            continue;
26✔
371
        }
26✔
372

373
        // The colouring order is only a heuristic: with symbolic sizes `Gt` is not a total order
374
        // (e.g. S vs. T compares false both ways), so members.front() is not guaranteed to be the
375
        // largest. It still serves as the shared allocation's name, but the surviving allocation
376
        // must be sized to the symbolic maximum over all members.
377
        const std::string representative = candidates[members.front()].container;
20✔
378
        symbolic::Expression max_size = candidates[members.front()].size;
20✔
379
        for (size_t m : members) {
46✔
380
            max_size = symbolic::max(max_size, candidates[m].size);
46✔
381
        }
46✔
382

383
        // Identify the earliest allocation and latest free in program order; these survive.
384
        size_t earliest = members.front();
20✔
385
        size_t latest = members.front();
20✔
386
        for (size_t m : members) {
46✔
387
            bool is_earliest = true;
46✔
388
            bool is_latest = true;
46✔
389
            for (size_t o : members) {
110✔
390
                if (o == m) {
110✔
391
                    continue;
46✔
392
                }
46✔
393
                if (ordering(candidates[m], candidates[o], dominance) != 1) {
64✔
394
                    is_earliest = false;
32✔
395
                }
32✔
396
                if (ordering(candidates[o], candidates[m], dominance) != 1) {
64✔
397
                    is_latest = false;
32✔
398
                }
32✔
399
            }
64✔
400
            if (is_earliest) {
46✔
401
                earliest = m;
20✔
402
            }
20✔
403
            if (is_latest) {
46✔
404
                latest = m;
20✔
405
            }
20✔
406
        }
46✔
407

408
        // Alias every member onto the representative container.
409
        for (size_t m : members) {
46✔
410
            if (candidates[m].container != representative) {
46✔
411
                builder.rename_container(candidates[m].container, representative);
26✔
412
            }
26✔
413
        }
46✔
414

415
        // Resize the surviving allocation to cover the largest member.
416
        candidates[earliest].alloc_node->set_size(max_size);
20✔
417

418
        // Drop the now-redundant allocations and frees, keeping the earliest alloc / latest free.
419
        for (size_t m : members) {
46✔
420
            if (m != earliest) {
46✔
421
                builder.remove_from_parent(*candidates[m].alloc_block);
26✔
422
            }
26✔
423
            if (m != latest) {
46✔
424
                builder.remove_from_parent(*candidates[m].free_block);
26✔
425
            }
26✔
426
        }
46✔
427

428
        changed = true;
20✔
429
    }
20✔
430

431
    return changed;
32✔
432
}
34✔
433

434
} // namespace passes
435
} // namespace sdfg
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc