• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 24348757857

13 Apr 2026 02:25PM UTC coverage: 64.469% (+0.09%) from 64.382%
24348757857

push

github

web-flow
Merge pull request #676 from daisytuner/ellide-host-mem-mgmt

Ellides H2D copies in case the host data was freshly allocated and not yet initialized before the offload transfer. In that case, offloaded Malloc is enough.

(will leave the host malloc itself in the graph, as that is a task for DDE to remove)

104 of 125 new or added lines in 5 files covered. (83.2%)

1 existing line in 1 file now uncovered.

30553 of 47392 relevant lines covered (64.47%)

584.02 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

25.63
/opt/src/passes/offloading/data_transfer_minimization_pass.cpp
1
#include "sdfg/passes/offloading/data_transfer_minimization_pass.h"
2

3
#include <cstddef>
4
#include <string>
5
#include <unordered_set>
6
#include <utility>
7

8
#include "sdfg/analysis/analysis.h"
9
#include "sdfg/analysis/data_transfer_elimination_analysis.h"
10
#include "sdfg/analysis/scope_analysis.h"
11
#include "sdfg/analysis/users.h"
12
#include "sdfg/data_flow/access_node.h"
13
#include "sdfg/data_flow/code_node.h"
14
#include "sdfg/data_flow/data_flow_graph.h"
15
#include "sdfg/data_flow/library_node.h"
16
#include "sdfg/data_flow/memlet.h"
17
#include "sdfg/data_flow/tasklet.h"
18
#include "sdfg/element.h"
19
#include "sdfg/exceptions.h"
20
#include "sdfg/helpers/helpers.h"
21
#include "sdfg/structured_control_flow/block.h"
22
#include "sdfg/structured_control_flow/control_flow_node.h"
23
#include "sdfg/structured_control_flow/sequence.h"
24
#include "sdfg/symbolic/symbolic.h"
25
#include "sdfg/targets/cuda/cuda_data_offloading_node.h"
26
#include "sdfg/targets/offloading/data_offloading_node.h"
27
#include "sdfg/targets/rocm/rocm_data_offloading_node.h"
28
#include "sdfg/types/pointer.h"
29
#include "sdfg/visitor/structured_sdfg_visitor.h"
30

31
namespace sdfg {
32
namespace passes {
33

34
DataTransferMinimizationPass::DataTransferMinimizationPass() {}
4✔
35

36
bool DataTransferMinimizationPass::eliminate_malloc_first_transfer(
37
    builder::StructuredSDFGBuilder& builder, analysis::OffloadHolder& malloc_holder, analysis::OffloadHolder& copy_in
38
) {
1✔
39
    // Get all relevant information
40
    std::string copy_in_device_container = copy_in.dev_data->data();
1✔
41
    DebugInfo copy_in_dst_debinfo = copy_in.dev_data->debug_info();
1✔
42

43
    // leave the malloc itself, because we have not proven yet that there is no more d2H transfer that needs it
44
    // DDE needs to be able to find it
45

46
    auto* h2d_block = dynamic_cast<structured_control_flow::Block*>(copy_in.offload_node->get_parent().get_parent());
1✔
47
    builder.remove_memlet(*h2d_block, *copy_in.host_access);
1✔
48
    builder.remove_node(*h2d_block, *copy_in.host_data);
1✔
49
    copy_in.remove_host_side();
1✔
50
    copy_in.offload_node->remove_h2d();
1✔
51

52
    return true;
1✔
53
}
1✔
54

55
bool DataTransferMinimizationPass::eliminate_transfer_pair(
56
    builder::StructuredSDFGBuilder& builder,
57
    analysis::OffloadHolder& copy_out,
58
    analysis::OffloadHolder& copy_in,
59
    bool remove_d2h
60
) {
2✔
61
    // Get all relevant information
62
    std::string copy_out_device_container = copy_out.dev_data->data();
2✔
63
    std::string copy_in_device_container = copy_in.dev_data->data();
2✔
64
    DebugInfo copy_out_src_debinfo = copy_out.dev_data->debug_info();
2✔
65
    DebugInfo copy_in_dst_debinfo = copy_in.dev_data->debug_info();
2✔
66

67
    // Remove what you can remove
68
    if (!remove_d2h && copy_out.offload_node->is_free()) {
2✔
69
        copy_out.offload_node->remove_free();
1✔
70
    } else if (remove_d2h) {
1✔
71
        auto* copy_out_block =
1✔
72
            dynamic_cast<structured_control_flow::Block*>(copy_out.offload_node->get_parent().get_parent());
1✔
73
        builder.clear_code_node_legacy(*copy_out_block, *copy_out.offload_node);
1✔
74
    }
1✔
75
    auto* copy_in_block = dynamic_cast<structured_control_flow::Block*>(copy_in.offload_node->get_parent().get_parent()
2✔
76
    );
2✔
77
    builder.clear_code_node_legacy(*copy_in_block, *copy_in.offload_node);
2✔
78

79
    // Maps the device pointers if necessary
80
    if (copy_out_device_container != copy_in_device_container) {
2✔
81
        auto& container_type = builder.subject().type(copy_out_device_container);
×
82
        auto ref_type = container_type.clone();
×
83
        auto& in_access = builder.add_access(*copy_in_block, copy_out_device_container, copy_out_src_debinfo);
×
84
        auto& out_access = builder.add_access(*copy_in_block, copy_in_device_container, copy_in_dst_debinfo);
×
85
        builder.add_reference_memlet(
×
86
            *copy_in_block,
×
87
            in_access,
×
88
            out_access,
×
89
            {symbolic::zero()},
×
90
            *ref_type,
×
NEW
91
            DebugInfo::merge(copy_out.offload_node->debug_info(), copy_in.offload_node->debug_info())
×
92
        );
×
93
    }
×
94

95
    return true;
2✔
96
}
2✔
97

98
bool DataTransferMinimizationPass::
99
    run_pass(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
4✔
100
    analysis::DataTransferEliminationAnalysis transfer_analysis(builder.subject(), analysis_manager);
4✔
101
    transfer_analysis.run();
4✔
102

103
    int removed = 0;
4✔
104

105
    auto& useless_mallocs = transfer_analysis.empty_malloc_candidates();
4✔
106
    for (auto& [malloc_cand, first_h2d] : useless_mallocs) {
4✔
107
        auto& malloc_holder = *malloc_cand.offload;
1✔
108

109
        DEBUG_PRINTLN(
1✔
110
            "  Elim malloc: " << "#" << malloc_holder.malloc_node->element_id() << " -> "
1✔
111
                              << (malloc_holder.host_data ? malloc_holder.host_data->data() : "-") << " / "
1✔
112
                              << "h2d+malloc: #" << first_h2d.offload_node->element_id() << " "
1✔
113
                              << (first_h2d.host_data ? first_h2d.host_data->data() : "-") << " -> "
1✔
114
                              << first_h2d.dev_data->data()
1✔
115
        );
1✔
116

117
        bool success = eliminate_malloc_first_transfer(builder, malloc_holder, first_h2d);
1✔
118

119
        if (success) {
1✔
120
            ++removed;
1✔
121
        }
1✔
122
    }
1✔
123

124
    auto& transfer_reuse_candidates = transfer_analysis.transfer_reuse_candidates();
4✔
125
    auto& users = analysis_manager.get<analysis::Users>();
4✔
126

127
    for (auto& candidate : transfer_reuse_candidates) {
4✔
128
        auto reads = candidate.first.read_count;
2✔
129
        auto& copy_out = *candidate.first.offload;
2✔
130
        auto& copy_in = candidate.second;
2✔
131
        auto& copy_in_container = copy_in.host_data->data();
2✔
132

133
        // copy from legacy version as hack: checking for users after the copy_in container (because current analysis
134
        // stops looking at that point)
135
        // TODO unsafe: this does not cover all ways that still need the data on host. Safe is: only manage device-side
136
        // things here and let dead-data find the unused host stuff
137
        auto* read = users.get_user(
2✔
138
            copy_in.host_data->data(), const_cast<data_flow::AccessNode*>(copy_in.host_data), analysis::Use::READ
2✔
139
        );
2✔
140

141
        for (auto* after_use : users.all_uses_after(*read)) {
4✔
142
            if (after_use->container() == copy_in_container && after_use->use() == analysis::Use::READ &&
4✔
143
                after_use != read) {
4✔
144
                ++reads;
1✔
145
            }
1✔
146
        }
4✔
147

148
#ifndef NDEBUG
2✔
149
        std::cerr << "  Elim transfer "
2✔
150
                  << "copy-out: #" << copy_out.offload_node->element_id() << " " << copy_out.dev_data->data() << " -> "
2✔
151
                  << (copy_out.host_data ? copy_out.host_data->data() : "-") << " / ";
2✔
152
        if (reads) {
2✔
153
            std::cerr << reads << " reads / ";
1✔
154
        }
1✔
155
        std::cerr << "copy-in: #" << copy_in.offload_node->element_id() << " "
2✔
156
                  << (copy_in.host_data ? copy_in.host_data->data() : "-") << " -> " << copy_in.dev_data->data()
2✔
157
                  << std::endl;
2✔
158
#endif
2✔
159

160
        bool success = eliminate_transfer_pair(builder, copy_out, copy_in, reads == 0);
2✔
161

162
        if (success) {
2✔
163
            ++removed;
2✔
164
        }
2✔
165
    }
2✔
166

167
    return removed > 0;
4✔
168
}
4✔
169

170
DataTransferMinimizationLegacy::
171
    DataTransferMinimizationLegacy(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager)
172
    : visitor::NonStoppingStructuredSDFGVisitor(builder, analysis_manager) {}
×
173

174
bool DataTransferMinimizationLegacy::visit() {
×
175
    DEBUG_PRINTLN("Running DataTransferMinimizationPass on " << this->builder_.subject().name());
×
176
    return visitor::NonStoppingStructuredSDFGVisitor::visit();
×
177
}
×
178

179
bool DataTransferMinimizationLegacy::accept(structured_control_flow::Sequence& sequence) {
×
180
    bool applied = false;
×
181
    offloading::DataOffloadingNode* copy_out = nullptr;
×
182
    structured_control_flow::Block* copy_out_block = nullptr;
×
183
    size_t copy_out_index = 0;
×
184

185
    // While a copy-out can be found:
186
    while (copy_out_index < sequence.size()) {
×
187
        // Find a new copy-out
188
        for (; copy_out_index < sequence.size(); copy_out_index++) {
×
189
            if (auto* block = dynamic_cast<structured_control_flow::Block*>(&sequence.at(copy_out_index).first)) {
×
190
                if (block->dataflow().library_nodes().size() == 1 && block->dataflow().tasklets().size() == 0) {
×
191
                    auto* libnode = *block->dataflow().library_nodes().begin();
×
192
                    if (auto* offloading_node = dynamic_cast<offloading::DataOffloadingNode*>(libnode)) {
×
193
                        if (offloading_node->is_d2h()) {
×
194
                            copy_out = offloading_node;
×
195
                            copy_out_block = block;
×
196
                            break;
×
197
                        }
×
198
                    }
×
199
                }
×
200
            }
×
201
        }
×
202

203
        // Find a matching copy-in
204
        size_t i;
×
205
        for (i = copy_out_index; i < sequence.size(); i++) {
×
206
            // Child must be a block
207
            auto* copy_in_block = dynamic_cast<structured_control_flow::Block*>(&sequence.at(i).first);
×
208
            if (!copy_in_block) {
×
209
                continue;
×
210
            }
×
211

212
            // Block must contain exactly one library node
213
            if (copy_in_block->dataflow().library_nodes().size() != 1 ||
×
214
                copy_in_block->dataflow().tasklets().size() != 0) {
×
215
                continue;
×
216
            }
×
217

218
            // Library node must be an offloading node
219
            auto* copy_in =
×
220
                dynamic_cast<offloading::DataOffloadingNode*>(*copy_in_block->dataflow().library_nodes().begin());
×
221
            if (!copy_in) {
×
222
                continue;
×
223
            }
×
224

225
            // Offloading node must be a copy-in
226
            if (!copy_in->is_h2d()) {
×
227
                continue;
×
228
            }
×
229

230
            // Copy-in and copy-out must be redundant
231
            if (!copy_out->redundant_with(*copy_in)) {
×
232
                continue;
×
233
            }
×
234

235
            // Get src and dst access nodes for copy-in & -out
236
            auto [copy_out_src, copy_out_dst] = this->get_src_and_dst(copy_out_block->dataflow(), copy_out);
×
237
            auto [copy_in_src, copy_in_dst] = this->get_src_and_dst(copy_in_block->dataflow(), copy_in);
×
238

239
            // Get the write and read users
240
            auto& users = this->analysis_manager_.get<analysis::Users>();
×
241
            analysis::User* write = users.get_user(copy_out_dst->data(), copy_out_dst, analysis::Use::WRITE);
×
242
            if (!write) {
×
243
                continue;
×
244
            }
×
245
            analysis::User* read = users.get_user(copy_in_src->data(), copy_in_src, analysis::Use::READ);
×
246
            if (!read) {
×
247
                continue;
×
248
            }
×
249

250
            if (copy_out_dst->data() == copy_in_src->data()) {
×
251
                // Ensure that the container is not written between the data transfer nodes
252
                bool used_between = false;
×
253
                for (auto* user : users.all_uses_between(*write, *read)) {
×
254
                    if (user->container() == copy_out_dst->data() && user->use() != analysis::Use::READ) {
×
255
                        used_between = true;
×
256
                        break;
×
257
                    }
×
258
                }
×
259
                if (used_between) {
×
260
                    continue;
×
261
                }
×
262
            } else {
×
263
                if (!this->check_container_dependency(
×
264
                        copy_out_block, copy_out_dst->data(), copy_in_block, copy_in_src->data()
×
265
                    )) {
×
266
                    continue;
×
267
                }
×
268
            }
×
269

270
            // Check that the container is not written after the data transfer nodes
271
            bool read_after = false;
×
272
            for (auto* user : users.all_uses_after(*write)) {
×
273
                if (user->container() == copy_out_dst->data() && user->use() == analysis::Use::READ && user != read) {
×
274
                    read_after = true;
×
275
                    break;
×
276
                }
×
277
            }
×
278

279
            // Debug output
280
            DEBUG_PRINTLN(
×
281
                "  Eliminating " << (read_after ? "(" : "") << "copy-out: #" << copy_out->element_id() << " "
×
282
                                 << copy_out_src->data() << " -> " << copy_out_dst->data() << (read_after ? ")" : "")
×
283
                                 << " / copy-in: #" << copy_in->element_id() << " " << copy_in_src->data() << " -> "
×
284
                                 << copy_in_dst->data()
×
285
            );
×
286

287
            // Get all relevant information
288
            std::string copy_out_device_container = copy_out_src->data();
×
289
            std::string copy_in_device_container = copy_in_dst->data();
×
290
            DebugInfo copy_out_src_debinfo = copy_out_src->debug_info();
×
291
            DebugInfo copy_in_dst_debinfo = copy_in_dst->debug_info();
×
292

293
            // Remove the data tranfers
294
            if (read_after && copy_out->is_free()) {
×
295
                copy_out->remove_free();
×
296
            } else if (!read_after) {
×
297
                this->builder_.clear_code_node_legacy(*copy_out_block, *copy_out);
×
298
            }
×
299
            this->builder_.clear_code_node_legacy(*copy_in_block, *copy_in);
×
300

301
            // Maps the device pointers if necessary
302
            if (copy_out_device_container != copy_in_device_container) {
×
303
                auto& container_type = this->builder_.subject().type(copy_out_device_container);
×
304
                auto ref_type = container_type.clone();
×
305
                auto& in_access =
×
306
                    this->builder_.add_access(*copy_in_block, copy_out_device_container, copy_out_src_debinfo);
×
307
                auto& out_access =
×
308
                    this->builder_.add_access(*copy_in_block, copy_in_device_container, copy_in_dst_debinfo);
×
309
                this->builder_.add_reference_memlet(
×
310
                    *copy_in_block,
×
311
                    in_access,
×
312
                    out_access,
×
313
                    {symbolic::zero()},
×
314
                    *ref_type,
×
315
                    DebugInfo::merge(copy_out->debug_info(), copy_in->debug_info())
×
316
                );
×
317
            }
×
318

319
            // Invalidate users analysis
320
            this->analysis_manager_.invalidate<analysis::Users>();
×
321
            applied = true;
×
322
            break;
×
323
        }
×
324

325
        // Skip if no matching copy-in was found
326
        if (i >= sequence.size()) {
×
327
            copy_out_index++;
×
328
        }
×
329
    }
×
330

331
    return applied;
×
332
}
×
333

334
std::pair<data_flow::AccessNode*, data_flow::AccessNode*> DataTransferMinimizationLegacy::
335
    get_src_and_dst(data_flow::DataFlowGraph& dfg, offloading::DataOffloadingNode* offloading_node) {
×
336
    if (!offloading_node->has_transfer()) {
×
337
        throw InvalidSDFGException(
×
338
            "DataTransferMinimization: Cannot get copy access nodes for offloading node without data transfers"
×
339
        );
×
340
    }
×
341
    data_flow::AccessNode *src, *dst;
×
342
    if (dynamic_cast<cuda::CUDADataOffloadingNode*>(offloading_node)) {
×
343
        src = this->get_in_access(offloading_node, "_src");
×
344
        dst = this->get_out_access(offloading_node, "_dst");
×
345
    } else if (dynamic_cast<rocm::ROCMDataOffloadingNode*>(offloading_node)) {
×
346
        src = this->get_in_access(offloading_node, "_src");
×
347
        dst = this->get_out_access(offloading_node, "_dst");
×
348
    } else {
×
349
        throw InvalidSDFGException(
×
350
            "DataTransferMinimization: Unknown offloading node encountered: " + offloading_node->code().value()
×
351
        );
×
352
    }
×
353
    return {src, dst};
×
354
}
×
355

356
data_flow::AccessNode* DataTransferMinimizationLegacy::
357
    get_in_access(data_flow::CodeNode* node, const std::string& dst_conn) {
×
358
    auto& dfg = node->get_parent();
×
359
    for (auto& iedge : dfg.in_edges(*node)) {
×
360
        if (iedge.dst_conn() == dst_conn) {
×
361
            return dynamic_cast<data_flow::AccessNode*>(&iedge.src());
×
362
        }
×
363
    }
×
364
    return nullptr;
×
365
}
×
366

367
data_flow::AccessNode* DataTransferMinimizationLegacy::
368
    get_out_access(data_flow::CodeNode* node, const std::string& src_conn) {
×
369
    auto& dfg = node->get_parent();
×
370
    for (auto& oedge : dfg.out_edges(*node)) {
×
371
        if (oedge.src_conn() == src_conn) {
×
372
            return static_cast<data_flow::AccessNode*>(&oedge.dst());
×
373
        }
×
374
    }
×
375
    return nullptr;
×
376
}
×
377

378
bool DataTransferMinimizationLegacy::check_container_dependency(
379
    structured_control_flow::Block* copy_out_block,
380
    const std::string& copy_out_container,
381
    structured_control_flow::Block* copy_in_block,
382
    const std::string& copy_in_container
383
) {
×
384
    // Simplification: Assume blocks are in the same sequence
385
    auto& scope_analysis = this->analysis_manager_.get<analysis::ScopeAnalysis>();
×
386
    auto* copy_out_block_parent = scope_analysis.parent_scope(copy_out_block);
×
387
    auto* copy_in_block_parent = scope_analysis.parent_scope(copy_in_block);
×
388
    auto* sequence = dynamic_cast<structured_control_flow::Sequence*>(copy_out_block_parent);
×
389
    if (copy_out_block_parent != copy_in_block_parent || !sequence) {
×
390
        return false;
×
391
    }
×
392

393
    std::unordered_set<std::string> copy_out_container_captures, copy_in_container_parts;
×
394
    size_t start = sequence->index(*copy_out_block);
×
395
    size_t stop = sequence->index(*copy_in_block);
×
396
    for (size_t i = start + 1; i < stop; i++) {
×
397
        auto* block = dynamic_cast<structured_control_flow::Block*>(&sequence->at(i).first);
×
398
        if (!block) {
×
399
            continue;
×
400
        }
×
401

402
        auto& dfg = block->dataflow();
×
403
        for (auto* access_node : dfg.data_nodes()) {
×
404
            if (access_node->data() == copy_in_container) {
×
405
                // Only allow constant assignments
406
                for (auto& iedge : dfg.in_edges(*access_node)) {
×
407
                    auto* tasklet = dynamic_cast<data_flow::Tasklet*>(&iedge.src());
×
408
                    if (!tasklet || tasklet->code() != data_flow::TaskletCode::assign) {
×
409
                        continue;
×
410
                    }
×
411

412
                    auto& iedge2 = *dfg.in_edges(*tasklet).begin();
×
413
                    if (!dynamic_cast<data_flow::ConstantNode*>(&iedge2.src())) {
×
414
                        return false;
×
415
                    }
×
416
                }
×
417

418
                // Collect H2D container parts
419
                for (auto& oedge : dfg.out_edges(*access_node)) {
×
420
                    if (oedge.type() != data_flow::MemletType::Reference) {
×
421
                        continue;
×
422
                    }
×
423

424
                    auto* access_node2 = dynamic_cast<data_flow::AccessNode*>(&oedge.dst());
×
425
                    if (!access_node2) {
×
426
                        continue;
×
427
                    }
×
428

429
                    copy_in_container_parts.insert(access_node2->data());
×
430
                }
×
431
            } else if (access_node->data() == copy_out_container) {
×
432
                // Collect D2H container captures
433
                for (auto& oedge : dfg.out_edges(*access_node)) {
×
434
                    if (oedge.type() != data_flow::MemletType::Dereference_Dst) {
×
435
                        continue;
×
436
                    }
×
437

438
                    auto* access_node2 = dynamic_cast<data_flow::AccessNode*>(&oedge.dst());
×
439
                    if (!access_node2) {
×
440
                        continue;
×
441
                    }
×
442

443
                    copy_out_container_captures.insert(access_node2->data());
×
444
                }
×
445
            }
×
446
        }
×
447
    }
×
448

449
    // Find all matches between captures and parts
450
    size_t matches = 0;
×
451
    for (auto& capture : copy_out_container_captures) {
×
452
        for (auto& part : copy_in_container_parts) {
×
453
            if (capture == part) {
×
454
                matches++;
×
455
            }
×
456
        }
×
457
    }
×
458

459
    return (matches == 1);
×
460
}
×
461

462
} // namespace passes
463
} // namespace sdfg
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc