• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 24215882789

09 Apr 2026 10:12PM UTC coverage: 64.375% (-0.007%) from 64.382%
24215882789

Pull #668

github

web-flow
Merge 6f7f28e8f into bb3981349
Pull Request #668: Offload Memset to GPU

249 of 381 new or added lines in 18 files covered. (65.35%)

189 existing lines in 2 files now uncovered.

29942 of 46512 relevant lines covered (64.37%)

584.42 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.75
/opt/src/transformations/offloading/cublas_data_transfer_extraction.cpp
1
#include "sdfg/transformations/offloading/cublas_data_transfer_extraction.h"
2

3
#include <cassert>
4
#include <cstddef>
5
#include <nlohmann/json_fwd.hpp>
6
#include <string>
7
#include <unordered_map>
8

9
#include "sdfg/analysis/analysis.h"
10
#include "sdfg/analysis/scope_analysis.h"
11
#include "sdfg/builder/structured_sdfg_builder.h"
12
#include "sdfg/data_flow/access_node.h"
13
#include "sdfg/data_flow/library_nodes/math/blas/dot_node.h"
14
#include "sdfg/data_flow/library_nodes/math/blas/gemm_node.h"
15
#include "sdfg/data_flow/library_nodes/math/math.h"
16
#include "sdfg/element.h"
17
#include "sdfg/exceptions.h"
18
#include "sdfg/structured_control_flow/block.h"
19
#include "sdfg/structured_control_flow/sequence.h"
20
#include "sdfg/symbolic/symbolic.h"
21
#include "sdfg/targets/cuda/cuda.h"
22
#include "sdfg/targets/cuda/cuda_data_offloading_node.h"
23
#include "sdfg/transformations/transformation.h"
24
#include "sdfg/types/type.h"
25
#include "sdfg/types/utils.h"
26
#include "symengine/symengine_rcp.h"
27

28
namespace sdfg {
29
namespace cuda {
30

31
std::string CUBLASDataTransferExtraction::create_device_container(
32
    builder::StructuredSDFGBuilder& builder, const types::Pointer& type, const symbolic::Expression& size
33
) {
5✔
34
    auto new_type = type.clone();
5✔
35
    new_type->storage_type(types::StorageType(
5✔
36
        "NV_Generic", size, types::StorageType::AllocationType::Unmanaged, types::StorageType::AllocationType::Unmanaged
5✔
37
    ));
5✔
38
    auto device_container = builder.find_new_name(CUDA_DEVICE_PREFIX);
5✔
39
    builder.add_container(device_container, *new_type);
5✔
40
    return device_container;
5✔
41
}
5✔
42

43
void CUBLASDataTransferExtraction::create_allocate(
44
    builder::StructuredSDFGBuilder& builder,
45
    structured_control_flow::Sequence& sequence,
46
    structured_control_flow::Block& block,
47
    const std::string& device_container,
48
    const symbolic::Expression& size,
49
    const types::Pointer& type
50
) {
×
51
    auto& alloc_block = builder.add_block_before(sequence, block, {}, block.debug_info());
×
52
    auto& d_cont = builder.add_access(alloc_block, device_container);
×
53
    auto& alloc_node = builder.add_library_node<CUDADataOffloadingNode>(
×
54
        alloc_block,
×
55
        this->blas_node_.debug_info(),
×
56
        size,
×
57
        symbolic::zero(),
×
58
        offloading::DataTransferDirection::NONE,
×
59
        offloading::BufferLifecycle::ALLOC
×
60
    );
×
61
    builder.add_computational_memlet(alloc_block, alloc_node, "_ret", d_cont, {}, type);
×
62
}
×
63

64
void CUBLASDataTransferExtraction::create_deallocate(
65
    builder::StructuredSDFGBuilder& builder,
66
    structured_control_flow::Sequence& sequence,
67
    structured_control_flow::Block& block,
68
    const std::string& device_container,
69
    const types::Pointer& type
70
) {
4✔
71
    auto& dealloc_block = builder.add_block_after(sequence, block, {}, block.debug_info());
4✔
72
    auto& d_cont_in = builder.add_access(dealloc_block, device_container);
4✔
73
    auto& d_cont_out = builder.add_access(dealloc_block, device_container);
4✔
74
    auto& dealloc_node = builder.add_library_node<CUDADataOffloadingNode>(
4✔
75
        dealloc_block,
4✔
76
        this->blas_node_.debug_info(),
4✔
77
        SymEngine::null,
4✔
78
        symbolic::zero(),
4✔
79
        offloading::DataTransferDirection::NONE,
4✔
80
        offloading::BufferLifecycle::FREE
4✔
81
    );
4✔
82
    builder.add_computational_memlet(dealloc_block, d_cont_in, dealloc_node, "_ptr", {}, type);
4✔
83
    builder.add_computational_memlet(dealloc_block, dealloc_node, "_ptr", d_cont_out, {}, type);
4✔
84
}
4✔
85

86
void CUBLASDataTransferExtraction::create_copy_to_device(
87
    builder::StructuredSDFGBuilder& builder,
88
    structured_control_flow::Sequence& sequence,
89
    structured_control_flow::Block& block,
90
    const std::string& host_container,
91
    const std::string& device_container,
92
    const symbolic::Expression& size,
93
    const types::Pointer& type
94
) {
×
95
    auto& copy_block = builder.add_block_before(sequence, block, {}, block.debug_info());
×
96
    auto& cont = builder.add_access(copy_block, host_container);
×
97
    auto& d_cont = builder.add_access(copy_block, device_container);
×
98
    auto& copy_node = builder.add_library_node<CUDADataOffloadingNode>(
×
99
        copy_block,
×
100
        this->blas_node_.debug_info(),
×
101
        size,
×
102
        symbolic::zero(),
×
103
        offloading::DataTransferDirection::H2D,
×
104
        offloading::BufferLifecycle::NO_CHANGE
×
105
    );
×
106
    builder.add_computational_memlet(copy_block, cont, copy_node, "_src", {}, type);
×
107
    builder.add_computational_memlet(copy_block, copy_node, "_dst", d_cont, {}, type);
×
108
}
×
109

110
void CUBLASDataTransferExtraction::create_copy_from_device(
111
    builder::StructuredSDFGBuilder& builder,
112
    structured_control_flow::Sequence& sequence,
113
    structured_control_flow::Block& block,
114
    const std::string& host_container,
115
    const std::string& device_container,
116
    const symbolic::Expression& size,
117
    const types::Pointer& type
118
) {
×
119
    auto& copy_block = builder.add_block_after(sequence, block, {}, block.debug_info());
×
120
    auto& cont = builder.add_access(copy_block, host_container);
×
121
    auto& d_cont = builder.add_access(copy_block, device_container);
×
122
    auto& copy_node = builder.add_library_node<CUDADataOffloadingNode>(
×
123
        copy_block,
×
124
        this->blas_node_.debug_info(),
×
125
        size,
×
126
        symbolic::zero(),
×
127
        offloading::DataTransferDirection::D2H,
×
128
        offloading::BufferLifecycle::NO_CHANGE
×
129
    );
×
130
    builder.add_computational_memlet(copy_block, d_cont, copy_node, "_src", {}, type);
×
131
    builder.add_computational_memlet(copy_block, copy_node, "_dst", cont, {}, type);
×
132
}
×
133

134
void CUBLASDataTransferExtraction::create_copy_to_device_with_allocation(
135
    builder::StructuredSDFGBuilder& builder,
136
    structured_control_flow::Sequence& sequence,
137
    structured_control_flow::Block& block,
138
    const std::string& host_container,
139
    const std::string& device_container,
140
    const symbolic::Expression& size,
141
    const types::Pointer& type
142
) {
5✔
143
    auto& copy_block = builder.add_block_before(sequence, block, {}, block.debug_info());
5✔
144
    auto& cont = builder.add_access(copy_block, host_container);
5✔
145
    auto& d_cont = builder.add_access(copy_block, device_container);
5✔
146
    auto& copy_node = builder.add_library_node<CUDADataOffloadingNode>(
5✔
147
        copy_block,
5✔
148
        this->blas_node_.debug_info(),
5✔
149
        size,
5✔
150
        symbolic::zero(),
5✔
151
        offloading::DataTransferDirection::H2D,
5✔
152
        offloading::BufferLifecycle::ALLOC
5✔
153
    );
5✔
154
    builder.add_computational_memlet(copy_block, cont, copy_node, "_src", {}, type);
5✔
155
    builder.add_computational_memlet(copy_block, copy_node, "_dst", d_cont, {}, type);
5✔
156
}
5✔
157

158
void CUBLASDataTransferExtraction::create_copy_from_device_with_deallocation(
159
    builder::StructuredSDFGBuilder& builder,
160
    structured_control_flow::Sequence& sequence,
161
    structured_control_flow::Block& block,
162
    const std::string& host_container,
163
    const std::string& device_container,
164
    const symbolic::Expression& size,
165
    const types::Pointer& type
166
) {
1✔
167
    auto& copy_block = builder.add_block_after(sequence, block, {}, block.debug_info());
1✔
168
    auto& cont = builder.add_access(copy_block, host_container);
1✔
169
    auto& d_cont = builder.add_access(copy_block, device_container);
1✔
170
    auto& copy_node = builder.add_library_node<CUDADataOffloadingNode>(
1✔
171
        copy_block,
1✔
172
        this->blas_node_.debug_info(),
1✔
173
        size,
1✔
174
        symbolic::zero(),
1✔
175
        offloading::DataTransferDirection::D2H,
1✔
176
        offloading::BufferLifecycle::FREE
1✔
177
    );
1✔
178
    builder.add_computational_memlet(copy_block, d_cont, copy_node, "_src", {}, type);
1✔
179
    builder.add_computational_memlet(copy_block, copy_node, "_dst", cont, {}, type);
1✔
180
}
1✔
181

182
CUBLASDataTransferExtraction::CUBLASDataTransferExtraction(math::blas::BLASNode& blas_node) : blas_node_(blas_node) {}
10✔
183

184
std::string CUBLASDataTransferExtraction::name() const { return "CUBLASDataTransferExtraction"; }
4✔
185

186
bool CUBLASDataTransferExtraction::
187
    can_be_applied(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
6✔
188
    // BLAS node must have implementation type CUBLAS without data transfers
189
    if (this->blas_node_.implementation_type().value() != cuda::ImplementationType_CUDAWithTransfers.value()) {
6✔
190
        return false;
2✔
191
    }
2✔
192

193

194
    // Restrict to BLAS nodes in their own block
195
    auto& dfg = this->blas_node_.get_parent();
4✔
196
    if (dfg.nodes().size() != dfg.in_degree(this->blas_node_) + dfg.out_degree(this->blas_node_) + 1) {
4✔
197
        return false;
×
198
    }
×
199

200
    // Supported BLAS nodes
201
    if (dynamic_cast<math::blas::DotNode*>(&this->blas_node_)) {
4✔
202
        return true;
2✔
203
    } else if (dynamic_cast<math::blas::GEMMNode*>(&this->blas_node_)) {
2✔
204
        return true;
2✔
205
    } else {
2✔
206
        return false;
×
207
    }
×
208
}
4✔
209

210
void CUBLASDataTransferExtraction::
211
    apply(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
2✔
212
    // Get data flow graph and block
213
    auto& dfg = this->blas_node_.get_parent();
2✔
214
    auto* block = dynamic_cast<structured_control_flow::Block*>(dfg.get_parent());
2✔
215
    assert(block);
2✔
216

217
    // Get sequence
218
    auto& scope_analysis = analysis_manager.get<analysis::ScopeAnalysis>();
2✔
219
    auto* sequence = dynamic_cast<structured_control_flow::Sequence*>(scope_analysis.parent_scope(block));
2✔
220
    assert(sequence);
2✔
221

222
    // Determine type
223
    types::PrimitiveType precision;
2✔
224
    switch (this->blas_node_.precision()) {
2✔
225
        case math::blas::h:
×
226
            precision = types::PrimitiveType::Half;
×
227
            break;
×
228
        case math::blas::s:
1✔
229
            precision = types::PrimitiveType::Float;
1✔
230
            break;
1✔
231
        case math::blas::d:
1✔
232
            precision = types::PrimitiveType::Double;
1✔
233
            break;
1✔
234
        default:
×
NEW
235
            throw InvalidSDFGException("CUBLASDataTransferExtraction: Unsupported precision");
×
236
    }
2✔
237
    types::Scalar base_type(precision);
2✔
238
    types::Pointer type(base_type);
2✔
239

240
    // Capture in and out accesses
241
    std::unordered_map<std::string, data_flow::AccessNode&> in_access, out_access;
2✔
242
    for (auto& iedge : dfg.in_edges(this->blas_node_)) {
7✔
243
        in_access.insert({iedge.dst_conn(), static_cast<data_flow::AccessNode&>(iedge.src())});
7✔
244
    }
7✔
245
    for (auto& oedge : dfg.out_edges(this->blas_node_)) {
2✔
246
        out_access.insert({oedge.src_conn(), static_cast<data_flow::AccessNode&>(oedge.dst())});
2✔
247
    }
2✔
248

249
    if (auto* dot_node = dynamic_cast<math::blas::DotNode*>(&this->blas_node_)) {
2✔
250
        auto x_size = symbolic::mul(
1✔
251
            symbolic::add(symbolic::mul(symbolic::sub(dot_node->n(), symbolic::one()), dot_node->incx()), symbolic::one()),
1✔
252
            types::get_contiguous_element_size(type, true)
1✔
253
        );
1✔
254
        auto y_size = symbolic::mul(
1✔
255
            symbolic::add(symbolic::mul(symbolic::sub(dot_node->n(), symbolic::one()), dot_node->incy()), symbolic::one()),
1✔
256
            types::get_contiguous_element_size(type, true)
1✔
257
        );
1✔
258
        auto dx = this->create_device_container(builder, type, x_size);
1✔
259
        auto dy = this->create_device_container(builder, type, y_size);
1✔
260

261
        this->create_copy_to_device_with_allocation(
1✔
262
            builder, *sequence, *block, in_access.at("__x").data(), dx, x_size, type
1✔
263
        );
1✔
264
        this->create_copy_to_device_with_allocation(
1✔
265
            builder, *sequence, *block, in_access.at("__y").data(), dy, y_size, type
1✔
266
        );
1✔
267

268
        this->create_deallocate(builder, *sequence, *block, dx, type);
1✔
269
        this->create_deallocate(builder, *sequence, *block, dy, type);
1✔
270

271
        in_access.at("__x").data(dx);
1✔
272
        in_access.at("__y").data(dy);
1✔
273
    } else if (auto* gemm_node = dynamic_cast<math::blas::GEMMNode*>(&this->blas_node_)) {
1✔
274
        auto elem_size = types::get_contiguous_element_size(type, true);
1✔
275
        auto a_size = symbolic::mul(symbolic::mul(gemm_node->m(), gemm_node->k()), elem_size);
1✔
276
        auto b_size = symbolic::mul(symbolic::mul(gemm_node->k(), gemm_node->n()), elem_size);
1✔
277
        auto c_size = symbolic::mul(symbolic::mul(gemm_node->m(), gemm_node->n()), elem_size);
1✔
278

279
        auto dA = this->create_device_container(builder, type, a_size);
1✔
280
        auto dB = this->create_device_container(builder, type, b_size);
1✔
281
        auto dC = this->create_device_container(builder, type, c_size);
1✔
282

283
        this->create_copy_to_device_with_allocation(
1✔
284
            builder, *sequence, *block, in_access.at("__A").data(), dA, a_size, type
1✔
285
        );
1✔
286
        this->create_copy_to_device_with_allocation(
1✔
287
            builder, *sequence, *block, in_access.at("__B").data(), dB, b_size, type
1✔
288
        );
1✔
289
        this->create_copy_to_device_with_allocation(
1✔
290
            builder, *sequence, *block, in_access.at("__C").data(), dC, c_size, type
1✔
291
        );
1✔
292

293
        this->create_copy_from_device_with_deallocation(
1✔
294
            builder, *sequence, *block, out_access.at("__C").data(), dC, c_size, type
1✔
295
        );
1✔
296
        this->create_deallocate(builder, *sequence, *block, dA, type);
1✔
297
        this->create_deallocate(builder, *sequence, *block, dB, type);
1✔
298

299
        in_access.at("__A").data(dA);
1✔
300
        in_access.at("__B").data(dB);
1✔
301
        in_access.at("__C").data(dC);
1✔
302
        out_access.at("__C").data(dC);
1✔
303
    } else {
1✔
NEW
304
        throw InvalidSDFGException("CUBLASDataTransferExtraction: Unsupported BLAS type");
×
305
    }
×
306

307
    // Change the implementation type to CUBLAS without data transfers
308
    this->blas_node_.implementation_type() = cuda::ImplementationType_CUDAWithoutTransfers;
2✔
309
}
2✔
310

311
void CUBLASDataTransferExtraction::to_json(nlohmann::json& j) const {
2✔
312
    j["transformation_type"] = this->name();
2✔
313

314
    // BLAS nodes are not loops; they appear as generic elements in GNN data.
315
    // Use type "unknown" to match the feature extractor's classification.
316
    j["subgraph"] = {{"0", {{"element_id", this->blas_node_.element_id()}, {"type", "unknown"}}}};
2✔
317

318
    // Legacy field for backward compatibility.
319
    j["blas_node_element_id"] = this->blas_node_.element_id();
2✔
320
}
2✔
321

322
CUBLASDataTransferExtraction CUBLASDataTransferExtraction::
323
    from_json(builder::StructuredSDFGBuilder& builder, const nlohmann::json& j) {
2✔
324
    size_t blas_node_id;
2✔
325
    if (j.contains("subgraph")) {
2✔
326
        const auto& node_desc = j.at("subgraph").at("0");
2✔
327
        blas_node_id = node_desc.at("element_id").get<size_t>();
2✔
328
    } else {
2✔
329
        blas_node_id = j.at("blas_node_element_id").get<size_t>();
×
330
    }
×
331
    auto* blas_node_element = builder.find_element_by_id(blas_node_id);
2✔
332
    if (!blas_node_element) {
2✔
333
        throw transformations::
×
334
            InvalidTransformationDescriptionException("Element with ID " + std::to_string(blas_node_id) + " not found");
×
335
    }
×
336
    auto* blas_node = dynamic_cast<math::blas::BLASNode*>(blas_node_element);
2✔
337
    if (!blas_node) {
2✔
338
        throw transformations::InvalidTransformationDescriptionException(
×
339
            "Element with ID " + std::to_string(blas_node_id) + " is not a BLASNode"
×
340
        );
×
341
    }
×
342

343
    return CUBLASDataTransferExtraction(*blas_node);
2✔
344
}
2✔
345

346
} // namespace cuda
347
} // namespace sdfg
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc