• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 28106147644

24 Jun 2026 02:32PM UTC coverage: 61.922% (+0.1%) from 61.779%
28106147644

Pull #806

github

web-flow
Merge 2be414d54 into 57cc1db99
Pull Request #806: Map Collapse for Multiple targets in a neste sequence

165 of 185 new or added lines in 2 files covered. (89.19%)

419 existing lines in 30 files now uncovered.

37705 of 60891 relevant lines covered (61.92%)

1004.4 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.57
/opt/src/transformations/out_local_storage.cpp
1
#include "sdfg/transformations/out_local_storage.h"
2

3
#include <algorithm>
4
#include <cstddef>
5
#include <functional>
6
#include <string>
7

8
#include "sdfg/analysis/memory_layout_analysis.h"
9
#include "sdfg/analysis/users.h"
10
#include "sdfg/builder/structured_sdfg_builder.h"
11
#include "sdfg/data_flow/access_node.h"
12
#include "sdfg/data_flow/library_nodes/barrier_local_node.h"
13
#include "sdfg/data_flow/memlet.h"
14
#include "sdfg/passes/structured_control_flow/dead_cfg_elimination.h"
15
#include "sdfg/passes/structured_control_flow/sequence_fusion.h"
16
#include "sdfg/structured_control_flow/if_else.h"
17
#include "sdfg/structured_control_flow/sequence.h"
18
#include "sdfg/structured_control_flow/structured_loop.h"
19
#include "sdfg/symbolic/symbolic.h"
20
#include "sdfg/targets/gpu/gpu_schedule_type.h"
21
#include "sdfg/types/array.h"
22
#include "sdfg/types/pointer.h"
23
#include "sdfg/types/scalar.h"
24

25
namespace sdfg {
26
namespace transformations {
27

28
OutLocalStorage::OutLocalStorage(
29
    structured_control_flow::StructuredLoop& loop,
30
    const data_flow::AccessNode& access_node,
31
    const types::StorageType& storage_type
32
)
33
    : loop_(loop), access_node_(access_node), container_(access_node.data()), storage_type_(storage_type) {};
41✔
34

35
std::string OutLocalStorage::name() const { return "OutLocalStorage"; };
10✔
36

37
bool OutLocalStorage::can_be_applied(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
37✔
38
    auto& sdfg = builder.subject();
37✔
39
    auto& body = this->loop_.root();
37✔
40

41
    tile_info_ = TileInfo{};
37✔
42

43
    // Criterion: Container must exist and is pointer
44
    if (!sdfg.exists(this->container_)) {
37✔
UNCOV
45
        return false;
×
46
    }
×
47
    auto& type = sdfg.type(this->container_);
37✔
48
    if (type.type_id() != types::TypeID::Pointer) {
37✔
UNCOV
49
        return false;
×
50
    }
×
51

52
    // Criterion: Container must be used in the loop body
53
    auto& users = analysis_manager.get<analysis::Users>();
37✔
54
    analysis::UsersView body_users(users, body);
37✔
55
    if (body_users.uses(this->container_).empty()) {
37✔
56
        return false;
2✔
57
    }
2✔
58

59
    // Criterion: Container must have writes (this is OutLocalStorage, not InLocalStorage)
60
    if (body_users.writes(this->container_).empty()) {
35✔
61
        return false;
1✔
62
    }
1✔
63

64
    // Determine if container is also read (read-write vs write-only)
65
    tile_info_.has_read = !body_users.reads(this->container_).empty();
34✔
66

67
    auto& mla = analysis_manager.get<analysis::MemoryLayoutAnalysis>();
34✔
68

69
    // Find a representative memlet from the access node to identify its group.
70
    // An access node may have multiple edges belonging to different tile groups.
71
    // We iterate all edges and select the first one whose tile group is valid
72
    // at the target loop level.
73
    const analysis::MemoryTileGroup* group = nullptr;
34✔
74
    auto& dfg = access_node_.get_parent();
34✔
75
    for (auto& memlet : dfg.in_edges(access_node_)) {
34✔
76
        auto* candidate = mla.tile_group_for(loop_, memlet);
27✔
77
        if (candidate) {
27✔
78
            group = candidate;
27✔
79
            break;
27✔
80
        }
27✔
81
    }
27✔
82
    if (!group) {
34✔
83
        for (auto& memlet : dfg.out_edges(access_node_)) {
7✔
84
            auto* candidate = mla.tile_group_for(loop_, memlet);
7✔
85
            if (candidate) {
7✔
86
                group = candidate;
7✔
87
                break;
7✔
88
            }
7✔
89
        }
7✔
90
    }
7✔
91
    if (!group) {
34✔
UNCOV
92
        return false;
×
93
    }
×
94

95
    auto& tile = group->tile;
34✔
96

97
    // Store group memlets for use in apply()
98
    group_memlets_.clear();
34✔
99
    group_memlets_.insert(group->memlets.begin(), group->memlets.end());
34✔
100

101
    // Get overapproximated extents (integer upper bounds)
102
    auto extents = tile.extents_approx();
34✔
103
    if (extents.empty()) {
34✔
UNCOV
104
        return false;
×
105
    }
×
106
    for (auto& ext : extents) {
56✔
107
        if (ext.is_null()) {
56✔
UNCOV
108
            return false;
×
109
        }
×
110
    }
56✔
111

112
    // Store tile info (before substitution, bases/strides stay symbolic)
113
    tile_info_.dimensions = extents;
34✔
114
    tile_info_.bases = tile.min_subset;
34✔
115
    tile_info_.strides = std::vector<symbolic::Expression>(tile.layout.strides().begin(), tile.layout.strides().end());
34✔
116
    tile_info_.offset = tile.layout.offset();
34✔
117

118
    // GPU shared memory: resolve symbolic extents using GPU block sizes and
119
    // require at least one cooperative dimension
120
    if (storage_type_.is_nv_shared()) {
34✔
121
        auto ancestors = ControlFlowNode::parent_chain(loop_);
6✔
122

123
        // Build substitution map: symbolic GPU map bounds → integer block sizes
124
        for (auto* node : ancestors) {
26✔
125
            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
26✔
126
                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
10✔
UNCOV
127
                    continue;
×
128
                }
×
129
                auto block_size = gpu::gpu_block_size(ancestor_map->schedule_type());
10✔
130
                // Extract symbolic bound from condition: Lt(indvar, BOUND)
131
                auto condition = ancestor_map->condition();
10✔
132
                if (SymEngine::is_a<SymEngine::StrictLessThan>(*condition)) {
10✔
133
                    auto stl = SymEngine::rcp_static_cast<const SymEngine::StrictLessThan>(condition);
10✔
134
                    auto rhs = stl->get_args()[1];
10✔
135
                    auto iter_count = symbolic::sub(rhs, ancestor_map->init());
10✔
136
                    if (!SymEngine::is_a<SymEngine::Integer>(*iter_count)) {
10✔
137
                        // Symbolic bound — substitute with block size in extents and bases
138
                        for (auto& ext : tile_info_.dimensions) {
17✔
139
                            ext = symbolic::simplify(symbolic::subs(ext, iter_count, block_size));
17✔
140
                        }
17✔
141
                        for (auto& base : tile_info_.bases) {
17✔
142
                            base = symbolic::simplify(symbolic::subs(base, iter_count, block_size));
17✔
143
                        }
17✔
144
                    }
10✔
145
                }
10✔
146
            }
10✔
147
        }
26✔
148

149
        // Criterion: All extents must now be provably integer
150
        for (auto& ext : tile_info_.dimensions) {
10✔
151
            if (!SymEngine::is_a<SymEngine::Integer>(*ext)) {
10✔
152
                return false;
2✔
153
            }
2✔
154
        }
10✔
155

156
        // Criterion: At least one cooperative dimension
157
        bool has_cooperative_dim = false;
4✔
158
        for (auto* node : ancestors) {
12✔
159
            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
12✔
160
                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
6✔
UNCOV
161
                    continue;
×
162
                }
×
163
                bool appears_in_bases = false;
6✔
164
                for (auto& base : tile_info_.bases) {
9✔
165
                    if (symbolic::uses(base, ancestor_map->indvar())) {
9✔
166
                        appears_in_bases = true;
2✔
167
                        break;
2✔
168
                    }
2✔
169
                }
9✔
170
                if (!appears_in_bases) {
6✔
171
                    has_cooperative_dim = true;
4✔
172
                    break;
4✔
173
                }
4✔
174
            }
6✔
175
        }
12✔
176
        if (!has_cooperative_dim) {
4✔
UNCOV
177
            return false;
×
178
        }
×
179
    } else {
28✔
180
        // CPU path: All extents must be provably integer
181
        for (auto& ext : tile_info_.dimensions) {
46✔
182
            if (!SymEngine::is_a<SymEngine::Integer>(*ext)) {
46✔
UNCOV
183
                return false;
×
184
            }
×
185
        }
46✔
186
    }
28✔
187

188
    return true;
32✔
189
}
34✔
190

191
void OutLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
28✔
192
    auto& sdfg = builder.subject();
28✔
193
    auto& users = analysis_manager.get<analysis::Users>();
28✔
194

195
    auto parent_node = loop_.get_parent();
28✔
196
    auto parent = dynamic_cast<structured_control_flow::Sequence*>(parent_node);
28✔
197
    if (!parent) {
28✔
UNCOV
198
        throw InvalidSDFGException("OutLocalStorage: Parent of loop must be a Sequence!");
×
199
    }
×
200

201
    // Get type information.
202
    auto* memlet = *group_memlets_.begin();
28✔
203
    types::Scalar scalar_type(memlet->base_type().primitive_type());
28✔
204
    types::Pointer pointer_type(scalar_type);
28✔
205

206
    // Create local buffer name
207
    local_name_ = builder.find_new_name("__daisy_out_local_storage_" + this->container_);
28✔
208

209

210
    // Collect varying dimensions (extent > 1) and their sizes.
211
    // Extent-1 dimensions are degenerate (no loop is needed) and must be
212
    // skipped when sizing the buffer, when creating copy indvars, and when
213
    // linearizing into the local buffer.  The bookkeeping must match what
214
    // `build_original_subset` expects (it indexes copy_indices by varying
215
    // dimension only).
216
    std::vector<size_t> varying_dims;
28✔
217
    std::vector<symbolic::Expression> varying_dim_sizes;
28✔
218
    for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
76✔
219
        auto& dim_size = tile_info_.dimensions.at(d);
48✔
220
        if (!symbolic::eq(dim_size, symbolic::integer(1))) {
48✔
221
            varying_dims.push_back(d);
30✔
222
            varying_dim_sizes.push_back(dim_size);
30✔
223
        }
30✔
224
    }
48✔
225

226
    // GPU classification: each ancestor GPU Map is either
227
    //  - per-thread (its Map indvar appears in tile.bases — each thread sees a
228
    //    distinct slice along that dim, so the shared buffer gets its own
229
    //    per-thread slot indexed by the within-block thread_idx), or
230
    //  - cooperative (Map indvar not in bases — all threads along that dim
231
    //    cooperatively load/store the same shared tile, strided by thread_idx).
232
    struct GpuDim {
28✔
233
        gpu::GPUDimension dim;
28✔
234
        symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim)
28✔
235
        symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol)
28✔
236
        symbolic::Integer block_size;
28✔
237
        bool is_per_thread;
28✔
238
    };
28✔
239
    std::vector<GpuDim> per_thread_dims; // populated only on GPU path
28✔
240
    std::vector<GpuDim> coop_dims; // populated only on GPU path
28✔
241
    bool is_rocm = false;
28✔
242

243
    if (storage_type_.is_nv_shared()) {
28✔
244
        auto ancestors = ControlFlowNode::parent_chain(loop_);
4✔
245
        for (auto* node : ancestors) {
20✔
246
            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
20✔
247
            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
20✔
248
            if (m->schedule_type().value() == "ROCM") {
8✔
UNCOV
249
                is_rocm = true;
×
250
                break;
×
251
            }
×
252
        }
8✔
253
        const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_";
4✔
254
        auto suffix = [](gpu::GPUDimension d) -> std::string {
8✔
255
            switch (d) {
8✔
256
                case gpu::GPUDimension::X:
4✔
257
                    return "x";
4✔
258
                case gpu::GPUDimension::Y:
4✔
259
                    return "y";
4✔
UNCOV
260
                case gpu::GPUDimension::Z:
×
261
                    return "z";
×
262
            }
8✔
UNCOV
263
            return "?";
×
264
        };
8✔
265
        for (auto* node : ancestors) {
20✔
266
            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
20✔
267
            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
20✔
268
            GpuDim gd;
8✔
269
            gd.dim = gpu::gpu_dimension(m->schedule_type());
8✔
270
            gd.map_indvar = m->indvar();
8✔
271
            gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim));
8✔
272
            gd.block_size = gpu::gpu_block_size(m->schedule_type());
8✔
273
            gd.is_per_thread = false;
8✔
274
            for (auto& base : tile_info_.bases) {
11✔
275
                if (symbolic::uses(base, m->indvar())) {
11✔
276
                    gd.is_per_thread = true;
3✔
277
                    break;
3✔
278
                }
3✔
279
            }
11✔
280
            (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd);
8✔
281
        }
8✔
282
        auto by_dim = [](const GpuDim& a, const GpuDim& b) {
4✔
283
            return static_cast<int>(a.dim) < static_cast<int>(b.dim);
1✔
284
        };
1✔
285
        std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim);
4✔
286
        std::sort(coop_dims.begin(), coop_dims.end(), by_dim);
4✔
287

288
        // Ensure within-block thread_idx containers exist. Codegen recognises
289
        // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z}
290
        // (CUDA) or the ROCm equivalent at emission time.
291
        auto ensure_idx = [&](const symbolic::Symbol& sym) {
8✔
292
            if (!sdfg.exists(sym->get_name())) {
8✔
293
                types::Scalar idx_type(types::PrimitiveType::Int32);
8✔
294
                idx_type.storage_type(types::StorageType::NV_Symbol());
8✔
295
                builder.add_container(sym->get_name(), idx_type);
8✔
296
            }
8✔
297
        };
8✔
298
        for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx);
4✔
299
        for (auto& gd : coop_dims) ensure_idx(gd.thread_idx);
5✔
300
    }
4✔
301

302
    // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++
303
    //                   [varying tile dim sizes (original access-dim order)]
304
    std::vector<symbolic::Expression> buf_dim_sizes;
28✔
305
    for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size);
28✔
306
    for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s);
30✔
307

308
    // Total buffer size (number of scalar slots)
309
    symbolic::Expression total_size = symbolic::integer(1);
28✔
310
    for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s);
33✔
311

312
    // Per-thread index prefix (each thread's fixed buffer coords)
313
    std::vector<symbolic::Expression> per_thread_indices;
28✔
314
    for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx);
28✔
315

316
    // Create the local buffer with specified storage type
317
    types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
28✔
318
    builder.add_container(local_name_, buffer_type);
28✔
319

320
    // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride)
321
    auto linearize_exprs = [&](const std::vector<symbolic::Expression>& indices) -> symbolic::Expression {
86✔
322
        symbolic::Expression linear_idx = symbolic::integer(0);
86✔
323
        symbolic::Expression stride = symbolic::integer(1);
86✔
324
        for (int i = static_cast<int>(indices.size()) - 1; i >= 0; i--) {
188✔
325
            linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride));
102✔
326
            stride = symbolic::mul(stride, buf_dim_sizes[i]);
102✔
327
        }
102✔
328
        return linear_idx;
86✔
329
    };
86✔
330

331
    // Helper: build linearized local index from per-dimension indvars (symbols)
332
    auto linearize = [&](const std::vector<symbolic::Symbol>& indvars) -> symbolic::Expression {
38✔
333
        std::vector<symbolic::Expression> exprs(indvars.begin(), indvars.end());
38✔
334
        return linearize_exprs(exprs);
38✔
335
    };
38✔
336

337
    // Helper: build source subset (base[d] + copy_indvar[d]) for original container
338
    auto build_original_subset = [&](const std::vector<symbolic::Expression>& copy_indices) -> data_flow::Subset {
43✔
339
        std::vector<symbolic::Expression> full_indices;
43✔
340
        size_t var_idx = 0;
43✔
341
        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
117✔
342
            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
74✔
343
                full_indices.push_back(symbolic::add(tile_info_.bases.at(d), copy_indices.at(var_idx++)));
47✔
344
            } else {
47✔
345
                full_indices.push_back(tile_info_.bases.at(d));
27✔
346
            }
27✔
347
        }
74✔
348

349
        symbolic::Expression linear = tile_info_.offset;
43✔
350
        for (size_t d = 0; d < full_indices.size(); d++) {
117✔
351
            linear = symbolic::add(linear, symbolic::mul(tile_info_.strides.at(d), full_indices.at(d)));
74✔
352
        }
74✔
353
        return {linear};
43✔
354
    };
43✔
355

356
    if (storage_type_.is_nv_shared()) {
28✔
357
        // ============================================================
358
        // GPU COOPERATIVE PATH
359
        // ============================================================
360
        // Each thread owns a fixed slot along per-thread buffer dims and
361
        // strides through the varying-flat range with the other threads
362
        // sharing that slot (i.e. threads in cooperative dims only).
363

364
        // Total cooperative-thread count (= 1 if no cooperative dims)
365
        symbolic::Expression total_coop_threads = symbolic::integer(1);
4✔
366
        for (auto& cd : coop_dims) {
5✔
367
            total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size);
5✔
368
        }
5✔
369

370
        // Flat within-block index over cooperative dims only (= 0 if none).
371
        // Row-major: X is least-significant when present.
372
        symbolic::Expression coop_flat = symbolic::integer(0);
4✔
373
        {
4✔
374
            symbolic::Expression stride = symbolic::integer(1);
4✔
375
            for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) {
9✔
376
                coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride));
5✔
377
                stride = symbolic::mul(stride, it->block_size);
5✔
378
            }
5✔
379
        }
4✔
380

381
        // Varying-flat size = product of tile dim extents (excluding extent==1).
382
        // This is the address range each thread cooperatively walks within its
383
        // per-thread slot.
384
        symbolic::Expression varying_flat_size = symbolic::integer(1);
4✔
385
        for (auto& s : varying_dim_sizes) {
4✔
386
            varying_flat_size = symbolic::mul(varying_flat_size, s);
4✔
387
        }
4✔
388

389
        // Helper to decompose a flat varying index into per-varying-dim indices
390
        // (row-major), and to build the buffer dest subset (per_thread ++ varying).
391
        auto decompose = [&](const symbolic::Symbol& idx_var) {
5✔
392
            std::vector<symbolic::Expression> result;
5✔
393
            symbolic::Expression remainder = idx_var;
5✔
394
            for (size_t i = 0; i < varying_dim_sizes.size(); i++) {
10✔
395
                if (i + 1 < varying_dim_sizes.size()) {
5✔
UNCOV
396
                    symbolic::Expression divisor = symbolic::integer(1);
×
397
                    for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) {
×
398
                        divisor = symbolic::mul(divisor, varying_dim_sizes[j]);
×
399
                    }
×
400
                    result.push_back(symbolic::div(remainder, divisor));
×
401
                    remainder = symbolic::mod(remainder, divisor);
×
402
                } else {
5✔
403
                    result.push_back(remainder);
5✔
404
                }
5✔
405
            }
5✔
406
            return result;
5✔
407
        };
5✔
408
        auto buf_subset_for = [&](const std::vector<symbolic::Expression>& varying_decomp) -> data_flow::Subset {
5✔
409
            std::vector<symbolic::Expression> dest_indices = per_thread_indices;
5✔
410
            for (auto& v : varying_decomp) dest_indices.push_back(v);
5✔
411
            return {linearize_exprs(dest_indices)};
5✔
412
        };
5✔
413

414
        // INIT: barrier → cooperative copy-in → barrier (if has_read)
415
        if (tile_info_.has_read) {
4✔
416
            // Barrier before init
417
            auto& barrier_block1 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
1✔
418
            builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block1, {});
1✔
419

420
            // Cooperative copy-in loop
421
            auto idx_name = builder.find_new_name("__daisy_ols_coop_init_" + this->container_);
1✔
422
            types::Scalar idx_type(types::PrimitiveType::UInt64);
1✔
423
            builder.add_container(idx_name, idx_type);
1✔
424
            auto idx_var = symbolic::symbol(idx_name);
1✔
425

426
            auto& init_loop = builder.add_map_before(
1✔
427
                *parent,
1✔
428
                loop_,
1✔
429
                idx_var,
1✔
430
                symbolic::Lt(idx_var, varying_flat_size),
1✔
431
                coop_flat,
1✔
432
                symbolic::add(idx_var, total_coop_threads),
1✔
433
                structured_control_flow::ScheduleType_Sequential::create(),
1✔
434
                {},
1✔
435
                loop_.debug_info()
1✔
436
            );
1✔
437

438
            auto& init_block = builder.add_block(init_loop.root());
1✔
439
            auto& init_src = builder.add_access(init_block, this->container_);
1✔
440
            auto& init_dst = builder.add_access(init_block, local_name_);
1✔
441
            auto& init_tasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"});
1✔
442

443
            auto init_decomp = decompose(idx_var);
1✔
444
            auto init_src_subset = build_original_subset(init_decomp);
1✔
445
            auto init_dst_subset = buf_subset_for(init_decomp);
1✔
446
            builder.add_computational_memlet(init_block, init_src, init_tasklet, "_in", init_src_subset, pointer_type);
1✔
447
            builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, init_dst_subset, buffer_type);
1✔
448

449
            // Barrier after init
450
            auto& barrier_block2 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
1✔
451
            builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block2, {});
1✔
452
        }
1✔
453

454
        // WRITEBACK: barrier → cooperative copy-out → barrier
455
        {
4✔
456
            // Barrier before writeback
457
            auto& barrier_block3 = builder.add_block_after(*parent, loop_, {}, loop_.debug_info());
4✔
458
            builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block3, {});
4✔
459

460
            // Cooperative writeback loop
461
            auto idx_name = builder.find_new_name("__daisy_ols_coop_wb_" + this->container_);
4✔
462
            types::Scalar idx_type(types::PrimitiveType::UInt64);
4✔
463
            builder.add_container(idx_name, idx_type);
4✔
464
            auto idx_var = symbolic::symbol(idx_name);
4✔
465

466
            auto& wb_loop = builder.add_map_after(
4✔
467
                *parent,
4✔
468
                loop_,
4✔
469
                idx_var,
4✔
470
                symbolic::Lt(idx_var, varying_flat_size),
4✔
471
                coop_flat,
4✔
472
                symbolic::add(idx_var, total_coop_threads),
4✔
473
                structured_control_flow::ScheduleType_Sequential::create(),
4✔
474
                {},
4✔
475
                loop_.debug_info()
4✔
476
            );
4✔
477

478
            auto& wb_block = builder.add_block(wb_loop.root());
4✔
479
            auto& wb_src = builder.add_access(wb_block, local_name_);
4✔
480
            auto& wb_dst = builder.add_access(wb_block, this->container_);
4✔
481
            auto& wb_tasklet = builder.add_tasklet(wb_block, data_flow::TaskletCode::assign, "_out", {"_in"});
4✔
482

483
            auto wb_decomp = decompose(idx_var);
4✔
484
            auto wb_src_subset = buf_subset_for(wb_decomp);
4✔
485
            auto wb_dst_subset = build_original_subset(wb_decomp);
4✔
486
            builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", wb_src_subset, buffer_type);
4✔
487
            builder.add_computational_memlet(wb_block, wb_tasklet, "_out", wb_dst, wb_dst_subset, pointer_type);
4✔
488

489
            // Barrier after writeback
490
            auto& barrier_block4 = builder.add_block_after(*parent, loop_, {}, loop_.debug_info());
4✔
491
            builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block4, {});
4✔
492
        }
4✔
493
    } else {
24✔
494
        // ============================================================
495
        // CPU SEQUENTIAL PATH
496
        // ============================================================
497
        if (tile_info_.has_read) {
24✔
498
            std::vector<symbolic::Symbol> init_indvars;
14✔
499
            structured_control_flow::Sequence* init_scope =
14✔
500
                &builder.add_sequence_before(*parent, loop_, {}, loop_.debug_info());
14✔
501
            for (size_t i = 0; i < varying_dims.size(); i++) {
30✔
502
                size_t d = varying_dims[i];
16✔
503
                auto indvar_name =
16✔
504
                    builder.find_new_name("__daisy_ols_init_" + this->container_ + "_d" + std::to_string(d));
16✔
505
                types::Scalar indvar_type(types::PrimitiveType::UInt64);
16✔
506
                builder.add_container(indvar_name, indvar_type);
16✔
507
                auto indvar = symbolic::symbol(indvar_name);
16✔
508
                init_indvars.push_back(indvar);
16✔
509

510
                auto init = symbolic::integer(0);
16✔
511
                auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
16✔
512
                auto update = symbolic::add(indvar, symbolic::integer(1));
16✔
513

514
                auto& init_loop = builder.add_map(
16✔
515
                    *init_scope,
16✔
516
                    indvar,
16✔
517
                    condition,
16✔
518
                    init,
16✔
519
                    update,
16✔
520
                    structured_control_flow::ScheduleType_Sequential::create(),
16✔
521
                    {},
16✔
522
                    loop_.debug_info()
16✔
523
                );
16✔
524
                init_scope = &init_loop.root();
16✔
525
            }
16✔
526

527
            // Create init copy block
528
            auto& init_block = builder.add_block(*init_scope);
14✔
529
            auto& init_src = builder.add_access(init_block, this->container_);
14✔
530
            auto& init_dst = builder.add_access(init_block, local_name_);
14✔
531
            auto& init_tasklet = builder.add_tasklet(init_block, data_flow::TaskletCode::assign, "_out", {"_in"});
14✔
532

533
            std::vector<symbolic::Expression> init_exprs(init_indvars.begin(), init_indvars.end());
14✔
534
            auto init_src_subset = build_original_subset(init_exprs);
14✔
535
            data_flow::Subset init_dst_subset = {linearize(init_indvars)};
14✔
536

537
            builder.add_computational_memlet(init_block, init_src, init_tasklet, "_in", init_src_subset, pointer_type);
14✔
538
            builder.add_computational_memlet(init_block, init_tasklet, "_out", init_dst, init_dst_subset, buffer_type);
14✔
539
        }
14✔
540

541
        // Writeback Maps
542
        {
24✔
543
            std::vector<symbolic::Symbol> wb_indvars;
24✔
544
            structured_control_flow::Sequence* wb_scope =
24✔
545
                &builder.add_sequence_after(*parent, loop_, {}, loop_.debug_info());
24✔
546
            for (size_t i = 0; i < varying_dims.size(); i++) {
50✔
547
                size_t d = varying_dims[i];
26✔
548
                auto indvar_name =
26✔
549
                    builder.find_new_name("__daisy_ols_wb_" + this->container_ + "_d" + std::to_string(d));
26✔
550
                types::Scalar indvar_type(types::PrimitiveType::UInt64);
26✔
551
                builder.add_container(indvar_name, indvar_type);
26✔
552
                auto indvar = symbolic::symbol(indvar_name);
26✔
553
                wb_indvars.push_back(indvar);
26✔
554

555
                auto init = symbolic::integer(0);
26✔
556
                auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
26✔
557
                auto update = symbolic::add(indvar, symbolic::integer(1));
26✔
558

559
                auto& wb_loop = builder.add_map(
26✔
560
                    *wb_scope,
26✔
561
                    indvar,
26✔
562
                    condition,
26✔
563
                    init,
26✔
564
                    update,
26✔
565
                    structured_control_flow::ScheduleType_Sequential::create(),
26✔
566
                    {},
26✔
567
                    loop_.debug_info()
26✔
568
                );
26✔
569
                wb_scope = &wb_loop.root();
26✔
570
            }
26✔
571

572
            // Create writeback copy block
573
            auto& wb_block = builder.add_block(*wb_scope);
24✔
574
            auto& wb_src = builder.add_access(wb_block, local_name_);
24✔
575
            auto& wb_dst = builder.add_access(wb_block, this->container_);
24✔
576
            auto& wb_tasklet = builder.add_tasklet(wb_block, data_flow::TaskletCode::assign, "_out", {"_in"});
24✔
577

578
            std::vector<symbolic::Expression> wb_exprs(wb_indvars.begin(), wb_indvars.end());
24✔
579
            data_flow::Subset wb_src_subset = {linearize(wb_indvars)};
24✔
580
            auto wb_dst_subset = build_original_subset(wb_exprs);
24✔
581

582
            builder.add_computational_memlet(wb_block, wb_src, wb_tasklet, "_in", wb_src_subset, buffer_type);
24✔
583
            builder.add_computational_memlet(wb_block, wb_tasklet, "_out", wb_dst, wb_dst_subset, pointer_type);
24✔
584
        }
24✔
585
    }
24✔
586

587
    // ==================================================================
588
    // Update accesses in the main loop to use the local buffer
589
    // ==================================================================
590
    auto& mla = analysis_manager.get<analysis::MemoryLayoutAnalysis>();
28✔
591

592
    // Recursive helper to traverse all blocks in the loop body
593
    std::function<void(structured_control_flow::ControlFlowNode&)> rewrite_accesses;
28✔
594
    rewrite_accesses = [&](structured_control_flow::ControlFlowNode& node) {
92✔
595
        if (auto* block = dynamic_cast<structured_control_flow::Block*>(&node)) {
92✔
596
            auto& dfg = block->dataflow();
40✔
597

598
            // Collect access nodes to process (avoid iterator invalidation when splitting)
599
            std::vector<data_flow::AccessNode*> access_nodes;
40✔
600
            for (auto* access_node : dfg.data_nodes()) {
96✔
601
                if (access_node->data() == this->container_) {
96✔
602
                    access_nodes.push_back(access_node);
46✔
603
                }
46✔
604
            }
96✔
605

606
            for (auto* access : access_nodes) {
46✔
607
                // Classify memlets: group vs non-group
608
                struct MemletRewrite {
46✔
609
                    data_flow::Memlet* memlet;
46✔
610
                    data_flow::Subset local_subset;
46✔
611
                    bool is_outgoing;
46✔
612
                };
46✔
613
                std::vector<MemletRewrite> group_rewrites;
46✔
614
                bool all_in_group = true;
46✔
615

616
                // Outgoing memlets (reads from this access node)
617
                for (auto& memlet : dfg.out_edges(*access)) {
46✔
618
                    if (group_memlets_.count(&memlet) == 0) {
16✔
619
                        all_in_group = false;
1✔
620
                        continue;
1✔
621
                    }
1✔
622
                    auto* acc = mla.access(memlet);
15✔
623
                    if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
15✔
624
                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
625
                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
15✔
626
                        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
41✔
627
                            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
26✔
628
                                local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
17✔
629
                            }
17✔
630
                        }
26✔
631
                        symbolic::Expression linear_idx = linearize_exprs(local_indices);
15✔
632
                        group_rewrites.push_back({&memlet, {linear_idx}, true});
15✔
633
                    } else {
15✔
634
                        // Memlet is claimed by the group but we cannot rewrite it (no
635
                        // delinearized access info). Leaving it as the original container
636
                        // would create a half-renamed access node. Bail out of renaming
637
                        // to keep the SDFG consistent.
UNCOV
638
                        all_in_group = false;
×
639
                    }
×
640
                }
15✔
641
                // Incoming memlets (writes to this access node)
642
                for (auto& memlet : dfg.in_edges(*access)) {
46✔
643
                    if (group_memlets_.count(&memlet) == 0) {
31✔
644
                        all_in_group = false;
3✔
645
                        continue;
3✔
646
                    }
3✔
647
                    auto* acc = mla.access(memlet);
28✔
648
                    if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
28✔
649
                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
650
                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
28✔
651
                        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
76✔
652
                            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
48✔
653
                                local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
30✔
654
                            }
30✔
655
                        }
48✔
656
                        symbolic::Expression linear_idx = linearize_exprs(local_indices);
28✔
657
                        group_rewrites.push_back({&memlet, {linear_idx}, false});
28✔
658
                    } else {
28✔
UNCOV
659
                        all_in_group = false;
×
660
                    }
×
661
                }
28✔
662

663
                if (group_rewrites.empty()) continue;
46✔
664

665
                if (all_in_group) {
43✔
666
                    // Simple case: all memlets in group → rewrite in-place and rename
667
                    for (auto& rw : group_rewrites) {
42✔
668
                        rw.memlet->set_subset(rw.local_subset);
42✔
669
                        rw.memlet->set_base_type(buffer_type);
42✔
670
                    }
42✔
671
                    access->data(local_name_);
42✔
672
                } else {
42✔
673
                    // Mixed case: split — create new local access node, redirect group memlets
674
                    auto& local_access = builder.add_access(*block, local_name_);
1✔
675
                    for (auto& rw : group_rewrites) {
1✔
676
                        if (rw.is_outgoing) {
1✔
677
                            // outgoing: access→tasklet  →  local_access→tasklet
UNCOV
678
                            auto& dst_node = rw.memlet->dst();
×
679
                            auto dst_conn = rw.memlet->dst_conn();
×
680
                            builder.remove_memlet(*block, *rw.memlet);
×
681
                            builder.add_memlet(
×
682
                                *block, local_access, "void", dst_node, dst_conn, rw.local_subset, buffer_type, {}
×
683
                            );
×
684
                        } else {
1✔
685
                            // incoming: tasklet→access  →  tasklet→local_access
686
                            auto& src_node = rw.memlet->src();
1✔
687
                            auto src_conn = rw.memlet->src_conn();
1✔
688
                            builder.remove_memlet(*block, *rw.memlet);
1✔
689
                            builder.add_memlet(
1✔
690
                                *block, src_node, src_conn, local_access, "void", rw.local_subset, buffer_type, {}
1✔
691
                            );
1✔
692
                        }
1✔
693
                    }
1✔
694
                }
1✔
695
            }
43✔
696
        } else if (auto* seq = dynamic_cast<structured_control_flow::Sequence*>(&node)) {
52✔
697
            for (size_t i = 0; i < seq->size(); i++) {
92✔
698
                rewrite_accesses(seq->at(i).first);
52✔
699
            }
52✔
700
        } else if (auto* loop = dynamic_cast<structured_control_flow::StructuredLoop*>(&node)) {
40✔
701
            rewrite_accesses(loop->root());
12✔
702
        } else if (auto* if_else = dynamic_cast<structured_control_flow::IfElse*>(&node)) {
12✔
UNCOV
703
            for (size_t i = 0; i < if_else->size(); i++) {
×
704
                rewrite_accesses(if_else->at(i).first);
×
705
            }
×
706
        }
×
707
    };
92✔
708
    rewrite_accesses(loop_.root());
28✔
709

710
    // Cleanup
711
    analysis_manager.invalidate_all();
28✔
712

713
    passes::SequenceFusion sf_pass;
28✔
714
    passes::DeadCFGElimination dce_pass;
28✔
715
    bool applies = false;
28✔
716
    do {
52✔
717
        applies = false;
52✔
718
        applies |= dce_pass.run(builder, analysis_manager);
52✔
719
        applies |= sf_pass.run(builder, analysis_manager);
52✔
720
    } while (applies);
52✔
721
};
28✔
722

723
void OutLocalStorage::to_json(nlohmann::json& j) const {
5✔
724
    j["transformation_type"] = this->name();
5✔
725
    j["parameters"] = nlohmann::json::object();
5✔
726

727
    serializer::JSONSerializer serializer_full;
5✔
728
    j["parameters"]["storage_type"] = nlohmann::json::object();
5✔
729
    serializer_full.storage_type_to_json(j["parameters"]["storage_type"], storage_type_);
5✔
730

731
    serializer::JSONSerializer ser_flat(false);
5✔
732
    j["subgraph"] = nlohmann::json::object();
5✔
733
    j["subgraph"]["0"] = nlohmann::json::object();
5✔
734
    ser_flat.serialize_node(j["subgraph"]["0"], loop_);
5✔
735

736
    j["subgraph"]["1"] = nlohmann::json::object();
5✔
737
    j["subgraph"]["1"]["element_id"] = access_node_.element_id();
5✔
738
    j["subgraph"]["1"]["type"] = "access_node";
5✔
739
};
5✔
740

741
OutLocalStorage OutLocalStorage::from_json(builder::StructuredSDFGBuilder& builder, const nlohmann::json& desc) {
3✔
742
    auto loop_id = desc["subgraph"]["0"]["element_id"].get<size_t>();
3✔
743
    auto element = builder.find_element_by_id(loop_id);
3✔
744
    if (!element) {
3✔
UNCOV
745
        throw InvalidTransformationDescriptionException("Element with ID " + std::to_string(loop_id) + " not found.");
×
UNCOV
746
    }
×
747
    auto loop = dynamic_cast<structured_control_flow::StructuredLoop*>(element);
3✔
748

749
    auto access_node = dynamic_cast<
3✔
750
        data_flow::AccessNode*>(builder.find_element_by_id(desc.at("subgraph").at("1").at("element_id").get<size_t>()));
3✔
751
    if (!access_node) {
3✔
UNCOV
752
        throw InvalidTransformationDescriptionException(
×
UNCOV
753
            "Access node with ID " + std::to_string(desc.at("subgraph").at("1").at("element_id").get<size_t>()) +
×
UNCOV
754
            " not found."
×
755
        );
×
756
    }
×
757

758
    types::StorageType storage_type = types::StorageType::CPU_Stack();
3✔
759
    if (desc["parameters"].contains("storage_type")) {
3✔
760
        serializer::JSONSerializer ser;
3✔
761
        storage_type = ser.json_to_storage_type(desc.at("parameters").at("storage_type"));
3✔
762
    }
3✔
763

764
    return OutLocalStorage(*loop, *access_node, storage_type);
3✔
765
};
3✔
766

767
} // namespace transformations
768
} // namespace sdfg
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc