• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 27500480732

14 Jun 2026 01:33PM UTC coverage: 61.642% (+0.1%) from 61.54%
27500480732

Pull #764

github

web-flow
Merge 3432db11b into 6b2e310be
Pull Request #764: Deprecates monolithic GPU transformations in favor of composable transformations

202 of 224 new or added lines in 2 files covered. (90.18%)

81 existing lines in 6 files now uncovered.

36559 of 59309 relevant lines covered (61.64%)

1132.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.64
/opt/src/transformations/in_local_storage.cpp
1
#include "sdfg/transformations/in_local_storage.h"
2

3
#include <algorithm>
4
#include <cstddef>
5
#include <functional>
6
#include <string>
7

8
#include "sdfg/analysis/memory_layout_analysis.h"
9
#include "sdfg/analysis/scope_analysis.h"
10
#include "sdfg/analysis/users.h"
11
#include "sdfg/builder/structured_sdfg_builder.h"
12
#include "sdfg/data_flow/access_node.h"
13
#include "sdfg/data_flow/library_nodes/barrier_local_node.h"
14
#include "sdfg/data_flow/memlet.h"
15
#include "sdfg/passes/structured_control_flow/dead_cfg_elimination.h"
16
#include "sdfg/passes/structured_control_flow/sequence_fusion.h"
17
#include "sdfg/structured_control_flow/if_else.h"
18
#include "sdfg/structured_control_flow/sequence.h"
19
#include "sdfg/structured_control_flow/structured_loop.h"
20
#include "sdfg/symbolic/symbolic.h"
21
#include "sdfg/targets/gpu/gpu_schedule_type.h"
22
#include "sdfg/types/array.h"
23
#include "sdfg/types/pointer.h"
24
#include "sdfg/types/scalar.h"
25

26
namespace sdfg {
27
namespace transformations {
28

29
InLocalStorage::InLocalStorage(
30
    structured_control_flow::StructuredLoop& loop,
31
    const data_flow::AccessNode& access_node,
32
    const types::StorageType& storage_type
33
)
34
    : loop_(loop), access_node_(access_node), container_(access_node.data()), storage_type_(storage_type) {}
35✔
35

36
std::string InLocalStorage::name() const { return "InLocalStorage"; }
7✔
37

38
bool InLocalStorage::can_be_applied(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
35✔
39
    auto& sdfg = builder.subject();
35✔
40
    auto& body = this->loop_.root();
35✔
41

42
    tile_info_ = TileInfo{};
35✔
43

44
    // Criterion: Container must exist and is pointer
45
    if (!sdfg.exists(this->container_)) {
35✔
46
        return false;
×
47
    }
×
48
    auto& type = sdfg.type(this->container_);
35✔
49
    if (type.type_id() != types::TypeID::Pointer) {
35✔
50
        return false;
×
51
    }
×
52

53
    // Criterion: Container must be used in the loop body
54
    auto& users = analysis_manager.get<analysis::Users>();
35✔
55
    analysis::UsersView body_users(users, body);
35✔
56
    if (body_users.uses(this->container_).empty()) {
35✔
57
        return false;
2✔
58
    }
2✔
59

60
    // Criterion: Container must be read-only within the loop (no writes)
61
    if (!body_users.writes(this->container_).empty()) {
33✔
62
        return false;
1✔
63
    }
1✔
64

65
    // Use MemoryLayoutAnalysis tile group API
66
    // Find a representative memlet from the access node to identify its group.
67
    auto& mla = analysis_manager.get<analysis::MemoryLayoutAnalysis>();
32✔
68
    const analysis::MemoryTileGroup* group = nullptr;
32✔
69
    auto& dfg = access_node_.get_parent();
32✔
70
    for (auto& memlet : dfg.out_edges(access_node_)) {
32✔
71
        auto* candidate = mla.tile_group_for(loop_, memlet);
32✔
72
        if (!candidate) {
32✔
73
            continue;
×
74
        }
×
75

76
        auto extents = candidate->tile.extents_approx();
32✔
77
        if (extents.empty()) {
32✔
78
            continue;
×
79
        }
×
80

81
        // Reject candidates with any unbounded-dependent extent (returned as null).
82
        bool has_null = false;
32✔
83
        for (auto& ext : extents) {
56✔
84
            if (ext.is_null()) {
56✔
85
                has_null = true;
×
86
                break;
×
87
            }
×
88
        }
56✔
89
        if (has_null) {
32✔
90
            continue;
×
91
        }
×
92

93
        // GPU path: accept first valid group (substitution happens later)
94
        if (storage_type_.is_nv_shared()) {
32✔
95
            group = candidate;
7✔
96
            break;
7✔
97
        }
7✔
98

99
        // CPU path: require provably integer extents
100
        bool all_integer = true;
25✔
101
        for (auto& ext : extents) {
43✔
102
            if (!SymEngine::is_a<SymEngine::Integer>(*ext)) {
43✔
103
                all_integer = false;
×
104
                break;
×
105
            }
×
106
        }
43✔
107
        if (all_integer) {
25✔
108
            group = candidate;
25✔
109
            break;
25✔
110
        }
25✔
111
    }
25✔
112
    if (!group) {
32✔
113
        return false;
×
114
    }
×
115

116
    auto& tile = group->tile;
32✔
117
    auto extents = tile.extents_approx();
32✔
118

119
    // Store group memlets for use in apply()
120
    group_memlets_.clear();
32✔
121
    group_memlets_.insert(group->memlets.begin(), group->memlets.end());
32✔
122

123
    // Store tile info (before substitution, bases/strides stay symbolic)
124
    tile_info_.dimensions = extents;
32✔
125
    tile_info_.bases = tile.min_subset;
32✔
126
    tile_info_.strides = std::vector<symbolic::Expression>(tile.layout.strides().begin(), tile.layout.strides().end());
32✔
127
    tile_info_.offset = tile.layout.offset();
32✔
128

129
    // GPU shared memory: resolve symbolic extents using GPU block sizes and
130
    // require at least one cooperative dimension
131
    if (storage_type_.is_nv_shared()) {
32✔
132
        auto ancestors = ControlFlowNode::parent_chain(loop_);
7✔
133

134
        // Build substitution map: symbolic GPU map bounds → integer block sizes
135
        // E.g., Map condition "i < N" with block_size=32 → N=32
136
        for (auto* node : ancestors) {
37✔
137
            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
37✔
138
                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
13✔
139
                    continue;
×
140
                }
×
141
                auto block_size = gpu::gpu_block_size(ancestor_map->schedule_type());
13✔
142
                // Extract symbolic bound from condition: Lt(indvar, BOUND)
143
                auto condition = ancestor_map->condition();
13✔
144
                if (SymEngine::is_a<SymEngine::StrictLessThan>(*condition)) {
13✔
145
                    auto stl = SymEngine::rcp_static_cast<const SymEngine::StrictLessThan>(condition);
13✔
146
                    auto rhs = stl->get_args()[1];
13✔
147
                    auto iter_count = symbolic::sub(rhs, ancestor_map->init());
13✔
148
                    if (!SymEngine::is_a<SymEngine::Integer>(*iter_count)) {
13✔
149
                        // Symbolic bound — substitute with block size in extents and bases
150
                        for (auto& ext : tile_info_.dimensions) {
16✔
151
                            ext = symbolic::simplify(symbolic::subs(ext, iter_count, block_size));
16✔
152
                        }
16✔
153
                        for (auto& base : tile_info_.bases) {
16✔
154
                            base = symbolic::simplify(symbolic::subs(base, iter_count, block_size));
16✔
155
                        }
16✔
156
                    }
9✔
157
                }
13✔
158
            }
13✔
159
        }
37✔
160

161
        // Also resolve the loop's own bound if symbolic and matches a block size
162
        // E.g., For k = 0..K where K is a parameter — check if K can be resolved
163
        // from any GPU ancestor map
164
        // (Already handled above: if K appears as a GPU map bound, it's substituted)
165

166
        // Criterion: All extents must now be provably integer
167
        for (auto& ext : tile_info_.dimensions) {
13✔
168
            if (!SymEngine::is_a<SymEngine::Integer>(*ext)) {
13✔
169
                return false;
2✔
170
            }
2✔
171
        }
13✔
172

173
        // Criterion: At least one cooperative dimension
174
        bool has_cooperative_dim = false;
5✔
175
        for (auto* node : ancestors) {
16✔
176
            if (auto* ancestor_map = dynamic_cast<structured_control_flow::Map*>(node)) {
16✔
177
                if (!gpu::is_gpu_schedule(ancestor_map->schedule_type())) {
6✔
178
                    continue;
×
179
                }
×
180
                // A GPU dim is cooperative if its indvar does NOT appear in any tile base
181
                bool appears_in_bases = false;
6✔
182
                for (auto& base : tile_info_.bases) {
10✔
183
                    if (symbolic::uses(base, ancestor_map->indvar())) {
10✔
184
                        appears_in_bases = true;
1✔
185
                        break;
1✔
186
                    }
1✔
187
                }
10✔
188
                if (!appears_in_bases) {
6✔
189
                    has_cooperative_dim = true;
5✔
190
                    break;
5✔
191
                }
5✔
192
            }
6✔
193
        }
16✔
194
        if (!has_cooperative_dim) {
5✔
195
            return false;
×
196
        }
×
197
    }
5✔
198

199
    return true;
30✔
200
}
32✔
201

202
void InLocalStorage::apply(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
23✔
203
    auto& sdfg = builder.subject();
23✔
204

205
    auto parent_node = loop_.get_parent();
23✔
206
    auto parent = dynamic_cast<structured_control_flow::Sequence*>(parent_node);
23✔
207
    if (!parent) {
23✔
208
        throw InvalidSDFGException("InLocalStorage: Parent of loop must be a Sequence!");
×
209
    }
×
210

211
    // We replace all relevant memlets with flat local indices
212
    // Thus, we now use a flat pointer to index into container
213
    // Remark: sdfg.type may return an opaque pointer, so use
214
    //         memlet instead
215
    auto* memlet = *group_memlets_.begin();
23✔
216
    types::Scalar scalar_type(memlet->base_type().primitive_type());
23✔
217
    types::Pointer pointer_type(scalar_type);
23✔
218

219
    // Create local buffer name
220
    local_name_ = builder.find_new_name("__daisy_in_local_storage_" + this->container_);
23✔
221

222
    // Collect varying dimensions (extent > 1) and their sizes
223
    std::vector<size_t> varying_dims;
23✔
224
    std::vector<symbolic::Expression> varying_dim_sizes;
23✔
225
    for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
64✔
226
        auto& dim_size = tile_info_.dimensions.at(d);
41✔
227
        if (!symbolic::eq(dim_size, symbolic::integer(1))) {
41✔
228
            varying_dims.push_back(d);
30✔
229
            varying_dim_sizes.push_back(dim_size);
30✔
230
        }
30✔
231
    }
41✔
232

233
    // GPU classification: each ancestor GPU Map is either
234
    //  - per-thread (its Map indvar appears in tile.bases — each thread sees a
235
    //    distinct slice along that dim, so the shared buffer gets its own
236
    //    per-thread slot indexed by the within-block thread_idx), or
237
    //  - cooperative (Map indvar not in bases — all threads along that dim
238
    //    cooperatively load the same shared tile, strided by thread_idx).
239
    struct GpuDim {
23✔
240
        gpu::GPUDimension dim;
23✔
241
        symbolic::Symbol map_indvar; // global thread index (== thread_idx + blockIdx * blockDim)
23✔
242
        symbolic::Symbol thread_idx; // within-block thread index (NV_Symbol)
23✔
243
        symbolic::Integer block_size;
23✔
244
        bool is_per_thread;
23✔
245
    };
23✔
246
    std::vector<GpuDim> per_thread_dims; // populated only on GPU path
23✔
247
    std::vector<GpuDim> coop_dims; // populated only on GPU path
23✔
248
    bool is_rocm = false;
23✔
249

250
    if (storage_type_.is_nv_shared()) {
23✔
251
        auto ancestors = ControlFlowNode::parent_chain(loop_);
5✔
252
        for (auto* node : ancestors) {
29✔
253
            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
29✔
254
            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
29✔
255
            if (m->schedule_type().value() == "ROCM") {
10✔
NEW
256
                is_rocm = true;
×
NEW
257
                break;
×
NEW
258
            }
×
259
        }
10✔
260
        const std::string prefix = is_rocm ? "__daisy_hip_thread_idx_" : "__daisy_cuda_thread_idx_";
5✔
261
        auto suffix = [](gpu::GPUDimension d) -> std::string {
10✔
262
            switch (d) {
10✔
263
                case gpu::GPUDimension::X:
5✔
264
                    return "x";
5✔
265
                case gpu::GPUDimension::Y:
5✔
266
                    return "y";
5✔
NEW
267
                case gpu::GPUDimension::Z:
×
NEW
268
                    return "z";
×
269
            }
10✔
NEW
270
            return "?";
×
271
        };
10✔
272
        for (auto* node : ancestors) {
29✔
273
            auto* m = dynamic_cast<structured_control_flow::Map*>(node);
29✔
274
            if (!m || !gpu::is_gpu_schedule(m->schedule_type())) continue;
29✔
275
            GpuDim gd;
10✔
276
            gd.dim = gpu::gpu_dimension(m->schedule_type());
10✔
277
            gd.map_indvar = m->indvar();
10✔
278
            gd.thread_idx = symbolic::symbol(prefix + suffix(gd.dim));
10✔
279
            gd.block_size = gpu::gpu_block_size(m->schedule_type());
10✔
280
            gd.is_per_thread = false;
10✔
281
            for (auto& base : tile_info_.bases) {
15✔
282
                if (symbolic::uses(base, m->indvar())) {
15✔
283
                    gd.is_per_thread = true;
4✔
284
                    break;
4✔
285
                }
4✔
286
            }
15✔
287
            (gd.is_per_thread ? per_thread_dims : coop_dims).push_back(gd);
10✔
288
        }
10✔
289
        auto by_dim = [](const GpuDim& a, const GpuDim& b) {
5✔
290
            return static_cast<int>(a.dim) < static_cast<int>(b.dim);
1✔
291
        };
1✔
292
        std::sort(per_thread_dims.begin(), per_thread_dims.end(), by_dim);
5✔
293
        std::sort(coop_dims.begin(), coop_dims.end(), by_dim);
5✔
294

295
        // Ensure within-block thread_idx containers exist. Codegen recognises
296
        // NV_Symbol-typed scalars and substitutes them with threadIdx.{x,y,z}
297
        // (CUDA) or the ROCm equivalent at emission time.
298
        auto ensure_idx = [&](const symbolic::Symbol& sym) {
10✔
299
            if (!sdfg.exists(sym->get_name())) {
10✔
300
                types::Scalar idx_type(types::PrimitiveType::Int32);
8✔
301
                idx_type.storage_type(types::StorageType::NV_Symbol());
8✔
302
                builder.add_container(sym->get_name(), idx_type);
8✔
303
            }
8✔
304
        };
10✔
305
        for (auto& gd : per_thread_dims) ensure_idx(gd.thread_idx);
5✔
306
        for (auto& gd : coop_dims) ensure_idx(gd.thread_idx);
6✔
307
    }
5✔
308

309
    // Buffer dim sizes: [per-thread block sizes (X, Y, Z canonical order)] ++
310
    //                   [varying tile dim sizes (original access-dim order)]
311
    std::vector<symbolic::Expression> buf_dim_sizes;
23✔
312
    for (auto& gd : per_thread_dims) buf_dim_sizes.push_back(gd.block_size);
23✔
313
    for (auto& s : varying_dim_sizes) buf_dim_sizes.push_back(s);
30✔
314

315
    // Total buffer size (number of scalar slots)
316
    symbolic::Expression total_size = symbolic::integer(1);
23✔
317
    for (auto& s : buf_dim_sizes) total_size = symbolic::mul(total_size, s);
34✔
318

319
    // Per-thread index prefix (each thread's fixed buffer coords)
320
    std::vector<symbolic::Expression> per_thread_indices;
23✔
321
    for (auto& gd : per_thread_dims) per_thread_indices.push_back(gd.thread_idx);
23✔
322

323
    // Row-major linearization over buf_dim_sizes (leftmost dim = outermost stride)
324
    auto linearize_exprs = [&](const std::vector<symbolic::Expression>& indices) -> symbolic::Expression {
50✔
325
        symbolic::Expression linear_idx = symbolic::integer(0);
50✔
326
        symbolic::Expression stride = symbolic::integer(1);
50✔
327
        for (int i = static_cast<int>(indices.size()) - 1; i >= 0; i--) {
126✔
328
            linear_idx = symbolic::add(linear_idx, symbolic::mul(indices[i], stride));
76✔
329
            stride = symbolic::mul(stride, buf_dim_sizes[i]);
76✔
330
        }
76✔
331
        return linear_idx;
50✔
332
    };
50✔
333

334
    // Helper: build linearized local index from per-dimension indvars (symbols)
335
    auto linearize = [&](const std::vector<symbolic::Symbol>& indvars) -> symbolic::Expression {
23✔
336
        std::vector<symbolic::Expression> exprs(indvars.begin(), indvars.end());
18✔
337
        return linearize_exprs(exprs);
18✔
338
    };
18✔
339

340
    // Helper: build source subset (base[d] + copy_indvar[d]) for original container
341
    auto build_original_subset = [&](const std::vector<symbolic::Expression>& copy_indices) -> data_flow::Subset {
23✔
342
        std::vector<symbolic::Expression> full_indices;
23✔
343
        size_t var_idx = 0;
23✔
344
        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
64✔
345
            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
41✔
346
                full_indices.push_back(symbolic::add(tile_info_.bases.at(d), copy_indices.at(var_idx++)));
30✔
347
            } else {
30✔
348
                full_indices.push_back(tile_info_.bases.at(d));
11✔
349
            }
11✔
350
        }
41✔
351

352
        symbolic::Expression linear = tile_info_.offset;
23✔
353
        for (size_t d = 0; d < full_indices.size(); d++) {
64✔
354
            linear = symbolic::add(linear, symbolic::mul(tile_info_.strides.at(d), full_indices.at(d)));
41✔
355
        }
41✔
356
        return {linear};
23✔
357
    };
23✔
358

359
    // ==================================================================
360
    // Branch: GPU cooperative path vs CPU sequential path
361
    // ==================================================================
362
    if (storage_type_.is_nv_shared()) {
23✔
363
        // ============================================================
364
        // GPU COOPERATIVE PATH
365
        // ============================================================
366
        // Each thread owns a fixed slot along per-thread buffer dims and
367
        // strides through the varying-flat range with the other threads
368
        // sharing that slot (i.e. threads in cooperative dims only).
369

370
        // Total cooperative-thread count (= 1 if no cooperative dims)
371
        symbolic::Expression total_coop_threads = symbolic::integer(1);
5✔
372
        for (auto& cd : coop_dims) {
6✔
373
            total_coop_threads = symbolic::mul(total_coop_threads, cd.block_size);
6✔
374
        }
6✔
375

376
        // Flat within-block index over cooperative dims only (= 0 if none).
377
        // Row-major: X is least-significant when present.
378
        symbolic::Expression coop_flat = symbolic::integer(0);
5✔
379
        {
5✔
380
            symbolic::Expression stride = symbolic::integer(1);
5✔
381
            for (auto it = coop_dims.rbegin(); it != coop_dims.rend(); ++it) {
11✔
382
                coop_flat = symbolic::add(coop_flat, symbolic::mul(it->thread_idx, stride));
6✔
383
                stride = symbolic::mul(stride, it->block_size);
6✔
384
            }
6✔
385
        }
5✔
386

387
        // Varying-flat size = product of tile dim extents (excluding extent==1).
388
        // This is the address range each thread cooperatively walks within its
389
        // per-thread slot.
390
        symbolic::Expression varying_flat_size = symbolic::integer(1);
5✔
391
        for (auto& s : varying_dim_sizes) {
5✔
392
            varying_flat_size = symbolic::mul(varying_flat_size, s);
5✔
393
        }
5✔
394

395
        // Create the local buffer with NV_Shared storage
396
        types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
5✔
397
        builder.add_container(local_name_, buffer_type);
5✔
398

399
        // Emit: barrier → cooperative copy loop → barrier → main loop
400
        // 1. Barrier before copy
401
        auto& barrier_block1 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
5✔
402
        builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block1, {});
5✔
403

404
        // 2. Cooperative copy: for (idx = coop_flat; idx < varying_flat_size; idx += total_coop_threads)
405
        auto idx_name = builder.find_new_name("__daisy_ils_coop_" + this->container_);
5✔
406
        types::Scalar idx_type(types::PrimitiveType::UInt64);
5✔
407
        builder.add_container(idx_name, idx_type);
5✔
408
        auto idx_var = symbolic::symbol(idx_name);
5✔
409

410
        auto& copy_loop = builder.add_map_before(
5✔
411
            *parent,
5✔
412
            loop_,
5✔
413
            idx_var,
5✔
414
            symbolic::Lt(idx_var, varying_flat_size),
5✔
415
            coop_flat,
5✔
416
            symbolic::add(idx_var, total_coop_threads),
5✔
417
            structured_control_flow::ScheduleType_Sequential::create(),
5✔
418
            {},
5✔
419
            loop_.debug_info()
5✔
420
        );
5✔
421

422
        auto& copy_scope = copy_loop.root();
5✔
423
        auto& copy_block = builder.add_block(copy_scope);
5✔
424
        auto& copy_src = builder.add_access(copy_block, this->container_);
5✔
425
        auto& copy_dst = builder.add_access(copy_block, local_name_);
5✔
426
        auto& copy_tasklet = builder.add_tasklet(copy_block, data_flow::TaskletCode::assign, "_out", {"_in"});
5✔
427

428
        // Decompose idx_var into per-varying-dim indices (row-major).
429
        // For a single varying dim this is just idx_var.
430
        std::vector<symbolic::Expression> varying_decomp;
5✔
431
        symbolic::Expression remainder = idx_var;
5✔
432
        for (size_t i = 0; i < varying_dim_sizes.size(); i++) {
10✔
433
            if (i + 1 < varying_dim_sizes.size()) {
5✔
434
                symbolic::Expression divisor = symbolic::integer(1);
×
NEW
435
                for (size_t j = i + 1; j < varying_dim_sizes.size(); j++) {
×
NEW
436
                    divisor = symbolic::mul(divisor, varying_dim_sizes[j]);
×
437
                }
×
NEW
438
                varying_decomp.push_back(symbolic::div(remainder, divisor));
×
439
                remainder = symbolic::mod(remainder, divisor);
×
440
            } else {
5✔
441
                varying_decomp.push_back(remainder);
5✔
442
            }
5✔
443
        }
5✔
444

445
        // Source = original container at (bases — which already use the global
446
        // Map indvars — plus the varying decomposition along each varying dim).
447
        auto copy_src_subset = build_original_subset(varying_decomp);
5✔
448

449
        // Destination = buffer at (per_thread_indices ++ varying_decomp) linearized.
450
        std::vector<symbolic::Expression> dest_indices = per_thread_indices;
5✔
451
        for (auto& v : varying_decomp) dest_indices.push_back(v);
5✔
452
        data_flow::Subset copy_dst_subset = {linearize_exprs(dest_indices)};
5✔
453

454
        builder.add_computational_memlet(copy_block, copy_src, copy_tasklet, "_in", copy_src_subset, pointer_type);
5✔
455
        builder.add_computational_memlet(copy_block, copy_tasklet, "_out", copy_dst, copy_dst_subset, buffer_type);
5✔
456

457
        // 3. Barrier after copy
458
        auto& barrier_block2 = builder.add_block_before(*parent, loop_, {}, loop_.debug_info());
5✔
459
        builder.add_library_node<data_flow::BarrierLocalNode>(barrier_block2, {});
5✔
460
    } else {
18✔
461
        // ============================================================
462
        // CPU SEQUENTIAL PATH
463
        // ============================================================
464
        // Create the local buffer with specified storage type
465
        types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
18✔
466
        builder.add_container(local_name_, buffer_type);
18✔
467

468
        std::vector<symbolic::Symbol> copy_indvars;
18✔
469
        structured_control_flow::Sequence* copy_scope =
18✔
470
            &builder.add_sequence_before(*parent, loop_, {}, loop_.debug_info());
18✔
471
        for (size_t i = 0; i < varying_dims.size(); i++) {
43✔
472
            size_t d = varying_dims[i];
25✔
473
            auto indvar_name = builder.find_new_name("__daisy_ils_" + this->container_ + "_d" + std::to_string(d));
25✔
474
            types::Scalar indvar_type(types::PrimitiveType::UInt64);
25✔
475
            builder.add_container(indvar_name, indvar_type);
25✔
476
            auto indvar = symbolic::symbol(indvar_name);
25✔
477
            copy_indvars.push_back(indvar);
25✔
478

479
            auto init = symbolic::integer(0);
25✔
480
            auto condition = symbolic::Lt(indvar, varying_dim_sizes[i]);
25✔
481
            auto update = symbolic::add(indvar, symbolic::integer(1));
25✔
482

483
            auto& copy_loop = builder.add_map(
25✔
484
                *copy_scope,
25✔
485
                indvar,
25✔
486
                condition,
25✔
487
                init,
25✔
488
                update,
25✔
489
                structured_control_flow::ScheduleType_Sequential::create(),
25✔
490
                {},
25✔
491
                loop_.debug_info()
25✔
492
            );
25✔
493
            copy_scope = &copy_loop.root();
25✔
494
        }
25✔
495

496
        // Create copy block
497
        auto& copy_block = builder.add_block(*copy_scope);
18✔
498
        auto& copy_src = builder.add_access(copy_block, this->container_);
18✔
499
        auto& copy_dst = builder.add_access(copy_block, local_name_);
18✔
500
        auto& copy_tasklet = builder.add_tasklet(copy_block, data_flow::TaskletCode::assign, "_out", {"_in"});
18✔
501

502
        std::vector<symbolic::Expression> copy_exprs(copy_indvars.begin(), copy_indvars.end());
18✔
503
        auto copy_src_subset = build_original_subset(copy_exprs);
18✔
504
        data_flow::Subset copy_dst_subset = {linearize(copy_indvars)};
18✔
505

506
        builder.add_computational_memlet(copy_block, copy_src, copy_tasklet, "_in", copy_src_subset, pointer_type);
18✔
507
        types::Array buffer_type_ref(storage_type_, 0, {}, scalar_type, total_size);
18✔
508
        builder.add_computational_memlet(copy_block, copy_tasklet, "_out", copy_dst, copy_dst_subset, buffer_type_ref);
18✔
509
    }
18✔
510

511
    // ==================================================================
512
    // Update accesses in the main loop to use the local buffer
513
    // ==================================================================
514
    types::Array buffer_type(storage_type_, 0, {}, scalar_type, total_size);
23✔
515
    auto& mla = analysis_manager.get<analysis::MemoryLayoutAnalysis>();
23✔
516

517
    // Recursive helper to traverse all blocks in the loop body
518
    std::function<void(structured_control_flow::ControlFlowNode&)> rewrite_accesses;
23✔
519
    rewrite_accesses = [&](structured_control_flow::ControlFlowNode& node) {
85✔
520
        if (auto* block = dynamic_cast<structured_control_flow::Block*>(&node)) {
85✔
521
            auto& dfg = block->dataflow();
34✔
522

523
            // Collect access nodes to process (avoid iterator invalidation)
524
            std::vector<data_flow::AccessNode*> access_nodes;
34✔
525
            for (auto* access_node : dfg.data_nodes()) {
99✔
526
                if (access_node->data() == this->container_) {
99✔
527
                    access_nodes.push_back(access_node);
28✔
528
                }
28✔
529
            }
99✔
530

531
            for (auto* access : access_nodes) {
34✔
532
                // Classify memlets: group vs non-group
533
                struct MemletRewrite {
28✔
534
                    data_flow::Memlet* memlet;
28✔
535
                    data_flow::Subset local_subset;
28✔
536
                    bool is_outgoing;
28✔
537
                };
28✔
538
                std::vector<MemletRewrite> group_rewrites;
28✔
539
                bool all_in_group = true;
28✔
540

541
                for (auto& memlet : dfg.out_edges(*access)) {
29✔
542
                    if (group_memlets_.count(&memlet) == 0) {
29✔
543
                        all_in_group = false;
2✔
544
                        continue;
2✔
545
                    }
2✔
546
                    auto* acc = mla.access(memlet);
27✔
547
                    if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
27✔
548
                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
549
                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
27✔
550
                        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
76✔
551
                            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
49✔
552
                                local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
38✔
553
                            }
38✔
554
                        }
49✔
555
                        symbolic::Expression linear_idx = linearize_exprs(local_indices);
27✔
556
                        group_rewrites.push_back({&memlet, {linear_idx}, true});
27✔
557
                    }
27✔
558
                }
27✔
559
                for (auto& memlet : dfg.in_edges(*access)) {
28✔
560
                    if (group_memlets_.count(&memlet) == 0) {
×
561
                        all_in_group = false;
×
562
                        continue;
×
563
                    }
×
564
                    auto* acc = mla.access(memlet);
×
565
                    if (acc && acc->subset.size() == tile_info_.dimensions.size()) {
×
566
                        // Buffer index: [per-thread thread_idx (X,Y,Z order)] ++ [varying d: subset[d] - base[d]]
NEW
567
                        std::vector<symbolic::Expression> local_indices = per_thread_indices;
×
568
                        for (size_t d = 0; d < tile_info_.dimensions.size(); d++) {
×
569
                            if (!symbolic::eq(tile_info_.dimensions.at(d), symbolic::integer(1))) {
×
570
                                local_indices.push_back(symbolic::sub(acc->subset.at(d), tile_info_.bases.at(d)));
×
571
                            }
×
572
                        }
×
573
                        symbolic::Expression linear_idx = linearize_exprs(local_indices);
×
574
                        group_rewrites.push_back({&memlet, {linear_idx}, false});
×
575
                    }
×
576
                }
×
577

578
                if (group_rewrites.empty()) continue;
28✔
579

580
                if (all_in_group) {
27✔
581
                    // Simple case: all memlets in group → rewrite in-place and rename
582
                    for (auto& rw : group_rewrites) {
26✔
583
                        rw.memlet->set_subset(rw.local_subset);
26✔
584
                        rw.memlet->set_base_type(buffer_type);
26✔
585
                    }
26✔
586
                    access->data(local_name_);
26✔
587
                } else {
26✔
588
                    // Mixed case: split — create new local access node, redirect group memlets
589
                    auto& local_access = builder.add_access(*block, local_name_);
1✔
590
                    for (auto& rw : group_rewrites) {
1✔
591
                        if (rw.is_outgoing) {
1✔
592
                            // outgoing: access→tasklet  →  local_access→tasklet
593
                            auto& dst_node = rw.memlet->dst();
1✔
594
                            auto dst_conn = rw.memlet->dst_conn();
1✔
595
                            builder.remove_memlet(*block, *rw.memlet);
1✔
596
                            builder.add_memlet(
1✔
597
                                *block, local_access, "void", dst_node, dst_conn, rw.local_subset, buffer_type, {}
1✔
598
                            );
1✔
599
                        } else {
1✔
600
                            // incoming: tasklet→access  →  tasklet→local_access
601
                            auto& src_node = rw.memlet->src();
×
602
                            auto src_conn = rw.memlet->src_conn();
×
603
                            builder.remove_memlet(*block, *rw.memlet);
×
604
                            builder.add_memlet(
×
605
                                *block, src_node, src_conn, local_access, "void", rw.local_subset, buffer_type, {}
×
606
                            );
×
607
                        }
×
608
                    }
1✔
609
                }
1✔
610
            }
27✔
611
        } else if (auto* seq = dynamic_cast<structured_control_flow::Sequence*>(&node)) {
51✔
612
            for (size_t i = 0; i < seq->size(); i++) {
85✔
613
                rewrite_accesses(seq->at(i).first);
48✔
614
            }
48✔
615
        } else if (auto* loop = dynamic_cast<structured_control_flow::StructuredLoop*>(&node)) {
37✔
616
            rewrite_accesses(loop->root());
14✔
617
        } else if (auto* if_else = dynamic_cast<structured_control_flow::IfElse*>(&node)) {
14✔
618
            for (size_t i = 0; i < if_else->size(); i++) {
×
619
                rewrite_accesses(if_else->at(i).first);
×
620
            }
×
621
        }
×
622
    };
85✔
623
    rewrite_accesses(loop_.root());
23✔
624

625
    // Cleanup
626
    analysis_manager.invalidate_all();
23✔
627

628
    passes::SequenceFusion sf_pass;
23✔
629
    passes::DeadCFGElimination dce_pass;
23✔
630
    bool applies = false;
23✔
631
    do {
41✔
632
        applies = false;
41✔
633
        applies |= dce_pass.run(builder, analysis_manager);
41✔
634
        applies |= sf_pass.run(builder, analysis_manager);
41✔
635
    } while (applies);
41✔
636
}
23✔
637

638
void InLocalStorage::to_json(nlohmann::json& j) const {
6✔
639
    std::string loop_type;
6✔
640
    if (dynamic_cast<structured_control_flow::For*>(&loop_)) {
6✔
641
        loop_type = "for";
6✔
642
    } else if (dynamic_cast<structured_control_flow::Map*>(&loop_)) {
6✔
643
        loop_type = "map";
×
644
    } else {
×
645
        throw std::runtime_error("Unsupported loop type for serialization of loop: " + loop_.indvar()->get_name());
×
646
    }
×
647
    j["subgraph"] = {
6✔
648
        {"0", {{"element_id", this->loop_.element_id()}, {"type", loop_type}}},
6✔
649
        {"1", {{"element_id", this->access_node_.element_id()}, {"type", "access_node"}}}
6✔
650
    };
6✔
651
    j["transformation_type"] = this->name();
6✔
652
    j["container"] = container_;
6✔
653
}
6✔
654

655
InLocalStorage InLocalStorage::from_json(builder::StructuredSDFGBuilder& builder, const nlohmann::json& desc) {
1✔
656
    auto loop_id = desc["subgraph"]["0"]["element_id"].get<size_t>();
1✔
657
    auto element = builder.find_element_by_id(loop_id);
1✔
658
    if (!element) {
1✔
659
        throw InvalidTransformationDescriptionException("Element with ID " + std::to_string(loop_id) + " not found.");
×
660
    }
×
661
    auto loop = dynamic_cast<structured_control_flow::StructuredLoop*>(element);
1✔
662
    if (!loop) {
1✔
663
        throw InvalidTransformationDescriptionException(
×
664
            "Element with ID " + std::to_string(loop_id) + " is not a structured loop."
×
665
        );
×
666
    }
×
667

668
    auto access_node = dynamic_cast<
1✔
669
        data_flow::AccessNode*>(builder.find_element_by_id(desc.at("subgraph").at("1").at("element_id").get<size_t>()));
1✔
670
    if (!access_node) {
1✔
671
        throw InvalidTransformationDescriptionException(
×
672
            "Access node with ID " + std::to_string(desc.at("subgraph").at("1").at("element_id").get<size_t>()) +
×
673
            " not found."
×
674
        );
×
675
    }
×
676

677
    return InLocalStorage(*loop, *access_node);
1✔
678
}
1✔
679

680
} // namespace transformations
681
} // namespace sdfg
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc