28303955550

Committed 27 Jun 2026 10:38PM UTC coverage: 61.924% (+0.2%) from 61.754%

Build # 28303955550

Build Type

Pull #814

github

Committed by

web-flow

Commit Message

Merge 89b94697f into 8322f5994

Pull Request Pull Request #814: Adds GPU reduce dispatchers

Coverage Stats

568 of 859 new or added lines in 16 files covered. (66.12%)

3 existing lines in 1 file now uncovered.

39450 of 63707 relevant lines covered (61.92%)

967.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.22

/opt/src/transformations/offloading/cuda_parallelize_nested_map.cpp

#include "sdfg/transformations/offloading/cuda_parallelize_nested_map.h"

#include <sdfg/analysis/loop_analysis.h>
#include "sdfg/exceptions.h"
#include "sdfg/structured_control_flow/reduce.h"
#include "sdfg/symbolic/symbolic.h"
#include "sdfg/targets/cuda/cuda.h"
#include "sdfg/types/pointer.h"
#include "sdfg/types/scalar.h"

namespace sdfg {
namespace transformations {

CUDAParallelizeNestedMap::CUDAParallelizeNestedMap(structured_control_flow::StructuredLoop& loop, size_t block_size)
    : loop_(loop), block_size_(block_size) {}

std::string CUDAParallelizeNestedMap::name() const { return "CUDAParallelizeNestedMap"; }

bool CUDAParallelizeNestedMap::
    can_be_applied(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
    if (dynamic_cast<structured_control_flow::Map*>(&loop_) == nullptr &&
        dynamic_cast<structured_control_flow::Reduce*>(&loop_) == nullptr) {
        throw InvalidTransformationDescriptionException(
            "CUDAParallelizeNestedMap: can only parallelize Map or Reduce nodes."
        );
    }

    auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();

    // Condition: Check if map is not yet parallelized with CUDA
    if (loop_.schedule_type().value() != ScheduleType_Sequential::value()) {
        return false;
    }

    // Condition: a nested Reduce can only be offloaded when every accumulator is
    // a device-resident pointer to a scalar.
    if (auto* reduce = dynamic_cast<structured_control_flow::Reduce*>(&loop_)) {
        auto& sdfg = builder.subject();
        for (auto& reduction : reduce->reductions()) {
            auto& type = sdfg.type(reduction.container);
            auto* ptr = dynamic_cast<const types::Pointer*>(&type);
            if (ptr == nullptr || !ptr->has_pointee_type() ||
                dynamic_cast<const types::Scalar*>(&ptr->pointee_type()) == nullptr) {
                return false;
            }
        }
    }

    // Condition: Check if parent loop exists
    auto parent = loop_analysis.parent_loop(&loop_);
    if (parent == nullptr) {
        return false;
    }

    // Condition: Check if parent loop is a CUDA map, and not Z dimension (final dimension)
    if (auto map = dynamic_cast<structured_control_flow::Map*>(parent)) {
        if (map->schedule_type().value() != cuda::ScheduleType_CUDA::value()) {
            return false;
        }
        if (cuda::ScheduleType_CUDA::dimension(map->schedule_type()) == cuda::CUDADimension::Z) {
            return false;
        }
        auto parent_indvar = map->indvar();
        auto ancestor = parent;
        while (ancestor) {
            if (auto map_ancestor = dynamic_cast<structured_control_flow::Map*>(ancestor)) {
                parent_indvar = map_ancestor->indvar();
                for (auto& arg : symbolic::atoms(loop_.condition())) {
                    if (symbolic::eq(arg, parent_indvar)) {
                        return false;
                    }
                }
            }
            ancestor = loop_analysis.parent_loop(ancestor);
        }
    } else {
        return false;
    }

    // Note: arbitrary `init` and `stride` are permitted. The CUDA dispatcher
    // emits `<map.indvar> = init + thread_flat_id * stride`, so the body sees
    // the natural strided value; `num_iterations()` accounts for both when
    // computing the grid geometry.

    // Condition: Resulting CUDA grid dimension must not exceed hardware limits.
    // Y and Z grid dimensions are limited to 65535.
    auto num_iters = loop_.num_iterations();
    if (!num_iters.is_null() && SymEngine::is_a<SymEngine::Integer>(*num_iters)) {
        int64_t iters = SymEngine::down_cast<const SymEngine::Integer&>(*num_iters).as_int();
        int64_t block = static_cast<int64_t>(block_size_);
        int64_t grid_size = (iters + block - 1) / block;

        constexpr int64_t max_grid_dim_yz = 65535;
        if (grid_size > max_grid_dim_yz) {
            return false;
        }
    }

    return true;
}

void CUDAParallelizeNestedMap::apply(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {
    auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();
    auto parent = loop_analysis.parent_loop(&loop_);

    auto parent_dim =
        cuda::ScheduleType_CUDA::dimension(static_cast<structured_control_flow::Map*>(parent)->schedule_type());

    cuda::CUDADimension child_dim;
    if (parent_dim == cuda::CUDADimension::X) {
        child_dim = cuda::CUDADimension::Y;
    } else if (parent_dim == cuda::CUDADimension::Y) {
        child_dim = cuda::CUDADimension::Z;
    } else {
        throw InvalidSDFGException("Parent loop is Z dimension, cannot parallelize nested map.");
    }

    auto new_schedule = cuda::ScheduleType_CUDA::create();
    cuda::ScheduleType_CUDA::dimension(new_schedule, child_dim);
    cuda::ScheduleType_CUDA::block_size(new_schedule, symbolic::integer(block_size_));

    builder.update_schedule_type(loop_, new_schedule);
}

void CUDAParallelizeNestedMap::to_json(nlohmann::json& j) const {
    j["transformation_type"] = this->name();
    j["parameters"] = nlohmann::json::object();
    j["parameters"]["block_size"] = block_size_;

    serializer::JSONSerializer ser_flat(false);
    j["subgraph"] = nlohmann::json::object();
    j["subgraph"]["0"] = nlohmann::json::object();
    ser_flat.serialize_node(j["subgraph"]["0"], loop_);
}

CUDAParallelizeNestedMap CUDAParallelizeNestedMap::
    from_json(builder::StructuredSDFGBuilder& builder, const nlohmann::json& j) {
    // Prefer the embedding-compatible representation (subgraph/parameters),
    // but fall back to legacy fields (loop/block_size) if needed.
    const auto& subgraph = j.at("subgraph");
    const auto& node_desc = subgraph.at("0");
    size_t loop_id = node_desc.at("element_id").get<size_t>();

    size_t block_size = j.at("parameters").at("block_size").get<size_t>();
    auto loop = dynamic_cast<structured_control_flow::StructuredLoop*>(builder.find_element_by_id(loop_id));
    if (!loop) {
        throw InvalidTransformationDescriptionException("Element with ID " + std::to_string(loop_id) + " is not a loop.");
    }
    return CUDAParallelizeNestedMap(*loop, block_size);
}

} // namespace transformations
} // namespace sdfg

1	#include "sdfg/transformations/offloading/cuda_parallelize_nested_map.h"
2
3	#include <sdfg/analysis/loop_analysis.h>
4	#include "sdfg/exceptions.h"
5	#include "sdfg/structured_control_flow/reduce.h"
6	#include "sdfg/symbolic/symbolic.h"
7	#include "sdfg/targets/cuda/cuda.h"
8	#include "sdfg/types/pointer.h"
9	#include "sdfg/types/scalar.h"
10
11	namespace sdfg {
12	namespace transformations {
13
14	CUDAParallelizeNestedMap::CUDAParallelizeNestedMap(structured_control_flow::StructuredLoop& loop, size_t block_size)
15	: loop_(loop), block_size_(block_size) {}	16✔
16
17	std::string CUDAParallelizeNestedMap::name() const { return "CUDAParallelizeNestedMap"; }	3✔
18
19	bool CUDAParallelizeNestedMap::
20	can_be_applied(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {	12✔
21	if (dynamic_cast<structured_control_flow::Map*>(&loop_) == nullptr &&	12✔
22	dynamic_cast<structured_control_flow::Reduce*>(&loop_) == nullptr) {	12✔
NEW 23	throw InvalidTransformationDescriptionException(	×
NEW 24	"CUDAParallelizeNestedMap: can only parallelize Map or Reduce nodes."	×
NEW 25	);	×
NEW 26	}	×
27
28	auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();	12✔
29
30	// Condition: Check if map is not yet parallelized with CUDA
31	if (loop_.schedule_type().value() != ScheduleType_Sequential::value()) {	12✔
32	return false;	1✔
33	}	1✔
34
35	// Condition: a nested Reduce can only be offloaded when every accumulator is
36	// a device-resident pointer to a scalar.
37	if (auto* reduce = dynamic_cast<structured_control_flow::Reduce*>(&loop_)) {	11✔
NEW 38	auto& sdfg = builder.subject();	×
NEW 39	for (auto& reduction : reduce->reductions()) {	×
NEW 40	auto& type = sdfg.type(reduction.container);	×
NEW 41	auto* ptr = dynamic_cast<const types::Pointer*>(&type);	×
NEW 42	if (ptr == nullptr \|\| !ptr->has_pointee_type() \|\|	×
NEW 43	dynamic_cast<const types::Scalar*>(&ptr->pointee_type()) == nullptr) {	×
NEW 44	return false;	×
NEW 45	}	×
NEW 46	}	×
NEW 47	}	×
48
49	// Condition: Check if parent loop exists
50	auto parent = loop_analysis.parent_loop(&loop_);	11✔
51	if (parent == nullptr) {	11✔
52	return false;	1✔
53	}	1✔
54
55	// Condition: Check if parent loop is a CUDA map, and not Z dimension (final dimension)
56	if (auto map = dynamic_cast<structured_control_flow::Map*>(parent)) {	10✔
57	if (map->schedule_type().value() != cuda::ScheduleType_CUDA::value()) {	10✔
58	return false;	1✔
59	}	1✔
60	if (cuda::ScheduleType_CUDA::dimension(map->schedule_type()) == cuda::CUDADimension::Z) {	9✔
61	return false;	1✔
62	}	1✔
63	auto parent_indvar = map->indvar();	8✔
64	auto ancestor = parent;	8✔
65	while (ancestor) {	18✔
66	if (auto map_ancestor = dynamic_cast<structured_control_flow::Map*>(ancestor)) {	10✔
67	parent_indvar = map_ancestor->indvar();	9✔
68	for (auto& arg : symbolic::atoms(loop_.condition())) {	11✔
69	if (symbolic::eq(arg, parent_indvar)) {	11✔
70	return false;	×
71	}	×
72	}	11✔
73	}	9✔
74	ancestor = loop_analysis.parent_loop(ancestor);	10✔
75	}	10✔
76	} else {	8✔
77	return false;	×
78	}	×
79
80	// Note: arbitrary `init` and `stride` are permitted. The CUDA dispatcher
81	// emits `<map.indvar> = init + thread_flat_id * stride`, so the body sees
82	// the natural strided value; `num_iterations()` accounts for both when
83	// computing the grid geometry.
84
85	// Condition: Resulting CUDA grid dimension must not exceed hardware limits.
86	// Y and Z grid dimensions are limited to 65535.
87	auto num_iters = loop_.num_iterations();	8✔
88	if (!num_iters.is_null() && SymEngine::is_a<SymEngine::Integer>(*num_iters)) {	8✔
89	int64_t iters = SymEngine::down_cast<const SymEngine::Integer&>(*num_iters).as_int();	6✔
90	int64_t block = static_cast<int64_t>(block_size_);	6✔
91	int64_t grid_size = (iters + block - 1) / block;	6✔
92
93	constexpr int64_t max_grid_dim_yz = 65535;	6✔
94	if (grid_size > max_grid_dim_yz) {	6✔
95	return false;	1✔
96	}	1✔
97	}	6✔
98
99	return true;	7✔
100	}	8✔
101
102	void CUDAParallelizeNestedMap::apply(builder::StructuredSDFGBuilder& builder, analysis::AnalysisManager& analysis_manager) {	5✔
103	auto& loop_analysis = analysis_manager.get<analysis::LoopAnalysis>();	5✔
104	auto parent = loop_analysis.parent_loop(&loop_);	5✔
105
106	auto parent_dim =	5✔
107	cuda::ScheduleType_CUDA::dimension(static_cast<structured_control_flow::Map*>(parent)->schedule_type());	5✔
108
109	cuda::CUDADimension child_dim;	5✔
110	if (parent_dim == cuda::CUDADimension::X) {	5✔
111	child_dim = cuda::CUDADimension::Y;	4✔
112	} else if (parent_dim == cuda::CUDADimension::Y) {	4✔
113	child_dim = cuda::CUDADimension::Z;	1✔
114	} else {	1✔
115	throw InvalidSDFGException("Parent loop is Z dimension, cannot parallelize nested map.");	×
116	}	×
117
118	auto new_schedule = cuda::ScheduleType_CUDA::create();	5✔
119	cuda::ScheduleType_CUDA::dimension(new_schedule, child_dim);	5✔
120	cuda::ScheduleType_CUDA::block_size(new_schedule, symbolic::integer(block_size_));	5✔
121
122	builder.update_schedule_type(loop_, new_schedule);	5✔
123	}	5✔
124
125	void CUDAParallelizeNestedMap::to_json(nlohmann::json& j) const {	1✔
126	j["transformation_type"] = this->name();	1✔
127	j["parameters"] = nlohmann::json::object();	1✔
128	j["parameters"]["block_size"] = block_size_;	1✔
129
130	serializer::JSONSerializer ser_flat(false);	1✔
131	j["subgraph"] = nlohmann::json::object();	1✔
132	j["subgraph"]["0"] = nlohmann::json::object();	1✔
133	ser_flat.serialize_node(j["subgraph"]["0"], loop_);	1✔
134	}	1✔
135
136	CUDAParallelizeNestedMap CUDAParallelizeNestedMap::
137	from_json(builder::StructuredSDFGBuilder& builder, const nlohmann::json& j) {	1✔
138	// Prefer the embedding-compatible representation (subgraph/parameters),
139	// but fall back to legacy fields (loop/block_size) if needed.
140	const auto& subgraph = j.at("subgraph");	1✔
141	const auto& node_desc = subgraph.at("0");	1✔
142	size_t loop_id = node_desc.at("element_id").get<size_t>();	1✔
143
144	size_t block_size = j.at("parameters").at("block_size").get<size_t>();	1✔
145	auto loop = dynamic_cast<structured_control_flow::StructuredLoop*>(builder.find_element_by_id(loop_id));	1✔
146	if (!loop) {	1✔
147	throw InvalidTransformationDescriptionException("Element with ID " + std::to_string(loop_id) + " is not a loop.");	×
148	}	×
149	return CUDAParallelizeNestedMap(*loop, block_size);	1✔
150	}	1✔
151
152	} // namespace transformations
153	} // namespace sdfg

daisytuner / docc / 28303955550

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous