27981272983

Committed 22 Jun 2026 08:18PM UTC coverage: 61.754% (-0.03%) from 61.782%

Build # 27981272983

Build Type

Pull #781

github

Committed by

web-flow

Commit Message

Merge bddaa3724 into fe87d162b

Pull Request Pull Request #781: Extend Segformer benchmarks setup

Coverage Stats

987 of 1432 new or added lines in 62 files covered. (68.92%)

9 existing lines in 7 files now uncovered.

38121 of 61730 relevant lines covered (61.75%)

993.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.83

/opt/src/targets/cuda/math/tensor/softmax.cpp

#include "sdfg/targets/cuda/math/tensor/softmax.h"
#include "sdfg/symbolic/symbolic.h"
#include "sdfg/targets/cuda/cuda.h"

namespace sdfg::cuda::tensor {

static constexpr int SOFTMAX_BLOCK_SIZE = 256;

static void emit_softmax_kernel(codegen::PrettyPrinter& ks, const std::string& kernel_name, const std::string& type) {
    // Fused softmax kernel: one block per row, warp-shuffle reductions
    ks << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type
       << "* __restrict__ output, int num_rows, int row_size) {" << std::endl;
    ks.setIndent(ks.indent() + 4);

    ks << "int row = blockIdx.x;" << std::endl;
    ks << "if (row >= num_rows) return;" << std::endl;
    ks << std::endl;
    ks << "const " << type << "* row_in = input + row * row_size;" << std::endl;
    ks << type << "* row_out = output + row * row_size;" << std::endl;
    ks << std::endl;

    // Shared memory for cross-warp reduction
    ks << "extern __shared__ " << type << " sdata[];" << std::endl;
    ks << "int lane_id = threadIdx.x & 31;" << std::endl;
    ks << "int warp_id = threadIdx.x >> 5;" << std::endl;
    ks << "int num_warps = (blockDim.x + 31) >> 5;" << std::endl;
    ks << std::endl;

    // Phase 1: find row max
    ks << "// Phase 1: row max" << std::endl;
    ks << type << " thread_max = -INFINITY;" << std::endl;
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "thread_max = fmax" << (type == "float" ? "f" : "") << "(thread_max, row_in[i]);" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << std::endl;

    // Warp-level max reduction
    ks << "// Warp-level max reduction" << std::endl;
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "thread_max = fmax" << (type == "float" ? "f" : "")
       << "(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask));" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << std::endl;

    // Cross-warp max reduction
    ks << "// Cross-warp max reduction" << std::endl;
    ks << "if (lane_id == 0) sdata[warp_id] = thread_max;" << std::endl;
    ks << "__syncthreads();" << std::endl;
    ks << type << " row_max = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : (" << type << ")(-INFINITY);"
       << std::endl;
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "row_max = fmax" << (type == "float" ? "f" : "") << "(row_max, __shfl_xor_sync(0xFFFFFFFF, row_max, mask));"
       << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << "if (threadIdx.x == 0) sdata[0] = row_max;" << std::endl;
    ks << "__syncthreads();" << std::endl;
    ks << "row_max = sdata[0];" << std::endl;
    ks << std::endl;

    // Phase 2: exp and sum
    ks << "// Phase 2: exp(x - max) and sum" << std::endl;
    ks << type << " thread_sum = 0;" << std::endl;
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << type << " val = exp" << (type == "float" ? "f" : "") << "(row_in[i] - row_max);" << std::endl;
    ks << "row_out[i] = val;" << std::endl;
    ks << "thread_sum += val;" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << std::endl;

    // Warp-level sum reduction
    ks << "// Warp-level sum reduction" << std::endl;
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "thread_sum += __shfl_xor_sync(0xFFFFFFFF, thread_sum, mask);" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << std::endl;

    // Cross-warp sum reduction
    ks << "// Cross-warp sum reduction" << std::endl;
    ks << "if (lane_id == 0) sdata[warp_id] = thread_sum;" << std::endl;
    ks << "__syncthreads();" << std::endl;
    ks << type << " row_sum = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : 0;" << std::endl;
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask);" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
    ks << "if (threadIdx.x == 0) sdata[0] = row_sum;" << std::endl;
    ks << "__syncthreads();" << std::endl;
    ks << "row_sum = sdata[0];" << std::endl;
    ks << std::endl;

    // Phase 3: normalize
    ks << "// Phase 3: normalize" << std::endl;
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
    ks.setIndent(ks.indent() + 4);
    ks << "row_out[i] /= row_sum;" << std::endl;
    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;

    ks.setIndent(ks.indent() - 4);
    ks << "}" << std::endl;
}

static void compute_row_dims(
    const sdfg::math::tensor::SoftmaxNode& node,
    codegen::LanguageExtension& lang,
    std::string& num_rows_str,
    std::string& row_size_str
) {
    auto& shape = node.shape();
    auto& axes = node.axes();
    int64_t ndim = static_cast<int64_t>(shape.size());

    // Normalize axes to positive
    std::set<int64_t> reduce_axes;
    for (auto a : axes) {
        reduce_axes.insert(a < 0 ? a + ndim : a);
    }

    // num_rows = product of non-reduced dims, row_size = product of reduced dims
    symbolic::Expression num_rows = symbolic::one();
    symbolic::Expression row_size = symbolic::one();
    for (int64_t i = 0; i < ndim; ++i) {
        if (reduce_axes.count(i)) {
            row_size = symbolic::mul(row_size, shape[i]);
        } else {
            num_rows = symbolic::mul(num_rows, shape[i]);
        }
    }

    num_rows_str = lang.expression(num_rows);
    row_size_str = lang.expression(row_size);
}

static std::string get_type_string(types::PrimitiveType prim_type) {
    switch (prim_type) {
        case types::PrimitiveType::Float:
            return "float";
        case types::PrimitiveType::Double:
            return "double";
        default:
            throw std::runtime_error("Unsupported primitive type for CUDA softmax dispatcher");
    }
}

static void dispatch_softmax_common(
    codegen::CodegenOutput& out,
    std::vector<codegen::DispatchInput>& inputs,
    codegen::LanguageExtension& language_extension,
    const sdfg::math::tensor::SoftmaxNode& node,
    const data_flow::DataFlowGraph& data_flow_graph,
    const std::string& input_ptr,
    const std::string& output_ptr
) {
    auto prim_type = node.primitive_type(data_flow_graph);
    std::string type = get_type_string(prim_type);

    std::string num_rows_str, row_size_str;
    compute_row_dims(node, language_extension, num_rows_str, row_size_str);

    std::string kernel_name = "softmax_kernel_" + std::to_string(node.element_id());

    out.library_snippet_factory.add_global("#include <cuda.h>");
    out.library_snippet_factory.add_global("#include <math.h>");

    // Forward-declare kernel in globals
    out.globals_stream << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type
                       << "* __restrict__ output, int num_rows, int row_size);" << std::endl;

    // Emit kernel to .cu file
    auto& kernel_stream = out.library_snippet_factory.require(kernel_name, "cu", true).stream();
    kernel_stream << "#include " << out.library_snippet_factory.header_path().filename() << std::endl << std::endl;
    emit_softmax_kernel(kernel_stream, kernel_name, type);

    // Emit kernel call
    out.stream << "{" << std::endl;
    out.stream.setIndent(out.stream.indent() + 4);

    out.stream << "int __softmax_num_rows = (int)(" << num_rows_str << ");" << std::endl;
    out.stream << "int __softmax_row_size = (int)(" << row_size_str << ");" << std::endl;
    out.stream << "int __softmax_block_size = " << SOFTMAX_BLOCK_SIZE << ";" << std::endl;
    out.stream << "if (__softmax_row_size < __softmax_block_size) __softmax_block_size = __softmax_row_size;"
               << std::endl;
    // Round up to multiple of 32 (warp size)
    out.stream << "__softmax_block_size = ((__softmax_block_size + 31) / 32) * 32;" << std::endl;
    out.stream << "int __softmax_num_warps = __softmax_block_size / 32;" << std::endl;
    out.stream << "size_t __softmax_smem = __softmax_num_warps * sizeof(" << type << ");" << std::endl;
    out.stream << kernel_name << "<<<__softmax_num_rows, __softmax_block_size, __softmax_smem>>>(" << input_ptr << ", "
               << output_ptr << ", __softmax_num_rows, __softmax_row_size);" << std::endl;

    check_cuda_kernel_launch_errors(out.stream, language_extension, false);

    out.stream.setIndent(out.stream.indent() - 4);
    out.stream << "}" << std::endl;
}

// WithTransfers

SoftmaxNodeDispatcher_CUDAWithTransfers::SoftmaxNodeDispatcher_CUDAWithTransfers(
    codegen::LanguageExtension& language_extension,
    const Function& function,
    const data_flow::DataFlowGraph& data_flow_graph,
    const sdfg::math::tensor::SoftmaxNode& node
)
    : codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}

void SoftmaxNodeDispatcher_CUDAWithTransfers::dispatch_code_with_edges(
    codegen::CodegenOutput& out,
    std::vector<codegen::DispatchInput>& inputs,
    std::vector<codegen::DispatchOutput>& outputs
) {
    auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);
    auto prim_type = node.primitive_type(this->data_flow_graph_);
    std::string type = get_type_string(prim_type);

    // Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
    auto& y_expr = inputs.at(0).expr;
    auto& x_expr = inputs.at(1).expr;

    std::string num_rows_str, row_size_str;
    compute_row_dims(node, this->language_extension_, num_rows_str, row_size_str);

    std::string total_size = "((size_t)(" + num_rows_str + ") * (size_t)(" + row_size_str + ")) * sizeof(" + type + ")";

    out.stream << "{" << std::endl;
    out.stream.setIndent(out.stream.indent() + 4);

    out.stream << "cudaError_t err_cuda;" << std::endl;
    out.stream << type << " *d_input, *d_output;" << std::endl;
    out.stream << "size_t __softmax_total_bytes = " << total_size << ";" << std::endl;

    out.stream << "err_cuda = cudaMalloc((void**) &d_input, __softmax_total_bytes);" << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
    out.stream << "err_cuda = cudaMalloc((void**) &d_output, __softmax_total_bytes);" << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");

    out.stream << "err_cuda = cudaMemcpy(d_input, " << x_expr << ", __softmax_total_bytes, cudaMemcpyHostToDevice);"
               << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");

    dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, "d_input", "d_output");

    out.stream << "err_cuda = cudaMemcpy(" << y_expr << ", d_output, __softmax_total_bytes, cudaMemcpyDeviceToHost);"
               << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");

    out.stream << "err_cuda = cudaFree(d_input);" << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
    out.stream << "err_cuda = cudaFree(d_output);" << std::endl;
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");

    out.stream.setIndent(out.stream.indent() - 4);
    out.stream << "}" << std::endl;
}

// WithoutTransfers

SoftmaxNodeDispatcher_CUDAWithoutTransfers::SoftmaxNodeDispatcher_CUDAWithoutTransfers(
    codegen::LanguageExtension& language_extension,
    const Function& function,
    const data_flow::DataFlowGraph& data_flow_graph,
    const sdfg::math::tensor::SoftmaxNode& node
)
    : codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}

void SoftmaxNodeDispatcher_CUDAWithoutTransfers::dispatch_code_with_edges(
    codegen::CodegenOutput& out,
    std::vector<codegen::DispatchInput>& inputs,
    std::vector<codegen::DispatchOutput>& outputs
) {
    auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);

    // Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
    auto& y_expr = inputs.at(0).expr;
    auto& x_expr = inputs.at(1).expr;

    dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, x_expr, y_expr);
}

} // namespace sdfg::cuda::tensor

1	#include "sdfg/targets/cuda/math/tensor/softmax.h"
2	#include "sdfg/symbolic/symbolic.h"
3	#include "sdfg/targets/cuda/cuda.h"
4
5	namespace sdfg::cuda::tensor {
6
7	static constexpr int SOFTMAX_BLOCK_SIZE = 256;
8
9	static void emit_softmax_kernel(codegen::PrettyPrinter& ks, const std::string& kernel_name, const std::string& type) {	5✔
10	// Fused softmax kernel: one block per row, warp-shuffle reductions
11	ks << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type	5✔
12	<< "* __restrict__ output, int num_rows, int row_size) {" << std::endl;	5✔
13	ks.setIndent(ks.indent() + 4);	5✔
14
15	ks << "int row = blockIdx.x;" << std::endl;	5✔
16	ks << "if (row >= num_rows) return;" << std::endl;	5✔
17	ks << std::endl;	5✔
18	ks << "const " << type << "* row_in = input + row * row_size;" << std::endl;	5✔
19	ks << type << "* row_out = output + row * row_size;" << std::endl;	5✔
20	ks << std::endl;	5✔
21
22	// Shared memory for cross-warp reduction
23	ks << "extern __shared__ " << type << " sdata[];" << std::endl;	5✔
24	ks << "int lane_id = threadIdx.x & 31;" << std::endl;	5✔
25	ks << "int warp_id = threadIdx.x >> 5;" << std::endl;	5✔
26	ks << "int num_warps = (blockDim.x + 31) >> 5;" << std::endl;	5✔
27	ks << std::endl;	5✔
28
29	// Phase 1: find row max
30	ks << "// Phase 1: row max" << std::endl;	5✔
31	ks << type << " thread_max = -INFINITY;" << std::endl;	5✔
32	ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;	5✔
33	ks.setIndent(ks.indent() + 4);	5✔
34	ks << "thread_max = fmax" << (type == "float" ? "f" : "") << "(thread_max, row_in[i]);" << std::endl;	5✔
35	ks.setIndent(ks.indent() - 4);	5✔
36	ks << "}" << std::endl;	5✔
37	ks << std::endl;	5✔
38
39	// Warp-level max reduction
40	ks << "// Warp-level max reduction" << std::endl;	5✔
41	ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;	5✔
42	ks.setIndent(ks.indent() + 4);	5✔
43	ks << "thread_max = fmax" << (type == "float" ? "f" : "")	5✔
44	<< "(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask));" << std::endl;	5✔
45	ks.setIndent(ks.indent() - 4);	5✔
46	ks << "}" << std::endl;	5✔
47	ks << std::endl;	5✔
48
49	// Cross-warp max reduction
50	ks << "// Cross-warp max reduction" << std::endl;	5✔
51	ks << "if (lane_id == 0) sdata[warp_id] = thread_max;" << std::endl;	5✔
52	ks << "__syncthreads();" << std::endl;	5✔
53	ks << type << " row_max = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : (" << type << ")(-INFINITY);"	5✔
54	<< std::endl;	5✔
55	ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;	5✔
56	ks.setIndent(ks.indent() + 4);	5✔
57	ks << "row_max = fmax" << (type == "float" ? "f" : "") << "(row_max, __shfl_xor_sync(0xFFFFFFFF, row_max, mask));"	5✔
58	<< std::endl;	5✔
59	ks.setIndent(ks.indent() - 4);	5✔
60	ks << "}" << std::endl;	5✔
61	ks << "if (threadIdx.x == 0) sdata[0] = row_max;" << std::endl;	5✔
62	ks << "__syncthreads();" << std::endl;	5✔
63	ks << "row_max = sdata[0];" << std::endl;	5✔
64	ks << std::endl;	5✔
65
66	// Phase 2: exp and sum
67	ks << "// Phase 2: exp(x - max) and sum" << std::endl;	5✔
68	ks << type << " thread_sum = 0;" << std::endl;	5✔
69	ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;	5✔
70	ks.setIndent(ks.indent() + 4);	5✔
71	ks << type << " val = exp" << (type == "float" ? "f" : "") << "(row_in[i] - row_max);" << std::endl;	5✔
72	ks << "row_out[i] = val;" << std::endl;	5✔
73	ks << "thread_sum += val;" << std::endl;	5✔
74	ks.setIndent(ks.indent() - 4);	5✔
75	ks << "}" << std::endl;	5✔
76	ks << std::endl;	5✔
77
78	// Warp-level sum reduction
79	ks << "// Warp-level sum reduction" << std::endl;	5✔
80	ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;	5✔
81	ks.setIndent(ks.indent() + 4);	5✔
82	ks << "thread_sum += __shfl_xor_sync(0xFFFFFFFF, thread_sum, mask);" << std::endl;	5✔
83	ks.setIndent(ks.indent() - 4);	5✔
84	ks << "}" << std::endl;	5✔
85	ks << std::endl;	5✔
86
87	// Cross-warp sum reduction
88	ks << "// Cross-warp sum reduction" << std::endl;	5✔
89	ks << "if (lane_id == 0) sdata[warp_id] = thread_sum;" << std::endl;	5✔
90	ks << "__syncthreads();" << std::endl;	5✔
91	ks << type << " row_sum = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : 0;" << std::endl;	5✔
92	ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;	5✔
93	ks.setIndent(ks.indent() + 4);	5✔
94	ks << "row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask);" << std::endl;	5✔
95	ks.setIndent(ks.indent() - 4);	5✔
96	ks << "}" << std::endl;	5✔
97	ks << "if (threadIdx.x == 0) sdata[0] = row_sum;" << std::endl;	5✔
98	ks << "__syncthreads();" << std::endl;	5✔
99	ks << "row_sum = sdata[0];" << std::endl;	5✔
100	ks << std::endl;	5✔
101
102	// Phase 3: normalize
103	ks << "// Phase 3: normalize" << std::endl;	5✔
104	ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;	5✔
105	ks.setIndent(ks.indent() + 4);	5✔
106	ks << "row_out[i] /= row_sum;" << std::endl;	5✔
107	ks.setIndent(ks.indent() - 4);	5✔
108	ks << "}" << std::endl;	5✔
109
110	ks.setIndent(ks.indent() - 4);	5✔
111	ks << "}" << std::endl;	5✔
112	}	5✔
113
114	static void compute_row_dims(
115	const sdfg::math::tensor::SoftmaxNode& node,
116	codegen::LanguageExtension& lang,
117	std::string& num_rows_str,
118	std::string& row_size_str
119	) {	7✔
120	auto& shape = node.shape();	7✔
121	auto& axes = node.axes();	7✔
122	int64_t ndim = static_cast<int64_t>(shape.size());	7✔
123
124	// Normalize axes to positive
125	std::set<int64_t> reduce_axes;	7✔
126	for (auto a : axes) {	7✔
127	reduce_axes.insert(a < 0 ? a + ndim : a);	7✔
128	}	7✔
129
130	// num_rows = product of non-reduced dims, row_size = product of reduced dims
131	symbolic::Expression num_rows = symbolic::one();	7✔
132	symbolic::Expression row_size = symbolic::one();	7✔
133	for (int64_t i = 0; i < ndim; ++i) {	27✔
134	if (reduce_axes.count(i)) {	20✔
135	row_size = symbolic::mul(row_size, shape[i]);	7✔
136	} else {	13✔
137	num_rows = symbolic::mul(num_rows, shape[i]);	13✔
138	}	13✔
139	}	20✔
140
141	num_rows_str = lang.expression(num_rows);	7✔
142	row_size_str = lang.expression(row_size);	7✔
143	}	7✔
144
145	static std::string get_type_string(types::PrimitiveType prim_type) {	7✔
146	switch (prim_type) {	7✔
147	case types::PrimitiveType::Float:	7✔
148	return "float";	7✔
NEW 149	case types::PrimitiveType::Double:	×
NEW 150	return "double";	×
NEW 151	default:	×
NEW 152	throw std::runtime_error("Unsupported primitive type for CUDA softmax dispatcher");	×
153	}	7✔
154	}	7✔
155
156	static void dispatch_softmax_common(
157	codegen::CodegenOutput& out,
158	std::vector<codegen::DispatchInput>& inputs,
159	codegen::LanguageExtension& language_extension,
160	const sdfg::math::tensor::SoftmaxNode& node,
161	const data_flow::DataFlowGraph& data_flow_graph,
162	const std::string& input_ptr,
163	const std::string& output_ptr
164	) {	5✔
165	auto prim_type = node.primitive_type(data_flow_graph);	5✔
166	std::string type = get_type_string(prim_type);	5✔
167
168	std::string num_rows_str, row_size_str;	5✔
169	compute_row_dims(node, language_extension, num_rows_str, row_size_str);	5✔
170
171	std::string kernel_name = "softmax_kernel_" + std::to_string(node.element_id());	5✔
172
173	out.library_snippet_factory.add_global("#include <cuda.h>");	5✔
174	out.library_snippet_factory.add_global("#include <math.h>");	5✔
175
176	// Forward-declare kernel in globals
177	out.globals_stream << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type	5✔
178	<< "* __restrict__ output, int num_rows, int row_size);" << std::endl;	5✔
179
180	// Emit kernel to .cu file
181	auto& kernel_stream = out.library_snippet_factory.require(kernel_name, "cu", true).stream();	5✔
182	kernel_stream << "#include " << out.library_snippet_factory.header_path().filename() << std::endl << std::endl;	5✔
183	emit_softmax_kernel(kernel_stream, kernel_name, type);	5✔
184
185	// Emit kernel call
186	out.stream << "{" << std::endl;	5✔
187	out.stream.setIndent(out.stream.indent() + 4);	5✔
188
189	out.stream << "int __softmax_num_rows = (int)(" << num_rows_str << ");" << std::endl;	5✔
190	out.stream << "int __softmax_row_size = (int)(" << row_size_str << ");" << std::endl;	5✔
191	out.stream << "int __softmax_block_size = " << SOFTMAX_BLOCK_SIZE << ";" << std::endl;	5✔
192	out.stream << "if (__softmax_row_size < __softmax_block_size) __softmax_block_size = __softmax_row_size;"	5✔
193	<< std::endl;	5✔
194	// Round up to multiple of 32 (warp size)
195	out.stream << "__softmax_block_size = ((__softmax_block_size + 31) / 32) * 32;" << std::endl;	5✔
196	out.stream << "int __softmax_num_warps = __softmax_block_size / 32;" << std::endl;	5✔
197	out.stream << "size_t __softmax_smem = __softmax_num_warps * sizeof(" << type << ");" << std::endl;	5✔
198	out.stream << kernel_name << "<<<__softmax_num_rows, __softmax_block_size, __softmax_smem>>>(" << input_ptr << ", "	5✔
199	<< output_ptr << ", __softmax_num_rows, __softmax_row_size);" << std::endl;	5✔
200
201	check_cuda_kernel_launch_errors(out.stream, language_extension, false);	5✔
202
203	out.stream.setIndent(out.stream.indent() - 4);	5✔
204	out.stream << "}" << std::endl;	5✔
205	}	5✔
206
207	// WithTransfers
208
209	SoftmaxNodeDispatcher_CUDAWithTransfers::SoftmaxNodeDispatcher_CUDAWithTransfers(
210	codegen::LanguageExtension& language_extension,
211	const Function& function,
212	const data_flow::DataFlowGraph& data_flow_graph,
213	const sdfg::math::tensor::SoftmaxNode& node
214	)
215	: codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}	2✔
216
217	void SoftmaxNodeDispatcher_CUDAWithTransfers::dispatch_code_with_edges(
218	codegen::CodegenOutput& out,
219	std::vector<codegen::DispatchInput>& inputs,
220	std::vector<codegen::DispatchOutput>& outputs
221	) {	2✔
222	auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);	2✔
223	auto prim_type = node.primitive_type(this->data_flow_graph_);	2✔
224	std::string type = get_type_string(prim_type);	2✔
225
226	// Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
227	auto& y_expr = inputs.at(0).expr;	2✔
228	auto& x_expr = inputs.at(1).expr;	2✔
229
230	std::string num_rows_str, row_size_str;	2✔
231	compute_row_dims(node, this->language_extension_, num_rows_str, row_size_str);	2✔
232
233	std::string total_size = "((size_t)(" + num_rows_str + ") * (size_t)(" + row_size_str + ")) * sizeof(" + type + ")";	2✔
234
235	out.stream << "{" << std::endl;	2✔
236	out.stream.setIndent(out.stream.indent() + 4);	2✔
237
238	out.stream << "cudaError_t err_cuda;" << std::endl;	2✔
239	out.stream << type << " d_input, d_output;" << std::endl;	2✔
240	out.stream << "size_t __softmax_total_bytes = " << total_size << ";" << std::endl;	2✔
241
242	out.stream << "err_cuda = cudaMalloc((void**) &d_input, __softmax_total_bytes);" << std::endl;	2✔
243	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
244	out.stream << "err_cuda = cudaMalloc((void**) &d_output, __softmax_total_bytes);" << std::endl;	2✔
245	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
246
247	out.stream << "err_cuda = cudaMemcpy(d_input, " << x_expr << ", __softmax_total_bytes, cudaMemcpyHostToDevice);"	2✔
248	<< std::endl;	2✔
249	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
250
251	dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, "d_input", "d_output");	2✔
252
253	out.stream << "err_cuda = cudaMemcpy(" << y_expr << ", d_output, __softmax_total_bytes, cudaMemcpyDeviceToHost);"	2✔
254	<< std::endl;	2✔
255	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
256
257	out.stream << "err_cuda = cudaFree(d_input);" << std::endl;	2✔
258	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
259	out.stream << "err_cuda = cudaFree(d_output);" << std::endl;	2✔
260	cuda_error_checking(out.stream, this->language_extension_, "err_cuda");	2✔
261
262	out.stream.setIndent(out.stream.indent() - 4);	2✔
263	out.stream << "}" << std::endl;	2✔
264	}	2✔
265
266	// WithoutTransfers
267
268	SoftmaxNodeDispatcher_CUDAWithoutTransfers::SoftmaxNodeDispatcher_CUDAWithoutTransfers(
269	codegen::LanguageExtension& language_extension,
270	const Function& function,
271	const data_flow::DataFlowGraph& data_flow_graph,
272	const sdfg::math::tensor::SoftmaxNode& node
273	)
274	: codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}	3✔
275
276	void SoftmaxNodeDispatcher_CUDAWithoutTransfers::dispatch_code_with_edges(
277	codegen::CodegenOutput& out,
278	std::vector<codegen::DispatchInput>& inputs,
279	std::vector<codegen::DispatchOutput>& outputs
280	) {	3✔
281	auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);	3✔
282
283	// Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
284	auto& y_expr = inputs.at(0).expr;	3✔
285	auto& x_expr = inputs.at(1).expr;	3✔
286
287	dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, x_expr, y_expr);	3✔
288	}	3✔
289
290	} // namespace sdfg::cuda::tensor

daisytuner / docc / 27981272983

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous