• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

daisytuner / docc / 27981272983

22 Jun 2026 08:18PM UTC coverage: 61.754% (-0.03%) from 61.782%
27981272983

Pull #781

github

web-flow
Merge bddaa3724 into fe87d162b
Pull Request #781: Extend Segformer benchmarks setup

987 of 1432 new or added lines in 62 files covered. (68.92%)

9 existing lines in 7 files now uncovered.

38121 of 61730 relevant lines covered (61.75%)

993.19 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.83
/opt/src/targets/cuda/math/tensor/softmax.cpp
1
#include "sdfg/targets/cuda/math/tensor/softmax.h"
2
#include "sdfg/symbolic/symbolic.h"
3
#include "sdfg/targets/cuda/cuda.h"
4

5
namespace sdfg::cuda::tensor {
6

7
static constexpr int SOFTMAX_BLOCK_SIZE = 256;
8

9
static void emit_softmax_kernel(codegen::PrettyPrinter& ks, const std::string& kernel_name, const std::string& type) {
5✔
10
    // Fused softmax kernel: one block per row, warp-shuffle reductions
11
    ks << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type
5✔
12
       << "* __restrict__ output, int num_rows, int row_size) {" << std::endl;
5✔
13
    ks.setIndent(ks.indent() + 4);
5✔
14

15
    ks << "int row = blockIdx.x;" << std::endl;
5✔
16
    ks << "if (row >= num_rows) return;" << std::endl;
5✔
17
    ks << std::endl;
5✔
18
    ks << "const " << type << "* row_in = input + row * row_size;" << std::endl;
5✔
19
    ks << type << "* row_out = output + row * row_size;" << std::endl;
5✔
20
    ks << std::endl;
5✔
21

22
    // Shared memory for cross-warp reduction
23
    ks << "extern __shared__ " << type << " sdata[];" << std::endl;
5✔
24
    ks << "int lane_id = threadIdx.x & 31;" << std::endl;
5✔
25
    ks << "int warp_id = threadIdx.x >> 5;" << std::endl;
5✔
26
    ks << "int num_warps = (blockDim.x + 31) >> 5;" << std::endl;
5✔
27
    ks << std::endl;
5✔
28

29
    // Phase 1: find row max
30
    ks << "// Phase 1: row max" << std::endl;
5✔
31
    ks << type << " thread_max = -INFINITY;" << std::endl;
5✔
32
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
5✔
33
    ks.setIndent(ks.indent() + 4);
5✔
34
    ks << "thread_max = fmax" << (type == "float" ? "f" : "") << "(thread_max, row_in[i]);" << std::endl;
5✔
35
    ks.setIndent(ks.indent() - 4);
5✔
36
    ks << "}" << std::endl;
5✔
37
    ks << std::endl;
5✔
38

39
    // Warp-level max reduction
40
    ks << "// Warp-level max reduction" << std::endl;
5✔
41
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
5✔
42
    ks.setIndent(ks.indent() + 4);
5✔
43
    ks << "thread_max = fmax" << (type == "float" ? "f" : "")
5✔
44
       << "(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask));" << std::endl;
5✔
45
    ks.setIndent(ks.indent() - 4);
5✔
46
    ks << "}" << std::endl;
5✔
47
    ks << std::endl;
5✔
48

49
    // Cross-warp max reduction
50
    ks << "// Cross-warp max reduction" << std::endl;
5✔
51
    ks << "if (lane_id == 0) sdata[warp_id] = thread_max;" << std::endl;
5✔
52
    ks << "__syncthreads();" << std::endl;
5✔
53
    ks << type << " row_max = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : (" << type << ")(-INFINITY);"
5✔
54
       << std::endl;
5✔
55
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
5✔
56
    ks.setIndent(ks.indent() + 4);
5✔
57
    ks << "row_max = fmax" << (type == "float" ? "f" : "") << "(row_max, __shfl_xor_sync(0xFFFFFFFF, row_max, mask));"
5✔
58
       << std::endl;
5✔
59
    ks.setIndent(ks.indent() - 4);
5✔
60
    ks << "}" << std::endl;
5✔
61
    ks << "if (threadIdx.x == 0) sdata[0] = row_max;" << std::endl;
5✔
62
    ks << "__syncthreads();" << std::endl;
5✔
63
    ks << "row_max = sdata[0];" << std::endl;
5✔
64
    ks << std::endl;
5✔
65

66
    // Phase 2: exp and sum
67
    ks << "// Phase 2: exp(x - max) and sum" << std::endl;
5✔
68
    ks << type << " thread_sum = 0;" << std::endl;
5✔
69
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
5✔
70
    ks.setIndent(ks.indent() + 4);
5✔
71
    ks << type << " val = exp" << (type == "float" ? "f" : "") << "(row_in[i] - row_max);" << std::endl;
5✔
72
    ks << "row_out[i] = val;" << std::endl;
5✔
73
    ks << "thread_sum += val;" << std::endl;
5✔
74
    ks.setIndent(ks.indent() - 4);
5✔
75
    ks << "}" << std::endl;
5✔
76
    ks << std::endl;
5✔
77

78
    // Warp-level sum reduction
79
    ks << "// Warp-level sum reduction" << std::endl;
5✔
80
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
5✔
81
    ks.setIndent(ks.indent() + 4);
5✔
82
    ks << "thread_sum += __shfl_xor_sync(0xFFFFFFFF, thread_sum, mask);" << std::endl;
5✔
83
    ks.setIndent(ks.indent() - 4);
5✔
84
    ks << "}" << std::endl;
5✔
85
    ks << std::endl;
5✔
86

87
    // Cross-warp sum reduction
88
    ks << "// Cross-warp sum reduction" << std::endl;
5✔
89
    ks << "if (lane_id == 0) sdata[warp_id] = thread_sum;" << std::endl;
5✔
90
    ks << "__syncthreads();" << std::endl;
5✔
91
    ks << type << " row_sum = (threadIdx.x < num_warps) ? sdata[threadIdx.x] : 0;" << std::endl;
5✔
92
    ks << "for (int mask = 16; mask > 0; mask >>= 1) {" << std::endl;
5✔
93
    ks.setIndent(ks.indent() + 4);
5✔
94
    ks << "row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask);" << std::endl;
5✔
95
    ks.setIndent(ks.indent() - 4);
5✔
96
    ks << "}" << std::endl;
5✔
97
    ks << "if (threadIdx.x == 0) sdata[0] = row_sum;" << std::endl;
5✔
98
    ks << "__syncthreads();" << std::endl;
5✔
99
    ks << "row_sum = sdata[0];" << std::endl;
5✔
100
    ks << std::endl;
5✔
101

102
    // Phase 3: normalize
103
    ks << "// Phase 3: normalize" << std::endl;
5✔
104
    ks << "for (int i = threadIdx.x; i < row_size; i += blockDim.x) {" << std::endl;
5✔
105
    ks.setIndent(ks.indent() + 4);
5✔
106
    ks << "row_out[i] /= row_sum;" << std::endl;
5✔
107
    ks.setIndent(ks.indent() - 4);
5✔
108
    ks << "}" << std::endl;
5✔
109

110
    ks.setIndent(ks.indent() - 4);
5✔
111
    ks << "}" << std::endl;
5✔
112
}
5✔
113

114
static void compute_row_dims(
115
    const sdfg::math::tensor::SoftmaxNode& node,
116
    codegen::LanguageExtension& lang,
117
    std::string& num_rows_str,
118
    std::string& row_size_str
119
) {
7✔
120
    auto& shape = node.shape();
7✔
121
    auto& axes = node.axes();
7✔
122
    int64_t ndim = static_cast<int64_t>(shape.size());
7✔
123

124
    // Normalize axes to positive
125
    std::set<int64_t> reduce_axes;
7✔
126
    for (auto a : axes) {
7✔
127
        reduce_axes.insert(a < 0 ? a + ndim : a);
7✔
128
    }
7✔
129

130
    // num_rows = product of non-reduced dims, row_size = product of reduced dims
131
    symbolic::Expression num_rows = symbolic::one();
7✔
132
    symbolic::Expression row_size = symbolic::one();
7✔
133
    for (int64_t i = 0; i < ndim; ++i) {
27✔
134
        if (reduce_axes.count(i)) {
20✔
135
            row_size = symbolic::mul(row_size, shape[i]);
7✔
136
        } else {
13✔
137
            num_rows = symbolic::mul(num_rows, shape[i]);
13✔
138
        }
13✔
139
    }
20✔
140

141
    num_rows_str = lang.expression(num_rows);
7✔
142
    row_size_str = lang.expression(row_size);
7✔
143
}
7✔
144

145
static std::string get_type_string(types::PrimitiveType prim_type) {
7✔
146
    switch (prim_type) {
7✔
147
        case types::PrimitiveType::Float:
7✔
148
            return "float";
7✔
NEW
149
        case types::PrimitiveType::Double:
×
NEW
150
            return "double";
×
NEW
151
        default:
×
NEW
152
            throw std::runtime_error("Unsupported primitive type for CUDA softmax dispatcher");
×
153
    }
7✔
154
}
7✔
155

156
static void dispatch_softmax_common(
157
    codegen::CodegenOutput& out,
158
    std::vector<codegen::DispatchInput>& inputs,
159
    codegen::LanguageExtension& language_extension,
160
    const sdfg::math::tensor::SoftmaxNode& node,
161
    const data_flow::DataFlowGraph& data_flow_graph,
162
    const std::string& input_ptr,
163
    const std::string& output_ptr
164
) {
5✔
165
    auto prim_type = node.primitive_type(data_flow_graph);
5✔
166
    std::string type = get_type_string(prim_type);
5✔
167

168
    std::string num_rows_str, row_size_str;
5✔
169
    compute_row_dims(node, language_extension, num_rows_str, row_size_str);
5✔
170

171
    std::string kernel_name = "softmax_kernel_" + std::to_string(node.element_id());
5✔
172

173
    out.library_snippet_factory.add_global("#include <cuda.h>");
5✔
174
    out.library_snippet_factory.add_global("#include <math.h>");
5✔
175

176
    // Forward-declare kernel in globals
177
    out.globals_stream << "__global__ void " << kernel_name << "(const " << type << "* __restrict__ input, " << type
5✔
178
                       << "* __restrict__ output, int num_rows, int row_size);" << std::endl;
5✔
179

180
    // Emit kernel to .cu file
181
    auto& kernel_stream = out.library_snippet_factory.require(kernel_name, "cu", true).stream();
5✔
182
    kernel_stream << "#include " << out.library_snippet_factory.header_path().filename() << std::endl << std::endl;
5✔
183
    emit_softmax_kernel(kernel_stream, kernel_name, type);
5✔
184

185
    // Emit kernel call
186
    out.stream << "{" << std::endl;
5✔
187
    out.stream.setIndent(out.stream.indent() + 4);
5✔
188

189
    out.stream << "int __softmax_num_rows = (int)(" << num_rows_str << ");" << std::endl;
5✔
190
    out.stream << "int __softmax_row_size = (int)(" << row_size_str << ");" << std::endl;
5✔
191
    out.stream << "int __softmax_block_size = " << SOFTMAX_BLOCK_SIZE << ";" << std::endl;
5✔
192
    out.stream << "if (__softmax_row_size < __softmax_block_size) __softmax_block_size = __softmax_row_size;"
5✔
193
               << std::endl;
5✔
194
    // Round up to multiple of 32 (warp size)
195
    out.stream << "__softmax_block_size = ((__softmax_block_size + 31) / 32) * 32;" << std::endl;
5✔
196
    out.stream << "int __softmax_num_warps = __softmax_block_size / 32;" << std::endl;
5✔
197
    out.stream << "size_t __softmax_smem = __softmax_num_warps * sizeof(" << type << ");" << std::endl;
5✔
198
    out.stream << kernel_name << "<<<__softmax_num_rows, __softmax_block_size, __softmax_smem>>>(" << input_ptr << ", "
5✔
199
               << output_ptr << ", __softmax_num_rows, __softmax_row_size);" << std::endl;
5✔
200

201
    check_cuda_kernel_launch_errors(out.stream, language_extension, false);
5✔
202

203
    out.stream.setIndent(out.stream.indent() - 4);
5✔
204
    out.stream << "}" << std::endl;
5✔
205
}
5✔
206

207
// WithTransfers
208

209
SoftmaxNodeDispatcher_CUDAWithTransfers::SoftmaxNodeDispatcher_CUDAWithTransfers(
210
    codegen::LanguageExtension& language_extension,
211
    const Function& function,
212
    const data_flow::DataFlowGraph& data_flow_graph,
213
    const sdfg::math::tensor::SoftmaxNode& node
214
)
215
    : codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}
2✔
216

217
void SoftmaxNodeDispatcher_CUDAWithTransfers::dispatch_code_with_edges(
218
    codegen::CodegenOutput& out,
219
    std::vector<codegen::DispatchInput>& inputs,
220
    std::vector<codegen::DispatchOutput>& outputs
221
) {
2✔
222
    auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);
2✔
223
    auto prim_type = node.primitive_type(this->data_flow_graph_);
2✔
224
    std::string type = get_type_string(prim_type);
2✔
225

226
    // Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
227
    auto& y_expr = inputs.at(0).expr;
2✔
228
    auto& x_expr = inputs.at(1).expr;
2✔
229

230
    std::string num_rows_str, row_size_str;
2✔
231
    compute_row_dims(node, this->language_extension_, num_rows_str, row_size_str);
2✔
232

233
    std::string total_size = "((size_t)(" + num_rows_str + ") * (size_t)(" + row_size_str + ")) * sizeof(" + type + ")";
2✔
234

235
    out.stream << "{" << std::endl;
2✔
236
    out.stream.setIndent(out.stream.indent() + 4);
2✔
237

238
    out.stream << "cudaError_t err_cuda;" << std::endl;
2✔
239
    out.stream << type << " *d_input, *d_output;" << std::endl;
2✔
240
    out.stream << "size_t __softmax_total_bytes = " << total_size << ";" << std::endl;
2✔
241

242
    out.stream << "err_cuda = cudaMalloc((void**) &d_input, __softmax_total_bytes);" << std::endl;
2✔
243
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
244
    out.stream << "err_cuda = cudaMalloc((void**) &d_output, __softmax_total_bytes);" << std::endl;
2✔
245
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
246

247
    out.stream << "err_cuda = cudaMemcpy(d_input, " << x_expr << ", __softmax_total_bytes, cudaMemcpyHostToDevice);"
2✔
248
               << std::endl;
2✔
249
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
250

251
    dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, "d_input", "d_output");
2✔
252

253
    out.stream << "err_cuda = cudaMemcpy(" << y_expr << ", d_output, __softmax_total_bytes, cudaMemcpyDeviceToHost);"
2✔
254
               << std::endl;
2✔
255
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
256

257
    out.stream << "err_cuda = cudaFree(d_input);" << std::endl;
2✔
258
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
259
    out.stream << "err_cuda = cudaFree(d_output);" << std::endl;
2✔
260
    cuda_error_checking(out.stream, this->language_extension_, "err_cuda");
2✔
261

262
    out.stream.setIndent(out.stream.indent() - 4);
2✔
263
    out.stream << "}" << std::endl;
2✔
264
}
2✔
265

266
// WithoutTransfers
267

268
SoftmaxNodeDispatcher_CUDAWithoutTransfers::SoftmaxNodeDispatcher_CUDAWithoutTransfers(
269
    codegen::LanguageExtension& language_extension,
270
    const Function& function,
271
    const data_flow::DataFlowGraph& data_flow_graph,
272
    const sdfg::math::tensor::SoftmaxNode& node
273
)
274
    : codegen::LibraryNodeDispatcher(language_extension, function, data_flow_graph, node) {}
3✔
275

276
void SoftmaxNodeDispatcher_CUDAWithoutTransfers::dispatch_code_with_edges(
277
    codegen::CodegenOutput& out,
278
    std::vector<codegen::DispatchInput>& inputs,
279
    std::vector<codegen::DispatchOutput>& outputs
280
) {
3✔
281
    auto& node = static_cast<const sdfg::math::tensor::SoftmaxNode&>(this->node_);
3✔
282

283
    // Connectors: inputs_={"Y", "X"} → inputs[0]=Y (output buffer), inputs[1]=X (input data)
284
    auto& y_expr = inputs.at(0).expr;
3✔
285
    auto& x_expr = inputs.at(1).expr;
3✔
286

287
    dispatch_softmax_common(out, inputs, this->language_extension_, node, this->data_flow_graph_, x_expr, y_expr);
3✔
288
}
3✔
289

290
} // namespace sdfg::cuda::tensor
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc