• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 21833272337

09 Feb 2026 04:44PM UTC coverage: 84.824% (+0.3%) from 84.545%
21833272337

push

github

web-flow
V3.95: Block formats and CI compile caches (#502)

* Incrementing SEMVER to v3.95.1

* axpy/gemm/gemv now protect against out-of-bound requests

* Don't push a Docker image if the build failed in some way

* protecting against div by 0 when input sample vector is empty

* areal enhancement of integer assignment

old implementation did a conversion to double, which loses information
for integer values > 2^53.

Now using a constexpr guarded path to pick up the information bits
if the areal is big enough to hold them.

* The new test file static/areal/conversion/integer_conversion.cpp covers:

  1. VerifySmallIntegerConversion — powers of 2, small odd integers, and signed integers within the type's representable range; tests both the
  double-delegation path (fbits < 53) and the native path (fbits >= 53)
  2. VerifyLargeUnsignedIntegerConversion — values beyond 2^53 (2^53+1, 2^54+1, 2^53+3, large powers of 2, UINT64_MAX) with verification that the ubit is
  correctly set when bits are truncated and clear when representation is exact
  3. VerifyLargeSignedIntegerConversion — -(2^53+1), INT64_MIN, INT64_MIN+1, INT64_MAX with sign bit and ubit verification
  4. VerifyIntegerDoubleConsistency — verifies bit-for-bit agreement between the integer assignment path and the double assignment path for integers up to
  2^53 (where both should produce identical results)

* Replaced the duplicated union-based type-punning in to_binary() with:

  1. #include <universal/utility/bit_cast.hpp> — the project's own portable sw::bit_cast backport (constexpr on C++20 compilers, memcpy-based fallback
  otherwise)
  2. print_scalar_bits() helper — a single template function that handles float (via sw::bit_cast<uint32_t>), double (via sw::bit_cast<uint64_t>), and a
  streaming fallback for other scalar types
  3. to_binary() now calls print_scalar_bits() twice (for v.lo() and v.hi()), eliminating the four duplicated union/loop blocks

  This removes undefined behavior (union typ... (continued)

1056 of 1113 new or added lines in 16 files covered. (94.88%)

2 existing lines in 1 file now uncovered.

35738 of 42132 relevant lines covered (84.82%)

6273224.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.06
/include/sw/universal/number/microfloat/microfloat_impl.hpp
1
#pragma once
2
// microfloat_impl.hpp: definition of the microfloat number system for MX/OCP element types
3
//
4
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
5
// SPDX-License-Identifier: MIT
6
//
7
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
8
#include <string>
9
#include <sstream>
10
#include <iostream>
11
#include <iomanip>
12
#include <cstring>
13
#include <cmath>
14

15
#include <universal/number/shared/specific_value_encoding.hpp>
16
#include <universal/number/shared/nan_encoding.hpp>
17
#include <universal/number/shared/infinite_encoding.hpp>
18
#include <universal/number/microfloat/microfloat_fwd.hpp>
19
#include <universal/number/microfloat/exceptions.hpp>
20

21
namespace sw { namespace universal {
22

23
// microfloat: a lightweight floating-point type for MX/OCP block formats
24
// Template parameters:
25
//   _nbits       - total number of bits (4, 6, or 8)
26
//   _es          - number of exponent bits
27
//   _hasInf      - whether the type supports IEEE-like infinity
28
//   _hasNaN      - whether the type supports NaN encoding
29
//   _isSaturating - whether overflow saturates to maxpos/maxneg
30
template<unsigned _nbits, unsigned _es, bool _hasInf, bool _hasNaN, bool _isSaturating>
31
class microfloat {
32
        static_assert(_nbits <= 8, "microfloat is limited to 8 bits");
33
        static_assert(_es < _nbits, "exponent bits must be less than total bits");
34
        static_assert(_es >= 1, "need at least 1 exponent bit");
35

36
        // HELPER methods
37
        template<typename SignedInt,
38
                typename = typename std::enable_if< std::is_integral<SignedInt>::value, SignedInt >::type>
39
        microfloat& convert_signed(SignedInt v) noexcept {
21✔
40
                from_float(static_cast<float>(v));
21✔
41
                return *this;
21✔
42
        }
43
        template<typename UnsignedInt,
44
                typename = typename std::enable_if< std::is_integral<UnsignedInt>::value, UnsignedInt >::type>
45
        microfloat& convert_unsigned(UnsignedInt v) noexcept {
46
                from_float(static_cast<float>(v));
47
                return *this;
48
        }
49
        template<typename Real,
50
                typename = typename std::enable_if< std::is_floating_point<Real>::value, Real >::type>
51
        microfloat& convert_ieee754(Real rhs) noexcept {
537,335✔
52
                from_float(static_cast<float>(rhs));
537,335✔
53
                return *this;
537,335✔
54
        }
55

56
public:
57
        static constexpr unsigned nbits  = _nbits;
58
        static constexpr unsigned es     = _es;
59
        static constexpr unsigned fbits  = nbits - 1u - es;   // fraction bits (without hidden bit)
60
        static constexpr int      bias   = (1 << (es - 1)) - 1;
61
        static constexpr bool     hasInf = _hasInf;
62
        static constexpr bool     hasNaN = _hasNaN;
63
        static constexpr bool     isSaturating = _isSaturating;
64
        static constexpr uint8_t  bitmask = static_cast<uint8_t>((1u << nbits) - 1u);
65

66
        // derived constants
67
        static constexpr uint8_t  sign_mask = static_cast<uint8_t>(1u << (nbits - 1u));
68
        static constexpr uint8_t  exponent_mask = static_cast<uint8_t>(((1u << es) - 1u) << fbits);
69
        static constexpr uint8_t  fraction_mask = static_cast<uint8_t>((1u << fbits) - 1u);
70
        static constexpr unsigned max_exp_code = (1u << es) - 1u;
71

72
        microfloat() = default;
73

74
        constexpr microfloat(const microfloat&) = default;
75
        constexpr microfloat(microfloat&&) = default;
76

77
        constexpr microfloat& operator=(const microfloat&) = default;
78
        constexpr microfloat& operator=(microfloat&&) = default;
79

80
        // specific value constructor
81
        constexpr microfloat(const SpecificValue code) noexcept : _bits{} {
82
                switch (code) {
83
                case SpecificValue::infpos:
84
                        setinf(false);
85
                        break;
86
                case SpecificValue::maxpos:
87
                        maxpos();
88
                        break;
89
                case SpecificValue::minpos:
90
                        minpos();
91
                        break;
92
                case SpecificValue::zero:
93
                default:
94
                        setzero();
95
                        break;
96
                case SpecificValue::minneg:
97
                        minneg();
98
                        break;
99
                case SpecificValue::infneg:
100
                        setinf(true);
101
                        break;
102
                case SpecificValue::maxneg:
103
                        maxneg();
104
                        break;
105
                case SpecificValue::qnan:
106
                case SpecificValue::nar:
107
                        setnan(NAN_TYPE_QUIET);
108
                        break;
109
                case SpecificValue::snan:
110
                        setnan(NAN_TYPE_SIGNALLING);
111
                        break;
112
                }
113
        }
114

115
        // initializers for native types
116
        microfloat(signed char iv)                    noexcept : _bits{} { *this = iv; }
117
        microfloat(short iv)                          noexcept : _bits{} { *this = iv; }
118
        microfloat(int iv)                            noexcept : _bits{} { *this = iv; }
119
        microfloat(long iv)                           noexcept : _bits{} { *this = iv; }
120
        microfloat(long long iv)                      noexcept : _bits{} { *this = iv; }
121
        microfloat(char iv)                           noexcept : _bits{} { *this = iv; }
122
        microfloat(unsigned short iv)                 noexcept : _bits{} { *this = iv; }
123
        microfloat(unsigned int iv)                   noexcept : _bits{} { *this = iv; }
124
        microfloat(unsigned long iv)                  noexcept : _bits{} { *this = iv; }
125
        microfloat(unsigned long long iv)             noexcept : _bits{} { *this = iv; }
126
        explicit microfloat(float iv)                 noexcept : _bits{} { *this = iv; }
537,335✔
127
        explicit microfloat(double iv)                noexcept : _bits{} { *this = iv; }
128

129
        // assignment operators for native types
130
        microfloat& operator=(signed char rhs)        noexcept { return convert_signed(rhs); }
131
        microfloat& operator=(short rhs)              noexcept { return convert_signed(rhs); }
132
        microfloat& operator=(int rhs)                noexcept { return convert_signed(rhs); }
21✔
133
        microfloat& operator=(long rhs)               noexcept { return convert_signed(rhs); }
134
        microfloat& operator=(long long rhs)          noexcept { return convert_signed(rhs); }
135
        microfloat& operator=(char rhs)               noexcept { return convert_unsigned(rhs); }
136
        microfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned(rhs); }
137
        microfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned(rhs); }
138
        microfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned(rhs); }
139
        microfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned(rhs); }
140
        microfloat& operator=(float rhs)              noexcept { return convert_ieee754(rhs); }
537,335✔
141
        microfloat& operator=(double rhs)             noexcept { return convert_ieee754(rhs); }
142

143
        // conversion operators
144
        explicit operator float()                       const noexcept { return to_float(); }
1,074,197✔
145
        explicit operator double()                      const noexcept { return static_cast<double>(to_float()); }
1✔
146
        explicit operator signed char()                 const noexcept { return static_cast<signed char>(to_float()); }
1✔
147
        explicit operator short()                       const noexcept { return static_cast<short>(to_float()); }
1✔
148
        explicit operator int()                         const noexcept { return static_cast<int>(to_float()); }
149
        explicit operator long()                        const noexcept { return static_cast<long>(to_float()); }
1✔
150
        explicit operator long long()                   const noexcept { return static_cast<long long>(to_float()); }
1✔
151
        explicit operator char()                        const noexcept { return static_cast<char>(to_float()); }
1✔
152
        explicit operator unsigned short()              const noexcept { return static_cast<unsigned short>(to_float()); }
1✔
153
        explicit operator unsigned int()                const noexcept { return static_cast<unsigned int>(to_float()); }
154
        explicit operator unsigned long()               const noexcept { return static_cast<unsigned long>(to_float()); }
1✔
155
        explicit operator unsigned long long()          const noexcept { return static_cast<unsigned long long>(to_float()); }
1✔
156

157
#if LONG_DOUBLE_SUPPORT
158
        explicit microfloat(long double iv)           noexcept : _bits{} { *this = iv; }
159
        microfloat& operator=(long double rhs)        noexcept { return convert_ieee754(rhs); }
160
        explicit operator long double()                 const noexcept { return static_cast<long double>(to_float()); }
161
#endif
162

163
        // prefix operators
164
        microfloat operator-() const noexcept {
1✔
165
                microfloat tmp;
166
                tmp.setbits(_bits ^ sign_mask);
1✔
167
                return tmp;
1✔
168
        }
169

170
        microfloat& operator++() noexcept {
2✔
171
                // increment to the next encoding
172
                if (_bits & sign_mask) {
2✔
173
                        // negative
NEW
174
                        uint8_t magnitude = _bits & static_cast<uint8_t>(~sign_mask);
×
NEW
175
                        if (magnitude == 1u) {
×
NEW
176
                                _bits = 0; // go to +0
×
177
                        }
NEW
178
                        else if (magnitude > 0u) {
×
NEW
179
                                --_bits;
×
180
                        }
181
                        // if magnitude == 0 (negative zero), stay at zero
182
                }
183
                else {
184
                        // positive: increment unless at max encoding
185
                        uint8_t magnitude = _bits & static_cast<uint8_t>(~sign_mask);
2✔
186
                        uint8_t maxMagnitude = static_cast<uint8_t>(bitmask >> 1);
2✔
187
                        if (magnitude < maxMagnitude) {
2✔
188
                                ++_bits;
2✔
189
                        }
190
                }
191
                return *this;
2✔
192
        }
193
        microfloat operator++(int) noexcept {
194
                microfloat tmp(*this);
195
                operator++();
196
                return tmp;
197
        }
198
        microfloat& operator--() noexcept {
2✔
199
                if (_bits & sign_mask) {
2✔
200
                        // negative: increment magnitude
NEW
201
                        uint8_t magnitude = _bits & static_cast<uint8_t>(~sign_mask);
×
NEW
202
                        uint8_t maxMagnitude = static_cast<uint8_t>(bitmask >> 1);
×
NEW
203
                        if (magnitude < maxMagnitude) {
×
NEW
204
                                ++_bits;
×
205
                        }
206
                }
207
                else {
208
                        // positive
209
                        if (_bits == 0u) {
2✔
NEW
210
                                _bits = sign_mask | 0x01u; // go to minneg
×
211
                        }
212
                        else {
213
                                --_bits;
2✔
214
                        }
215
                }
216
                return *this;
2✔
217
        }
218
        microfloat operator--(int) noexcept {
1✔
219
                microfloat tmp(*this);
1✔
220
                operator--();
1✔
221
                return tmp;
1✔
222
        }
223

224
        // arithmetic operators
225
        microfloat& operator+=(const microfloat& rhs) {
134,469✔
226
                float result = to_float() + rhs.to_float();
134,469✔
227
                from_float(result);
134,469✔
228
                return *this;
134,469✔
229
        }
230
        microfloat& operator-=(const microfloat& rhs) {
134,469✔
231
                float result = to_float() - rhs.to_float();
134,469✔
232
                from_float(result);
134,469✔
233
                return *this;
134,469✔
234
        }
235
        microfloat& operator*=(const microfloat& rhs) {
134,469✔
236
                float result = to_float() * rhs.to_float();
134,469✔
237
                from_float(result);
134,469✔
238
                return *this;
134,469✔
239
        }
240
        microfloat& operator/=(const microfloat& rhs) {
133,177✔
241
                float result = to_float() / rhs.to_float();
133,177✔
242
                from_float(result);
133,177✔
243
                return *this;
133,177✔
244
        }
245

246
        // modifiers
247
        constexpr void clear() noexcept { _bits = 0; }
10,973✔
248
        constexpr void setzero() noexcept { clear(); }
10,607✔
249

250
        constexpr void setnan(int NaNType = NAN_TYPE_SIGNALLING) noexcept {
3✔
251
                if constexpr (hasNaN) {
252
                        if constexpr (nbits == 8 && es == 4) {
253
                                // e4m3: NaN encodings are 0x7F (positive) and 0xFF (negative)
254
                                // S.1111.111 pattern
255
                                _bits = (NaNType == NAN_TYPE_SIGNALLING) ? 0xFFu : 0x7Fu;
1✔
256
                        }
257
                        else {
258
                                // e5m2 (IEEE-like): all-ones exponent with non-zero fraction
259
                                // quiet NaN: fraction MSB = 1, signaling NaN: fraction MSB = 0 with non-zero fraction
260
                                if (NaNType == NAN_TYPE_SIGNALLING) {
2✔
261
                                        _bits = static_cast<uint8_t>(sign_mask | exponent_mask | 0x01u);
1✔
262
                                }
263
                                else {
264
                                        _bits = static_cast<uint8_t>(exponent_mask | (1u << (fbits - 1u)));
1✔
265
                                }
266
                        }
267
                }
268
                else {
269
                        // no NaN support: set to zero
270
                        _bits = 0;
271
                }
272
                _bits &= bitmask;
3✔
273
        }
3✔
274

275
        constexpr void setinf(bool sign = false) noexcept {
32,842✔
276
                if constexpr (hasInf) {
277
                        // e5m2 (IEEE-like): all-ones exponent, zero fraction
278
                        _bits = exponent_mask;
32,842✔
279
                        if (sign) _bits |= sign_mask;
32,842✔
280
                        _bits &= bitmask;
32,842✔
281
                }
282
                else if constexpr (isSaturating) {
283
                        // saturate to maxpos/maxneg
284
                        if (sign) maxneg(); else maxpos();
285
                }
286
                else {
287
                        _bits = 0;
288
                }
289
        }
32,842✔
290

291
        constexpr void setbit(unsigned i, bool v = true) noexcept {
2✔
292
                if (i < nbits) {
2✔
293
                        uint8_t bit = static_cast<uint8_t>(1u << i);
2✔
294
                        if (v) {
2✔
295
                                _bits |= bit;
1✔
296
                        }
297
                        else {
298
                                _bits &= static_cast<uint8_t>(~bit);
1✔
299
                        }
300
                        _bits &= bitmask;
2✔
301
                }
302
        }
2✔
303
        constexpr void setbits(unsigned value) noexcept { _bits = static_cast<uint8_t>(value & bitmask); }
551,294✔
304

305
        constexpr microfloat& minpos() noexcept { _bits = 0x01u; return *this; }
6✔
306

307
        constexpr microfloat& maxpos() noexcept {
1,183,045✔
308
                if constexpr (hasNaN && hasInf) {
309
                        // e5m2: all-ones exponent is Inf/NaN, so max normal = exponent_mask - 1 step
310
                        // max = S.11110.11 = 0x7B
311
                        _bits = static_cast<uint8_t>(exponent_mask - (1u << fbits) + fraction_mask + (1u << fbits));
498,230✔
312
                        // Actually: max normal for e5m2: exp=11110, frac=11 -> 0b0.11110.11 = 0x7B
313
                        _bits = static_cast<uint8_t>((max_exp_code - 1u) << fbits | fraction_mask);
498,230✔
314
                }
315
                else if constexpr (hasNaN && !hasInf) {
316
                        // e4m3: NaN is all-ones exp + all-ones fraction, max = all-ones exp + (fraction_mask - 1)
317
                        // max = 0b0.1111.110 = 0x7E
318
                        _bits = static_cast<uint8_t>(exponent_mask | (fraction_mask - 1u));
558,828✔
319
                }
320
                else {
321
                        // No NaN, no Inf: all encodings are valid numbers
322
                        // max = 0.111...1 (all bits except sign set)
323
                        _bits = static_cast<uint8_t>(bitmask >> 1);
125,987✔
324
                }
325
                return *this;
1,183,045✔
326
        }
327

328
        constexpr microfloat& zero() noexcept { _bits = 0x00u; return *this; }
329

330
        constexpr microfloat& minneg() noexcept { _bits = static_cast<uint8_t>(sign_mask | 0x01u); return *this; }
5✔
331

332
        constexpr microfloat& maxneg() noexcept {
25,087✔
333
                maxpos();
25,087✔
334
                _bits |= sign_mask;
25,087✔
335
                _bits &= bitmask;
25,087✔
336
                return *this;
25,087✔
337
        }
338

339
        // selectors
340
        constexpr bool iszero() const noexcept {
4,044,519✔
341
                return (_bits == 0x00u) || (_bits == sign_mask);
4,044,519✔
342
        }
343
        constexpr bool isone() const noexcept {
1✔
344
                // 1.0 = sign=0, exponent=bias, fraction=0
345
                uint8_t one_encoding = static_cast<uint8_t>(static_cast<unsigned>(bias) << fbits);
1✔
346
                return _bits == one_encoding;
1✔
347
        }
348
        constexpr bool isodd() const noexcept { return (_bits & 0x01u); }
349
        constexpr bool iseven() const noexcept { return !isodd(); }
350
        constexpr bool ispos() const noexcept { return !isneg(); }
351
        constexpr bool isneg() const noexcept { return (_bits & sign_mask) != 0; }
3,331,776✔
352

353
        constexpr bool isnan(int NaNType = NAN_TYPE_EITHER) const noexcept {
4,126,304✔
354
                if constexpr (!hasNaN) return false;
68,100✔
355

356
                if constexpr (nbits == 8 && es == 4 && !hasInf && hasNaN) {
357
                        // e4m3: NaN is S.1111.111 -> encodings 0x7F and 0xFF
358
                        bool isNaN = (_bits & 0x7Fu) == 0x7Fu;
2,075,386✔
359
                        if (NaNType == NAN_TYPE_EITHER) return isNaN;
2,075,386✔
NEW
360
                        if (NaNType == NAN_TYPE_SIGNALLING) return isNaN && ((_bits & sign_mask) != 0);
×
NEW
361
                        if (NaNType == NAN_TYPE_QUIET) return isNaN && ((_bits & sign_mask) == 0);
×
NEW
362
                        return false;
×
363
                }
364
                else {
365
                        // IEEE-like (e5m2): NaN = all-ones exponent + non-zero fraction
366
                        uint8_t exp = (_bits & exponent_mask);
1,982,818✔
367
                        uint8_t frac = (_bits & fraction_mask);
1,982,818✔
368
                        bool isNaN = (exp == exponent_mask) && (frac != 0);
1,982,818✔
369
                        if (NaNType == NAN_TYPE_EITHER) return isNaN;
1,982,818✔
NEW
370
                        bool isQuietNaN = isNaN && ((frac & (1u << (fbits - 1u))) != 0);
×
NEW
371
                        bool isSignalNaN = isNaN && ((frac & (1u << (fbits - 1u))) == 0);
×
NEW
372
                        if (NaNType == NAN_TYPE_QUIET) return isQuietNaN;
×
NEW
373
                        if (NaNType == NAN_TYPE_SIGNALLING) return isSignalNaN;
×
NEW
374
                        return false;
×
375
                }
376
        }
377

378
        constexpr bool isinf(int InfType = INF_TYPE_EITHER) const noexcept {
2,024,483✔
379
                if constexpr (!hasInf) return false;
293,450✔
380

381
                // IEEE-like: all-ones exponent + zero fraction
382
                uint8_t exp = (_bits & exponent_mask);
1,731,033✔
383
                uint8_t frac = (_bits & fraction_mask);
1,731,033✔
384
                bool inf = (exp == exponent_mask) && (frac == 0);
1,731,033✔
385
                if (!inf) return false;
1,731,033✔
386
                bool negative = isneg();
2,403✔
387
                if (InfType == INF_TYPE_EITHER) return true;
2,403✔
NEW
388
                if (InfType == INF_TYPE_NEGATIVE) return negative;
×
NEW
389
                if (InfType == INF_TYPE_POSITIVE) return !negative;
×
NEW
390
                return false;
×
391
        }
392

393
        constexpr bool   sign()   const noexcept { return isneg(); }
394
        constexpr int    scale()  const noexcept {
395
                int e = static_cast<int>((_bits & exponent_mask) >> fbits);
396
                return e - bias;
397
        }
398
        constexpr uint8_t bits()  const noexcept { return _bits; }
1,027,142✔
399

400
        constexpr bool test(unsigned bitIndex) const noexcept { return at(bitIndex); }
401
        constexpr bool at(unsigned bitIndex) const noexcept {
402
                if (bitIndex < nbits) {
403
                        return (_bits & (1u << bitIndex)) != 0;
404
                }
405
                return false;
406
        }
407
        constexpr uint8_t nibble(unsigned n) const noexcept {
408
                if (n < 2) {
409
                        return static_cast<uint8_t>((_bits >> (n * 4u)) & 0x0Fu);
410
                }
411
                return 0;
412
        }
413
        constexpr uint8_t exponent() const noexcept {
3,328,964✔
414
                return static_cast<uint8_t>((_bits & exponent_mask) >> fbits);
3,328,964✔
415
        }
416
        constexpr uint8_t fraction() const noexcept {
3,328,964✔
417
                return static_cast<uint8_t>(_bits & fraction_mask);
3,328,964✔
418
        }
419

420
        // Convert to float
421
        float to_float() const noexcept {
3,348,957✔
422
                if (iszero()) return 0.0f;
3,348,957✔
423
                if constexpr (hasNaN) {
424
                        if (isnan()) return std::numeric_limits<float>::quiet_NaN();
3,037,979✔
425
                }
426
                if constexpr (hasInf) {
427
                        if (isinf()) {
1,482,031✔
428
                                return isneg() ? -std::numeric_limits<float>::infinity()
409✔
429
                                               :  std::numeric_limits<float>::infinity();
409✔
430
                        }
431
                }
432

433
                bool     s = isneg();
3,328,964✔
434
                unsigned e = exponent();
3,328,964✔
435
                unsigned f = fraction();
3,328,964✔
436

437
                float value;
438
                if (e == 0) {
3,328,964✔
439
                        // subnormal: value = (-1)^s * 2^(1-bias) * (0.fraction)
440
                        float frac = static_cast<float>(f) / static_cast<float>(1u << fbits);
105,666✔
441
                        value = std::ldexp(frac, 1 - bias);
105,666✔
442
                }
443
                else {
444
                        // normal: value = (-1)^s * 2^(e-bias) * (1.fraction)
445
                        float frac = 1.0f + static_cast<float>(f) / static_cast<float>(1u << fbits);
3,223,298✔
446
                        value = std::ldexp(frac, static_cast<int>(e) - bias);
3,223,298✔
447
                }
448
                return s ? -value : value;
3,328,964✔
449
        }
450

451
        // Convert from float with RNE rounding
452
        void from_float(float v) noexcept {
1,142,687✔
453
                if (v != v) { // NaN check
1,142,687✔
454
                        if constexpr (hasNaN) {
NEW
455
                                setnan(NAN_TYPE_QUIET);
×
456
                        }
457
                        else {
NEW
458
                                setzero();
×
459
                        }
460
                        return;
93,956✔
461
                }
462

463
                bool s = std::signbit(v);
1,142,687✔
464
                if (s) v = -v;
1,142,687✔
465

466
                if (std::isinf(v)) {
1,142,687✔
467
                        if constexpr (hasInf) {
468
                                setinf(s);
2✔
469
                        }
470
                        else if constexpr (isSaturating) {
NEW
471
                                if (s) maxneg(); else maxpos();
×
472
                        }
473
                        else {
474
                                setzero();
475
                        }
476
                        return;
2✔
477
                }
478

479
                if (v == 0.0f) {
1,142,685✔
480
                        setzero();
10,597✔
481
                        return;
10,597✔
482
                }
483

484
                // Compute the maxpos value for clamping
485
                microfloat mp;
486
                mp.maxpos();
1,132,088✔
487
                float maxval = mp.to_float();
1,132,088✔
488

489
                if (v >= maxval) {
1,132,088✔
490
                        // Check if we need to round to maxpos or to inf
491
                        if constexpr (hasInf) {
492
                                // Compute the tie-point between maxpos and inf
493
                                // For IEEE-like types, values > maxval round to inf or stay at maxval
494
                                // The tie point is maxval + 0.5 ULP above maxval
495
                                // For simplicity with these small types: if v > maxval, go to inf
496
                                if (v > maxval) {
34,089✔
497
                                        setinf(s);
32,839✔
498
                                        return;
32,839✔
499
                                }
500
                        }
501
                        if constexpr (isSaturating) {
502
                                if (s) maxneg(); else maxpos();
49,268✔
503
                                return;
49,268✔
504
                        }
505
                        // non-saturating without inf: clamp to max
506
                        if (s) maxneg(); else maxpos();
1,250✔
507
                        return;
1,250✔
508
                }
509

510
                // Extract exponent and fraction from the float value
511
                int exp;
512
                float frac = std::frexp(v, &exp);
1,048,731✔
513
                // frexp returns frac in [0.5, 1.0), exp such that v = frac * 2^exp
514
                // We want: v = 1.mantissa * 2^(exp-1)
515
                // so our biased_exp = exp - 1 + bias
516
                exp -= 1; // now v = (2*frac) * 2^exp, and 2*frac in [1.0, 2.0)
1,048,731✔
517
                float significand = 2.0f * frac; // in [1.0, 2.0)
1,048,731✔
518

519
                int biased_exp = exp + bias;
1,048,731✔
520

521
                if (biased_exp <= 0) {
1,048,731✔
522
                        // Subnormal range
523
                        // subnormal: v = f * 2^(1-bias) where f = 0.mantissa in [0, 1)
524
                        float subnormal_frac = v / std::ldexp(1.0f, 1 - bias);
94,693✔
525
                        // subnormal_frac is in [0, 1)
526
                        // Quantize to fbits bits with RNE
527
                        float scaled = subnormal_frac * static_cast<float>(1u << fbits);
94,693✔
528
                        unsigned f_int = rne_round(scaled);
94,693✔
529
                        if (f_int >= (1u << fbits)) {
94,693✔
530
                                // Rounded up to smallest normal
531
                                biased_exp = 1;
2,683✔
532
                                f_int = 0;
2,683✔
533
                                _bits = static_cast<uint8_t>((static_cast<unsigned>(biased_exp) << fbits) | f_int);
2,683✔
534
                        }
535
                        else {
536
                                _bits = static_cast<uint8_t>(f_int);
92,010✔
537
                        }
538
                }
539
                else {
540
                        // Normal range
541
                        // significand is in [1.0, 2.0), we need the fractional part
542
                        float mantissa = significand - 1.0f; // in [0, 1)
954,038✔
543
                        float scaled = mantissa * static_cast<float>(1u << fbits);
954,038✔
544
                        unsigned f_int = rne_round(scaled);
954,038✔
545
                        if (f_int >= (1u << fbits)) {
954,038✔
546
                                // Carry into exponent
547
                                f_int = 0;
58,281✔
548
                                biased_exp += 1;
58,281✔
549
                        }
550
                        // Check for overflow after rounding
551
                        if constexpr (hasNaN && hasInf) {
552
                                // e5m2: max biased exp for normal = max_exp_code - 1
553
                                if (static_cast<unsigned>(biased_exp) >= max_exp_code) {
428,890✔
NEW
554
                                        setinf(s);
×
NEW
555
                                        return;
×
556
                                }
557
                        }
558
                        else if constexpr (hasNaN && !hasInf) {
559
                                // e4m3: max biased exp = max_exp_code, but all-ones exp + all-ones frac = NaN
560
                                if (static_cast<unsigned>(biased_exp) > max_exp_code) {
441,676✔
NEW
561
                                        if (s) maxneg(); else maxpos();
×
NEW
562
                                        return;
×
563
                                }
564
                                if (static_cast<unsigned>(biased_exp) == max_exp_code && f_int >= fraction_mask) {
441,676✔
NEW
565
                                        if (s) maxneg(); else maxpos();
×
NEW
566
                                        return;
×
567
                                }
568
                        }
569
                        else {
570
                                // No NaN, no Inf: all encodings valid
571
                                if (static_cast<unsigned>(biased_exp) > max_exp_code) {
83,472✔
NEW
572
                                        if (s) maxneg(); else maxpos();
×
NEW
573
                                        return;
×
574
                                }
575
                        }
576
                        _bits = static_cast<uint8_t>((static_cast<unsigned>(biased_exp) << fbits) | f_int);
954,038✔
577
                }
578

579
                if (s) _bits |= sign_mask;
1,048,731✔
580
                _bits &= bitmask;
1,048,731✔
581
        }
582

583
protected:
584
        uint8_t _bits;
585

586
private:
587
        // Round-to-nearest-even helper
588
        static unsigned rne_round(float v) noexcept {
1,048,731✔
589
                unsigned truncated = static_cast<unsigned>(v);
1,048,731✔
590
                float remainder = v - static_cast<float>(truncated);
1,048,731✔
591
                if (remainder > 0.5f) return truncated + 1u;
1,048,731✔
592
                if (remainder < 0.5f) return truncated;
703,051✔
593
                // Exactly 0.5: round to even
594
                return (truncated & 1u) ? truncated + 1u : truncated;
73,349✔
595
        }
596

597
        // microfloat - microfloat logic comparisons
598
        template<unsigned n, unsigned e, bool i, bool na, bool s>
599
        friend bool operator==(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs);
600

601
        // microfloat - literal logic comparisons
602
        template<unsigned n, unsigned e, bool i, bool na, bool s>
603
        friend bool operator==(microfloat<n,e,i,na,s> lhs, float rhs);
604

605
        // literal - microfloat logic comparisons
606
        template<unsigned n, unsigned e, bool i, bool na, bool s>
607
        friend bool operator==(float lhs, microfloat<n,e,i,na,s> rhs);
608
};
609

610
////////////////////////    functions   /////////////////////////////////
611

612
template<unsigned n, unsigned e, bool i, bool na, bool s>
613
inline microfloat<n,e,i,na,s> abs(microfloat<n,e,i,na,s> a) {
614
        return (a.isneg() ? -a : a);
615
}
616

617
/// stream operators
618

619
template<unsigned n, unsigned e, bool i, bool na, bool s>
620
inline std::ostream& operator<<(std::ostream& ostr, microfloat<n,e,i,na,s> mf) {
98✔
621
        return ostr << float(mf);
98✔
622
}
623

624
template<unsigned n, unsigned e, bool i, bool na, bool s>
625
inline std::istream& operator>>(std::istream& istr, microfloat<n,e,i,na,s>& p) {
626
        float f;
627
        istr >> f;
628
        p = microfloat<n,e,i,na,s>(f);
629
        return istr;
630
}
631

632
////////////////// string operators
633

634
template<unsigned nbits, unsigned es, bool hasInf, bool hasNaN, bool isSaturating>
635
inline std::string to_binary(microfloat<nbits, es, hasInf, hasNaN, isSaturating> mf, bool bNibbleMarker = false) {
250✔
636
        constexpr unsigned fbits = nbits - 1u - es;
250✔
637
        std::stringstream ss;
250✔
638
        uint8_t bits = mf.bits();
250✔
639
        uint8_t mask = static_cast<uint8_t>(1u << (nbits - 1u));
250✔
640

641
        ss << (bits & mask ? "0b1." : "0b0.");
250✔
642
        mask >>= 1;
250✔
643
        // exponent bits
644
        for (unsigned j = 0; j < es; ++j) {
964✔
645
                if (bNibbleMarker && j > 0 && (j % 4) == 0) ss << '\'';
714✔
646
                ss << ((bits & mask) ? '1' : '0');
714✔
647
                mask >>= 1;
714✔
648
        }
649
        ss << '.';
250✔
650
        // fraction bits
651
        for (unsigned j = 0; j < fbits; ++j) {
838✔
652
                if (bNibbleMarker && j > 0 && (j % 4) == 0) ss << '\'';
588✔
653
                ss << ((bits & mask) ? '1' : '0');
588✔
654
                mask >>= 1;
588✔
655
        }
656
        return ss.str();
500✔
657
}
250✔
658

659
//////////////////////////////////////////////////////////////////////////////////////////////////////
660
// microfloat - microfloat binary logic operators
661

662
template<unsigned n, unsigned e, bool i, bool na, bool s>
663
inline bool operator==(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
38✔
664
        if (lhs.isnan() || rhs.isnan()) return false;
38✔
665
        // +0 == -0
666
        if (lhs.iszero() && rhs.iszero()) return true;
34✔
667
        return lhs._bits == rhs._bits;
29✔
668
}
669

670
template<unsigned n, unsigned e, bool i, bool na, bool s>
671
inline bool operator!=(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
13✔
672
        return !operator==(lhs, rhs);
13✔
673
}
674

675
template<unsigned n, unsigned e, bool i, bool na, bool s>
676
inline bool operator<(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
52✔
677
        if (lhs.isnan() || rhs.isnan()) return false;
52✔
678
        return (float(lhs) - float(rhs)) < 0;
48✔
679
}
680

681
template<unsigned n, unsigned e, bool i, bool na, bool s>
682
inline bool operator>(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
14✔
683
        return operator<(rhs, lhs);
14✔
684
}
685

686
template<unsigned n, unsigned e, bool i, bool na, bool s>
687
inline bool operator<=(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
12✔
688
        return operator<(lhs, rhs) || operator==(lhs, rhs);
12✔
689
}
690

691
template<unsigned n, unsigned e, bool i, bool na, bool s>
692
inline bool operator>=(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
12✔
693
        return !operator<(lhs, rhs);
12✔
694
}
695

696
//////////////////////////////////////////////////////////////////////////////////////////////////////
697
// microfloat - literal binary logic operators
698

699
template<unsigned n, unsigned e, bool i, bool na, bool s>
700
inline bool operator==(microfloat<n,e,i,na,s> lhs, float rhs) {
701
        return operator==(lhs, microfloat<n,e,i,na,s>(rhs));
702
}
703

704
template<unsigned n, unsigned e, bool i, bool na, bool s>
705
inline bool operator!=(microfloat<n,e,i,na,s> lhs, float rhs) {
706
        return !operator==(lhs, microfloat<n,e,i,na,s>(rhs));
707
}
708

709
template<unsigned n, unsigned e, bool i, bool na, bool s>
710
inline bool operator<(microfloat<n,e,i,na,s> lhs, float rhs) {
711
        return operator<(lhs, microfloat<n,e,i,na,s>(rhs));
712
}
713

714
template<unsigned n, unsigned e, bool i, bool na, bool s>
715
inline bool operator>(microfloat<n,e,i,na,s> lhs, float rhs) {
716
        return operator<(microfloat<n,e,i,na,s>(rhs), lhs);
717
}
718

719
template<unsigned n, unsigned e, bool i, bool na, bool s>
720
inline bool operator<=(microfloat<n,e,i,na,s> lhs, float rhs) {
721
        return operator<(lhs, microfloat<n,e,i,na,s>(rhs)) || operator==(lhs, microfloat<n,e,i,na,s>(rhs));
722
}
723

724
template<unsigned n, unsigned e, bool i, bool na, bool s>
725
inline bool operator>=(microfloat<n,e,i,na,s> lhs, float rhs) {
726
        return !operator<(lhs, microfloat<n,e,i,na,s>(rhs));
727
}
728

729
//////////////////////////////////////////////////////////////////////////////////////////////////////
730
// literal - microfloat binary logic operators
731

732
template<unsigned n, unsigned e, bool i, bool na, bool s>
733
inline bool operator==(float lhs, microfloat<n,e,i,na,s> rhs) {
734
        return operator==(microfloat<n,e,i,na,s>(lhs), rhs);
735
}
736

737
template<unsigned n, unsigned e, bool i, bool na, bool s>
738
inline bool operator!=(float lhs, microfloat<n,e,i,na,s> rhs) {
739
        return !operator==(microfloat<n,e,i,na,s>(lhs), rhs);
740
}
741

742
template<unsigned n, unsigned e, bool i, bool na, bool s>
743
inline bool operator<(float lhs, microfloat<n,e,i,na,s> rhs) {
744
        return operator<(microfloat<n,e,i,na,s>(lhs), rhs);
745
}
746

747
template<unsigned n, unsigned e, bool i, bool na, bool s>
748
inline bool operator>(float lhs, microfloat<n,e,i,na,s> rhs) {
749
        return operator<(rhs, microfloat<n,e,i,na,s>(lhs));
750
}
751

752
template<unsigned n, unsigned e, bool i, bool na, bool s>
753
inline bool operator<=(float lhs, microfloat<n,e,i,na,s> rhs) {
754
        return operator<(microfloat<n,e,i,na,s>(lhs), rhs) || operator==(microfloat<n,e,i,na,s>(lhs), rhs);
755
}
756

757
template<unsigned n, unsigned e, bool i, bool na, bool s>
758
inline bool operator>=(float lhs, microfloat<n,e,i,na,s> rhs) {
759
        return !operator<(microfloat<n,e,i,na,s>(lhs), rhs);
760
}
761

762
//////////////////////////////////////////////////////////////////////////////////////////////////////
763
// microfloat - microfloat binary arithmetic operators
764

765
template<unsigned n, unsigned e, bool i, bool na, bool s>
766
inline microfloat<n,e,i,na,s> operator+(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
134,469✔
767
        microfloat<n,e,i,na,s> sum = lhs;
134,469✔
768
        sum += rhs;
134,469✔
769
        return sum;
134,469✔
770
}
771

772
template<unsigned n, unsigned e, bool i, bool na, bool s>
773
inline microfloat<n,e,i,na,s> operator-(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
134,469✔
774
        microfloat<n,e,i,na,s> diff = lhs;
134,469✔
775
        diff -= rhs;
134,469✔
776
        return diff;
134,469✔
777
}
778

779
template<unsigned n, unsigned e, bool i, bool na, bool s>
780
inline microfloat<n,e,i,na,s> operator*(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
134,469✔
781
        microfloat<n,e,i,na,s> mul = lhs;
134,469✔
782
        mul *= rhs;
134,469✔
783
        return mul;
134,469✔
784
}
785

786
template<unsigned n, unsigned e, bool i, bool na, bool s>
787
inline microfloat<n,e,i,na,s> operator/(microfloat<n,e,i,na,s> lhs, microfloat<n,e,i,na,s> rhs) {
133,177✔
788
        microfloat<n,e,i,na,s> ratio = lhs;
133,177✔
789
        ratio /= rhs;
133,177✔
790
        return ratio;
133,177✔
791
}
792

793
//////////////////////////////////////////////////////////////////////////////////////////////////////
794
// microfloat - literal binary arithmetic operators
795

796
template<unsigned n, unsigned e, bool i, bool na, bool s>
797
inline microfloat<n,e,i,na,s> operator+(microfloat<n,e,i,na,s> lhs, float rhs) {
798
        return operator+(lhs, microfloat<n,e,i,na,s>(rhs));
799
}
800

801
template<unsigned n, unsigned e, bool i, bool na, bool s>
802
inline microfloat<n,e,i,na,s> operator-(microfloat<n,e,i,na,s> lhs, float rhs) {
803
        return operator-(lhs, microfloat<n,e,i,na,s>(rhs));
804
}
805

806
template<unsigned n, unsigned e, bool i, bool na, bool s>
807
inline microfloat<n,e,i,na,s> operator*(microfloat<n,e,i,na,s> lhs, float rhs) {
808
        return operator*(lhs, microfloat<n,e,i,na,s>(rhs));
809
}
810

811
template<unsigned n, unsigned e, bool i, bool na, bool s>
812
inline microfloat<n,e,i,na,s> operator/(microfloat<n,e,i,na,s> lhs, float rhs) {
813
        return operator/(lhs, microfloat<n,e,i,na,s>(rhs));
814
}
815

816
//////////////////////////////////////////////////////////////////////////////////////////////////////
817
// literal - microfloat binary arithmetic operators
818

819
template<unsigned n, unsigned e, bool i, bool na, bool s>
820
inline microfloat<n,e,i,na,s> operator+(float lhs, microfloat<n,e,i,na,s> rhs) {
821
        return operator+(microfloat<n,e,i,na,s>(lhs), rhs);
822
}
823

824
template<unsigned n, unsigned e, bool i, bool na, bool s>
825
inline microfloat<n,e,i,na,s> operator-(float lhs, microfloat<n,e,i,na,s> rhs) {
826
        return operator-(microfloat<n,e,i,na,s>(lhs), rhs);
827
}
828

829
template<unsigned n, unsigned e, bool i, bool na, bool s>
830
inline microfloat<n,e,i,na,s> operator*(float lhs, microfloat<n,e,i,na,s> rhs) {
831
        return operator*(microfloat<n,e,i,na,s>(lhs), rhs);
832
}
833

834
template<unsigned n, unsigned e, bool i, bool na, bool s>
835
inline microfloat<n,e,i,na,s> operator/(float lhs, microfloat<n,e,i,na,s> rhs) {
836
        return operator/(microfloat<n,e,i,na,s>(lhs), rhs);
837
}
838

839
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc