• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 26055419319

18 May 2026 07:25PM UTC coverage: 84.218%. First build
26055419319

Pull #869

github

web-flow
Merge e82104a91 into 3e2a68633
Pull Request #869: IBM HFP clean-up

104 of 105 new or added lines in 2 files covered. (99.05%)

46862 of 55644 relevant lines covered (84.22%)

6407468.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.22
/include/sw/universal/number/hfloat/hfloat_impl.hpp
1
#pragma once
2
// hfloat_impl.hpp: implementation of IBM System/360 hexadecimal floating-point
3
//
4
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
5
// SPDX-License-Identifier: MIT
6
//
7
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
8
//
9
// IBM System/360 Hexadecimal Floating-Point (1964):
10
//   Format: [sign(1)] [exponent(es)] [hex_fraction(ndigits*4 bits)]
11
//   Value:  (-1)^sign * 16^(exponent - bias) * 0.f1f2...fn
12
//
13
// Key properties:
14
//   - No hidden bit (fraction always has explicit leading hex digit)
15
//   - No NaN, no infinity, no subnormals
16
//   - Truncation rounding only (never rounds up)
17
//   - Overflow saturates to maxpos/maxneg
18
//   - Zero: sign=0, exponent=0, fraction=0
19
//   - Wobbling precision: 0-3 leading zero bits in MSB hex digit
20
//
21
// Standard configurations:
22
//   Short:    hfloat<6, 7>  = 1+7+24 = 32 bits
23
//   Long:     hfloat<14, 7> = 1+7+56 = 64 bits
24
//   Extended: hfloat<28, 7> = 1+7+112 = 120 bits (stored in 128)
25

26
#include <cctype>
27
#include <cstdint>
28
#include <cstring>
29
#include <cmath>
30
#include <string>
31
#include <string_view>
32
#include <sstream>
33
#include <iostream>
34
#include <iomanip>
35
#include <algorithm>
36

37
// supporting types and functions
38
#include <universal/native/ieee754.hpp>
39
#include <universal/utility/decimal_to_binary.hpp>
40
#include <universal/number/shared/nan_encoding.hpp>
41
#include <universal/number/shared/infinite_encoding.hpp>
42
#include <universal/number/shared/specific_value_encoding.hpp>
43
// hfloat exception structure
44
#include <universal/number/hfloat/exceptions.hpp>
45

46
namespace sw { namespace universal {
47

48
///////////////////////////////////////////////////////////////////////////////
49
// hfloat: IBM System/360 hexadecimal floating-point number
50
//
51
// Template parameters:
52
//   ndigits  - number of hexadecimal fraction digits
53
//   es       - exponent bits (7 for standard IBM HFP)
54
//   bt       - block type for storage
55
//
56
template<unsigned _ndigits, unsigned _es, typename bt = std::uint32_t>
57
class hfloat {
58
public:
59
        static constexpr unsigned ndigits     = _ndigits;            // hex fraction digits
60
        static constexpr unsigned es          = _es;                 // exponent bits
61
        static constexpr unsigned fbits       = ndigits * 4u;        // fraction bits
62
        static constexpr unsigned nbits       = 1u + es + fbits;     // total bits
63
        static constexpr int      bias        = (1 << (es - 1));     // exponent bias (64 for es=7)
64
        static constexpr int      emax        = (1 << es) - 1 - bias; // max unbiased exponent
65
        static constexpr int      emin        = -bias;                // min unbiased exponent
66

67
        typedef bt BlockType;
68

69
        static constexpr unsigned bitsInByte  = 8u;
70
        static constexpr unsigned bitsInBlock = sizeof(bt) * bitsInByte;
71
        static constexpr unsigned nrBlocks    = 1u + ((nbits - 1u) / bitsInBlock);
72
        static constexpr unsigned MSU         = nrBlocks - 1u;
73

74
        static constexpr bt ALL_ONES  = bt(~0);
75
        static constexpr bt MSU_MASK  = (nrBlocks * bitsInBlock == nbits) ? ALL_ONES : bt((1ull << (nbits % bitsInBlock)) - 1u);
76
        static constexpr bt BLOCK_MASK = bt(~0);
77

78
        /// trivial constructor
79
        hfloat() = default;
80

81
        hfloat(const hfloat&) = default;
82
        hfloat(hfloat&&) = default;
83

84
        hfloat& operator=(const hfloat&) = default;
85
        hfloat& operator=(hfloat&&) = default;
86

87
        // specific value constructor
88
        constexpr hfloat(const SpecificValue code) noexcept : _block{} {
10✔
89
                switch (code) {
10✔
90
                case SpecificValue::maxpos:
2✔
91
                        maxpos();
2✔
92
                        break;
2✔
93
                case SpecificValue::minpos:
1✔
94
                        minpos();
1✔
95
                        break;
1✔
96
                case SpecificValue::zero:
7✔
97
                default:
98
                        zero();
7✔
99
                        break;
7✔
100
                case SpecificValue::minneg:
×
101
                        minneg();
×
102
                        break;
×
103
                case SpecificValue::maxneg:
×
104
                        maxneg();
×
105
                        break;
×
106
                case SpecificValue::infpos:
×
107
                        maxpos();  // no infinity in HFP, saturate to maxpos
×
108
                        break;
×
109
                case SpecificValue::infneg:
×
110
                        maxneg();  // no infinity in HFP, saturate to maxneg
×
111
                        break;
×
112
                case SpecificValue::nar:
×
113
                case SpecificValue::qnan:
114
                case SpecificValue::snan:
115
                        zero();    // no NaN in HFP, map to zero
×
116
                        break;
×
117
                }
118
        }
10✔
119

120
        // initializers for native types
121
        constexpr explicit hfloat(signed char iv)           noexcept : _block{} { *this = iv; }
122
        constexpr explicit hfloat(short iv)                 noexcept : _block{} { *this = iv; }
123
        constexpr explicit hfloat(int iv)                   noexcept : _block{} { *this = iv; }
18✔
124
        constexpr explicit hfloat(long iv)                  noexcept : _block{} { *this = iv; }
125
        constexpr explicit hfloat(long long iv)             noexcept : _block{} { *this = iv; }
126
        constexpr explicit hfloat(char iv)                  noexcept : _block{} { *this = iv; }
127
        constexpr explicit hfloat(unsigned short iv)        noexcept : _block{} { *this = iv; }
128
        constexpr explicit hfloat(unsigned int iv)          noexcept : _block{} { *this = iv; }
5✔
129
        constexpr explicit hfloat(unsigned long iv)         noexcept : _block{} { *this = iv; }
130
        constexpr explicit hfloat(unsigned long long iv)    noexcept : _block{} { *this = iv; }
131
        constexpr explicit hfloat(float iv)                 noexcept : _block{} { *this = iv; }
132
        constexpr explicit hfloat(double iv)                noexcept : _block{} { *this = iv; }
308✔
133

134
        // assignment operators for native types
135
        constexpr hfloat& operator=(signed char rhs)        noexcept { return convert_signed(rhs); }
136
        constexpr hfloat& operator=(short rhs)              noexcept { return convert_signed(rhs); }
137
        constexpr hfloat& operator=(int rhs)                noexcept { return convert_signed(rhs); }
24✔
138
        constexpr hfloat& operator=(long rhs)               noexcept { return convert_signed(rhs); }
139
        constexpr hfloat& operator=(long long rhs)          noexcept { return convert_signed(rhs); }
140
        // Plain `char` may be signed or unsigned per platform; route through
141
        // the signed conversion via integer promotion so hfloat(char(-1)) on
142
        // signed-char targets yields -1, not UCHAR_MAX (CodeRabbit finding
143
        // from PR #805 applied to the sibling type).
144
        constexpr hfloat& operator=(char rhs)               noexcept { return convert_signed(static_cast<int>(rhs)); }
145
        constexpr hfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned(rhs); }
146
        constexpr hfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned(rhs); }
5✔
147
        constexpr hfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned(rhs); }
148
        constexpr hfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned(rhs); }
149
        constexpr hfloat& operator=(float rhs)              noexcept { return convert_ieee754(rhs); }
150
        constexpr hfloat& operator=(double rhs)             noexcept { return convert_ieee754(rhs); }
308✔
151

152
        // conversion operators
153
        constexpr explicit operator float()           const noexcept { return float(convert_to_double()); }
154
        constexpr explicit operator double()          const noexcept { return convert_to_double(); }
282✔
155

156
#if LONG_DOUBLE_SUPPORT
157
        constexpr explicit hfloat(long double iv)           noexcept : _block{} { *this = iv; }
158
        constexpr hfloat& operator=(long double rhs)        noexcept { return convert_ieee754(double(rhs)); }
159
        constexpr explicit operator long double()     const noexcept { return (long double)convert_to_double(); }
160
#endif
161

162
        // Unified entry point: assign from a normalized 64-bit binary mantissa
163
        // plus its "frexp" exponent. This is the algorithm core; both parse()
164
        // (which derives mantissa64 from decimal_to_binary) and convert_ieee754
165
        // (which lifts a double's 53-bit mantissa into the 64-bit convention by
166
        // left-shifting 11 bits) call this same routine.
167
        //
168
        // Preconditions:
169
        //   mantissa64 is normalized with MSB at bit 63 (the "implicit 1"
170
        //   position of a normalized binary mantissa). For inputs derived from
171
        //   an IEEE double, the low 11 bits will be zero -- but they don't have
172
        //   to be; parse() fills all 64.
173
        //   bin_exp is the "frexp" exponent: value = mantissa64 * 2^(bin_exp - 64).
174
        constexpr hfloat& assign_from_mantissa64(bool negative, int bin_exp, uint64_t mantissa64) noexcept {
323✔
175
                if (mantissa64 == 0) { setzero(); return *this; }
323✔
176

177
                // Convert binary exponent to base-16 exponent. ceil(bin_exp/4) so
178
                // 0.f * 16^hex_exp == value with f's leading hex digit non-zero.
179
                int hex_exp;
180
                if (bin_exp > 0) hex_exp = (bin_exp + 3) / 4;
323✔
181
                else             hex_exp = bin_exp / 4;  // C truncates toward zero == ceil for negative
68✔
182

183
                // Align mantissa to the hex fraction bit positions. For a 64-bit
184
                // mantissa, mantissa_shift = (bin_exp - 4*hex_exp + fbits) - 64.
185
                int shift = bin_exp - 4 * hex_exp + static_cast<int>(fbits);
323✔
186
                int mantissa_shift = shift - 64;
323✔
187
                uint64_t fraction = 0;
323✔
188
                if (mantissa_shift == 0) {
323✔
189
                        fraction = mantissa64;
×
190
                }
191
                else if (mantissa_shift > 0) {
323✔
192
                        // fraction's natural width exceeds 64 bits -- can't be held in
193
                        // uint64_t. This is unreachable for hfloat configs with fbits
194
                        // <= 64 (the standard hfloat_short / hfloat_long sizes) and is
195
                        // the hfloat_extended (fbits=112) edge case. Best-effort: cap
196
                        // fraction at the max representable in fbits so normalize_and_pack
197
                        // has something non-zero to encode. Full bit-exact conversion
198
                        // for fbits > 64 would need a multi-limb intermediate; left as
199
                        // future work.
200
                        if constexpr (fbits < 64) {
NEW
201
                                fraction = (uint64_t(1) << fbits) - 1u;
×
202
                        } else {
203
                                fraction = ~uint64_t(0);
2✔
204
                        }
205
                }
206
                else if (-mantissa_shift < 64) {
321✔
207
                        // Shift right truncates low bits -- IBM HFP's truncation rounding.
208
                        fraction = mantissa64 >> static_cast<unsigned>(-mantissa_shift);
321✔
209
                }
210
                // else: fraction stays 0 (deep underflow)
211

212
                if constexpr (fbits < 64) {
213
                        fraction &= ((uint64_t(1) << fbits) - 1u);
321✔
214
                }
215

216
                normalize_and_pack(negative, hex_exp, fraction);
323✔
217
                return *this;
323✔
218
        }
219

220
        // prefix operators
221
        constexpr hfloat operator-() const {
26✔
222
                hfloat negated(*this);
26✔
223
                if (!negated.iszero()) {
26✔
224
                        negated.setsign(!negated.sign());
21✔
225
                }
226
                return negated;
26✔
227
        }
228

229
        // arithmetic operators
230
        constexpr hfloat& operator+=(const hfloat& rhs) {
132✔
231
                bool lhs_sign, rhs_sign;
232
                int lhs_exp, rhs_exp;
233
                uint64_t lhs_frac, rhs_frac;
234
                unpack(lhs_sign, lhs_exp, lhs_frac);
132✔
235
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
132✔
236

237
                if (rhs.iszero()) return *this;
132✔
238
                if (iszero()) { *this = rhs; return *this; }
128✔
239

240
                // align exponents by shifting the smaller-exponent fraction RIGHT
241
                // by 4*(exp_large - exp_small) bits (hex digit alignment)
242
                int shift = lhs_exp - rhs_exp;
126✔
243
                uint64_t aligned_lhs = lhs_frac;
126✔
244
                uint64_t aligned_rhs = rhs_frac;
126✔
245
                int result_exp;
246

247
                if (shift >= 0) {
126✔
248
                        result_exp = lhs_exp;
93✔
249
                        aligned_rhs >>= (static_cast<unsigned>(shift) * 4u);
93✔
250
                }
251
                else {
252
                        result_exp = rhs_exp;
33✔
253
                        aligned_lhs >>= (static_cast<unsigned>(-shift) * 4u);
33✔
254
                }
255

256
                // add/subtract based on signs
257
                int64_t result_frac;
258
                int64_t a = lhs_sign ? -static_cast<int64_t>(aligned_lhs) : static_cast<int64_t>(aligned_lhs);
126✔
259
                int64_t b = rhs_sign ? -static_cast<int64_t>(aligned_rhs) : static_cast<int64_t>(aligned_rhs);
126✔
260
                result_frac = a + b;
126✔
261

262
                bool result_sign = (result_frac < 0);
126✔
263
                uint64_t abs_frac = static_cast<uint64_t>(result_sign ? -result_frac : result_frac);
126✔
264

265
                normalize_and_pack(result_sign, result_exp, abs_frac);
126✔
266
                return *this;
126✔
267
        }
268

269
        constexpr hfloat& operator-=(const hfloat& rhs) {
63✔
270
                hfloat neg(rhs);
63✔
271
                if (!neg.iszero()) neg.setsign(!neg.sign());
63✔
272
                return operator+=(neg);
63✔
273
        }
274

275
        constexpr hfloat& operator*=(const hfloat& rhs) {
69✔
276
                if (iszero() || rhs.iszero()) { setzero(); return *this; }
69✔
277

278
                bool lhs_sign, rhs_sign;
279
                int lhs_exp, rhs_exp;
280
                uint64_t lhs_frac, rhs_frac;
281
                unpack(lhs_sign, lhs_exp, lhs_frac);
66✔
282
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
66✔
283

284
                bool result_sign = (lhs_sign != rhs_sign);
66✔
285
                int result_exp = lhs_exp + rhs_exp;
66✔
286

287
#ifdef __SIZEOF_INT128__
288
                __uint128_t wide = static_cast<__uint128_t>(lhs_frac) * static_cast<__uint128_t>(rhs_frac);
66✔
289
                // The fractions are in 0.f format with fbits fraction bits
290
                // Product has 2*fbits bits, shift right by fbits to get back to fbits
291
                uint64_t result_frac = static_cast<uint64_t>(wide >> fbits);
66✔
292
                normalize_and_pack(result_sign, result_exp, result_frac);
66✔
293
#else
294
                // fallback: delegate through double
295
                double d = double(*this) * double(rhs);
296
                *this = d;
297
#endif
298
                return *this;
66✔
299
        }
300

301
        constexpr hfloat& operator/=(const hfloat& rhs) {
21✔
302
                if (rhs.iszero()) {
21✔
303
#if HFLOAT_THROW_ARITHMETIC_EXCEPTION
304
                        // Throwing in a constant expression is ill-formed; fence the
305
                        // throw under runtime to keep operator/= constexpr-callable.
306
                        // At constant-eval, divide-by-zero falls through to setzero()
307
                        // (IBM HFP has no NaN/inf, so saturation is the closest
308
                        // constexpr-safe behavior).
309
                        if (!std::is_constant_evaluated()) {
310
                                throw hfloat_divide_by_zero();
311
                        }
312
#endif
313
                        setzero();
1✔
314
                        return *this;
1✔
315
                }
316
                if (iszero()) return *this;
20✔
317

318
                bool lhs_sign, rhs_sign;
319
                int lhs_exp, rhs_exp;
320
                uint64_t lhs_frac, rhs_frac;
321
                unpack(lhs_sign, lhs_exp, lhs_frac);
19✔
322
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
19✔
323

324
                bool result_sign = (lhs_sign != rhs_sign);
19✔
325
                int result_exp = lhs_exp - rhs_exp;
19✔
326

327
#ifdef __SIZEOF_INT128__
328
                // Scale numerator up by fbits for precision
329
                __uint128_t scaled_num = static_cast<__uint128_t>(lhs_frac) << fbits;
19✔
330
                uint64_t result_frac = static_cast<uint64_t>(scaled_num / static_cast<__uint128_t>(rhs_frac));
19✔
331
                normalize_and_pack(result_sign, result_exp, result_frac);
19✔
332
#else
333
                double d = double(*this) / double(rhs);
334
                *this = d;
335
#endif
336
                return *this;
19✔
337
        }
338

339
        // unary operators
340
        constexpr hfloat& operator++() {
2✔
341
                *this += hfloat(1);
2✔
342
                return *this;
2✔
343
        }
344
        constexpr hfloat operator++(int) {
345
                hfloat tmp(*this);
346
                operator++();
347
                return tmp;
348
        }
349
        constexpr hfloat& operator--() {
2✔
350
                *this -= hfloat(1);
2✔
351
                return *this;
2✔
352
        }
353
        constexpr hfloat operator--(int) {
1✔
354
                hfloat tmp(*this);
1✔
355
                operator--();
1✔
356
                return tmp;
1✔
357
        }
358

359
        // modifiers
360
        constexpr void clear() noexcept {
597✔
361
                for (unsigned i = 0; i < nrBlocks; ++i) _block[i] = bt(0);
1,238✔
362
        }
597✔
363
        constexpr void setzero() noexcept { clear(); }
35✔
364

365
        constexpr void setsign(bool negative = true) noexcept {
82✔
366
                setbit(nbits - 1, negative);
82✔
367
        }
82✔
368

369
        // use un-interpreted raw bits to set the value
370
        constexpr void setbits(uint64_t value) noexcept {
371
                clear();
372
                for (unsigned i = 0; i < nrBlocks; ++i) {
373
                        _block[i] = bt(value & BLOCK_MASK);
374
                        value >>= bitsInBlock;
375
                }
376
                _block[MSU] &= MSU_MASK;
377
        }
378

379
        // create specific number system values of interest
380
        //
381
        // The `max_frac` computations below cap at uint64_t max for fbits >= 64:
382
        // `1ull << 64` and wider are UB. For hfloat configs with fbits > 64
383
        // (hfloat_extended at fbits=112), the high fraction bits stay zero
384
        // since pack()'s fraction parameter is also uint64_t; full-width
385
        // fbits > 64 saturation is future work pending a multi-limb fraction
386
        // path through pack / normalize_and_pack.
387
        constexpr hfloat& maxpos() noexcept {
4✔
388
                clear();
4✔
389
                unsigned biased_exp = (1u << es) - 1u;
4✔
390
                uint64_t max_frac;
391
                if constexpr (fbits < 64) {
392
                        max_frac = (uint64_t(1) << fbits) - 1u;
2✔
393
                } else {
394
                        max_frac = ~uint64_t(0);
2✔
395
                }
396
                pack(false, static_cast<int>(biased_exp) - bias, max_frac);
4✔
397
                return *this;
4✔
398
        }
399
        constexpr hfloat& minpos() noexcept {
1✔
400
                clear();
1✔
401
                pack(false, emin, 1);
1✔
402
                return *this;
1✔
403
        }
404
        constexpr hfloat& zero() noexcept {
7✔
405
                clear();
7✔
406
                return *this;
7✔
407
        }
408
        constexpr hfloat& minneg() noexcept {
×
409
                clear();
×
410
                pack(true, emin, 1);
×
411
                return *this;
×
412
        }
413
        constexpr hfloat& maxneg() noexcept {
1✔
414
                clear();
1✔
415
                unsigned biased_exp = (1u << es) - 1u;
1✔
416
                uint64_t max_frac;
417
                if constexpr (fbits < 64) {
418
                        max_frac = (uint64_t(1) << fbits) - 1u;
1✔
419
                } else {
420
                        max_frac = ~uint64_t(0);
×
421
                }
422
                pack(true, static_cast<int>(biased_exp) - bias, max_frac);
1✔
423
                return *this;
1✔
424
        }
425

426
        // selectors
427
        constexpr bool sign() const noexcept {
1,119✔
428
                return getbit(nbits - 1);
1,119✔
429
        }
430

431
        constexpr bool iszero() const noexcept {
1,896✔
432
                // zero when fraction is all zeros (exponent and sign don't matter)
433
                // In IBM HFP, zero is represented as all-zeros
434
                for (unsigned i = 0; i < nrBlocks; ++i) {
1,996✔
435
                        bt mask = (i == MSU) ? bt(MSU_MASK & ~(bt(1) << ((nbits - 1) % bitsInBlock))) : BLOCK_MASK;
1,927✔
436
                        if ((_block[i] & mask) != 0) return false;
1,927✔
437
                }
438
                return true;
69✔
439
        }
440

441
        constexpr bool isone() const noexcept {
442
                bool s; int e; uint64_t f;
443
                unpack(s, e, f);
444
                // 1.0 = 0.1 * 16^1, so e=1, f = 1 << (fbits-4)  (leading hex digit = 1).
445
                // Gate the shift so `1ull << (fbits - 4)` doesn't UB for fbits >= 68
446
                // (hfloat_extended at fbits=112 would otherwise hit shift-count-overflow);
447
                // for those configs unpack() also caps the fraction at 64 useful bits,
448
                // so we can never construct the wide-fbits "leading hex digit = 1"
449
                // encoding here -- return false consistently.
450
                if constexpr (fbits >= 68) {
451
                        (void)s; (void)e; (void)f;
452
                        return false;
453
                }
454
                else {
455
                        return !s && (e == 1) && (f == (uint64_t(1) << (fbits - 4)));
456
                }
457
        }
458

459
        constexpr bool ispos() const noexcept { return !sign(); }
460
        constexpr bool isneg() const noexcept { return sign(); }
461

462
        // IBM HFP has no NaN or infinity
463
        constexpr bool isinf() const noexcept { return false; }
2✔
464
        constexpr bool isnan() const noexcept { return false; }
2✔
465
        constexpr bool isnan(int) const noexcept { return false; }
466

467
        constexpr int scale() const noexcept {
2✔
468
                if (iszero()) return 0;
2✔
469
                bool s; int e; uint64_t f;
470
                unpack(s, e, f);
2✔
471
                // IBM HFP: value = 0.f * 16^e
472
                // Scale in terms of powers of 2: scale = 4*e + leading_bit_position_of_f - fbits
473
                // But conceptually: scale = 4 * (e - bias_already_removed)
474
                // Find the leading 1 bit of the fraction. unpack() caps the
475
                // fraction at 64 useful bits, so we only need to scan up to bit 63.
476
                // Indices >= 64 would make `f >> i` UB.
477
                constexpr int read_iter = (fbits < 64) ? static_cast<int>(fbits) : 64;
2✔
478
                int leading = -1;
2✔
479
                for (int i = read_iter - 1; i >= 0; --i) {
25✔
480
                        if ((f >> i) & 1) { leading = i; break; }
25✔
481
                }
482
                if (leading < 0) return 0;
2✔
483
                return 4 * e + leading - static_cast<int>(fbits);
2✔
484
        }
485

486
        // convert to string
487
        std::string str(size_t nrDigits = 0) const {
57✔
488
                if (iszero()) return sign() ? std::string("-0") : std::string("0");
65✔
489

490
                double d = convert_to_double();
53✔
491
                std::stringstream ss;
53✔
492
                if (nrDigits > 0) {
53✔
493
                        ss << std::setprecision(static_cast<int>(nrDigits));
53✔
494
                }
495
                ss << d;
53✔
496
                return ss.str();
53✔
497
        }
53✔
498

499
        ///////////////////////////////////////////////////////////////////
500
        // Bit access (public for free functions)
501
        constexpr bool getbit(unsigned pos) const noexcept {
30,542✔
502
                if (pos >= nbits) return false;
30,542✔
503
                unsigned block_idx = pos / bitsInBlock;
30,542✔
504
                unsigned bit_idx = pos % bitsInBlock;
30,542✔
505
                return (_block[block_idx] >> bit_idx) & 1;
30,542✔
506
        }
507

508
        ///////////////////////////////////////////////////////////////////
509
        // Unpack into sign, unbiased exponent, and fraction
510
        constexpr void unpack(bool& s, int& exponent, uint64_t& fraction) const noexcept {
853✔
511
                s = sign();
853✔
512
                if (iszero()) { exponent = 0; fraction = 0; return; }
853✔
513

514
                // Extract exponent field (es bits)
515
                unsigned exp_field = 0;
844✔
516
                unsigned expStart = nbits - 2; // MSB of exponent (just below sign)
844✔
517
                for (unsigned i = 0; i < es; ++i) {
6,752✔
518
                        if (getbit(expStart - i)) {
5,908✔
519
                                exp_field |= (1u << (es - 1 - i));
1,778✔
520
                        }
521
                }
522
                exponent = static_cast<int>(exp_field) - bias;
844✔
523

524
                // Extract fraction. The destination is uint64_t so we read at most
525
                // 64 bits; high fraction bits (i >= 64) for hfloat_extended would
526
                // make `1ull << i` UB and aren't representable in the result type
527
                // anyway. Matches the cap on the pack() write path.
528
                fraction = 0;
844✔
529
                constexpr unsigned read_iter = (fbits < 64) ? fbits : 64u;
844✔
530
                for (unsigned i = 0; i < read_iter; ++i) {
22,204✔
531
                        if (getbit(i)) {
21,360✔
532
                                fraction |= (uint64_t(1) << i);
3,078✔
533
                        }
534
                }
535
        }
536

537
protected:
538
        bt _block[nrBlocks];
539

540
        ///////////////////////////////////////////////////////////////////
541
        // Bit manipulation helpers
542
        constexpr void setbit(unsigned pos, bool value) noexcept {
18,506✔
543
                if (pos >= nbits) return;
18,506✔
544
                unsigned block_idx = pos / bitsInBlock;
18,506✔
545
                unsigned bit_idx = pos % bitsInBlock;
18,506✔
546
                if (value) {
18,506✔
547
                        _block[block_idx] |= bt(1ull << bit_idx);
2,994✔
548
                }
549
                else {
550
                        _block[block_idx] &= bt(~(1ull << bit_idx));
15,512✔
551
                }
552
        }
553

554
        ///////////////////////////////////////////////////////////////////
555
        // Pack sign, unbiased exponent, and fraction
556
        constexpr void pack(bool s, int exponent, uint64_t fraction) noexcept {
549✔
557
                clear();
549✔
558

559
                // set sign
560
                setbit(nbits - 1, s);
549✔
561

562
                // set exponent field
563
                unsigned biased_exp = static_cast<unsigned>(exponent + bias);
549✔
564
                unsigned expStart = nbits - 2;
549✔
565
                for (unsigned i = 0; i < es; ++i) {
4,392✔
566
                        setbit(expStart - i, (biased_exp >> (es - 1 - i)) & 1);
3,843✔
567
                }
568

569
                // set fraction field. The source is a uint64_t so it carries at
570
                // most 64 useful bits; for fbits > 64 (hfloat_extended) the high
571
                // fbits stay zero because `fraction >> i` for i >= 64 is UB
572
                // (compilers typically mask the shift count modulo 64 which
573
                // would re-replicate the low bits at the top -- definitely
574
                // wrong). Multi-limb fraction support for fbits > 64 is future
575
                // work; until then, only the low 64 fraction bits are populated.
576
                constexpr unsigned frac_iter = (fbits < 64) ? fbits : 64u;
549✔
577
                for (unsigned i = 0; i < frac_iter; ++i) {
14,581✔
578
                        setbit(i, (fraction >> i) & 1);
14,032✔
579
                }
580
        }
549✔
581

582
        ///////////////////////////////////////////////////////////////////
583
        // Normalize: ensure leading hex digit is non-zero, then truncate
584
        constexpr void normalize_and_pack(bool s, int exponent, uint64_t fraction) noexcept {
561✔
585
                if (fraction == 0) { setzero(); return; }
561✔
586

587
                // Normalize the leading hex digit. For fbits < 64 the
588
                // `1ull << fbits` thresholds are well-defined and the loops
589
                // produce IBM-HFP-correct truncation. For fbits >= 64 the
590
                // uint64_t fraction cannot represent the full hex significand
591
                // (hfloat_extended is fbits=112); we skip the normalization
592
                // loops since the relevant thresholds would be UB and accept
593
                // that the wide-fbits saturation path is approximate. Full-
594
                // width fbits > 64 conversion needs a multi-limb fraction
595
                // representation and is left as future work.
596
                if constexpr (fbits < 64) {
597
                        while (fraction >= (uint64_t(1) << fbits)) {
560✔
598
                                fraction >>= 4;  // shift right by one hex digit
16✔
599
                                exponent++;
16✔
600
                        }
601
                        while (fraction > 0 && fraction < (uint64_t(1) << (fbits - 4))) {
595✔
602
                                fraction <<= 4;  // shift left by one hex digit
51✔
603
                                exponent--;
51✔
604
                        }
605
                        // Truncate to fbits (IBM HFP truncates, never rounds up).
606
                        fraction &= ((uint64_t(1) << fbits) - 1);
544✔
607
                }
608
                // else: fraction is already at most ~uint64_t(0), which by
609
                // construction fits within fbits >= 64; pack() places it in
610
                // the low 64 fbits and leaves the rest zero.
611

612
                // Check overflow/underflow
613
                if (exponent > emax) {
546✔
614
                        // overflow: saturate to maxpos/maxneg
615
                        if (s) maxneg(); else maxpos();
3✔
616
                        return;
3✔
617
                }
618
                if (exponent < emin) {
543✔
619
                        // underflow: set to zero
620
                        setzero();
×
621
                        return;
×
622
                }
623

624
                pack(s, exponent, fraction);
543✔
625
        }
626

627
        ///////////////////////////////////////////////////////////////////
628
        // Conversion helpers
629

630
        // Convert IEEE-754 double to hfloat
631
        //
632
        // Constexpr-safe rewrite of the original frexp/ldexp implementation:
633
        // - NaN detected via x != x (only NaN is unequal to itself)
634
        // - infinity detected by bracketing against numeric_limits::max()
635
        //   (numeric_limits<double>::max() is constexpr; std::isinf is not)
636
        // - fabs replaced with `negative ? -rhs : rhs`
637
        // - frexp/ldexp replaced with raw IEEE 754 field extraction via
638
        //   sw::bit_cast (constexpr on toolchains exposing std::bit_cast or
639
        //   __builtin_bit_cast; runtime fallback retains the legacy path).
640
        //
641
        // IBM HFP has no NaN and no infinity:
642
        //   NaN double  -> hfloat zero
643
        //   +/- inf     -> hfloat maxpos/maxneg (saturation)
644
        //
645
        // This used to duplicate the encoding algorithm. After the unification
646
        // for issue #849, it just extracts the IEEE double's 53-bit significand
647
        // (with hidden bit), lifts it into the 64-bit normalized mantissa
648
        // convention by left-shifting 11 bits (the headroom between IEEE
649
        // double's 53 and our canonical 64), and delegates to
650
        // `assign_from_mantissa64`. The shift is lossless -- the low 11 bits of
651
        // the wide mantissa are simply zero, since the double had no more
652
        // information to give.
653
        constexpr hfloat& convert_ieee754(double rhs) noexcept {
308✔
654
                if (rhs != rhs) {                       // NaN
308✔
655
                        setzero();
×
656
                        return *this;
×
657
                }
658
                if (rhs == 0.0) {
308✔
659
                        setzero();
14✔
660
                        return *this;
14✔
661
                }
662
                constexpr double dbl_max = std::numeric_limits<double>::max();
294✔
663
                if (rhs >  dbl_max) { maxpos(); return *this; }   // +inf
294✔
664
                if (rhs < -dbl_max) { maxneg(); return *this; }   // -inf
294✔
665

666
                bool negative = (rhs < 0);
294✔
667
                double abs_val = negative ? -rhs : rhs;
294✔
668

669
                // Extract IEEE 754 double fields without std::frexp:
670
                // IEEE 754 double: bias 1023, 52 fraction bits, hidden 1 for normals.
671
                //   normal: value = (1 + rawFrac/2^52) * 2^(rawExp - 1023)
672
                //                 = (2^52 + rawFrac) * 2^(rawExp - 1075)
673
                // frexp would return frac in [0.5, 1) with bin_exp = rawExp - 1022;
674
                // we use the same bin_exp here so the mantissa MSB lies at bit
675
                // (mantissa_width - 1), matching assign_from_mantissa64's contract.
676
                //
677
                // sw::bit_cast is constexpr on toolchains exposing std::bit_cast or
678
                // __builtin_bit_cast (the universally-supported case in C++20+); on
679
                // older toolchains it works at runtime via memcpy but is not
680
                // constexpr.  Calling convert_ieee754() in a constant expression on
681
                // such a toolchain produces a not-a-constant-expression diagnostic
682
                // at the bit_cast invocation -- the correct behavior, vs. silently
683
                // returning zero from a runtime fallback.
684
                uint64_t bits = sw::bit_cast<uint64_t>(abs_val);
294✔
685
                uint64_t rawExp  = (bits >> 52) & 0x7FFu;
294✔
686
                uint64_t rawFrac = bits & ((uint64_t(1) << 52) - 1u);
294✔
687
                if (rawExp == 0) {
294✔
688
                        // Subnormal double values are far below IBM HFP's smallest
689
                        // representable (16^emin); flush to zero per HFP semantics.
690
                        setzero();
×
691
                        return *this;
×
692
                }
693
                int bin_exp = static_cast<int>(rawExp) - 1022;
294✔
694
                uint64_t mantissa53 = (uint64_t(1) << 52) | rawFrac;  // 53-bit, MSB at bit 52
294✔
695
                uint64_t mantissa64 = mantissa53 << 11;               // lift to MSB at bit 63
294✔
696
                return assign_from_mantissa64(negative, bin_exp, mantissa64);
294✔
697
        }
698

699
        // Convert hfloat to IEEE-754 double
700
        //
701
        // Constexpr-safe: replaces std::ldexp(d, n) with a power-of-2 scaling
702
        // loop that multiplies/divides by 2.0 (exactly representable in double).
703
        // At runtime, dispatches to std::ldexp for performance.  The shift range
704
        // per template config is bounded (fbits + 4*emax for hfloat_extended is
705
        // ~360), so the constexpr loop is well-bounded.
706
        constexpr double convert_to_double() const noexcept {
335✔
707
                if (iszero()) return sign() ? -0.0 : 0.0;
335✔
708

709
                bool s; int e; uint64_t f;
710
                unpack(s, e, f);
311✔
711

712
                // value = 0.f * 16^e = f * 2^(-fbits) * 16^e = f * 2^(4*e - fbits)
713
                int shift = 4 * e - static_cast<int>(fbits);
311✔
714
                double result = static_cast<double>(f);
311✔
715
                if (std::is_constant_evaluated()) {
311✔
716
                        // Power-of-2 scaling: 2.0 and 0.5 are exactly representable
717
                        // in IEEE 754 double, so this introduces no rounding error
718
                        // over the bounded iteration count.
719
                        if (shift > 0) {
×
720
                                for (int i = 0; i < shift; ++i) result *= 2.0;
×
721
                        }
722
                        else if (shift < 0) {
×
723
                                for (int i = 0; i < -shift; ++i) result *= 0.5;
×
724
                        }
725
                }
726
                else {
727
                        result = std::ldexp(result, shift);
311✔
728
                }
729
                return s ? -result : result;
311✔
730
        }
731

732
        // Direct integer-to-hfloat packing for fbits <= 56 (hfloat_short and
733
        // hfloat_long).  Avoids the double round-trip that would lose precision
734
        // for int64_t values above 2^53 -- a value like 9007199254740993ULL
735
        // IS representable in hfloat_long (56-bit fraction) but rounds via
736
        // double.  For fbits >= 64 (hfloat_extended) the fraction is wider
737
        // than uint64_t can hold, so we fall back to convert_ieee754; this
738
        // keeps the same precision as pre-promotion code for that variant.
739
        constexpr hfloat& convert_signed(int64_t v) noexcept {
24✔
740
                if (v == 0) {
24✔
741
                        setzero();
1✔
742
                        return *this;
1✔
743
                }
744
                bool negative = (v < 0);
23✔
745
                // Compute |v| as uint64_t without ever negating an int64_t (the
746
                // negation of INT64_MIN would overflow).  Bitwise-safe identity:
747
                // |INT64_MIN| = -(v + 1) + 1, each step stays in range.
748
                uint64_t abs_v = negative
23✔
749
                        ? (static_cast<uint64_t>(-(v + 1)) + 1ull)
23✔
750
                        : static_cast<uint64_t>(v);
751
                if constexpr (fbits >= 64) {
752
                        return convert_ieee754(static_cast<double>(v));
753
                }
754
                else {
755
                        pack_uint64(negative, abs_v);
23✔
756
                        return *this;
23✔
757
                }
758
        }
759

760
        constexpr hfloat& convert_unsigned(uint64_t v) noexcept {
5✔
761
                if (v == 0) {
5✔
762
                        setzero();
1✔
763
                        return *this;
1✔
764
                }
765
                if constexpr (fbits >= 64) {
766
                        return convert_ieee754(static_cast<double>(v));
767
                }
768
                else {
769
                        pack_uint64(false, v);
4✔
770
                        return *this;
4✔
771
                }
772
        }
773

774
        // Pack a uint64_t magnitude into the hfloat's hex representation.
775
        // Precondition: v != 0 and fbits < 64.
776
        //   value = v = 0.f * 16^hex_exp where 0.f's leading hex digit is non-zero.
777
        // Algorithm:
778
        //   p          = highest set bit of v (0-indexed)
779
        //   hex_exp    = p/4 + 1   (number of hex digits to represent v)
780
        //   shift      = fbits - 4*hex_exp
781
        //   fraction   = (shift >= 0) ? v << shift : v >> -shift  (truncate per HFP)
782
        // Worst-case shift left: p=0, fbits<=56 -> shift=fbits-4, v<<shift fits in
783
        // uint64_t since v has only 1 bit.  Worst-case shift right: p=63, shift=fbits-64.
784
        constexpr void pack_uint64(bool negative, uint64_t v) noexcept {
27✔
785
                // Find highest set bit position (loop is bounded to 64 iterations
786
                // and constexpr-clean; v != 0 by precondition).
787
                unsigned p = 63;
27✔
788
                while (((v >> p) & 1u) == 0u) --p;
1,662✔
789
                int hex_exp = static_cast<int>(p / 4u) + 1;
27✔
790
                int shift = static_cast<int>(fbits) - 4 * hex_exp;
27✔
791
                uint64_t fraction = (shift >= 0)
27✔
792
                        ? (v << static_cast<unsigned>(shift))
27✔
793
                        : (v >> static_cast<unsigned>(-shift));
×
794
                normalize_and_pack(negative, hex_exp, fraction);
27✔
795
        }
27✔
796

797
private:
798

799
        // hfloat - hfloat logic comparisons
800
        template<unsigned N, unsigned E, typename B>
801
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const hfloat<N, E, B>& rhs);
802

803
        // hfloat - literal logic comparisons
804
        template<unsigned N, unsigned E, typename B>
805
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const double rhs);
806

807
        // literal - hfloat logic comparisons
808
        template<unsigned N, unsigned E, typename B>
809
        friend constexpr bool operator==(const double lhs, const hfloat<N, E, B>& rhs);
810
};
811

812

813
////////////////////////    helper functions   /////////////////////////////////
814

815
template<unsigned ndigits, unsigned es, typename BlockType>
816
inline std::string to_binary(const hfloat<ndigits, es, BlockType>& number, bool nibbleMarker = false) {
47✔
817
        using Hfloat = hfloat<ndigits, es, BlockType>;
818
        std::stringstream s;
47✔
819

820
        // sign bit
821
        s << "0b" << (number.sign() ? '1' : '0') << '.';
47✔
822

823
        // exponent field (es bits)
824
        unsigned expStart = Hfloat::nbits - 2;
47✔
825
        for (unsigned i = 0; i < es; ++i) {
376✔
826
                s << (number.getbit(expStart - i) ? '1' : '0');
329✔
827
        }
828
        s << '.';
47✔
829

830
        // fraction field (fbits bits, show in hex-digit groups)
831
        for (int i = static_cast<int>(Hfloat::fbits) - 1; i >= 0; --i) {
1,447✔
832
                s << (number.getbit(static_cast<unsigned>(i)) ? '1' : '0');
1,400✔
833
                if (nibbleMarker && i > 0 && (i % 4 == 0)) s << '\'';
1,400✔
834
        }
835

836
        return s.str();
94✔
837
}
47✔
838

839
template<unsigned ndigits, unsigned es, typename BlockType>
840
inline std::string to_hex(const hfloat<ndigits, es, BlockType>& number) {
9✔
841
        std::stringstream s;
9✔
842

843
        s << (number.sign() ? '-' : '+');
9✔
844
        s << "0x0.";
9✔
845

846
        // extract hex digits from fraction
847
        bool sign_val; int exp_val; uint64_t frac_val;
848
        number.unpack(sign_val, exp_val, frac_val);
9✔
849

850
        for (int i = static_cast<int>(ndigits) - 1; i >= 0; --i) {
123✔
851
                unsigned hex_digit = (frac_val >> (i * 4)) & 0xF;
114✔
852
                s << "0123456789ABCDEF"[hex_digit];
114✔
853
        }
854
        s << " * 16^" << (exp_val);
9✔
855

856
        return s.str();
18✔
857
}
9✔
858

859

860
// native semantic representation: radix-16, delegates to to_hex
861
template<unsigned ndigits, unsigned es, typename BlockType>
862
inline std::string to_native(const hfloat<ndigits, es, BlockType>& v, bool = false) {
863
        return to_hex(v);
864
}
865

866
////////////////////////    HFLOAT functions   /////////////////////////////////
867

868
template<unsigned ndigits, unsigned es, typename BlockType>
869
inline constexpr hfloat<ndigits, es, BlockType> abs(const hfloat<ndigits, es, BlockType>& a) {
870
        hfloat<ndigits, es, BlockType> result(a);
871
        result.setsign(false);
872
        return result;
873
}
874

875
template<unsigned ndigits, unsigned es, typename BlockType>
876
inline constexpr hfloat<ndigits, es, BlockType> fabs(hfloat<ndigits, es, BlockType> a) {
877
        a.setsign(false);
878
        return a;
879
}
880

881

882
////////////////////////  stream operators   /////////////////////////////////
883

884
template<unsigned ndigits, unsigned es, typename BlockType>
885
inline std::ostream& operator<<(std::ostream& ostr, const hfloat<ndigits, es, BlockType>& i) {
57✔
886
        std::stringstream ss;
57✔
887
        std::streamsize prec = ostr.precision();
57✔
888
        std::streamsize width = ostr.width();
57✔
889
        std::ios_base::fmtflags ff;
890
        ff = ostr.flags();
57✔
891
        ss.flags(ff);
57✔
892
        ss << std::setw(width) << std::setprecision(prec) << i.str(size_t(prec));
57✔
893
        return ostr << ss.str();
114✔
894
}
57✔
895

896
template<unsigned ndigits, unsigned es, typename BlockType>
897
inline std::istream& operator>>(std::istream& istr, hfloat<ndigits, es, BlockType>& p) {
2✔
898
        std::string txt;
2✔
899
        if (!(istr >> txt)) {
2✔
900
                // extraction failed (already-bad stream or EOF); failbit set by >>.
901
                return istr;
1✔
902
        }
903
        if (!parse(txt, p)) {
1✔
904
                std::cerr << "unable to parse -" << txt << "- into an hfloat value\n";
1✔
905
                istr.setstate(std::ios::failbit);
1✔
906
        }
907
        return istr;
1✔
908
}
2✔
909

910
////////////////// string operators
911

912
// Parse a decimal floating-point literal into an hfloat. hfloat has no NaN
913
// or Inf encoding, so "nan" / "inf" / "infinity" tokens are rejected. The
914
// decimal payload is converted via decimal_to_binary with 64 bits of
915
// precision and assembled directly into the hfloat hex fraction -- this
916
// gives bit-exact correctly-rounded conversion for hfloat_long's full 56
917
// fbits, which the via-double path could not deliver (double has only 53).
918
//
919
// Resolves #849.
920
template<unsigned ndigits, unsigned es, typename BlockType>
921
bool parse(const std::string& number, hfloat<ndigits, es, BlockType>& value) {
59✔
922
        if (number.empty()) return false;
59✔
923

924
        // Reject special-value tokens up front (hfloat has no NaN or Inf
925
        // encoding) and pre-validate the input grammar so malformed tokens are
926
        // signaled via false return rather than silently mapped to zero by the
927
        // d2b path.
928
        std::size_t pos = 0;
58✔
929
        while (pos < number.size() && std::isspace(static_cast<unsigned char>(number[pos]))) ++pos;
58✔
930
        if (pos >= number.size()) return false;
58✔
931
        const std::size_t numeric_start = pos;
58✔
932
        if (number[pos] == '+' || number[pos] == '-') ++pos;
58✔
933
        if (pos + 3 <= number.size()) {
58✔
934
                char a = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos])));
50✔
935
                char b = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 1])));
50✔
936
                char c = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 2])));
50✔
937
                if ((a == 'n' && b == 'a' && c == 'n')
50✔
938
                 || (a == 'i' && b == 'n' && c == 'f')) {
46✔
939
                        return false;  // hfloat cannot represent these
11✔
940
                }
941
        }
942
        // Pre-validate the rest as a decimal floating-point literal:
943
        // digits[.digits][eE[+-]digits] or .digits[eE[+-]digits].
944
        bool seen_digit = false;
47✔
945
        bool seen_dot   = false;
47✔
946
        if (pos >= number.size()) return false;
47✔
947
        while (pos < number.size()) {
152✔
948
                char ch = number[pos];
129✔
949
                if (ch >= '0' && ch <= '9') { seen_digit = true; ++pos; continue; }
129✔
950
                if (ch == '.') {
46✔
951
                        if (seen_dot) return false;
25✔
952
                        seen_dot = true;
24✔
953
                        ++pos;
24✔
954
                        continue;
24✔
955
                }
956
                break;
21✔
957
        }
958
        if (!seen_digit) return false;
44✔
959
        if (pos < number.size() && (number[pos] == 'e' || number[pos] == 'E')) {
39✔
960
                ++pos;
16✔
961
                if (pos < number.size() && (number[pos] == '+' || number[pos] == '-')) ++pos;
16✔
962
                bool seen_exp_digit = false;
16✔
963
                while (pos < number.size() && number[pos] >= '0' && number[pos] <= '9') {
46✔
964
                        seen_exp_digit = true;
30✔
965
                        ++pos;
30✔
966
                }
967
                if (!seen_exp_digit) return false;
16✔
968
        }
969
        if (pos != number.size()) return false;  // trailing junk
37✔
970

971
        // Hand the trimmed payload (post-whitespace) to decimal_to_binary with
972
        // 64 bits of precision -- enough to fill hfloat_long's 56 fbits exactly
973
        // and a few extra for hfloat_short's 24 fbits to round correctly. For
974
        // hfloat<28,*> (fbits=112) the result is the same lossy precision the
975
        // pre-existing operator=(double) path already had, since 64 < 112.
976
        std::string_view payload(number.data() + numeric_start, number.size() - numeric_start);
36✔
977
        auto d = ::sw::universal::decimal_to_binary::convert(payload, 64u);
36✔
978
        if (!d.valid) return false;
36✔
979
        if (d.is_zero) {
36✔
980
                value = hfloat<ndigits, es, BlockType>(SpecificValue::zero);
7✔
981
                return true;
7✔
982
        }
983

984
        // Extract the 64-bit normalized mantissa (MSB at bit 63) from d2b's
985
        // bigint. With target_mantissa_bits == 64, mantissa.bit[63] is the
986
        // implicit-1 of the binary expansion and bits [0, 62] are the fraction.
987
        std::uint64_t mantissa64 = 0;
29✔
988
        for (unsigned i = 0; i < 64; ++i) {
1,885✔
989
                if (d.mantissa.at(i)) mantissa64 |= (std::uint64_t{1} << i);
1,856✔
990
        }
991

992
        // d2b's binary_scale is the exponent of mantissa's MSB, so the
993
        // "frexp" exponent (value = 1.frac * 2^(bin_exp - 1)) is binary_scale + 1.
994
        int bin_exp = static_cast<int>(d.binary_scale) + 1;
29✔
995
        value.assign_from_mantissa64(d.negative, bin_exp, mantissa64);
29✔
996
        return true;
29✔
997
}
998

999

1000
//////////////////////////////////////////////////////////////////////////////////////////////////////
1001
// hfloat - hfloat binary logic operators
1002

1003
// hfloat values are normalized by normalize_and_pack so that the leading
1004
// hex digit of the fraction is non-zero -- the (sign, exponent, fraction)
1005
// tuple is unique per value.  We compare tuples directly instead of via
1006
// double() to preserve precision for hfloat_long (56-bit fraction) and
1007
// hfloat_extended, where the double round-trip would lose bits below the
1008
// 53-bit IEEE 754 mantissa.
1009
template<unsigned ndigits, unsigned es, typename BlockType>
1010
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
29✔
1011
        bool lz = lhs.iszero();
29✔
1012
        bool rz = rhs.iszero();
29✔
1013
        if (lz && rz) return true;       // +0 == -0 in HFP (no signed-zero distinction)
29✔
1014
        if (lz || rz) return false;
28✔
1015
        if (lhs.sign() != rhs.sign()) return false;
28✔
1016
        bool ts; int le = 0, re = 0;
28✔
1017
        uint64_t lf = 0, rf = 0;
28✔
1018
        lhs.unpack(ts, le, lf);
28✔
1019
        rhs.unpack(ts, re, rf);
28✔
1020
        return le == re && lf == rf;
28✔
1021
}
1022

1023
template<unsigned ndigits, unsigned es, typename BlockType>
1024
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
22✔
1025
        return !operator==(lhs, rhs);
22✔
1026
}
1027

1028
template<unsigned ndigits, unsigned es, typename BlockType>
1029
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
15✔
1030
        bool lz = lhs.iszero();
15✔
1031
        bool rz = rhs.iszero();
15✔
1032
        if (lz && rz) return false;
15✔
1033
        bool ls = lhs.sign();
15✔
1034
        bool rs = rhs.sign();
15✔
1035
        if (lz) return !rs;              // 0 < +x; 0 not < -x
15✔
1036
        if (rz) return ls;               // -x < 0; +x not < 0
14✔
1037
        if (ls != rs) return ls;         // negative < positive
13✔
1038
        // Same sign, both non-zero.  unpack returns (sign, biased-exp-removed,
1039
        // fraction).  For positives, lhs < rhs iff (le < re) || tie + (lf < rf).
1040
        // For negatives, the ordering reverses: lhs < rhs iff |lhs| > |rhs|.
1041
        bool ts; int le = 0, re = 0;
12✔
1042
        uint64_t lf = 0, rf = 0;
12✔
1043
        lhs.unpack(ts, le, lf);
12✔
1044
        rhs.unpack(ts, re, rf);
12✔
1045
        if (ls) {
12✔
1046
                // both negative: lhs < rhs iff |lhs| > |rhs|
1047
                return (le > re) || (le == re && lf > rf);
2✔
1048
        }
1049
        // both positive
1050
        return (le < re) || (le == re && lf < rf);
10✔
1051
}
1052

1053
template<unsigned ndigits, unsigned es, typename BlockType>
1054
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
2✔
1055
        return operator< (rhs, lhs);
2✔
1056
}
1057

1058
template<unsigned ndigits, unsigned es, typename BlockType>
1059
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1060
        return operator< (lhs, rhs) || operator==(lhs, rhs);
3✔
1061
}
1062

1063
template<unsigned ndigits, unsigned es, typename BlockType>
1064
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1065
        return !operator< (lhs, rhs);
3✔
1066
}
1067

1068
//////////////////////////////////////////////////////////////////////////////////////////////////////
1069
// hfloat - literal binary logic operators
1070
template<unsigned ndigits, unsigned es, typename BlockType>
1071
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1072
        return operator==(lhs, hfloat<ndigits, es, BlockType>(rhs));
1073
}
1074
template<unsigned ndigits, unsigned es, typename BlockType>
1075
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1076
        return !operator==(lhs, rhs);
1077
}
1078
template<unsigned ndigits, unsigned es, typename BlockType>
1079
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1080
        return operator<(lhs, hfloat<ndigits, es, BlockType>(rhs));
1081
}
1082
template<unsigned ndigits, unsigned es, typename BlockType>
1083
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1084
        return operator< (hfloat<ndigits, es, BlockType>(rhs), lhs);
1085
}
1086
template<unsigned ndigits, unsigned es, typename BlockType>
1087
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1088
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1089
}
1090
template<unsigned ndigits, unsigned es, typename BlockType>
1091
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1092
        return !operator< (lhs, rhs);
1093
}
1094

1095
//////////////////////////////////////////////////////////////////////////////////////////////////////
1096
// literal - hfloat binary logic operators
1097
template<unsigned ndigits, unsigned es, typename BlockType>
1098
inline constexpr bool operator==(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1099
        return operator==(hfloat<ndigits, es, BlockType>(lhs), rhs);
1100
}
1101
template<unsigned ndigits, unsigned es, typename BlockType>
1102
inline constexpr bool operator!=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1103
        return !operator==(lhs, rhs);
1104
}
1105
template<unsigned ndigits, unsigned es, typename BlockType>
1106
inline constexpr bool operator< (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1107
        return operator<(hfloat<ndigits, es, BlockType>(lhs), rhs);
1108
}
1109
template<unsigned ndigits, unsigned es, typename BlockType>
1110
inline constexpr bool operator> (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1111
        return operator< (rhs, lhs);
1112
}
1113
template<unsigned ndigits, unsigned es, typename BlockType>
1114
inline constexpr bool operator<=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1115
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1116
}
1117
template<unsigned ndigits, unsigned es, typename BlockType>
1118
inline constexpr bool operator>=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1119
        return !operator< (lhs, rhs);
1120
}
1121

1122
//////////////////////////////////////////////////////////////////////////////////////////////////////
1123
// hfloat - hfloat binary arithmetic operators
1124
template<unsigned ndigits, unsigned es, typename BlockType>
1125
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
67✔
1126
        hfloat<ndigits, es, BlockType> sum(lhs);
67✔
1127
        sum += rhs;
67✔
1128
        return sum;
67✔
1129
}
1130
template<unsigned ndigits, unsigned es, typename BlockType>
1131
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
61✔
1132
        hfloat<ndigits, es, BlockType> diff(lhs);
61✔
1133
        diff -= rhs;
61✔
1134
        return diff;
61✔
1135
}
1136
template<unsigned ndigits, unsigned es, typename BlockType>
1137
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
69✔
1138
        hfloat<ndigits, es, BlockType> mul(lhs);
69✔
1139
        mul *= rhs;
69✔
1140
        return mul;
69✔
1141
}
1142
template<unsigned ndigits, unsigned es, typename BlockType>
1143
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
21✔
1144
        hfloat<ndigits, es, BlockType> ratio(lhs);
21✔
1145
        ratio /= rhs;
21✔
1146
        return ratio;
21✔
1147
}
1148

1149
//////////////////////////////////////////////////////////////////////////////////////////////////////
1150
// hfloat - literal binary arithmetic operators
1151
template<unsigned ndigits, unsigned es, typename BlockType>
1152
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1153
        return operator+(lhs, hfloat<ndigits, es, BlockType>(rhs));
1154
}
1155
template<unsigned ndigits, unsigned es, typename BlockType>
1156
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1157
        return operator-(lhs, hfloat<ndigits, es, BlockType>(rhs));
1158
}
1159
template<unsigned ndigits, unsigned es, typename BlockType>
1160
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1161
        return operator*(lhs, hfloat<ndigits, es, BlockType>(rhs));
1162
}
1163
template<unsigned ndigits, unsigned es, typename BlockType>
1164
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1165
        return operator/(lhs, hfloat<ndigits, es, BlockType>(rhs));
1166
}
1167

1168
//////////////////////////////////////////////////////////////////////////////////////////////////////
1169
// literal - hfloat binary arithmetic operators
1170
template<unsigned ndigits, unsigned es, typename BlockType>
1171
inline constexpr hfloat<ndigits, es, BlockType> operator+(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1172
        return operator+(hfloat<ndigits, es, BlockType>(lhs), rhs);
1173
}
1174
template<unsigned ndigits, unsigned es, typename BlockType>
1175
inline constexpr hfloat<ndigits, es, BlockType> operator-(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1176
        return operator-(hfloat<ndigits, es, BlockType>(lhs), rhs);
1177
}
1178
template<unsigned ndigits, unsigned es, typename BlockType>
1179
inline constexpr hfloat<ndigits, es, BlockType> operator*(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1180
        return operator*(hfloat<ndigits, es, BlockType>(lhs), rhs);
1181
}
1182
template<unsigned ndigits, unsigned es, typename BlockType>
1183
inline constexpr hfloat<ndigits, es, BlockType> operator/(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1184
        return operator/(hfloat<ndigits, es, BlockType>(lhs), rhs);
1185
}
1186

1187
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc