• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 26061348223

18 May 2026 09:24PM UTC coverage: 84.24%. First build
26061348223

Pull #869

github

web-flow
Merge 92b07a2d1 into 3e2a68633
Pull Request #869: fix: IBM HFP clean-up

85 of 97 new or added lines in 2 files covered. (87.63%)

46900 of 55674 relevant lines covered (84.24%)

6413619.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.29
/include/sw/universal/number/hfloat/hfloat_impl.hpp
1
#pragma once
2
// hfloat_impl.hpp: implementation of IBM System/360 hexadecimal floating-point
3
//
4
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
5
// SPDX-License-Identifier: MIT
6
//
7
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
8
//
9
// IBM System/360 Hexadecimal Floating-Point (1964):
10
//   Format: [sign(1)] [exponent(es)] [hex_fraction(ndigits*4 bits)]
11
//   Value:  (-1)^sign * 16^(exponent - bias) * 0.f1f2...fn
12
//
13
// Key properties:
14
//   - No hidden bit (fraction always has explicit leading hex digit)
15
//   - No NaN, no infinity, no subnormals
16
//   - Truncation rounding only (never rounds up)
17
//   - Overflow saturates to maxpos/maxneg
18
//   - Zero: sign=0, exponent=0, fraction=0
19
//   - Wobbling precision: 0-3 leading zero bits in MSB hex digit
20
//
21
// Standard configurations:
22
//   Short:    hfloat<6, 7>  = 1+7+24 = 32 bits
23
//   Long:     hfloat<14, 7> = 1+7+56 = 64 bits
24
//   Extended: hfloat<28, 7> = 1+7+112 = 120 bits (stored in 128)
25

26
#include <cctype>
27
#include <cstdint>
28
#include <cstring>
29
#include <cmath>
30
#include <string>
31
#include <string_view>
32
#include <sstream>
33
#include <iostream>
34
#include <iomanip>
35
#include <algorithm>
36

37
// supporting types and functions
38
#include <universal/native/ieee754.hpp>
39
#include <universal/utility/decimal_to_binary.hpp>
40
#include <universal/number/shared/nan_encoding.hpp>
41
#include <universal/number/shared/infinite_encoding.hpp>
42
#include <universal/number/shared/specific_value_encoding.hpp>
43
// hfloat exception structure
44
#include <universal/number/hfloat/exceptions.hpp>
45

46
namespace sw { namespace universal {
47

48
///////////////////////////////////////////////////////////////////////////////
49
// hfloat: IBM System/360 hexadecimal floating-point number
50
//
51
// Template parameters:
52
//   ndigits  - number of hexadecimal fraction digits
53
//   es       - exponent bits (7 for standard IBM HFP)
54
//   bt       - block type for storage
55
//
56
template<unsigned _ndigits, unsigned _es, typename bt = std::uint32_t>
57
class hfloat {
58
public:
59
        static constexpr unsigned ndigits     = _ndigits;            // hex fraction digits
60
        static constexpr unsigned es          = _es;                 // exponent bits
61
        static constexpr unsigned fbits       = ndigits * 4u;        // fraction bits
62
        static constexpr unsigned nbits       = 1u + es + fbits;     // total bits
63
        static constexpr int      bias        = (1 << (es - 1));     // exponent bias (64 for es=7)
64
        static constexpr int      emax        = (1 << es) - 1 - bias; // max unbiased exponent
65
        static constexpr int      emin        = -bias;                // min unbiased exponent
66

67
        typedef bt BlockType;
68

69
        static constexpr unsigned bitsInByte  = 8u;
70
        static constexpr unsigned bitsInBlock = sizeof(bt) * bitsInByte;
71
        static constexpr unsigned nrBlocks    = 1u + ((nbits - 1u) / bitsInBlock);
72
        static constexpr unsigned MSU         = nrBlocks - 1u;
73

74
        static constexpr bt ALL_ONES  = bt(~0);
75
        static constexpr bt MSU_MASK  = (nrBlocks * bitsInBlock == nbits) ? ALL_ONES : bt((1ull << (nbits % bitsInBlock)) - 1u);
76
        static constexpr bt BLOCK_MASK = bt(~0);
77

78
        /// trivial constructor
79
        hfloat() = default;
80

81
        hfloat(const hfloat&) = default;
82
        hfloat(hfloat&&) = default;
83

84
        hfloat& operator=(const hfloat&) = default;
85
        hfloat& operator=(hfloat&&) = default;
86

87
        // specific value constructor
88
        constexpr hfloat(const SpecificValue code) noexcept : _block{} {
11✔
89
                switch (code) {
11✔
90
                case SpecificValue::maxpos:
3✔
91
                        maxpos();
3✔
92
                        break;
3✔
93
                case SpecificValue::minpos:
1✔
94
                        minpos();
1✔
95
                        break;
1✔
96
                case SpecificValue::zero:
7✔
97
                default:
98
                        zero();
7✔
99
                        break;
7✔
100
                case SpecificValue::minneg:
×
101
                        minneg();
×
102
                        break;
×
103
                case SpecificValue::maxneg:
×
104
                        maxneg();
×
105
                        break;
×
106
                case SpecificValue::infpos:
×
107
                        maxpos();  // no infinity in HFP, saturate to maxpos
×
108
                        break;
×
109
                case SpecificValue::infneg:
×
110
                        maxneg();  // no infinity in HFP, saturate to maxneg
×
111
                        break;
×
112
                case SpecificValue::nar:
×
113
                case SpecificValue::qnan:
114
                case SpecificValue::snan:
115
                        zero();    // no NaN in HFP, map to zero
×
116
                        break;
×
117
                }
118
        }
11✔
119

120
        // initializers for native types
121
        constexpr explicit hfloat(signed char iv)           noexcept : _block{} { *this = iv; }
122
        constexpr explicit hfloat(short iv)                 noexcept : _block{} { *this = iv; }
123
        constexpr explicit hfloat(int iv)                   noexcept : _block{} { *this = iv; }
18✔
124
        constexpr explicit hfloat(long iv)                  noexcept : _block{} { *this = iv; }
125
        constexpr explicit hfloat(long long iv)             noexcept : _block{} { *this = iv; }
126
        constexpr explicit hfloat(char iv)                  noexcept : _block{} { *this = iv; }
127
        constexpr explicit hfloat(unsigned short iv)        noexcept : _block{} { *this = iv; }
128
        constexpr explicit hfloat(unsigned int iv)          noexcept : _block{} { *this = iv; }
5✔
129
        constexpr explicit hfloat(unsigned long iv)         noexcept : _block{} { *this = iv; }
130
        constexpr explicit hfloat(unsigned long long iv)    noexcept : _block{} { *this = iv; }
131
        constexpr explicit hfloat(float iv)                 noexcept : _block{} { *this = iv; }
132
        constexpr explicit hfloat(double iv)                noexcept : _block{} { *this = iv; }
308✔
133

134
        // assignment operators for native types
135
        constexpr hfloat& operator=(signed char rhs)        noexcept { return convert_signed(rhs); }
136
        constexpr hfloat& operator=(short rhs)              noexcept { return convert_signed(rhs); }
137
        constexpr hfloat& operator=(int rhs)                noexcept { return convert_signed(rhs); }
24✔
138
        constexpr hfloat& operator=(long rhs)               noexcept { return convert_signed(rhs); }
139
        constexpr hfloat& operator=(long long rhs)          noexcept { return convert_signed(rhs); }
140
        // Plain `char` may be signed or unsigned per platform; route through
141
        // the signed conversion via integer promotion so hfloat(char(-1)) on
142
        // signed-char targets yields -1, not UCHAR_MAX (CodeRabbit finding
143
        // from PR #805 applied to the sibling type).
144
        constexpr hfloat& operator=(char rhs)               noexcept { return convert_signed(static_cast<int>(rhs)); }
145
        constexpr hfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned(rhs); }
146
        constexpr hfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned(rhs); }
5✔
147
        constexpr hfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned(rhs); }
148
        constexpr hfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned(rhs); }
149
        constexpr hfloat& operator=(float rhs)              noexcept { return convert_ieee754(rhs); }
150
        constexpr hfloat& operator=(double rhs)             noexcept { return convert_ieee754(rhs); }
308✔
151

152
        // conversion operators
153
        constexpr explicit operator float()           const noexcept { return float(convert_to_double()); }
154
        constexpr explicit operator double()          const noexcept { return convert_to_double(); }
283✔
155

156
#if LONG_DOUBLE_SUPPORT
157
        constexpr explicit hfloat(long double iv)           noexcept : _block{} { *this = iv; }
158
        constexpr hfloat& operator=(long double rhs)        noexcept { return convert_ieee754(double(rhs)); }
159
        constexpr explicit operator long double()     const noexcept { return (long double)convert_to_double(); }
160
#endif
161

162
        // Unified entry point: assign from a normalized 64-bit binary mantissa
163
        // plus its "frexp" exponent. This is the algorithm core; both parse()
164
        // (which derives mantissa64 from decimal_to_binary) and convert_ieee754
165
        // (which lifts a double's 53-bit mantissa into the 64-bit convention by
166
        // left-shifting 11 bits) call this same routine.
167
        //
168
        // Preconditions:
169
        //   mantissa64 is normalized with MSB at bit 63 (the "implicit 1"
170
        //   position of a normalized binary mantissa). For inputs derived from
171
        //   an IEEE double, the low 11 bits will be zero -- but they don't have
172
        //   to be; parse() fills all 64.
173
        //   bin_exp is the "frexp" exponent: value = mantissa64 * 2^(bin_exp - 64).
174
        constexpr hfloat& assign_from_mantissa64(bool negative, int bin_exp, uint64_t mantissa64) noexcept {
326✔
175
                if (mantissa64 == 0) { setzero(); return *this; }
326✔
176

177
                // Convert binary exponent to base-16 exponent. ceil(bin_exp/4) so
178
                // 0.f * 16^hex_exp == value with f's leading hex digit non-zero.
179
                int hex_exp;
180
                if (bin_exp > 0) hex_exp = (bin_exp + 3) / 4;
326✔
181
                else             hex_exp = bin_exp / 4;  // C truncates toward zero == ceil for negative
71✔
182

183
                // Align mantissa to the hex fraction bit positions. For a 64-bit
184
                // mantissa, mantissa_shift = (bin_exp - 4*hex_exp + fbits) - 64.
185
                int shift = bin_exp - 4 * hex_exp + static_cast<int>(fbits);
326✔
186
                int mantissa_shift = shift - 64;
326✔
187
                uint64_t fraction = 0;
326✔
188
                if (mantissa_shift == 0) {
326✔
189
                        fraction = mantissa64;
×
190
                }
191
                else if (mantissa_shift > 0) {
326✔
192
                        // fraction's natural width exceeds 64 bits -- can't be held in
193
                        // uint64_t. This is unreachable for hfloat configs with fbits
194
                        // <= 64 (the standard hfloat_short / hfloat_long sizes) and is
195
                        // the hfloat_extended (fbits=112) path: the 64-bit mantissa must
196
                        // be placed at offset `mantissa_shift` within the fbits-wide
197
                        // fraction (its low `mantissa_shift` bits are truncated to zero
198
                        // -- IBM HFP truncation). Check overflow/underflow first (the
199
                        // usual normalize_and_pack route can't see the wide fraction),
200
                        // then pack sign + exponent and setbit() the mantissa bits at
201
                        // their target high positions. Full bit-exact fbits>64
202
                        // conversion needs a wider intermediate (e.g. __uint128_t) and
203
                        // is left as future work; this delivers hfp64-precision in an
204
                        // hfp128 container.
205
                        if constexpr (fbits >= 64) {
206
                                if (hex_exp > emax) {
207
                                        if (negative) maxneg(); else maxpos();
208
                                        return *this;
209
                                }
210
                                if (hex_exp < emin) {
211
                                        setzero();
212
                                        return *this;
213
                                }
214
                                if (static_cast<unsigned>(mantissa_shift) < 64u) {
215
                                        pack(negative, hex_exp, 0);
216
                                        unsigned offset = static_cast<unsigned>(mantissa_shift);
217
                                        unsigned upper = (offset + 64u <= fbits) ? 64u : (fbits - offset);
218
                                        for (unsigned i = 0; i < upper; ++i) {
219
                                                if ((mantissa64 >> i) & 1) setbit(offset + i, true);
220
                                        }
221
                                        return *this;
222
                                }
223
                                // mantissa_shift >= 64: entire 64-bit mantissa lies beyond
224
                                // bit 63 of the fraction. Saturating to maxpos keeps the
225
                                // exponent meaningful while signaling unrepresentable
226
                                // precision; rare for normal-magnitude inputs.
227
                                fraction = ~uint64_t(0);
228
                        }
229
                        else {
230
                                // fbits < 64 with mantissa_shift > 0 is mathematically
231
                                // unreachable (shift = bin_exp - 4*hex_exp + fbits with
232
                                // (bin_exp - 4*hex_exp) in [0,3], so mantissa_shift =
233
                                // shift - 64 <= fbits + 3 - 64 < 0 when fbits < 61), but
234
                                // keep a defensive fallback for very narrow configs.
NEW
235
                                fraction = (uint64_t(1) << fbits) - 1u;
×
236
                        }
237
                }
238
                else if (-mantissa_shift < 64) {
326✔
239
                        // Shift right truncates low bits -- IBM HFP's truncation rounding.
240
                        fraction = mantissa64 >> static_cast<unsigned>(-mantissa_shift);
326✔
241
                }
242
                // else: fraction stays 0 (deep underflow)
243

244
                if constexpr (fbits < 64) {
245
                        fraction &= ((uint64_t(1) << fbits) - 1u);
326✔
246
                }
247

248
                normalize_and_pack(negative, hex_exp, fraction);
326✔
249
                return *this;
326✔
250
        }
251

252
        // Assign from a wide (target_mantissa_bits == fbits) decimal_to_binary
253
        // result. Delivers full fbits of precision regardless of host integer
254
        // width -- the whole point of routing decimal literals through d2b
255
        // instead of through double. Used by parse() when fbits > 64; the
256
        // uint64_t-based assign_from_mantissa64 above still serves the
257
        // convert_ieee754 (double / long double) entry point, where the input
258
        // is bounded to ~64 mantissa bits regardless.
259
        template<unsigned BigBits>
260
        constexpr hfloat& assign_from_wide_mantissa(bool negative, int bin_exp,
6✔
261
            const ::sw::universal::decimal_to_binary::basic_result<BigBits>& d) noexcept {
262
                // d.mantissa is normalized with MSB at bit (fbits - 1). The hex-
263
                // digit alignment within IBM HFP requires at most a 0..3 bit right
264
                // shift relative to that normalization so the leading hex digit
265
                // (bits [fbits-4, fbits-1]) ends up non-zero.
266
                int hex_exp;
267
                if (bin_exp > 0) hex_exp = (bin_exp + 3) / 4;
6✔
268
                else             hex_exp = bin_exp / 4;  // C truncates toward zero == ceil for negative
2✔
269

270
                if (hex_exp > emax) {
6✔
271
                        if (negative) maxneg(); else maxpos();
1✔
272
                        return *this;
1✔
273
                }
274
                if (hex_exp < emin) {
5✔
NEW
275
                        setzero();
×
NEW
276
                        return *this;
×
277
                }
278

279
                // shift = bin_exp - 4*hex_exp lies in [-3, 0]. Read fraction bit i
280
                // from d.mantissa.at(i + right_shift); the low `right_shift` mantissa
281
                // bits are dropped -- IBM HFP truncation rounding.
282
                int shift = bin_exp - 4 * hex_exp;
5✔
283
                unsigned right_shift = static_cast<unsigned>(-shift);
5✔
284

285
                clear();
5✔
286
                setbit(nbits - 1, negative);
5✔
287
                unsigned biased_exp = static_cast<unsigned>(hex_exp + bias);
5✔
288
                unsigned expStart = nbits - 2;
5✔
289
                for (unsigned i = 0; i < es; ++i) {
40✔
290
                        setbit(expStart - i, (biased_exp >> (es - 1 - i)) & 1);
35✔
291
                }
292
                for (unsigned i = 0; i < fbits; ++i) {
565✔
293
                        if (d.mantissa.at(i + right_shift)) setbit(i, true);
560✔
294
                }
295
                return *this;
5✔
296
        }
297

298
        // prefix operators
299
        constexpr hfloat operator-() const {
26✔
300
                hfloat negated(*this);
26✔
301
                if (!negated.iszero()) {
26✔
302
                        negated.setsign(!negated.sign());
21✔
303
                }
304
                return negated;
26✔
305
        }
306

307
        // arithmetic operators
308
        constexpr hfloat& operator+=(const hfloat& rhs) {
132✔
309
                bool lhs_sign, rhs_sign;
310
                int lhs_exp, rhs_exp;
311
                uint64_t lhs_frac, rhs_frac;
312
                unpack(lhs_sign, lhs_exp, lhs_frac);
132✔
313
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
132✔
314

315
                if (rhs.iszero()) return *this;
132✔
316
                if (iszero()) { *this = rhs; return *this; }
128✔
317

318
                // align exponents by shifting the smaller-exponent fraction RIGHT
319
                // by 4*(exp_large - exp_small) bits (hex digit alignment)
320
                int shift = lhs_exp - rhs_exp;
126✔
321
                uint64_t aligned_lhs = lhs_frac;
126✔
322
                uint64_t aligned_rhs = rhs_frac;
126✔
323
                int result_exp;
324

325
                if (shift >= 0) {
126✔
326
                        result_exp = lhs_exp;
93✔
327
                        aligned_rhs >>= (static_cast<unsigned>(shift) * 4u);
93✔
328
                }
329
                else {
330
                        result_exp = rhs_exp;
33✔
331
                        aligned_lhs >>= (static_cast<unsigned>(-shift) * 4u);
33✔
332
                }
333

334
                // add/subtract based on signs
335
                int64_t result_frac;
336
                int64_t a = lhs_sign ? -static_cast<int64_t>(aligned_lhs) : static_cast<int64_t>(aligned_lhs);
126✔
337
                int64_t b = rhs_sign ? -static_cast<int64_t>(aligned_rhs) : static_cast<int64_t>(aligned_rhs);
126✔
338
                result_frac = a + b;
126✔
339

340
                bool result_sign = (result_frac < 0);
126✔
341
                uint64_t abs_frac = static_cast<uint64_t>(result_sign ? -result_frac : result_frac);
126✔
342

343
                normalize_and_pack(result_sign, result_exp, abs_frac);
126✔
344
                return *this;
126✔
345
        }
346

347
        constexpr hfloat& operator-=(const hfloat& rhs) {
63✔
348
                hfloat neg(rhs);
63✔
349
                if (!neg.iszero()) neg.setsign(!neg.sign());
63✔
350
                return operator+=(neg);
63✔
351
        }
352

353
        constexpr hfloat& operator*=(const hfloat& rhs) {
69✔
354
                if (iszero() || rhs.iszero()) { setzero(); return *this; }
69✔
355

356
                bool lhs_sign, rhs_sign;
357
                int lhs_exp, rhs_exp;
358
                uint64_t lhs_frac, rhs_frac;
359
                unpack(lhs_sign, lhs_exp, lhs_frac);
66✔
360
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
66✔
361

362
                bool result_sign = (lhs_sign != rhs_sign);
66✔
363
                int result_exp = lhs_exp + rhs_exp;
66✔
364

365
#ifdef __SIZEOF_INT128__
366
                __uint128_t wide = static_cast<__uint128_t>(lhs_frac) * static_cast<__uint128_t>(rhs_frac);
66✔
367
                // The fractions are in 0.f format with fbits fraction bits
368
                // Product has 2*fbits bits, shift right by fbits to get back to fbits
369
                uint64_t result_frac = static_cast<uint64_t>(wide >> fbits);
66✔
370
                normalize_and_pack(result_sign, result_exp, result_frac);
66✔
371
#else
372
                // fallback: delegate through double
373
                double d = double(*this) * double(rhs);
374
                *this = d;
375
#endif
376
                return *this;
66✔
377
        }
378

379
        constexpr hfloat& operator/=(const hfloat& rhs) {
21✔
380
                if (rhs.iszero()) {
21✔
381
#if HFLOAT_THROW_ARITHMETIC_EXCEPTION
382
                        // Throwing in a constant expression is ill-formed; fence the
383
                        // throw under runtime to keep operator/= constexpr-callable.
384
                        // At constant-eval, divide-by-zero falls through to setzero()
385
                        // (IBM HFP has no NaN/inf, so saturation is the closest
386
                        // constexpr-safe behavior).
387
                        if (!std::is_constant_evaluated()) {
388
                                throw hfloat_divide_by_zero();
389
                        }
390
#endif
391
                        setzero();
1✔
392
                        return *this;
1✔
393
                }
394
                if (iszero()) return *this;
20✔
395

396
                bool lhs_sign, rhs_sign;
397
                int lhs_exp, rhs_exp;
398
                uint64_t lhs_frac, rhs_frac;
399
                unpack(lhs_sign, lhs_exp, lhs_frac);
19✔
400
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
19✔
401

402
                bool result_sign = (lhs_sign != rhs_sign);
19✔
403
                int result_exp = lhs_exp - rhs_exp;
19✔
404

405
#ifdef __SIZEOF_INT128__
406
                // Scale numerator up by fbits for precision
407
                __uint128_t scaled_num = static_cast<__uint128_t>(lhs_frac) << fbits;
19✔
408
                uint64_t result_frac = static_cast<uint64_t>(scaled_num / static_cast<__uint128_t>(rhs_frac));
19✔
409
                normalize_and_pack(result_sign, result_exp, result_frac);
19✔
410
#else
411
                double d = double(*this) / double(rhs);
412
                *this = d;
413
#endif
414
                return *this;
19✔
415
        }
416

417
        // unary operators
418
        constexpr hfloat& operator++() {
2✔
419
                *this += hfloat(1);
2✔
420
                return *this;
2✔
421
        }
422
        constexpr hfloat operator++(int) {
423
                hfloat tmp(*this);
424
                operator++();
425
                return tmp;
426
        }
427
        constexpr hfloat& operator--() {
2✔
428
                *this -= hfloat(1);
2✔
429
                return *this;
2✔
430
        }
431
        constexpr hfloat operator--(int) {
1✔
432
                hfloat tmp(*this);
1✔
433
                operator--();
1✔
434
                return tmp;
1✔
435
        }
436

437
        // modifiers
438
        constexpr void clear() noexcept {
608✔
439
                for (unsigned i = 0; i < nrBlocks; ++i) _block[i] = bt(0);
1,281✔
440
        }
608✔
441
        constexpr void setzero() noexcept { clear(); }
35✔
442

443
        constexpr void setsign(bool negative = true) noexcept {
82✔
444
                setbit(nbits - 1, negative);
82✔
445
        }
82✔
446

447
        // use un-interpreted raw bits to set the value
448
        constexpr void setbits(uint64_t value) noexcept {
449
                clear();
450
                for (unsigned i = 0; i < nrBlocks; ++i) {
451
                        _block[i] = bt(value & BLOCK_MASK);
452
                        value >>= bitsInBlock;
453
                }
454
                _block[MSU] &= MSU_MASK;
455
        }
456

457
        // create specific number system values of interest
458
        //
459
        // The `max_frac` computations below cap at uint64_t max for fbits >= 64:
460
        // `1ull << 64` and wider are UB. For hfloat configs with fbits > 64
461
        // (hfloat_extended at fbits=112), the high fraction bits stay zero
462
        // since pack()'s fraction parameter is also uint64_t; full-width
463
        // fbits > 64 saturation is future work pending a multi-limb fraction
464
        // path through pack / normalize_and_pack.
465
        constexpr hfloat& maxpos() noexcept {
5✔
466
                clear();
5✔
467
                unsigned biased_exp = (1u << es) - 1u;
5✔
468
                uint64_t max_frac;
469
                if constexpr (fbits < 64) {
470
                        max_frac = (uint64_t(1) << fbits) - 1u;
2✔
471
                } else {
472
                        max_frac = ~uint64_t(0);
3✔
473
                }
474
                pack(false, static_cast<int>(biased_exp) - bias, max_frac);
5✔
475
                return *this;
5✔
476
        }
477
        constexpr hfloat& minpos() noexcept {
1✔
478
                clear();
1✔
479
                pack(false, emin, 1);
1✔
480
                return *this;
1✔
481
        }
482
        constexpr hfloat& zero() noexcept {
7✔
483
                clear();
7✔
484
                return *this;
7✔
485
        }
NEW
486
        constexpr hfloat& minneg() noexcept {
×
NEW
487
                clear();
×
NEW
488
                pack(true, emin, 1);
×
NEW
489
                return *this;
×
490
        }
491
        constexpr hfloat& maxneg() noexcept {
1✔
492
                clear();
1✔
493
                unsigned biased_exp = (1u << es) - 1u;
1✔
494
                uint64_t max_frac;
495
                if constexpr (fbits < 64) {
496
                        max_frac = (uint64_t(1) << fbits) - 1u;
1✔
497
                } else {
NEW
498
                        max_frac = ~uint64_t(0);
×
499
                }
500
                pack(true, static_cast<int>(biased_exp) - bias, max_frac);
1✔
501
                return *this;
1✔
502
        }
503

504
        // selectors
505
        constexpr bool sign() const noexcept {
1,138✔
506
                return getbit(nbits - 1);
1,138✔
507
        }
508

509
        constexpr bool iszero() const noexcept {
1,925✔
510
                // zero when fraction is all zeros (exponent and sign don't matter)
511
                // In IBM HFP, zero is represented as all-zeros
512
                for (unsigned i = 0; i < nrBlocks; ++i) {
2,040✔
513
                        bt mask = (i == MSU) ? bt(MSU_MASK & ~(bt(1) << ((nbits - 1) % bitsInBlock))) : BLOCK_MASK;
1,971✔
514
                        if ((_block[i] & mask) != 0) return false;
1,971✔
515
                }
516
                return true;
69✔
517
        }
518

519
        constexpr bool isone() const noexcept {
520
                bool s; int e; uint64_t f;
521
                unpack(s, e, f);
522
                // 1.0 = 0.1 * 16^1, so e=1, f = 1 << (fbits-4)  (leading hex digit = 1).
523
                // Gate the shift so `1ull << (fbits - 4)` doesn't UB for fbits >= 68
524
                // (hfloat_extended at fbits=112 would otherwise hit shift-count-overflow);
525
                // for those configs unpack() also caps the fraction at 64 useful bits,
526
                // so we can never construct the wide-fbits "leading hex digit = 1"
527
                // encoding here -- return false consistently.
528
                if constexpr (fbits >= 68) {
529
                        (void)s; (void)e; (void)f;
530
                        return false;
531
                }
532
                else {
533
                        return !s && (e == 1) && (f == (uint64_t(1) << (fbits - 4)));
534
                }
535
        }
536

537
        constexpr bool ispos() const noexcept { return !sign(); }
538
        constexpr bool isneg() const noexcept { return sign(); }
539

540
        // IBM HFP has no NaN or infinity
541
        constexpr bool isinf() const noexcept { return false; }
2✔
542
        constexpr bool isnan() const noexcept { return false; }
2✔
543
        constexpr bool isnan(int) const noexcept { return false; }
544

545
        constexpr int scale() const noexcept {
4✔
546
                if (iszero()) return 0;
4✔
547
                bool s; int e; uint64_t f;
548
                unpack(s, e, f);
4✔
549
                (void)s;
550
                // IBM HFP: value = 0.f * 16^e. unpack() returns the fraction's
551
                // low 64 bits (for fbits < 64, that's the entire fraction). For
552
                // fbits >= 64 the leading hex digit lives in bits [fbits-4, fbits-1]
553
                // of the full fraction -- unpack's low-64-bit view sees only the
554
                // truncated tail, not the leading bit, so we read the top 64
555
                // fraction bits directly (bits [fbits-64, fbits-1]).
556
                uint64_t top = 0;
4✔
557
                if constexpr (fbits >= 64) {
558
                        for (unsigned i = 0; i < 64u; ++i) {
65✔
559
                                if (getbit(fbits - 64u + i)) top |= (uint64_t(1) << i);
64✔
560
                        }
561
                }
562
                else {
563
                        top = f;
3✔
564
                }
565
                constexpr int read_iter = (fbits < 64) ? static_cast<int>(fbits) : 64;
4✔
566
                int leading = -1;
4✔
567
                for (int i = read_iter - 1; i >= 0; --i) {
27✔
568
                        if ((top >> i) & 1) { leading = i; break; }
27✔
569
                }
570
                if (leading < 0) return 0;
4✔
571
                constexpr int frac_msb_weight = (fbits < 64) ? static_cast<int>(fbits) : 64;
4✔
572
                return 4 * e + leading - frac_msb_weight;
4✔
573
        }
574

575
        // convert to string
576
        std::string str(size_t nrDigits = 0) const {
63✔
577
                if (iszero()) return sign() ? std::string("-0") : std::string("0");
71✔
578

579
                double d = convert_to_double();
59✔
580
                std::stringstream ss;
59✔
581
                if (nrDigits > 0) {
59✔
582
                        ss << std::setprecision(static_cast<int>(nrDigits));
59✔
583
                }
584
                ss << d;
59✔
585
                return ss.str();
59✔
586
        }
59✔
587

588
        ///////////////////////////////////////////////////////////////////
589
        // Bit access (public for free functions)
590
        constexpr bool getbit(unsigned pos) const noexcept {
32,049✔
591
                if (pos >= nbits) return false;
32,049✔
592
                unsigned block_idx = pos / bitsInBlock;
32,049✔
593
                unsigned bit_idx = pos % bitsInBlock;
32,049✔
594
                return (_block[block_idx] >> bit_idx) & 1;
32,049✔
595
        }
596

597
        ///////////////////////////////////////////////////////////////////
598
        // Unpack into sign, unbiased exponent, and fraction
599
        constexpr void unpack(bool& s, int& exponent, uint64_t& fraction) const noexcept {
864✔
600
                s = sign();
864✔
601
                if (iszero()) { exponent = 0; fraction = 0; return; }
864✔
602

603
                // Extract exponent field (es bits)
604
                unsigned exp_field = 0;
855✔
605
                unsigned expStart = nbits - 2; // MSB of exponent (just below sign)
855✔
606
                for (unsigned i = 0; i < es; ++i) {
6,840✔
607
                        if (getbit(expStart - i)) {
5,985✔
608
                                exp_field |= (1u << (es - 1 - i));
1,812✔
609
                        }
610
                }
611
                exponent = static_cast<int>(exp_field) - bias;
855✔
612

613
                // Extract fraction. The destination is uint64_t so we read at most
614
                // 64 bits; high fraction bits (i >= 64) for hfloat_extended would
615
                // make `1ull << i` UB and aren't representable in the result type
616
                // anyway. Matches the cap on the pack() write path.
617
                fraction = 0;
855✔
618
                constexpr unsigned read_iter = (fbits < 64) ? fbits : 64u;
855✔
619
                for (unsigned i = 0; i < read_iter; ++i) {
22,815✔
620
                        if (getbit(i)) {
21,960✔
621
                                fraction |= (uint64_t(1) << i);
3,287✔
622
                        }
623
                }
624
        }
625

626
protected:
627
        bt _block[nrBlocks];
628

629
        ///////////////////////////////////////////////////////////////////
630
        // Bit manipulation helpers
631
        constexpr void setbit(unsigned pos, bool value) noexcept {
19,021✔
632
                if (pos >= nbits) return;
19,021✔
633
                unsigned block_idx = pos / bitsInBlock;
19,021✔
634
                unsigned bit_idx = pos % bitsInBlock;
19,021✔
635
                if (value) {
19,021✔
636
                        _block[block_idx] |= bt(1ull << bit_idx);
3,366✔
637
                }
638
                else {
639
                        _block[block_idx] &= bt(~(1ull << bit_idx));
15,655✔
640
                }
641
        }
642

643
        ///////////////////////////////////////////////////////////////////
644
        // Pack sign, unbiased exponent, and fraction
645
        constexpr void pack(bool s, int exponent, uint64_t fraction) noexcept {
554✔
646
                clear();
554✔
647

648
                // set sign
649
                setbit(nbits - 1, s);
554✔
650

651
                // set exponent field
652
                unsigned biased_exp = static_cast<unsigned>(exponent + bias);
554✔
653
                unsigned expStart = nbits - 2;
554✔
654
                for (unsigned i = 0; i < es; ++i) {
4,432✔
655
                        setbit(expStart - i, (biased_exp >> (es - 1 - i)) & 1);
3,878✔
656
                }
657

658
                // set fraction field. The source is a uint64_t so it carries at
659
                // most 64 useful bits; for fbits > 64 (hfloat_extended) the high
660
                // fbits stay zero because `fraction >> i` for i >= 64 is UB
661
                // (compilers typically mask the shift count modulo 64 which
662
                // would re-replicate the low bits at the top -- definitely
663
                // wrong). Multi-limb fraction support for fbits > 64 is future
664
                // work; until then, only the low 64 fraction bits are populated.
665
                constexpr unsigned frac_iter = (fbits < 64) ? fbits : 64u;
554✔
666
                for (unsigned i = 0; i < frac_iter; ++i) {
14,802✔
667
                        setbit(i, (fraction >> i) & 1);
14,248✔
668
                }
669
        }
554✔
670

671
        ///////////////////////////////////////////////////////////////////
672
        // Normalize: ensure leading hex digit is non-zero, then truncate
673
        constexpr void normalize_and_pack(bool s, int exponent, uint64_t fraction) noexcept {
564✔
674
                if (fraction == 0) { setzero(); return; }
564✔
675

676
                // Normalize the leading hex digit. For fbits < 64 the
677
                // `1ull << fbits` thresholds are well-defined and the loops
678
                // produce IBM-HFP-correct truncation. For fbits >= 64 the
679
                // uint64_t fraction cannot represent the full hex significand
680
                // (hfloat_extended is fbits=112); we skip the normalization
681
                // loops since the relevant thresholds would be UB and accept
682
                // that the wide-fbits saturation path is approximate. Full-
683
                // width fbits > 64 conversion needs a multi-limb fraction
684
                // representation and is left as future work.
685
                if constexpr (fbits < 64) {
686
                        while (fraction >= (uint64_t(1) << fbits)) {
565✔
687
                                fraction >>= 4;  // shift right by one hex digit
16✔
688
                                exponent++;
16✔
689
                        }
690
                        while (fraction > 0 && fraction < (uint64_t(1) << (fbits - 4))) {
600✔
691
                                fraction <<= 4;  // shift left by one hex digit
51✔
692
                                exponent--;
51✔
693
                        }
694
                        // Truncate to fbits (IBM HFP truncates, never rounds up).
695
                        fraction &= ((uint64_t(1) << fbits) - 1);
549✔
696
                }
697
                // else: fraction is already at most ~uint64_t(0), which by
698
                // construction fits within fbits >= 64; pack() places it in
699
                // the low 64 fbits and leaves the rest zero.
700

701
                // Check overflow/underflow
702
                if (exponent > emax) {
549✔
703
                        // overflow: saturate to maxpos/maxneg
704
                        if (s) maxneg(); else maxpos();
2✔
705
                        return;
2✔
706
                }
707
                if (exponent < emin) {
547✔
708
                        // underflow: set to zero
709
                        setzero();
×
710
                        return;
×
711
                }
712

713
                pack(s, exponent, fraction);
547✔
714
        }
715

716
        ///////////////////////////////////////////////////////////////////
717
        // Conversion helpers
718

719
        // Convert IEEE-754 double to hfloat
720
        //
721
        // Constexpr-safe rewrite of the original frexp/ldexp implementation:
722
        // - NaN detected via x != x (only NaN is unequal to itself)
723
        // - infinity detected by bracketing against numeric_limits::max()
724
        //   (numeric_limits<double>::max() is constexpr; std::isinf is not)
725
        // - fabs replaced with `negative ? -rhs : rhs`
726
        // - frexp/ldexp replaced with raw IEEE 754 field extraction via
727
        //   sw::bit_cast (constexpr on toolchains exposing std::bit_cast or
728
        //   __builtin_bit_cast; runtime fallback retains the legacy path).
729
        //
730
        // IBM HFP has no NaN and no infinity:
731
        //   NaN double  -> hfloat zero
732
        //   +/- inf     -> hfloat maxpos/maxneg (saturation)
733
        //
734
        // This used to duplicate the encoding algorithm. After the unification
735
        // for issue #849, it just extracts the IEEE double's 53-bit significand
736
        // (with hidden bit), lifts it into the 64-bit normalized mantissa
737
        // convention by left-shifting 11 bits (the headroom between IEEE
738
        // double's 53 and our canonical 64), and delegates to
739
        // `assign_from_mantissa64`. The shift is lossless -- the low 11 bits of
740
        // the wide mantissa are simply zero, since the double had no more
741
        // information to give.
742
        constexpr hfloat& convert_ieee754(double rhs) noexcept {
308✔
743
                if (rhs != rhs) {                       // NaN
308✔
744
                        setzero();
×
745
                        return *this;
×
746
                }
747
                if (rhs == 0.0) {
308✔
748
                        setzero();
14✔
749
                        return *this;
14✔
750
                }
751
                constexpr double dbl_max = std::numeric_limits<double>::max();
294✔
752
                if (rhs >  dbl_max) { maxpos(); return *this; }   // +inf
294✔
753
                if (rhs < -dbl_max) { maxneg(); return *this; }   // -inf
294✔
754

755
                bool negative = (rhs < 0);
294✔
756
                double abs_val = negative ? -rhs : rhs;
294✔
757

758
                // Extract IEEE 754 double fields without std::frexp:
759
                // IEEE 754 double: bias 1023, 52 fraction bits, hidden 1 for normals.
760
                //   normal: value = (1 + rawFrac/2^52) * 2^(rawExp - 1023)
761
                //                 = (2^52 + rawFrac) * 2^(rawExp - 1075)
762
                // frexp would return frac in [0.5, 1) with bin_exp = rawExp - 1022;
763
                // we use the same bin_exp here so the mantissa MSB lies at bit
764
                // (mantissa_width - 1), matching assign_from_mantissa64's contract.
765
                //
766
                // sw::bit_cast is constexpr on toolchains exposing std::bit_cast or
767
                // __builtin_bit_cast (the universally-supported case in C++20+); on
768
                // older toolchains it works at runtime via memcpy but is not
769
                // constexpr.  Calling convert_ieee754() in a constant expression on
770
                // such a toolchain produces a not-a-constant-expression diagnostic
771
                // at the bit_cast invocation -- the correct behavior, vs. silently
772
                // returning zero from a runtime fallback.
773
                uint64_t bits = sw::bit_cast<uint64_t>(abs_val);
294✔
774
                uint64_t rawExp  = (bits >> 52) & 0x7FFu;
294✔
775
                uint64_t rawFrac = bits & ((uint64_t(1) << 52) - 1u);
294✔
776
                if (rawExp == 0) {
294✔
777
                        // Subnormal double values are far below IBM HFP's smallest
778
                        // representable (16^emin); flush to zero per HFP semantics.
779
                        setzero();
×
780
                        return *this;
×
781
                }
782
                int bin_exp = static_cast<int>(rawExp) - 1022;
294✔
783
                uint64_t mantissa53 = (uint64_t(1) << 52) | rawFrac;  // 53-bit, MSB at bit 52
294✔
784
                uint64_t mantissa64 = mantissa53 << 11;               // lift to MSB at bit 63
294✔
785
                return assign_from_mantissa64(negative, bin_exp, mantissa64);
294✔
786
        }
787

788
        // Convert hfloat to IEEE-754 double
789
        //
790
        // Constexpr-safe: replaces std::ldexp(d, n) with a power-of-2 scaling
791
        // loop that multiplies/divides by 2.0 (exactly representable in double).
792
        // At runtime, dispatches to std::ldexp for performance.  The shift range
793
        // per template config is bounded (fbits + 4*emax for hfloat_extended is
794
        // ~360), so the constexpr loop is well-bounded.
795
        constexpr double convert_to_double() const noexcept {
342✔
796
                if (iszero()) return sign() ? -0.0 : 0.0;
342✔
797

798
                bool s; int e; uint64_t f;
799
                unpack(s, e, f);
318✔
800

801
                // value = 0.f * 16^e. For fbits < 64 the unpack'd f IS the full
802
                // fraction and value = f * 2^(-fbits) * 16^e = f * 2^(4*e - fbits).
803
                // For fbits >= 64 the leading hex digit lives in the TOP 64 fraction
804
                // bits, which unpack's low-64 view doesn't see; read them directly
805
                // and treat f as representing fraction's bits [fbits-64, fbits-1],
806
                // so value = f * 2^(-64) * 16^e = f * 2^(4*e - 64).
807
                int shift;
808
                if constexpr (fbits >= 64) {
809
                        f = 0;
5✔
810
                        for (unsigned i = 0; i < 64u; ++i) {
325✔
811
                                if (getbit(fbits - 64u + i)) f |= (uint64_t(1) << i);
320✔
812
                        }
813
                        shift = 4 * e - 64;
5✔
814
                }
815
                else {
816
                        shift = 4 * e - static_cast<int>(fbits);
313✔
817
                }
818
                double result = static_cast<double>(f);
318✔
819
                if (std::is_constant_evaluated()) {
318✔
820
                        // Power-of-2 scaling: 2.0 and 0.5 are exactly representable
821
                        // in IEEE 754 double, so this introduces no rounding error
822
                        // over the bounded iteration count.
NEW
823
                        if (shift > 0) {
×
NEW
824
                                for (int i = 0; i < shift; ++i) result *= 2.0;
×
825
                        }
NEW
826
                        else if (shift < 0) {
×
NEW
827
                                for (int i = 0; i < -shift; ++i) result *= 0.5;
×
828
                        }
829
                }
830
                else {
831
                        result = std::ldexp(result, shift);
318✔
832
                }
833
                return s ? -result : result;
318✔
834
        }
835

836
        // Direct integer-to-hfloat packing for fbits <= 56 (hfloat_short and
837
        // hfloat_long).  Avoids the double round-trip that would lose precision
838
        // for int64_t values above 2^53 -- a value like 9007199254740993ULL
839
        // IS representable in hfloat_long (56-bit fraction) but rounds via
840
        // double.  For fbits >= 64 (hfloat_extended) the fraction is wider
841
        // than uint64_t can hold, so we fall back to convert_ieee754; this
842
        // keeps the same precision as pre-promotion code for that variant.
843
        constexpr hfloat& convert_signed(int64_t v) noexcept {
24✔
844
                if (v == 0) {
24✔
845
                        setzero();
1✔
846
                        return *this;
1✔
847
                }
848
                bool negative = (v < 0);
23✔
849
                // Compute |v| as uint64_t without ever negating an int64_t (the
850
                // negation of INT64_MIN would overflow).  Bitwise-safe identity:
851
                // |INT64_MIN| = -(v + 1) + 1, each step stays in range.
852
                uint64_t abs_v = negative
23✔
853
                        ? (static_cast<uint64_t>(-(v + 1)) + 1ull)
23✔
854
                        : static_cast<uint64_t>(v);
855
                if constexpr (fbits >= 64) {
856
                        return convert_ieee754(static_cast<double>(v));
857
                }
858
                else {
859
                        pack_uint64(negative, abs_v);
23✔
860
                        return *this;
23✔
861
                }
862
        }
863

864
        constexpr hfloat& convert_unsigned(uint64_t v) noexcept {
5✔
865
                if (v == 0) {
5✔
866
                        setzero();
1✔
867
                        return *this;
1✔
868
                }
869
                if constexpr (fbits >= 64) {
870
                        return convert_ieee754(static_cast<double>(v));
871
                }
872
                else {
873
                        pack_uint64(false, v);
4✔
874
                        return *this;
4✔
875
                }
876
        }
877

878
        // Pack a uint64_t magnitude into the hfloat's hex representation.
879
        // Precondition: v != 0 and fbits < 64.
880
        //   value = v = 0.f * 16^hex_exp where 0.f's leading hex digit is non-zero.
881
        // Algorithm:
882
        //   p          = highest set bit of v (0-indexed)
883
        //   hex_exp    = p/4 + 1   (number of hex digits to represent v)
884
        //   shift      = fbits - 4*hex_exp
885
        //   fraction   = (shift >= 0) ? v << shift : v >> -shift  (truncate per HFP)
886
        // Worst-case shift left: p=0, fbits<=56 -> shift=fbits-4, v<<shift fits in
887
        // uint64_t since v has only 1 bit.  Worst-case shift right: p=63, shift=fbits-64.
888
        constexpr void pack_uint64(bool negative, uint64_t v) noexcept {
27✔
889
                // Find highest set bit position (loop is bounded to 64 iterations
890
                // and constexpr-clean; v != 0 by precondition).
891
                unsigned p = 63;
27✔
892
                while (((v >> p) & 1u) == 0u) --p;
1,662✔
893
                int hex_exp = static_cast<int>(p / 4u) + 1;
27✔
894
                int shift = static_cast<int>(fbits) - 4 * hex_exp;
27✔
895
                uint64_t fraction = (shift >= 0)
27✔
896
                        ? (v << static_cast<unsigned>(shift))
27✔
897
                        : (v >> static_cast<unsigned>(-shift));
×
898
                normalize_and_pack(negative, hex_exp, fraction);
27✔
899
        }
27✔
900

901
private:
902

903
        // hfloat - hfloat logic comparisons
904
        template<unsigned N, unsigned E, typename B>
905
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const hfloat<N, E, B>& rhs);
906

907
        // hfloat - literal logic comparisons
908
        template<unsigned N, unsigned E, typename B>
909
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const double rhs);
910

911
        // literal - hfloat logic comparisons
912
        template<unsigned N, unsigned E, typename B>
913
        friend constexpr bool operator==(const double lhs, const hfloat<N, E, B>& rhs);
914
};
915

916

917
////////////////////////    helper functions   /////////////////////////////////
918

919
template<unsigned ndigits, unsigned es, typename BlockType>
920
inline std::string to_binary(const hfloat<ndigits, es, BlockType>& number, bool nibbleMarker = false) {
53✔
921
        using Hfloat = hfloat<ndigits, es, BlockType>;
922
        std::stringstream s;
53✔
923

924
        // sign bit
925
        s << "0b" << (number.sign() ? '1' : '0') << '.';
53✔
926

927
        // exponent field (es bits)
928
        unsigned expStart = Hfloat::nbits - 2;
53✔
929
        for (unsigned i = 0; i < es; ++i) {
424✔
930
                s << (number.getbit(expStart - i) ? '1' : '0');
371✔
931
        }
932
        s << '.';
53✔
933

934
        // fraction field (fbits bits, show in hex-digit groups)
935
        for (int i = static_cast<int>(Hfloat::fbits) - 1; i >= 0; --i) {
1,837✔
936
                s << (number.getbit(static_cast<unsigned>(i)) ? '1' : '0');
1,784✔
937
                if (nibbleMarker && i > 0 && (i % 4 == 0)) s << '\'';
1,784✔
938
        }
939

940
        return s.str();
106✔
941
}
53✔
942

943
template<unsigned ndigits, unsigned es, typename BlockType>
944
inline std::string to_hex(const hfloat<ndigits, es, BlockType>& number) {
9✔
945
        std::stringstream s;
9✔
946

947
        s << (number.sign() ? '-' : '+');
9✔
948
        s << "0x0.";
9✔
949

950
        // extract hex digits from fraction
951
        bool sign_val; int exp_val; uint64_t frac_val;
952
        number.unpack(sign_val, exp_val, frac_val);
9✔
953

954
        for (int i = static_cast<int>(ndigits) - 1; i >= 0; --i) {
123✔
955
                unsigned hex_digit = (frac_val >> (i * 4)) & 0xF;
114✔
956
                s << "0123456789ABCDEF"[hex_digit];
114✔
957
        }
958
        s << " * 16^" << (exp_val);
9✔
959

960
        return s.str();
18✔
961
}
9✔
962

963

964
// native semantic representation: radix-16, delegates to to_hex
965
template<unsigned ndigits, unsigned es, typename BlockType>
966
inline std::string to_native(const hfloat<ndigits, es, BlockType>& v, bool = false) {
967
        return to_hex(v);
968
}
969

970
////////////////////////    HFLOAT functions   /////////////////////////////////
971

972
template<unsigned ndigits, unsigned es, typename BlockType>
973
inline constexpr hfloat<ndigits, es, BlockType> abs(const hfloat<ndigits, es, BlockType>& a) {
974
        hfloat<ndigits, es, BlockType> result(a);
975
        result.setsign(false);
976
        return result;
977
}
978

979
template<unsigned ndigits, unsigned es, typename BlockType>
980
inline constexpr hfloat<ndigits, es, BlockType> fabs(hfloat<ndigits, es, BlockType> a) {
981
        a.setsign(false);
982
        return a;
983
}
984

985

986
////////////////////////  stream operators   /////////////////////////////////
987

988
template<unsigned ndigits, unsigned es, typename BlockType>
989
inline std::ostream& operator<<(std::ostream& ostr, const hfloat<ndigits, es, BlockType>& i) {
63✔
990
        std::stringstream ss;
63✔
991
        std::streamsize prec = ostr.precision();
63✔
992
        std::streamsize width = ostr.width();
63✔
993
        std::ios_base::fmtflags ff;
994
        ff = ostr.flags();
63✔
995
        ss.flags(ff);
63✔
996
        ss << std::setw(width) << std::setprecision(prec) << i.str(size_t(prec));
63✔
997
        return ostr << ss.str();
126✔
998
}
63✔
999

1000
template<unsigned ndigits, unsigned es, typename BlockType>
1001
inline std::istream& operator>>(std::istream& istr, hfloat<ndigits, es, BlockType>& p) {
2✔
1002
        std::string txt;
2✔
1003
        if (!(istr >> txt)) {
2✔
1004
                // extraction failed (already-bad stream or EOF); failbit set by >>.
1005
                return istr;
1✔
1006
        }
1007
        if (!parse(txt, p)) {
1✔
1008
                std::cerr << "unable to parse -" << txt << "- into an hfloat value\n";
1✔
1009
                istr.setstate(std::ios::failbit);
1✔
1010
        }
1011
        return istr;
1✔
1012
}
2✔
1013

1014
////////////////// string operators
1015

1016
// Parse a decimal floating-point literal into an hfloat. hfloat has no NaN
1017
// or Inf encoding, so "nan" / "inf" / "infinity" tokens are rejected. The
1018
// decimal payload is converted via decimal_to_binary with 64 bits of
1019
// precision and assembled directly into the hfloat hex fraction -- this
1020
// gives bit-exact correctly-rounded conversion for hfloat_long's full 56
1021
// fbits, which the via-double path could not deliver (double has only 53).
1022
//
1023
// Resolves #849.
1024
template<unsigned ndigits, unsigned es, typename BlockType>
1025
bool parse(const std::string& number, hfloat<ndigits, es, BlockType>& value) {
68✔
1026
        if (number.empty()) return false;
68✔
1027

1028
        // Reject special-value tokens up front (hfloat has no NaN or Inf
1029
        // encoding) and pre-validate the input grammar so malformed tokens are
1030
        // signaled via false return rather than silently mapped to zero by the
1031
        // d2b path.
1032
        std::size_t pos = 0;
67✔
1033
        while (pos < number.size() && std::isspace(static_cast<unsigned char>(number[pos]))) ++pos;
67✔
1034
        if (pos >= number.size()) return false;
67✔
1035
        const std::size_t numeric_start = pos;
67✔
1036
        if (number[pos] == '+' || number[pos] == '-') ++pos;
67✔
1037
        if (pos + 3 <= number.size()) {
67✔
1038
                char a = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos])));
59✔
1039
                char b = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 1])));
59✔
1040
                char c = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 2])));
59✔
1041
                if ((a == 'n' && b == 'a' && c == 'n')
59✔
1042
                 || (a == 'i' && b == 'n' && c == 'f')) {
55✔
1043
                        return false;  // hfloat cannot represent these
11✔
1044
                }
1045
        }
1046
        // Pre-validate the rest as a decimal floating-point literal:
1047
        // digits[.digits][eE[+-]digits] or .digits[eE[+-]digits].
1048
        bool seen_digit = false;
56✔
1049
        bool seen_dot   = false;
56✔
1050
        if (pos >= number.size()) return false;
56✔
1051
        while (pos < number.size()) {
387✔
1052
                char ch = number[pos];
360✔
1053
                if (ch >= '0' && ch <= '9') { seen_digit = true; ++pos; continue; }
360✔
1054
                if (ch == '.') {
60✔
1055
                        if (seen_dot) return false;
34✔
1056
                        seen_dot = true;
33✔
1057
                        ++pos;
33✔
1058
                        continue;
33✔
1059
                }
1060
                break;
26✔
1061
        }
1062
        if (!seen_digit) return false;
53✔
1063
        if (pos < number.size() && (number[pos] == 'e' || number[pos] == 'E')) {
48✔
1064
                ++pos;
21✔
1065
                if (pos < number.size() && (number[pos] == '+' || number[pos] == '-')) ++pos;
21✔
1066
                bool seen_exp_digit = false;
21✔
1067
                while (pos < number.size() && number[pos] >= '0' && number[pos] <= '9') {
61✔
1068
                        seen_exp_digit = true;
40✔
1069
                        ++pos;
40✔
1070
                }
1071
                if (!seen_exp_digit) return false;
21✔
1072
        }
1073
        if (pos != number.size()) return false;  // trailing junk
46✔
1074

1075
        // Hand the trimmed payload (post-whitespace) to decimal_to_binary.
1076
        // Request fbits of precision when fbits > 64 so wide-fraction configs
1077
        // (hfloat<28,*> with fbits=112) get bit-exact conversion through the
1078
        // d2b multi-limb mantissa instead of being clipped to double-precision
1079
        // like the legacy via-double path. For fbits <= 64 stay at 64 bits --
1080
        // that pinned the hfp64 0.1 truncation test under issue #849 and gives
1081
        // hfp32 a few guard bits for the d2b path's rounding to settle.
1082
        using H = hfloat<ndigits, es, BlockType>;
1083
        constexpr unsigned target_bits = (H::fbits > 64u) ? H::fbits : 64u;
45✔
1084
        std::string_view payload(number.data() + numeric_start, number.size() - numeric_start);
45✔
1085
        auto d = ::sw::universal::decimal_to_binary::convert(payload, target_bits);
45✔
1086
        if (!d.valid) return false;
45✔
1087
        if (d.is_zero) {
45✔
1088
                value = H(SpecificValue::zero);
7✔
1089
                return true;
7✔
1090
        }
1091

1092
        // d2b's binary_scale is the exponent of mantissa's MSB, so the
1093
        // "frexp" exponent (value = 1.frac * 2^(bin_exp - 1)) is binary_scale + 1.
1094
        int bin_exp = static_cast<int>(d.binary_scale) + 1;
38✔
1095

1096
        if constexpr (H::fbits > 64u) {
1097
                // Wide-fraction path: feed d2b's full multi-limb mantissa directly
1098
                // into the hex-aligned fraction. This is what makes parse() deliver
1099
                // precision beyond what double / long double can carry; the legacy
1100
                // uint64_t intermediate would truncate to ~64 bits and defeat the
1101
                // purpose of routing decimal literals through d2b in the first place.
1102
                value.assign_from_wide_mantissa(d.negative, bin_exp, d);
6✔
1103
        }
1104
        else {
1105
                // Extract the 64-bit normalized mantissa (MSB at bit 63) from d2b's
1106
                // bigint. With target_mantissa_bits == 64, mantissa.bit[63] is the
1107
                // implicit-1 of the binary expansion and bits [0, 62] are the fraction.
1108
                std::uint64_t mantissa64 = 0;
32✔
1109
                for (unsigned i = 0; i < 64; ++i) {
2,080✔
1110
                        if (d.mantissa.at(i)) mantissa64 |= (std::uint64_t{1} << i);
2,048✔
1111
                }
1112
                value.assign_from_mantissa64(d.negative, bin_exp, mantissa64);
32✔
1113
        }
1114
        return true;
38✔
1115
}
1116

1117

1118
//////////////////////////////////////////////////////////////////////////////////////////////////////
1119
// hfloat - hfloat binary logic operators
1120

1121
// hfloat values are normalized by normalize_and_pack so that the leading
1122
// hex digit of the fraction is non-zero -- the (sign, exponent, fraction)
1123
// tuple is unique per value.  We compare tuples directly instead of via
1124
// double() to preserve precision for hfloat_long (56-bit fraction) and
1125
// hfloat_extended, where the double round-trip would lose bits below the
1126
// 53-bit IEEE 754 mantissa.
1127
template<unsigned ndigits, unsigned es, typename BlockType>
1128
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
30✔
1129
        bool lz = lhs.iszero();
30✔
1130
        bool rz = rhs.iszero();
30✔
1131
        if (lz && rz) return true;       // +0 == -0 in HFP (no signed-zero distinction)
30✔
1132
        if (lz || rz) return false;
29✔
1133
        if (lhs.sign() != rhs.sign()) return false;
29✔
1134
        bool ts; int le = 0, re = 0;
29✔
1135
        uint64_t lf = 0, rf = 0;
29✔
1136
        lhs.unpack(ts, le, lf);
29✔
1137
        rhs.unpack(ts, re, rf);
29✔
1138
        return le == re && lf == rf;
29✔
1139
}
1140

1141
template<unsigned ndigits, unsigned es, typename BlockType>
1142
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
22✔
1143
        return !operator==(lhs, rhs);
22✔
1144
}
1145

1146
template<unsigned ndigits, unsigned es, typename BlockType>
1147
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
15✔
1148
        bool lz = lhs.iszero();
15✔
1149
        bool rz = rhs.iszero();
15✔
1150
        if (lz && rz) return false;
15✔
1151
        bool ls = lhs.sign();
15✔
1152
        bool rs = rhs.sign();
15✔
1153
        if (lz) return !rs;              // 0 < +x; 0 not < -x
15✔
1154
        if (rz) return ls;               // -x < 0; +x not < 0
14✔
1155
        if (ls != rs) return ls;         // negative < positive
13✔
1156
        // Same sign, both non-zero.  unpack returns (sign, biased-exp-removed,
1157
        // fraction).  For positives, lhs < rhs iff (le < re) || tie + (lf < rf).
1158
        // For negatives, the ordering reverses: lhs < rhs iff |lhs| > |rhs|.
1159
        bool ts; int le = 0, re = 0;
12✔
1160
        uint64_t lf = 0, rf = 0;
12✔
1161
        lhs.unpack(ts, le, lf);
12✔
1162
        rhs.unpack(ts, re, rf);
12✔
1163
        if (ls) {
12✔
1164
                // both negative: lhs < rhs iff |lhs| > |rhs|
1165
                return (le > re) || (le == re && lf > rf);
2✔
1166
        }
1167
        // both positive
1168
        return (le < re) || (le == re && lf < rf);
10✔
1169
}
1170

1171
template<unsigned ndigits, unsigned es, typename BlockType>
1172
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
2✔
1173
        return operator< (rhs, lhs);
2✔
1174
}
1175

1176
template<unsigned ndigits, unsigned es, typename BlockType>
1177
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1178
        return operator< (lhs, rhs) || operator==(lhs, rhs);
3✔
1179
}
1180

1181
template<unsigned ndigits, unsigned es, typename BlockType>
1182
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1183
        return !operator< (lhs, rhs);
3✔
1184
}
1185

1186
//////////////////////////////////////////////////////////////////////////////////////////////////////
1187
// hfloat - literal binary logic operators
1188
template<unsigned ndigits, unsigned es, typename BlockType>
1189
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1190
        return operator==(lhs, hfloat<ndigits, es, BlockType>(rhs));
1191
}
1192
template<unsigned ndigits, unsigned es, typename BlockType>
1193
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1194
        return !operator==(lhs, rhs);
1195
}
1196
template<unsigned ndigits, unsigned es, typename BlockType>
1197
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1198
        return operator<(lhs, hfloat<ndigits, es, BlockType>(rhs));
1199
}
1200
template<unsigned ndigits, unsigned es, typename BlockType>
1201
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1202
        return operator< (hfloat<ndigits, es, BlockType>(rhs), lhs);
1203
}
1204
template<unsigned ndigits, unsigned es, typename BlockType>
1205
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1206
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1207
}
1208
template<unsigned ndigits, unsigned es, typename BlockType>
1209
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1210
        return !operator< (lhs, rhs);
1211
}
1212

1213
//////////////////////////////////////////////////////////////////////////////////////////////////////
1214
// literal - hfloat binary logic operators
1215
template<unsigned ndigits, unsigned es, typename BlockType>
1216
inline constexpr bool operator==(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1217
        return operator==(hfloat<ndigits, es, BlockType>(lhs), rhs);
1218
}
1219
template<unsigned ndigits, unsigned es, typename BlockType>
1220
inline constexpr bool operator!=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1221
        return !operator==(lhs, rhs);
1222
}
1223
template<unsigned ndigits, unsigned es, typename BlockType>
1224
inline constexpr bool operator< (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1225
        return operator<(hfloat<ndigits, es, BlockType>(lhs), rhs);
1226
}
1227
template<unsigned ndigits, unsigned es, typename BlockType>
1228
inline constexpr bool operator> (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1229
        return operator< (rhs, lhs);
1230
}
1231
template<unsigned ndigits, unsigned es, typename BlockType>
1232
inline constexpr bool operator<=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1233
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1234
}
1235
template<unsigned ndigits, unsigned es, typename BlockType>
1236
inline constexpr bool operator>=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1237
        return !operator< (lhs, rhs);
1238
}
1239

1240
//////////////////////////////////////////////////////////////////////////////////////////////////////
1241
// hfloat - hfloat binary arithmetic operators
1242
template<unsigned ndigits, unsigned es, typename BlockType>
1243
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
67✔
1244
        hfloat<ndigits, es, BlockType> sum(lhs);
67✔
1245
        sum += rhs;
67✔
1246
        return sum;
67✔
1247
}
1248
template<unsigned ndigits, unsigned es, typename BlockType>
1249
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
61✔
1250
        hfloat<ndigits, es, BlockType> diff(lhs);
61✔
1251
        diff -= rhs;
61✔
1252
        return diff;
61✔
1253
}
1254
template<unsigned ndigits, unsigned es, typename BlockType>
1255
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
69✔
1256
        hfloat<ndigits, es, BlockType> mul(lhs);
69✔
1257
        mul *= rhs;
69✔
1258
        return mul;
69✔
1259
}
1260
template<unsigned ndigits, unsigned es, typename BlockType>
1261
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
21✔
1262
        hfloat<ndigits, es, BlockType> ratio(lhs);
21✔
1263
        ratio /= rhs;
21✔
1264
        return ratio;
21✔
1265
}
1266

1267
//////////////////////////////////////////////////////////////////////////////////////////////////////
1268
// hfloat - literal binary arithmetic operators
1269
template<unsigned ndigits, unsigned es, typename BlockType>
1270
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1271
        return operator+(lhs, hfloat<ndigits, es, BlockType>(rhs));
1272
}
1273
template<unsigned ndigits, unsigned es, typename BlockType>
1274
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1275
        return operator-(lhs, hfloat<ndigits, es, BlockType>(rhs));
1276
}
1277
template<unsigned ndigits, unsigned es, typename BlockType>
1278
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1279
        return operator*(lhs, hfloat<ndigits, es, BlockType>(rhs));
1280
}
1281
template<unsigned ndigits, unsigned es, typename BlockType>
1282
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1283
        return operator/(lhs, hfloat<ndigits, es, BlockType>(rhs));
1284
}
1285

1286
//////////////////////////////////////////////////////////////////////////////////////////////////////
1287
// literal - hfloat binary arithmetic operators
1288
template<unsigned ndigits, unsigned es, typename BlockType>
1289
inline constexpr hfloat<ndigits, es, BlockType> operator+(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1290
        return operator+(hfloat<ndigits, es, BlockType>(lhs), rhs);
1291
}
1292
template<unsigned ndigits, unsigned es, typename BlockType>
1293
inline constexpr hfloat<ndigits, es, BlockType> operator-(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1294
        return operator-(hfloat<ndigits, es, BlockType>(lhs), rhs);
1295
}
1296
template<unsigned ndigits, unsigned es, typename BlockType>
1297
inline constexpr hfloat<ndigits, es, BlockType> operator*(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1298
        return operator*(hfloat<ndigits, es, BlockType>(lhs), rhs);
1299
}
1300
template<unsigned ndigits, unsigned es, typename BlockType>
1301
inline constexpr hfloat<ndigits, es, BlockType> operator/(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1302
        return operator/(hfloat<ndigits, es, BlockType>(lhs), rhs);
1303
}
1304

1305
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc