• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 26065017861

18 May 2026 10:50PM UTC coverage: 84.23%. First build
26065017861

Pull #869

github

web-flow
Merge 21d877662 into 3e2a68633
Pull Request #869: fix: IBM HFP clean-up

78 of 88 new or added lines in 2 files covered. (88.64%)

46911 of 55694 relevant lines covered (84.23%)

6409080.47 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.47
/include/sw/universal/number/hfloat/hfloat_impl.hpp
1
#pragma once
2
// hfloat_impl.hpp: implementation of IBM System/360 hexadecimal floating-point
3
//
4
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
5
// SPDX-License-Identifier: MIT
6
//
7
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
8
//
9
// IBM System/360 Hexadecimal Floating-Point (1964):
10
//   Format: [sign(1)] [exponent(es)] [hex_fraction(ndigits*4 bits)]
11
//   Value:  (-1)^sign * 16^(exponent - bias) * 0.f1f2...fn
12
//
13
// Key properties:
14
//   - No hidden bit (fraction always has explicit leading hex digit)
15
//   - No NaN, no infinity, no subnormals
16
//   - Truncation rounding only (never rounds up)
17
//   - Overflow saturates to maxpos/maxneg
18
//   - Zero: sign=0, exponent=0, fraction=0
19
//   - Wobbling precision: 0-3 leading zero bits in MSB hex digit
20
//
21
// Standard configurations:
22
//   Short:    hfloat<6, 7>  = 1+7+24 = 32 bits
23
//   Long:     hfloat<14, 7> = 1+7+56 = 64 bits
24
//   Extended: hfloat<28, 7> = 1+7+112 = 120 bits (stored in 128)
25

26
#include <cctype>
27
#include <cstdint>
28
#include <cstring>
29
#include <cmath>
30
#include <string>
31
#include <string_view>
32
#include <sstream>
33
#include <iostream>
34
#include <iomanip>
35
#include <algorithm>
36

37
// supporting types and functions
38
#include <universal/native/ieee754.hpp>
39
#include <universal/utility/decimal_to_binary.hpp>
40
#include <universal/number/shared/nan_encoding.hpp>
41
#include <universal/number/shared/infinite_encoding.hpp>
42
#include <universal/number/shared/specific_value_encoding.hpp>
43
// hfloat exception structure
44
#include <universal/number/hfloat/exceptions.hpp>
45

46
namespace sw { namespace universal {
47

48
///////////////////////////////////////////////////////////////////////////////
49
// hfloat: IBM System/360 hexadecimal floating-point number
50
//
51
// Template parameters:
52
//   ndigits  - number of hexadecimal fraction digits
53
//   es       - exponent bits (7 for standard IBM HFP)
54
//   bt       - block type for storage
55
//
56
template<unsigned _ndigits, unsigned _es, typename bt = std::uint32_t>
57
class hfloat {
58
public:
59
        static constexpr unsigned ndigits     = _ndigits;            // hex fraction digits
60
        static constexpr unsigned es          = _es;                 // exponent bits
61
        static constexpr unsigned fbits       = ndigits * 4u;        // fraction bits
62
        static constexpr unsigned nbits       = 1u + es + fbits;     // total bits
63
        static constexpr int      bias        = (1 << (es - 1));     // exponent bias (64 for es=7)
64
        static constexpr int      emax        = (1 << es) - 1 - bias; // max unbiased exponent
65
        static constexpr int      emin        = -bias;                // min unbiased exponent
66

67
        typedef bt BlockType;
68

69
        static constexpr unsigned bitsInByte  = 8u;
70
        static constexpr unsigned bitsInBlock = sizeof(bt) * bitsInByte;
71
        static constexpr unsigned nrBlocks    = 1u + ((nbits - 1u) / bitsInBlock);
72
        static constexpr unsigned MSU         = nrBlocks - 1u;
73

74
        static constexpr bt ALL_ONES  = bt(~0);
75
        static constexpr bt MSU_MASK  = (nrBlocks * bitsInBlock == nbits) ? ALL_ONES : bt((1ull << (nbits % bitsInBlock)) - 1u);
76
        static constexpr bt BLOCK_MASK = bt(~0);
77

78
        /// trivial constructor
79
        hfloat() = default;
80

81
        hfloat(const hfloat&) = default;
82
        hfloat(hfloat&&) = default;
83

84
        hfloat& operator=(const hfloat&) = default;
85
        hfloat& operator=(hfloat&&) = default;
86

87
        // specific value constructor
88
        constexpr hfloat(const SpecificValue code) noexcept : _block{} {
11✔
89
                switch (code) {
11✔
90
                case SpecificValue::maxpos:
3✔
91
                        maxpos();
3✔
92
                        break;
3✔
93
                case SpecificValue::minpos:
1✔
94
                        minpos();
1✔
95
                        break;
1✔
96
                case SpecificValue::zero:
7✔
97
                default:
98
                        zero();
7✔
99
                        break;
7✔
100
                case SpecificValue::minneg:
×
101
                        minneg();
×
102
                        break;
×
103
                case SpecificValue::maxneg:
×
104
                        maxneg();
×
105
                        break;
×
106
                case SpecificValue::infpos:
×
107
                        maxpos();  // no infinity in HFP, saturate to maxpos
×
108
                        break;
×
109
                case SpecificValue::infneg:
×
110
                        maxneg();  // no infinity in HFP, saturate to maxneg
×
111
                        break;
×
112
                case SpecificValue::nar:
×
113
                case SpecificValue::qnan:
114
                case SpecificValue::snan:
115
                        zero();    // no NaN in HFP, map to zero
×
116
                        break;
×
117
                }
118
        }
11✔
119

120
        // initializers for native types
121
        constexpr explicit hfloat(signed char iv)           noexcept : _block{} { *this = iv; }
122
        constexpr explicit hfloat(short iv)                 noexcept : _block{} { *this = iv; }
123
        constexpr explicit hfloat(int iv)                   noexcept : _block{} { *this = iv; }
18✔
124
        constexpr explicit hfloat(long iv)                  noexcept : _block{} { *this = iv; }
125
        constexpr explicit hfloat(long long iv)             noexcept : _block{} { *this = iv; }
126
        constexpr explicit hfloat(char iv)                  noexcept : _block{} { *this = iv; }
127
        constexpr explicit hfloat(unsigned short iv)        noexcept : _block{} { *this = iv; }
128
        constexpr explicit hfloat(unsigned int iv)          noexcept : _block{} { *this = iv; }
5✔
129
        constexpr explicit hfloat(unsigned long iv)         noexcept : _block{} { *this = iv; }
130
        constexpr explicit hfloat(unsigned long long iv)    noexcept : _block{} { *this = iv; }
131
        constexpr explicit hfloat(float iv)                 noexcept : _block{} { *this = iv; }
132
        constexpr explicit hfloat(double iv)                noexcept : _block{} { *this = iv; }
308✔
133

134
        // assignment operators for native types
135
        constexpr hfloat& operator=(signed char rhs)        noexcept { return convert_signed(rhs); }
136
        constexpr hfloat& operator=(short rhs)              noexcept { return convert_signed(rhs); }
137
        constexpr hfloat& operator=(int rhs)                noexcept { return convert_signed(rhs); }
24✔
138
        constexpr hfloat& operator=(long rhs)               noexcept { return convert_signed(rhs); }
139
        constexpr hfloat& operator=(long long rhs)          noexcept { return convert_signed(rhs); }
140
        // Plain `char` may be signed or unsigned per platform; route through
141
        // the signed conversion via integer promotion so hfloat(char(-1)) on
142
        // signed-char targets yields -1, not UCHAR_MAX (CodeRabbit finding
143
        // from PR #805 applied to the sibling type).
144
        constexpr hfloat& operator=(char rhs)               noexcept { return convert_signed(static_cast<int>(rhs)); }
145
        constexpr hfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned(rhs); }
146
        constexpr hfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned(rhs); }
5✔
147
        constexpr hfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned(rhs); }
148
        constexpr hfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned(rhs); }
149
        constexpr hfloat& operator=(float rhs)              noexcept { return convert_ieee754(rhs); }
150
        constexpr hfloat& operator=(double rhs)             noexcept { return convert_ieee754(rhs); }
308✔
151

152
        // conversion operators
153
        constexpr explicit operator float()           const noexcept { return float(convert_to_double()); }
154
        constexpr explicit operator double()          const noexcept { return convert_to_double(); }
283✔
155

156
#if LONG_DOUBLE_SUPPORT
157
        constexpr explicit hfloat(long double iv)           noexcept : _block{} { *this = iv; }
158
        constexpr hfloat& operator=(long double rhs)        noexcept { return convert_ieee754(double(rhs)); }
159
        constexpr explicit operator long double()     const noexcept { return (long double)convert_to_double(); }
160
#endif
161

162
        // Unified entry point: assign from a normalized 64-bit binary mantissa
163
        // plus its "frexp" exponent. This is the algorithm core; both parse()
164
        // (which derives mantissa64 from decimal_to_binary) and convert_ieee754
165
        // (which lifts a double's 53-bit mantissa into the 64-bit convention by
166
        // left-shifting 11 bits) call this same routine.
167
        //
168
        // Preconditions:
169
        //   mantissa64 is normalized with MSB at bit 63 (the "implicit 1"
170
        //   position of a normalized binary mantissa). For inputs derived from
171
        //   an IEEE double, the low 11 bits will be zero -- but they don't have
172
        //   to be; parse() fills all 64.
173
        //   bin_exp is the "frexp" exponent: value = mantissa64 * 2^(bin_exp - 64).
174
        constexpr hfloat& assign_from_mantissa64(bool negative, int bin_exp, uint64_t mantissa64) noexcept {
326✔
175
                if (mantissa64 == 0) { setzero(); return *this; }
326✔
176

177
                // Convert binary exponent to base-16 exponent. ceil(bin_exp/4) so
178
                // 0.f * 16^hex_exp == value with f's leading hex digit non-zero.
179
                int hex_exp;
180
                if (bin_exp > 0) hex_exp = (bin_exp + 3) / 4;
326✔
181
                else             hex_exp = bin_exp / 4;  // C truncates toward zero == ceil for negative
71✔
182

183
                // Align mantissa to the hex fraction bit positions. For a 64-bit
184
                // mantissa, mantissa_shift = (bin_exp - 4*hex_exp + fbits) - 64.
185
                int shift = bin_exp - 4 * hex_exp + static_cast<int>(fbits);
326✔
186
                int mantissa_shift = shift - 64;
326✔
187
                uint64_t fraction = 0;
326✔
188
                if (mantissa_shift == 0) {
326✔
189
                        fraction = mantissa64;
×
190
                }
191
                else if (mantissa_shift > 0) {
326✔
192
                        // fraction's natural width exceeds 64 bits -- can't be held in
193
                        // uint64_t. This is unreachable for hfloat configs with fbits
194
                        // <= 64 (the standard hfloat_short / hfloat_long sizes) and is
195
                        // the hfloat_extended (fbits=112) path: the 64-bit mantissa must
196
                        // be placed at offset `mantissa_shift` within the fbits-wide
197
                        // fraction (its low `mantissa_shift` bits are truncated to zero
198
                        // -- IBM HFP truncation). Check overflow/underflow first (the
199
                        // usual normalize_and_pack route can't see the wide fraction),
200
                        // then pack sign + exponent and setbit() the mantissa bits at
201
                        // their target high positions. Full bit-exact fbits>64
202
                        // conversion needs a wider intermediate (e.g. __uint128_t) and
203
                        // is left as future work; this delivers hfp64-precision in an
204
                        // hfp128 container.
205
                        if constexpr (fbits >= 64) {
206
                                if (hex_exp > emax) {
207
                                        if (negative) maxneg(); else maxpos();
208
                                        return *this;
209
                                }
210
                                if (hex_exp < emin) {
211
                                        setzero();
212
                                        return *this;
213
                                }
214
                                if (static_cast<unsigned>(mantissa_shift) < 64u) {
215
                                        pack(negative, hex_exp, 0);
216
                                        unsigned offset = static_cast<unsigned>(mantissa_shift);
217
                                        unsigned upper = (offset + 64u <= fbits) ? 64u : (fbits - offset);
218
                                        for (unsigned i = 0; i < upper; ++i) {
219
                                                if ((mantissa64 >> i) & 1) setbit(offset + i, true);
220
                                        }
221
                                        return *this;
222
                                }
223
                                // mantissa_shift >= 64: entire 64-bit mantissa lies beyond
224
                                // bit 63 of the fraction. Saturating to maxpos keeps the
225
                                // exponent meaningful while signaling unrepresentable
226
                                // precision; rare for normal-magnitude inputs.
227
                                fraction = ~uint64_t(0);
228
                        }
229
                        else {
230
                                // fbits < 64 with mantissa_shift > 0 is mathematically
231
                                // unreachable (shift = bin_exp - 4*hex_exp + fbits with
232
                                // (bin_exp - 4*hex_exp) in [0,3], so mantissa_shift =
233
                                // shift - 64 <= fbits + 3 - 64 < 0 when fbits < 61), but
234
                                // keep a defensive fallback for very narrow configs.
NEW
235
                                fraction = (uint64_t(1) << fbits) - 1u;
×
236
                        }
237
                }
238
                else if (-mantissa_shift < 64) {
326✔
239
                        // Shift right truncates low bits -- IBM HFP's truncation rounding.
240
                        fraction = mantissa64 >> static_cast<unsigned>(-mantissa_shift);
326✔
241
                }
242
                // else: fraction stays 0 (deep underflow)
243

244
                if constexpr (fbits < 64) {
245
                        fraction &= ((uint64_t(1) << fbits) - 1u);
326✔
246
                }
247

248
                normalize_and_pack(negative, hex_exp, fraction);
326✔
249
                return *this;
326✔
250
        }
251

252
        // Assign from a wide (target_mantissa_bits == fbits) decimal_to_binary
253
        // result. Delivers full fbits of precision regardless of host integer
254
        // width -- the whole point of routing decimal literals through d2b
255
        // instead of through double. Used by parse() when fbits > 64; the
256
        // uint64_t-based assign_from_mantissa64 above still serves the
257
        // convert_ieee754 (double / long double) entry point, where the input
258
        // is bounded to ~64 mantissa bits regardless.
259
        template<unsigned BigBits>
260
        constexpr hfloat& assign_from_wide_mantissa(bool negative, int bin_exp,
6✔
261
            const ::sw::universal::decimal_to_binary::basic_result<BigBits>& d) noexcept {
262
                // d.mantissa is normalized with MSB at bit (fbits - 1). The hex-
263
                // digit alignment within IBM HFP requires at most a 0..3 bit right
264
                // shift relative to that normalization so the leading hex digit
265
                // (bits [fbits-4, fbits-1]) ends up non-zero.
266
                int hex_exp;
267
                if (bin_exp > 0) hex_exp = (bin_exp + 3) / 4;
6✔
268
                else             hex_exp = bin_exp / 4;  // C truncates toward zero == ceil for negative
2✔
269

270
                if (hex_exp > emax) {
6✔
271
                        if (negative) maxneg(); else maxpos();
1✔
272
                        return *this;
1✔
273
                }
274
                if (hex_exp < emin) {
5✔
NEW
275
                        setzero();
×
NEW
276
                        return *this;
×
277
                }
278

279
                // shift = bin_exp - 4*hex_exp lies in [-3, 0]. Read fraction bit i
280
                // from d.mantissa.at(i + right_shift); the low `right_shift` mantissa
281
                // bits are dropped -- IBM HFP truncation rounding.
282
                int shift = bin_exp - 4 * hex_exp;
5✔
283
                unsigned right_shift = static_cast<unsigned>(-shift);
5✔
284

285
                clear();
5✔
286
                setbit(nbits - 1, negative);
5✔
287
                unsigned biased_exp = static_cast<unsigned>(hex_exp + bias);
5✔
288
                unsigned expStart = nbits - 2;
5✔
289
                for (unsigned i = 0; i < es; ++i) {
40✔
290
                        setbit(expStart - i, (biased_exp >> (es - 1 - i)) & 1);
35✔
291
                }
292
                for (unsigned i = 0; i < fbits; ++i) {
565✔
293
                        if (d.mantissa.at(i + right_shift)) setbit(i, true);
560✔
294
                }
295
                return *this;
5✔
296
        }
297

298
        // prefix operators
299
        constexpr hfloat operator-() const {
26✔
300
                hfloat negated(*this);
26✔
301
                if (!negated.iszero()) {
26✔
302
                        negated.setsign(!negated.sign());
21✔
303
                }
304
                return negated;
26✔
305
        }
306

307
        // arithmetic operators
308
        constexpr hfloat& operator+=(const hfloat& rhs) {
132✔
309
                bool lhs_sign, rhs_sign;
310
                int lhs_exp, rhs_exp;
311
                uint64_t lhs_frac, rhs_frac;
312
                unpack(lhs_sign, lhs_exp, lhs_frac);
132✔
313
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
132✔
314

315
                if (rhs.iszero()) return *this;
132✔
316
                if (iszero()) { *this = rhs; return *this; }
128✔
317

318
                // align exponents by shifting the smaller-exponent fraction RIGHT
319
                // by 4*(exp_large - exp_small) bits (hex digit alignment)
320
                int shift = lhs_exp - rhs_exp;
126✔
321
                uint64_t aligned_lhs = lhs_frac;
126✔
322
                uint64_t aligned_rhs = rhs_frac;
126✔
323
                int result_exp;
324

325
                if (shift >= 0) {
126✔
326
                        result_exp = lhs_exp;
93✔
327
                        aligned_rhs >>= (static_cast<unsigned>(shift) * 4u);
93✔
328
                }
329
                else {
330
                        result_exp = rhs_exp;
33✔
331
                        aligned_lhs >>= (static_cast<unsigned>(-shift) * 4u);
33✔
332
                }
333

334
                // add/subtract based on signs
335
                int64_t result_frac;
336
                int64_t a = lhs_sign ? -static_cast<int64_t>(aligned_lhs) : static_cast<int64_t>(aligned_lhs);
126✔
337
                int64_t b = rhs_sign ? -static_cast<int64_t>(aligned_rhs) : static_cast<int64_t>(aligned_rhs);
126✔
338
                result_frac = a + b;
126✔
339

340
                bool result_sign = (result_frac < 0);
126✔
341
                uint64_t abs_frac = static_cast<uint64_t>(result_sign ? -result_frac : result_frac);
126✔
342

343
                normalize_and_pack(result_sign, result_exp, abs_frac);
126✔
344
                return *this;
126✔
345
        }
346

347
        constexpr hfloat& operator-=(const hfloat& rhs) {
63✔
348
                hfloat neg(rhs);
63✔
349
                if (!neg.iszero()) neg.setsign(!neg.sign());
63✔
350
                return operator+=(neg);
63✔
351
        }
352

353
        constexpr hfloat& operator*=(const hfloat& rhs) {
69✔
354
                if (iszero() || rhs.iszero()) { setzero(); return *this; }
69✔
355

356
                bool lhs_sign, rhs_sign;
357
                int lhs_exp, rhs_exp;
358
                uint64_t lhs_frac, rhs_frac;
359
                unpack(lhs_sign, lhs_exp, lhs_frac);
66✔
360
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
66✔
361

362
                bool result_sign = (lhs_sign != rhs_sign);
66✔
363
                int result_exp = lhs_exp + rhs_exp;
66✔
364

365
#ifdef __SIZEOF_INT128__
366
                __uint128_t wide = static_cast<__uint128_t>(lhs_frac) * static_cast<__uint128_t>(rhs_frac);
66✔
367
                // The fractions are in 0.f format with fbits fraction bits
368
                // Product has 2*fbits bits, shift right by fbits to get back to fbits
369
                uint64_t result_frac = static_cast<uint64_t>(wide >> fbits);
66✔
370
                normalize_and_pack(result_sign, result_exp, result_frac);
66✔
371
#else
372
                // fallback: delegate through double
373
                double d = double(*this) * double(rhs);
374
                *this = d;
375
#endif
376
                return *this;
66✔
377
        }
378

379
        constexpr hfloat& operator/=(const hfloat& rhs) {
21✔
380
                if (rhs.iszero()) {
21✔
381
#if HFLOAT_THROW_ARITHMETIC_EXCEPTION
382
                        // Throwing in a constant expression is ill-formed; fence the
383
                        // throw under runtime to keep operator/= constexpr-callable.
384
                        // At constant-eval, divide-by-zero falls through to setzero()
385
                        // (IBM HFP has no NaN/inf, so saturation is the closest
386
                        // constexpr-safe behavior).
387
                        if (!std::is_constant_evaluated()) {
388
                                throw hfloat_divide_by_zero();
389
                        }
390
#endif
391
                        setzero();
1✔
392
                        return *this;
1✔
393
                }
394
                if (iszero()) return *this;
20✔
395

396
                bool lhs_sign, rhs_sign;
397
                int lhs_exp, rhs_exp;
398
                uint64_t lhs_frac, rhs_frac;
399
                unpack(lhs_sign, lhs_exp, lhs_frac);
19✔
400
                rhs.unpack(rhs_sign, rhs_exp, rhs_frac);
19✔
401

402
                bool result_sign = (lhs_sign != rhs_sign);
19✔
403
                int result_exp = lhs_exp - rhs_exp;
19✔
404

405
#ifdef __SIZEOF_INT128__
406
                // Scale numerator up by fbits for precision
407
                __uint128_t scaled_num = static_cast<__uint128_t>(lhs_frac) << fbits;
19✔
408
                uint64_t result_frac = static_cast<uint64_t>(scaled_num / static_cast<__uint128_t>(rhs_frac));
19✔
409
                normalize_and_pack(result_sign, result_exp, result_frac);
19✔
410
#else
411
                double d = double(*this) / double(rhs);
412
                *this = d;
413
#endif
414
                return *this;
19✔
415
        }
416

417
        // unary operators
418
        constexpr hfloat& operator++() {
2✔
419
                *this += hfloat(1);
2✔
420
                return *this;
2✔
421
        }
422
        constexpr hfloat operator++(int) {
423
                hfloat tmp(*this);
424
                operator++();
425
                return tmp;
426
        }
427
        constexpr hfloat& operator--() {
2✔
428
                *this -= hfloat(1);
2✔
429
                return *this;
2✔
430
        }
431
        constexpr hfloat operator--(int) {
1✔
432
                hfloat tmp(*this);
1✔
433
                operator--();
1✔
434
                return tmp;
1✔
435
        }
436

437
        // modifiers
438
        constexpr void clear() noexcept {
608✔
439
                for (unsigned i = 0; i < nrBlocks; ++i) _block[i] = bt(0);
1,281✔
440
        }
608✔
441
        constexpr void setzero() noexcept { clear(); }
35✔
442

443
        constexpr void setsign(bool negative = true) noexcept {
82✔
444
                setbit(nbits - 1, negative);
82✔
445
        }
82✔
446

447
        // use un-interpreted raw bits to set the value
448
        constexpr void setbits(uint64_t value) noexcept {
449
                clear();
450
                for (unsigned i = 0; i < nrBlocks; ++i) {
451
                        _block[i] = bt(value & BLOCK_MASK);
452
                        value >>= bitsInBlock;
453
                }
454
                _block[MSU] &= MSU_MASK;
455
        }
456

457
        // create specific number system values of interest
458
        //
459
        // IBM HFP has no subnormals (has_denorm == denorm_absent), so minpos
460
        // and denorm_min coincide at the smallest normalized positive value:
461
        // leading hex digit = 1, smallest exponent. maxpos sets every fraction
462
        // bit (all 28 hex digits = F for hfp128). For fbits >= 64 the pack()
463
        // uint64_t parameter cannot carry the full fraction; we fall back to
464
        // setbit() to populate bits beyond position 63.
465
        constexpr hfloat& maxpos() noexcept {
5✔
466
                clear();
5✔
467
                unsigned biased_exp = (1u << es) - 1u;
5✔
468
                int max_exp = static_cast<int>(biased_exp) - bias;
5✔
469
                if constexpr (fbits < 64) {
470
                        uint64_t max_frac = (uint64_t(1) << fbits) - 1u;
2✔
471
                        pack(false, max_exp, max_frac);
2✔
472
                }
473
                else {
474
                        pack(false, max_exp, 0);
3✔
475
                        for (unsigned i = 0; i < fbits; ++i) setbit(i, true);
339✔
476
                }
477
                return *this;
5✔
478
        }
479
        constexpr hfloat& minpos() noexcept {
1✔
480
                clear();
1✔
481
                // Smallest normalized positive: leading hex digit = 1, i.e. bit
482
                // (fbits - 4) set, all other fraction bits zero. Value = 16^(emin-1).
483
                if constexpr (fbits < 64) {
484
                        pack(false, emin, uint64_t(1) << (fbits - 4));
1✔
485
                }
486
                else {
NEW
487
                        pack(false, emin, 0);
×
NEW
488
                        setbit(fbits - 4, true);
×
489
                }
490
                return *this;
1✔
491
        }
492
        constexpr hfloat& zero() noexcept {
7✔
493
                clear();
7✔
494
                return *this;
7✔
495
        }
496
        constexpr hfloat& minneg() noexcept {
×
497
                clear();
×
498
                if constexpr (fbits < 64) {
NEW
499
                        pack(true, emin, uint64_t(1) << (fbits - 4));
×
500
                }
501
                else {
NEW
502
                        pack(true, emin, 0);
×
NEW
503
                        setbit(fbits - 4, true);
×
504
                }
505
                return *this;
×
506
        }
507
        constexpr hfloat& maxneg() noexcept {
1✔
508
                clear();
1✔
509
                unsigned biased_exp = (1u << es) - 1u;
1✔
510
                int max_exp = static_cast<int>(biased_exp) - bias;
1✔
511
                if constexpr (fbits < 64) {
512
                        uint64_t max_frac = (uint64_t(1) << fbits) - 1u;
1✔
513
                        pack(true, max_exp, max_frac);
1✔
514
                }
515
                else {
NEW
516
                        pack(true, max_exp, 0);
×
NEW
517
                        for (unsigned i = 0; i < fbits; ++i) setbit(i, true);
×
518
                }
519
                return *this;
1✔
520
        }
521

522
        // selectors
523
        constexpr bool sign() const noexcept {
1,129✔
524
                return getbit(nbits - 1);
1,129✔
525
        }
526

527
        constexpr bool iszero() const noexcept {
1,899✔
528
                // zero when fraction is all zeros (exponent and sign don't matter)
529
                // In IBM HFP, zero is represented as all-zeros
530
                for (unsigned i = 0; i < nrBlocks; ++i) {
2,029✔
531
                        bt mask = (i == MSU) ? bt(MSU_MASK & ~(bt(1) << ((nbits - 1) % bitsInBlock))) : BLOCK_MASK;
1,961✔
532
                        if ((_block[i] & mask) != 0) return false;
1,961✔
533
                }
534
                return true;
68✔
535
        }
536

537
        constexpr bool isone() const noexcept {
538
                bool s; int e; uint64_t f;
539
                unpack(s, e, f);
540
                // 1.0 = 0.1 * 16^1, so e=1, f = 1 << (fbits-4)  (leading hex digit = 1).
541
                // Gate the shift so `1ull << (fbits - 4)` doesn't UB for fbits >= 68
542
                // (hfloat_extended at fbits=112 would otherwise hit shift-count-overflow);
543
                // for those configs unpack() also caps the fraction at 64 useful bits,
544
                // so we can never construct the wide-fbits "leading hex digit = 1"
545
                // encoding here -- return false consistently.
546
                if constexpr (fbits >= 68) {
547
                        (void)s; (void)e; (void)f;
548
                        return false;
549
                }
550
                else {
551
                        return !s && (e == 1) && (f == (uint64_t(1) << (fbits - 4)));
552
                }
553
        }
554

555
        constexpr bool ispos() const noexcept { return !sign(); }
556
        constexpr bool isneg() const noexcept { return sign(); }
557

558
        // IBM HFP has no NaN or infinity
559
        constexpr bool isinf() const noexcept { return false; }
2✔
560
        constexpr bool isnan() const noexcept { return false; }
2✔
561
        constexpr bool isnan(int) const noexcept { return false; }
562

563
        constexpr int scale() const noexcept {
4✔
564
                if (iszero()) return 0;
4✔
565
                bool s; int e; uint64_t f;
566
                unpack(s, e, f);
4✔
567
                (void)s;
568
                // IBM HFP: value = 0.f * 16^e. unpack() returns the fraction's
569
                // low 64 bits (for fbits < 64, that's the entire fraction). For
570
                // fbits >= 64 the leading hex digit lives in bits [fbits-4, fbits-1]
571
                // of the full fraction -- unpack's low-64-bit view sees only the
572
                // truncated tail, not the leading bit, so we read the top 64
573
                // fraction bits directly (bits [fbits-64, fbits-1]).
574
                uint64_t top = 0;
4✔
575
                if constexpr (fbits >= 64) {
576
                        for (unsigned i = 0; i < 64u; ++i) {
65✔
577
                                if (getbit(fbits - 64u + i)) top |= (uint64_t(1) << i);
64✔
578
                        }
579
                }
580
                else {
581
                        top = f;
3✔
582
                }
583
                constexpr int read_iter = (fbits < 64) ? static_cast<int>(fbits) : 64;
4✔
584
                int leading = -1;
4✔
585
                for (int i = read_iter - 1; i >= 0; --i) {
7✔
586
                        if ((top >> i) & 1) { leading = i; break; }
7✔
587
                }
588
                if (leading < 0) return 0;
4✔
589
                constexpr int frac_msb_weight = (fbits < 64) ? static_cast<int>(fbits) : 64;
4✔
590
                return 4 * e + leading - frac_msb_weight;
4✔
591
        }
592

593
        // convert to string
594
        std::string str(size_t nrDigits = 0) const {
63✔
595
                if (iszero()) return sign() ? std::string("-0") : std::string("0");
71✔
596

597
                double d = convert_to_double();
59✔
598
                std::stringstream ss;
59✔
599
                if (nrDigits > 0) {
59✔
600
                        ss << std::setprecision(static_cast<int>(nrDigits));
59✔
601
                }
602
                ss << d;
59✔
603
                return ss.str();
59✔
604
        }
59✔
605

606
        ///////////////////////////////////////////////////////////////////
607
        // Bit access (public for free functions)
608
        constexpr bool getbit(unsigned pos) const noexcept {
32,232✔
609
                if (pos >= nbits) return false;
32,232✔
610
                unsigned block_idx = pos / bitsInBlock;
32,232✔
611
                unsigned bit_idx = pos % bitsInBlock;
32,232✔
612
                return (_block[block_idx] >> bit_idx) & 1;
32,232✔
613
        }
614

615
        ///////////////////////////////////////////////////////////////////
616
        // Unpack into sign, unbiased exponent, and fraction
617
        constexpr void unpack(bool& s, int& exponent, uint64_t& fraction) const noexcept {
838✔
618
                s = sign();
838✔
619
                if (iszero()) { exponent = 0; fraction = 0; return; }
838✔
620

621
                // Extract exponent field (es bits)
622
                unsigned exp_field = 0;
830✔
623
                unsigned expStart = nbits - 2; // MSB of exponent (just below sign)
830✔
624
                for (unsigned i = 0; i < es; ++i) {
6,640✔
625
                        if (getbit(expStart - i)) {
5,810✔
626
                                exp_field |= (1u << (es - 1 - i));
1,741✔
627
                        }
628
                }
629
                exponent = static_cast<int>(exp_field) - bias;
830✔
630

631
                // Extract fraction. The destination is uint64_t so we read at most
632
                // 64 bits; high fraction bits (i >= 64) for hfloat_extended would
633
                // make `1ull << i` UB and aren't representable in the result type
634
                // anyway. Matches the cap on the pack() write path.
635
                fraction = 0;
830✔
636
                constexpr unsigned read_iter = (fbits < 64) ? fbits : 64u;
830✔
637
                for (unsigned i = 0; i < read_iter; ++i) {
21,902✔
638
                        if (getbit(i)) {
21,072✔
639
                                fraction |= (uint64_t(1) << i);
2,931✔
640
                        }
641
                }
642
        }
643

644
protected:
645
        bt _block[nrBlocks];
646

647
        ///////////////////////////////////////////////////////////////////
648
        // Bit manipulation helpers
649
        constexpr void setbit(unsigned pos, bool value) noexcept {
19,357✔
650
                if (pos >= nbits) return;
19,357✔
651
                unsigned block_idx = pos / bitsInBlock;
19,357✔
652
                unsigned bit_idx = pos % bitsInBlock;
19,357✔
653
                if (value) {
19,357✔
654
                        _block[block_idx] |= bt(1ull << bit_idx);
3,510✔
655
                }
656
                else {
657
                        _block[block_idx] &= bt(~(1ull << bit_idx));
15,847✔
658
                }
659
        }
660

661
        ///////////////////////////////////////////////////////////////////
662
        // Pack sign, unbiased exponent, and fraction
663
        constexpr void pack(bool s, int exponent, uint64_t fraction) noexcept {
554✔
664
                clear();
554✔
665

666
                // set sign
667
                setbit(nbits - 1, s);
554✔
668

669
                // set exponent field
670
                unsigned biased_exp = static_cast<unsigned>(exponent + bias);
554✔
671
                unsigned expStart = nbits - 2;
554✔
672
                for (unsigned i = 0; i < es; ++i) {
4,432✔
673
                        setbit(expStart - i, (biased_exp >> (es - 1 - i)) & 1);
3,878✔
674
                }
675

676
                // set fraction field. The source is a uint64_t so it carries at
677
                // most 64 useful bits; for fbits > 64 (hfloat_extended) the high
678
                // fbits stay zero because `fraction >> i` for i >= 64 is UB
679
                // (compilers typically mask the shift count modulo 64 which
680
                // would re-replicate the low bits at the top -- definitely
681
                // wrong). Multi-limb fraction support for fbits > 64 is future
682
                // work; until then, only the low 64 fraction bits are populated.
683
                constexpr unsigned frac_iter = (fbits < 64) ? fbits : 64u;
554✔
684
                for (unsigned i = 0; i < frac_iter; ++i) {
14,802✔
685
                        setbit(i, (fraction >> i) & 1);
14,248✔
686
                }
687
        }
554✔
688

689
        ///////////////////////////////////////////////////////////////////
690
        // Normalize: ensure leading hex digit is non-zero, then truncate
691
        constexpr void normalize_and_pack(bool s, int exponent, uint64_t fraction) noexcept {
564✔
692
                if (fraction == 0) { setzero(); return; }
564✔
693

694
                // Normalize the leading hex digit. For fbits < 64 the
695
                // `1ull << fbits` thresholds are well-defined and the loops
696
                // produce IBM-HFP-correct truncation. For fbits >= 64 the
697
                // uint64_t fraction cannot represent the full hex significand
698
                // (hfloat_extended is fbits=112); we skip the normalization
699
                // loops since the relevant thresholds would be UB and accept
700
                // that the wide-fbits saturation path is approximate. Full-
701
                // width fbits > 64 conversion needs a multi-limb fraction
702
                // representation and is left as future work.
703
                if constexpr (fbits < 64) {
704
                        while (fraction >= (uint64_t(1) << fbits)) {
565✔
705
                                fraction >>= 4;  // shift right by one hex digit
16✔
706
                                exponent++;
16✔
707
                        }
708
                        while (fraction > 0 && fraction < (uint64_t(1) << (fbits - 4))) {
600✔
709
                                fraction <<= 4;  // shift left by one hex digit
51✔
710
                                exponent--;
51✔
711
                        }
712
                        // Truncate to fbits (IBM HFP truncates, never rounds up).
713
                        fraction &= ((uint64_t(1) << fbits) - 1);
549✔
714
                }
715
                // else: fraction is already at most ~uint64_t(0), which by
716
                // construction fits within fbits >= 64; pack() places it in
717
                // the low 64 fbits and leaves the rest zero.
718

719
                // Check overflow/underflow
720
                if (exponent > emax) {
549✔
721
                        // overflow: saturate to maxpos/maxneg
722
                        if (s) maxneg(); else maxpos();
2✔
723
                        return;
2✔
724
                }
725
                if (exponent < emin) {
547✔
726
                        // underflow: set to zero
727
                        setzero();
×
728
                        return;
×
729
                }
730

731
                pack(s, exponent, fraction);
547✔
732
        }
733

734
        ///////////////////////////////////////////////////////////////////
735
        // Conversion helpers
736

737
        // Convert IEEE-754 double to hfloat
738
        //
739
        // Constexpr-safe rewrite of the original frexp/ldexp implementation:
740
        // - NaN detected via x != x (only NaN is unequal to itself)
741
        // - infinity detected by bracketing against numeric_limits::max()
742
        //   (numeric_limits<double>::max() is constexpr; std::isinf is not)
743
        // - fabs replaced with `negative ? -rhs : rhs`
744
        // - frexp/ldexp replaced with raw IEEE 754 field extraction via
745
        //   sw::bit_cast (constexpr on toolchains exposing std::bit_cast or
746
        //   __builtin_bit_cast; runtime fallback retains the legacy path).
747
        //
748
        // IBM HFP has no NaN and no infinity:
749
        //   NaN double  -> hfloat zero
750
        //   +/- inf     -> hfloat maxpos/maxneg (saturation)
751
        //
752
        // This used to duplicate the encoding algorithm. After the unification
753
        // for issue #849, it just extracts the IEEE double's 53-bit significand
754
        // (with hidden bit), lifts it into the 64-bit normalized mantissa
755
        // convention by left-shifting 11 bits (the headroom between IEEE
756
        // double's 53 and our canonical 64), and delegates to
757
        // `assign_from_mantissa64`. The shift is lossless -- the low 11 bits of
758
        // the wide mantissa are simply zero, since the double had no more
759
        // information to give.
760
        constexpr hfloat& convert_ieee754(double rhs) noexcept {
308✔
761
                if (rhs != rhs) {                       // NaN
308✔
762
                        setzero();
×
763
                        return *this;
×
764
                }
765
                if (rhs == 0.0) {
308✔
766
                        setzero();
14✔
767
                        return *this;
14✔
768
                }
769
                constexpr double dbl_max = std::numeric_limits<double>::max();
294✔
770
                if (rhs >  dbl_max) { maxpos(); return *this; }   // +inf
294✔
771
                if (rhs < -dbl_max) { maxneg(); return *this; }   // -inf
294✔
772

773
                bool negative = (rhs < 0);
294✔
774
                double abs_val = negative ? -rhs : rhs;
294✔
775

776
                // Extract IEEE 754 double fields without std::frexp:
777
                // IEEE 754 double: bias 1023, 52 fraction bits, hidden 1 for normals.
778
                //   normal: value = (1 + rawFrac/2^52) * 2^(rawExp - 1023)
779
                //                 = (2^52 + rawFrac) * 2^(rawExp - 1075)
780
                // frexp would return frac in [0.5, 1) with bin_exp = rawExp - 1022;
781
                // we use the same bin_exp here so the mantissa MSB lies at bit
782
                // (mantissa_width - 1), matching assign_from_mantissa64's contract.
783
                //
784
                // sw::bit_cast is constexpr on toolchains exposing std::bit_cast or
785
                // __builtin_bit_cast (the universally-supported case in C++20+); on
786
                // older toolchains it works at runtime via memcpy but is not
787
                // constexpr.  Calling convert_ieee754() in a constant expression on
788
                // such a toolchain produces a not-a-constant-expression diagnostic
789
                // at the bit_cast invocation -- the correct behavior, vs. silently
790
                // returning zero from a runtime fallback.
791
                uint64_t bits = sw::bit_cast<uint64_t>(abs_val);
294✔
792
                uint64_t rawExp  = (bits >> 52) & 0x7FFu;
294✔
793
                uint64_t rawFrac = bits & ((uint64_t(1) << 52) - 1u);
294✔
794
                if (rawExp == 0) {
294✔
795
                        // Subnormal double values are far below IBM HFP's smallest
796
                        // representable (16^emin); flush to zero per HFP semantics.
797
                        setzero();
×
798
                        return *this;
×
799
                }
800
                int bin_exp = static_cast<int>(rawExp) - 1022;
294✔
801
                uint64_t mantissa53 = (uint64_t(1) << 52) | rawFrac;  // 53-bit, MSB at bit 52
294✔
802
                uint64_t mantissa64 = mantissa53 << 11;               // lift to MSB at bit 63
294✔
803
                return assign_from_mantissa64(negative, bin_exp, mantissa64);
294✔
804
        }
805

806
        // Convert hfloat to IEEE-754 double
807
        //
808
        // Constexpr-safe: replaces std::ldexp(d, n) with a power-of-2 scaling
809
        // loop that multiplies/divides by 2.0 (exactly representable in double).
810
        // At runtime, dispatches to std::ldexp for performance.  The shift range
811
        // per template config is bounded (fbits + 4*emax for hfloat_extended is
812
        // ~360), so the constexpr loop is well-bounded.
813
        constexpr double convert_to_double() const noexcept {
342✔
814
                if (iszero()) return sign() ? -0.0 : 0.0;
342✔
815

816
                bool s; int e; uint64_t f;
817
                unpack(s, e, f);
318✔
818

819
                // value = 0.f * 16^e. For fbits < 64 the unpack'd f IS the full
820
                // fraction and value = f * 2^(-fbits) * 16^e = f * 2^(4*e - fbits).
821
                // For fbits >= 64 the leading hex digit lives in the TOP 64 fraction
822
                // bits, which unpack's low-64 view doesn't see; read them directly
823
                // and treat f as representing fraction's bits [fbits-64, fbits-1],
824
                // so value = f * 2^(-64) * 16^e = f * 2^(4*e - 64).
825
                int shift;
826
                if constexpr (fbits >= 64) {
827
                        f = 0;
5✔
828
                        for (unsigned i = 0; i < 64u; ++i) {
325✔
829
                                if (getbit(fbits - 64u + i)) f |= (uint64_t(1) << i);
320✔
830
                        }
831
                        shift = 4 * e - 64;
5✔
832
                }
833
                else {
834
                        shift = 4 * e - static_cast<int>(fbits);
313✔
835
                }
836
                double result = static_cast<double>(f);
318✔
837
                if (std::is_constant_evaluated()) {
318✔
838
                        // Power-of-2 scaling: 2.0 and 0.5 are exactly representable
839
                        // in IEEE 754 double, so this introduces no rounding error
840
                        // over the bounded iteration count.
841
                        if (shift > 0) {
×
842
                                for (int i = 0; i < shift; ++i) result *= 2.0;
×
843
                        }
844
                        else if (shift < 0) {
×
845
                                for (int i = 0; i < -shift; ++i) result *= 0.5;
×
846
                        }
847
                }
848
                else {
849
                        result = std::ldexp(result, shift);
318✔
850
                }
851
                return s ? -result : result;
318✔
852
        }
853

854
        // Direct integer-to-hfloat packing for fbits <= 56 (hfloat_short and
855
        // hfloat_long).  Avoids the double round-trip that would lose precision
856
        // for int64_t values above 2^53 -- a value like 9007199254740993ULL
857
        // IS representable in hfloat_long (56-bit fraction) but rounds via
858
        // double.  For fbits >= 64 (hfloat_extended) the fraction is wider
859
        // than uint64_t can hold, so we fall back to convert_ieee754; this
860
        // keeps the same precision as pre-promotion code for that variant.
861
        constexpr hfloat& convert_signed(int64_t v) noexcept {
24✔
862
                if (v == 0) {
24✔
863
                        setzero();
1✔
864
                        return *this;
1✔
865
                }
866
                bool negative = (v < 0);
23✔
867
                // Compute |v| as uint64_t without ever negating an int64_t (the
868
                // negation of INT64_MIN would overflow).  Bitwise-safe identity:
869
                // |INT64_MIN| = -(v + 1) + 1, each step stays in range.
870
                uint64_t abs_v = negative
23✔
871
                        ? (static_cast<uint64_t>(-(v + 1)) + 1ull)
23✔
872
                        : static_cast<uint64_t>(v);
873
                if constexpr (fbits >= 64) {
874
                        return convert_ieee754(static_cast<double>(v));
875
                }
876
                else {
877
                        pack_uint64(negative, abs_v);
23✔
878
                        return *this;
23✔
879
                }
880
        }
881

882
        constexpr hfloat& convert_unsigned(uint64_t v) noexcept {
5✔
883
                if (v == 0) {
5✔
884
                        setzero();
1✔
885
                        return *this;
1✔
886
                }
887
                if constexpr (fbits >= 64) {
888
                        return convert_ieee754(static_cast<double>(v));
889
                }
890
                else {
891
                        pack_uint64(false, v);
4✔
892
                        return *this;
4✔
893
                }
894
        }
895

896
        // Pack a uint64_t magnitude into the hfloat's hex representation.
897
        // Precondition: v != 0 and fbits < 64.
898
        //   value = v = 0.f * 16^hex_exp where 0.f's leading hex digit is non-zero.
899
        // Algorithm:
900
        //   p          = highest set bit of v (0-indexed)
901
        //   hex_exp    = p/4 + 1   (number of hex digits to represent v)
902
        //   shift      = fbits - 4*hex_exp
903
        //   fraction   = (shift >= 0) ? v << shift : v >> -shift  (truncate per HFP)
904
        // Worst-case shift left: p=0, fbits<=56 -> shift=fbits-4, v<<shift fits in
905
        // uint64_t since v has only 1 bit.  Worst-case shift right: p=63, shift=fbits-64.
906
        constexpr void pack_uint64(bool negative, uint64_t v) noexcept {
27✔
907
                // Find highest set bit position (loop is bounded to 64 iterations
908
                // and constexpr-clean; v != 0 by precondition).
909
                unsigned p = 63;
27✔
910
                while (((v >> p) & 1u) == 0u) --p;
1,662✔
911
                int hex_exp = static_cast<int>(p / 4u) + 1;
27✔
912
                int shift = static_cast<int>(fbits) - 4 * hex_exp;
27✔
913
                uint64_t fraction = (shift >= 0)
27✔
914
                        ? (v << static_cast<unsigned>(shift))
27✔
915
                        : (v >> static_cast<unsigned>(-shift));
×
916
                normalize_and_pack(negative, hex_exp, fraction);
27✔
917
        }
27✔
918

919
private:
920

921
        // hfloat - hfloat logic comparisons
922
        template<unsigned N, unsigned E, typename B>
923
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const hfloat<N, E, B>& rhs);
924

925
        // hfloat - literal logic comparisons
926
        template<unsigned N, unsigned E, typename B>
927
        friend constexpr bool operator==(const hfloat<N, E, B>& lhs, const double rhs);
928

929
        // literal - hfloat logic comparisons
930
        template<unsigned N, unsigned E, typename B>
931
        friend constexpr bool operator==(const double lhs, const hfloat<N, E, B>& rhs);
932
};
933

934

935
////////////////////////    helper functions   /////////////////////////////////
936

937
template<unsigned ndigits, unsigned es, typename BlockType>
938
inline std::string to_binary(const hfloat<ndigits, es, BlockType>& number, bool nibbleMarker = false) {
53✔
939
        using Hfloat = hfloat<ndigits, es, BlockType>;
940
        std::stringstream s;
53✔
941

942
        // sign bit
943
        s << "0b" << (number.sign() ? '1' : '0') << '.';
53✔
944

945
        // exponent field (es bits)
946
        unsigned expStart = Hfloat::nbits - 2;
53✔
947
        for (unsigned i = 0; i < es; ++i) {
424✔
948
                s << (number.getbit(expStart - i) ? '1' : '0');
371✔
949
        }
950
        s << '.';
53✔
951

952
        // fraction field (fbits bits, show in hex-digit groups)
953
        for (int i = static_cast<int>(Hfloat::fbits) - 1; i >= 0; --i) {
1,837✔
954
                s << (number.getbit(static_cast<unsigned>(i)) ? '1' : '0');
1,784✔
955
                if (nibbleMarker && i > 0 && (i % 4 == 0)) s << '\'';
1,784✔
956
        }
957

958
        return s.str();
106✔
959
}
53✔
960

961
template<unsigned ndigits, unsigned es, typename BlockType>
962
inline std::string to_hex(const hfloat<ndigits, es, BlockType>& number) {
9✔
963
        std::stringstream s;
9✔
964

965
        s << (number.sign() ? '-' : '+');
9✔
966
        s << "0x0.";
9✔
967

968
        // Read exponent and fraction directly from bit storage. unpack() returns
969
        // the fraction as uint64_t, which loses bits for wide configs
970
        // (hfloat_extended has fbits=112); the legacy `frac_val >> (i*4)` loop
971
        // below also had UB for i*4 >= 64, producing wrapped/duplicated digits.
972
        // MSB-first left-fold to assemble the exponent field; safe for any es.
973
        using Hfloat = hfloat<ndigits, es, BlockType>;
974
        unsigned exp_field = 0;
9✔
975
        unsigned expStart  = Hfloat::nbits - 2;
9✔
976
        for (unsigned i = 0; i < es; ++i) {
72✔
977
                exp_field = (exp_field << 1) | (number.getbit(expStart - i) ? 1u : 0u);
63✔
978
        }
979
        int exp_val = static_cast<int>(exp_field) - Hfloat::bias;
9✔
980

981
        for (int i = static_cast<int>(ndigits) - 1; i >= 0; --i) {
123✔
982
                unsigned hex_digit = 0;
114✔
983
                for (unsigned b = 0; b < 4u; ++b) {
570✔
984
                        if (number.getbit(static_cast<unsigned>(i) * 4u + b)) {
456✔
985
                                hex_digit |= (1u << b);
199✔
986
                        }
987
                }
988
                s << "0123456789ABCDEF"[hex_digit];
114✔
989
        }
990
        s << " * 16^" << exp_val;
9✔
991

992
        return s.str();
18✔
993
}
9✔
994

995

996
// native semantic representation: radix-16, delegates to to_hex
997
template<unsigned ndigits, unsigned es, typename BlockType>
998
inline std::string to_native(const hfloat<ndigits, es, BlockType>& v, bool = false) {
999
        return to_hex(v);
1000
}
1001

1002
////////////////////////    HFLOAT functions   /////////////////////////////////
1003

1004
template<unsigned ndigits, unsigned es, typename BlockType>
1005
inline constexpr hfloat<ndigits, es, BlockType> abs(const hfloat<ndigits, es, BlockType>& a) {
1006
        hfloat<ndigits, es, BlockType> result(a);
1007
        result.setsign(false);
1008
        return result;
1009
}
1010

1011
template<unsigned ndigits, unsigned es, typename BlockType>
1012
inline constexpr hfloat<ndigits, es, BlockType> fabs(hfloat<ndigits, es, BlockType> a) {
1013
        a.setsign(false);
1014
        return a;
1015
}
1016

1017

1018
////////////////////////  stream operators   /////////////////////////////////
1019

1020
template<unsigned ndigits, unsigned es, typename BlockType>
1021
inline std::ostream& operator<<(std::ostream& ostr, const hfloat<ndigits, es, BlockType>& i) {
63✔
1022
        std::stringstream ss;
63✔
1023
        std::streamsize prec = ostr.precision();
63✔
1024
        std::streamsize width = ostr.width();
63✔
1025
        std::ios_base::fmtflags ff;
1026
        ff = ostr.flags();
63✔
1027
        ss.flags(ff);
63✔
1028
        ss << std::setw(width) << std::setprecision(prec) << i.str(size_t(prec));
63✔
1029
        return ostr << ss.str();
126✔
1030
}
63✔
1031

1032
template<unsigned ndigits, unsigned es, typename BlockType>
1033
inline std::istream& operator>>(std::istream& istr, hfloat<ndigits, es, BlockType>& p) {
2✔
1034
        std::string txt;
2✔
1035
        if (!(istr >> txt)) {
2✔
1036
                // extraction failed (already-bad stream or EOF); failbit set by >>.
1037
                return istr;
1✔
1038
        }
1039
        if (!parse(txt, p)) {
1✔
1040
                std::cerr << "unable to parse -" << txt << "- into an hfloat value\n";
1✔
1041
                istr.setstate(std::ios::failbit);
1✔
1042
        }
1043
        return istr;
1✔
1044
}
2✔
1045

1046
////////////////// string operators
1047

1048
// Parse a decimal floating-point literal into an hfloat. hfloat has no NaN
1049
// or Inf encoding, so "nan" / "inf" / "infinity" tokens are rejected. The
1050
// decimal payload is converted via decimal_to_binary with 64 bits of
1051
// precision and assembled directly into the hfloat hex fraction -- this
1052
// gives bit-exact correctly-rounded conversion for hfloat_long's full 56
1053
// fbits, which the via-double path could not deliver (double has only 53).
1054
//
1055
// Resolves #849.
1056
template<unsigned ndigits, unsigned es, typename BlockType>
1057
bool parse(const std::string& number, hfloat<ndigits, es, BlockType>& value) {
68✔
1058
        if (number.empty()) return false;
68✔
1059

1060
        // Reject special-value tokens up front (hfloat has no NaN or Inf
1061
        // encoding) and pre-validate the input grammar so malformed tokens are
1062
        // signaled via false return rather than silently mapped to zero by the
1063
        // d2b path.
1064
        std::size_t pos = 0;
67✔
1065
        while (pos < number.size() && std::isspace(static_cast<unsigned char>(number[pos]))) ++pos;
67✔
1066
        if (pos >= number.size()) return false;
67✔
1067
        const std::size_t numeric_start = pos;
67✔
1068
        if (number[pos] == '+' || number[pos] == '-') ++pos;
67✔
1069
        if (pos + 3 <= number.size()) {
67✔
1070
                char a = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos])));
59✔
1071
                char b = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 1])));
59✔
1072
                char c = static_cast<char>(std::tolower(static_cast<unsigned char>(number[pos + 2])));
59✔
1073
                if ((a == 'n' && b == 'a' && c == 'n')
59✔
1074
                 || (a == 'i' && b == 'n' && c == 'f')) {
55✔
1075
                        return false;  // hfloat cannot represent these
11✔
1076
                }
1077
        }
1078
        // Pre-validate the rest as a decimal floating-point literal:
1079
        // digits[.digits][eE[+-]digits] or .digits[eE[+-]digits].
1080
        bool seen_digit = false;
56✔
1081
        bool seen_dot   = false;
56✔
1082
        if (pos >= number.size()) return false;
56✔
1083
        while (pos < number.size()) {
387✔
1084
                char ch = number[pos];
360✔
1085
                if (ch >= '0' && ch <= '9') { seen_digit = true; ++pos; continue; }
360✔
1086
                if (ch == '.') {
60✔
1087
                        if (seen_dot) return false;
34✔
1088
                        seen_dot = true;
33✔
1089
                        ++pos;
33✔
1090
                        continue;
33✔
1091
                }
1092
                break;
26✔
1093
        }
1094
        if (!seen_digit) return false;
53✔
1095
        if (pos < number.size() && (number[pos] == 'e' || number[pos] == 'E')) {
48✔
1096
                ++pos;
21✔
1097
                if (pos < number.size() && (number[pos] == '+' || number[pos] == '-')) ++pos;
21✔
1098
                bool seen_exp_digit = false;
21✔
1099
                while (pos < number.size() && number[pos] >= '0' && number[pos] <= '9') {
61✔
1100
                        seen_exp_digit = true;
40✔
1101
                        ++pos;
40✔
1102
                }
1103
                if (!seen_exp_digit) return false;
21✔
1104
        }
1105
        if (pos != number.size()) return false;  // trailing junk
46✔
1106

1107
        // Hand the trimmed payload (post-whitespace) to decimal_to_binary.
1108
        // Request fbits of precision when fbits > 64 so wide-fraction configs
1109
        // (hfloat<28,*> with fbits=112) get bit-exact conversion through the
1110
        // d2b multi-limb mantissa instead of being clipped to double-precision
1111
        // like the legacy via-double path. For fbits <= 64 stay at 64 bits --
1112
        // that pinned the hfp64 0.1 truncation test under issue #849 and gives
1113
        // hfp32 a few guard bits for the d2b path's rounding to settle.
1114
        using H = hfloat<ndigits, es, BlockType>;
1115
        constexpr unsigned target_bits = (H::fbits > 64u) ? H::fbits : 64u;
45✔
1116
        std::string_view payload(number.data() + numeric_start, number.size() - numeric_start);
45✔
1117
        auto d = ::sw::universal::decimal_to_binary::convert(payload, target_bits);
45✔
1118
        if (!d.valid) return false;
45✔
1119
        if (d.is_zero) {
45✔
1120
                value = H(SpecificValue::zero);
7✔
1121
                return true;
7✔
1122
        }
1123

1124
        // d2b's binary_scale is the exponent of mantissa's MSB, so the
1125
        // "frexp" exponent (value = 1.frac * 2^(bin_exp - 1)) is binary_scale + 1.
1126
        int bin_exp = static_cast<int>(d.binary_scale) + 1;
38✔
1127

1128
        if constexpr (H::fbits > 64u) {
1129
                // Wide-fraction path: feed d2b's full multi-limb mantissa directly
1130
                // into the hex-aligned fraction. This is what makes parse() deliver
1131
                // precision beyond what double / long double can carry; the legacy
1132
                // uint64_t intermediate would truncate to ~64 bits and defeat the
1133
                // purpose of routing decimal literals through d2b in the first place.
1134
                value.assign_from_wide_mantissa(d.negative, bin_exp, d);
6✔
1135
        }
1136
        else {
1137
                // Extract the 64-bit normalized mantissa (MSB at bit 63) from d2b's
1138
                // bigint. With target_mantissa_bits == 64, mantissa.bit[63] is the
1139
                // implicit-1 of the binary expansion and bits [0, 62] are the fraction.
1140
                std::uint64_t mantissa64 = 0;
32✔
1141
                for (unsigned i = 0; i < 64; ++i) {
2,080✔
1142
                        if (d.mantissa.at(i)) mantissa64 |= (std::uint64_t{1} << i);
2,048✔
1143
                }
1144
                value.assign_from_mantissa64(d.negative, bin_exp, mantissa64);
32✔
1145
        }
1146
        return true;
38✔
1147
}
1148

1149

1150
//////////////////////////////////////////////////////////////////////////////////////////////////////
1151
// hfloat - hfloat binary logic operators
1152

1153
// hfloat values are normalized by normalize_and_pack so that the leading
1154
// hex digit of the fraction is non-zero -- the (sign, exponent, fraction)
1155
// tuple is unique per value.  We compare tuples directly instead of via
1156
// double() to preserve precision for hfloat_long (56-bit fraction) and
1157
// hfloat_extended, where the double round-trip would lose bits below the
1158
// 53-bit IEEE 754 mantissa.
1159
template<unsigned ndigits, unsigned es, typename BlockType>
1160
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
30✔
1161
        bool lz = lhs.iszero();
30✔
1162
        bool rz = rhs.iszero();
30✔
1163
        if (lz && rz) return true;       // +0 == -0 in HFP (no signed-zero distinction)
30✔
1164
        if (lz || rz) return false;
29✔
1165
        if (lhs.sign() != rhs.sign()) return false;
29✔
1166
        bool ts; int le = 0, re = 0;
29✔
1167
        uint64_t lf = 0, rf = 0;
29✔
1168
        lhs.unpack(ts, le, lf);
29✔
1169
        rhs.unpack(ts, re, rf);
29✔
1170
        return le == re && lf == rf;
29✔
1171
}
1172

1173
template<unsigned ndigits, unsigned es, typename BlockType>
1174
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
22✔
1175
        return !operator==(lhs, rhs);
22✔
1176
}
1177

1178
template<unsigned ndigits, unsigned es, typename BlockType>
1179
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
15✔
1180
        bool lz = lhs.iszero();
15✔
1181
        bool rz = rhs.iszero();
15✔
1182
        if (lz && rz) return false;
15✔
1183
        bool ls = lhs.sign();
15✔
1184
        bool rs = rhs.sign();
15✔
1185
        if (lz) return !rs;              // 0 < +x; 0 not < -x
15✔
1186
        if (rz) return ls;               // -x < 0; +x not < 0
14✔
1187
        if (ls != rs) return ls;         // negative < positive
13✔
1188
        // Same sign, both non-zero.  unpack returns (sign, biased-exp-removed,
1189
        // fraction).  For positives, lhs < rhs iff (le < re) || tie + (lf < rf).
1190
        // For negatives, the ordering reverses: lhs < rhs iff |lhs| > |rhs|.
1191
        bool ts; int le = 0, re = 0;
12✔
1192
        uint64_t lf = 0, rf = 0;
12✔
1193
        lhs.unpack(ts, le, lf);
12✔
1194
        rhs.unpack(ts, re, rf);
12✔
1195
        if (ls) {
12✔
1196
                // both negative: lhs < rhs iff |lhs| > |rhs|
1197
                return (le > re) || (le == re && lf > rf);
2✔
1198
        }
1199
        // both positive
1200
        return (le < re) || (le == re && lf < rf);
10✔
1201
}
1202

1203
template<unsigned ndigits, unsigned es, typename BlockType>
1204
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
2✔
1205
        return operator< (rhs, lhs);
2✔
1206
}
1207

1208
template<unsigned ndigits, unsigned es, typename BlockType>
1209
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1210
        return operator< (lhs, rhs) || operator==(lhs, rhs);
3✔
1211
}
1212

1213
template<unsigned ndigits, unsigned es, typename BlockType>
1214
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
3✔
1215
        return !operator< (lhs, rhs);
3✔
1216
}
1217

1218
//////////////////////////////////////////////////////////////////////////////////////////////////////
1219
// hfloat - literal binary logic operators
1220
template<unsigned ndigits, unsigned es, typename BlockType>
1221
inline constexpr bool operator==(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1222
        return operator==(lhs, hfloat<ndigits, es, BlockType>(rhs));
1223
}
1224
template<unsigned ndigits, unsigned es, typename BlockType>
1225
inline constexpr bool operator!=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1226
        return !operator==(lhs, rhs);
1227
}
1228
template<unsigned ndigits, unsigned es, typename BlockType>
1229
inline constexpr bool operator< (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1230
        return operator<(lhs, hfloat<ndigits, es, BlockType>(rhs));
1231
}
1232
template<unsigned ndigits, unsigned es, typename BlockType>
1233
inline constexpr bool operator> (const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1234
        return operator< (hfloat<ndigits, es, BlockType>(rhs), lhs);
1235
}
1236
template<unsigned ndigits, unsigned es, typename BlockType>
1237
inline constexpr bool operator<=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1238
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1239
}
1240
template<unsigned ndigits, unsigned es, typename BlockType>
1241
inline constexpr bool operator>=(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1242
        return !operator< (lhs, rhs);
1243
}
1244

1245
//////////////////////////////////////////////////////////////////////////////////////////////////////
1246
// literal - hfloat binary logic operators
1247
template<unsigned ndigits, unsigned es, typename BlockType>
1248
inline constexpr bool operator==(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1249
        return operator==(hfloat<ndigits, es, BlockType>(lhs), rhs);
1250
}
1251
template<unsigned ndigits, unsigned es, typename BlockType>
1252
inline constexpr bool operator!=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1253
        return !operator==(lhs, rhs);
1254
}
1255
template<unsigned ndigits, unsigned es, typename BlockType>
1256
inline constexpr bool operator< (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1257
        return operator<(hfloat<ndigits, es, BlockType>(lhs), rhs);
1258
}
1259
template<unsigned ndigits, unsigned es, typename BlockType>
1260
inline constexpr bool operator> (const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1261
        return operator< (rhs, lhs);
1262
}
1263
template<unsigned ndigits, unsigned es, typename BlockType>
1264
inline constexpr bool operator<=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1265
        return operator< (lhs, rhs) || operator==(lhs, rhs);
1266
}
1267
template<unsigned ndigits, unsigned es, typename BlockType>
1268
inline constexpr bool operator>=(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1269
        return !operator< (lhs, rhs);
1270
}
1271

1272
//////////////////////////////////////////////////////////////////////////////////////////////////////
1273
// hfloat - hfloat binary arithmetic operators
1274
template<unsigned ndigits, unsigned es, typename BlockType>
1275
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
67✔
1276
        hfloat<ndigits, es, BlockType> sum(lhs);
67✔
1277
        sum += rhs;
67✔
1278
        return sum;
67✔
1279
}
1280
template<unsigned ndigits, unsigned es, typename BlockType>
1281
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
61✔
1282
        hfloat<ndigits, es, BlockType> diff(lhs);
61✔
1283
        diff -= rhs;
61✔
1284
        return diff;
61✔
1285
}
1286
template<unsigned ndigits, unsigned es, typename BlockType>
1287
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
69✔
1288
        hfloat<ndigits, es, BlockType> mul(lhs);
69✔
1289
        mul *= rhs;
69✔
1290
        return mul;
69✔
1291
}
1292
template<unsigned ndigits, unsigned es, typename BlockType>
1293
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const hfloat<ndigits, es, BlockType>& rhs) {
21✔
1294
        hfloat<ndigits, es, BlockType> ratio(lhs);
21✔
1295
        ratio /= rhs;
21✔
1296
        return ratio;
21✔
1297
}
1298

1299
//////////////////////////////////////////////////////////////////////////////////////////////////////
1300
// hfloat - literal binary arithmetic operators
1301
template<unsigned ndigits, unsigned es, typename BlockType>
1302
inline constexpr hfloat<ndigits, es, BlockType> operator+(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1303
        return operator+(lhs, hfloat<ndigits, es, BlockType>(rhs));
1304
}
1305
template<unsigned ndigits, unsigned es, typename BlockType>
1306
inline constexpr hfloat<ndigits, es, BlockType> operator-(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1307
        return operator-(lhs, hfloat<ndigits, es, BlockType>(rhs));
1308
}
1309
template<unsigned ndigits, unsigned es, typename BlockType>
1310
inline constexpr hfloat<ndigits, es, BlockType> operator*(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1311
        return operator*(lhs, hfloat<ndigits, es, BlockType>(rhs));
1312
}
1313
template<unsigned ndigits, unsigned es, typename BlockType>
1314
inline constexpr hfloat<ndigits, es, BlockType> operator/(const hfloat<ndigits, es, BlockType>& lhs, const double rhs) {
1315
        return operator/(lhs, hfloat<ndigits, es, BlockType>(rhs));
1316
}
1317

1318
//////////////////////////////////////////////////////////////////////////////////////////////////////
1319
// literal - hfloat binary arithmetic operators
1320
template<unsigned ndigits, unsigned es, typename BlockType>
1321
inline constexpr hfloat<ndigits, es, BlockType> operator+(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1322
        return operator+(hfloat<ndigits, es, BlockType>(lhs), rhs);
1323
}
1324
template<unsigned ndigits, unsigned es, typename BlockType>
1325
inline constexpr hfloat<ndigits, es, BlockType> operator-(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1326
        return operator-(hfloat<ndigits, es, BlockType>(lhs), rhs);
1327
}
1328
template<unsigned ndigits, unsigned es, typename BlockType>
1329
inline constexpr hfloat<ndigits, es, BlockType> operator*(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1330
        return operator*(hfloat<ndigits, es, BlockType>(lhs), rhs);
1331
}
1332
template<unsigned ndigits, unsigned es, typename BlockType>
1333
inline constexpr hfloat<ndigits, es, BlockType> operator/(const double lhs, const hfloat<ndigits, es, BlockType>& rhs) {
1334
        return operator/(hfloat<ndigits, es, BlockType>(lhs), rhs);
1335
}
1336

1337
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc