• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 23609893433

26 Mar 2026 05:55PM UTC coverage: 84.357% (-0.02%) from 84.372%
23609893433

Pull #617

github

web-flow
Merge 3b29626d1 into ffdd14222
Pull Request #617: fix(ci): Suppress Impossible-Case Warnings in Quad-Double API Tests

44364 of 52591 relevant lines covered (84.36%)

6126601.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.98
/include/sw/universal/number/cfloat/cfloat_impl.hpp
1
#pragma once
2
// cfloat_impl.hpp: implementation of an arbitrary configuration fixed-size 'classic' floating-point representation
3
// cfloat<> can emulate IEEE-754 floats and the new Deep Learning types, such as 
4
// IEEE-754 half-precision floats
5
// Google bfloat16
6
// NVIDIA TensorFloat 
7
// AMD FP16 and FP32
8
// Microsoft FP8 and FP9
9
// Tesla CFP8, CFP16
10
// 
11
// cfloat<> can also emulate more precise configurations, such as
12
// 80bit IEEE-754 extended precision floats
13
// true 128bit quad precision floats
14
//
15
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
16
// SPDX-License-Identifier: MIT
17
//
18
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
19

20
// compiler specific environment has been delegated to be handled by the
21
// number system include file <universal/number/cfloat/cfloat.hpp>
22
// 
23
// supporting types and functions
24
#include <limits>
25
#include <regex>
26
#include <type_traits>
27
#include <universal/native/ieee754.hpp>
28
#include <universal/native/subnormal.hpp>
29
#include <universal/utility/find_msb.hpp>
30
#include <universal/number/shared/nan_encoding.hpp>
31
#include <universal/number/shared/infinite_encoding.hpp>
32
#include <universal/number/shared/specific_value_encoding.hpp>
33
// arithmetic tracing options
34
#include <universal/number/algorithm/trace_constants.hpp>
35
// cfloat exception structure
36
#include <universal/number/cfloat/exceptions.hpp>
37
// composition types used by cfloat
38
#include <universal/internal/blockbinary/blockbinary.hpp>
39
#include <universal/internal/blocktriple/blocktriple.hpp>
40
#include <universal/number/support/decimal.hpp>
41

42
#ifndef CFLOAT_THROW_ARITHMETIC_EXCEPTION
43
#define CFLOAT_THROW_ARITHMETIC_EXCEPTION 0
44
#endif
45

46
#ifndef TRACE_CONVERSION
47
#define TRACE_CONVERSION 0
48
#endif
49

50
namespace sw { namespace universal {
51

52
/*
53
 * classic floating-point cfloat offers subnormals, max-exponent values, and
54
 * saturation. The default configuration turns off subnormals, max-exponent
55
 * values, and projects values outside of their dynamic range to +-inf
56
 *
57
 * Behavior flags
58
 *   hasSubnormals   : gradual underflow: use all fraction encodings when exponent is all 0's
59
 *   hasMaxExpValues  : reclaim max-exponent encodings as numeric values instead of
60
 *                      reserving the entire all-1s exponent binade for inf/NaN;
61
 *                      only 4 encodings are reserved for +-inf and quiet/signalling NaN
62
 *   isSaturating     : saturate to maxneg or maxpos when value is out of dynamic range
63
 */
64

65
/// <summary>
66
/// decode an cfloat value into its constituent parts
67
/// </summary>
68
/// <typeparam name="bt">block type</typeparam>
69
/// <param name="v">cfloat value to decode (input: const ref)</param>
70
/// <param name="s">sign (output: bool ref)</param>
71
/// <param name="e">exponent (output: blockbinary ref)</param>
72
/// <param name="f">fraction (output: blockbinary ref)</param>
73
template<unsigned nbits, unsigned es, unsigned fbitsPlus1, typename bt,
74
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
75
void decode(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v, bool& s, blockbinary<es, bt>& e, blockbinary<fbitsPlus1, bt>& f) {
800✔
76
        v.sign(s);
800✔
77
        v.exponent(e);
800✔
78
        v.fraction(f);
800✔
79
}
800✔
80

81
/// <summary>
82
/// return the binary scale of the given number
83
/// </summary>
84
/// <typeparam name="bt">Block type used for storage: derived through ADL</typeparam>
85
/// <param name="v">the cfloat number for which we seek to know the binary scale</param>
86
/// <returns>binary scale, i.e. 2^scale, of the value of the cfloat</returns>
87
template<unsigned nbits, unsigned es, typename bt,
88
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
89
int scale(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
329✔
90
        return v.scale();
329✔
91
}
92

93
/// <summary>
94
/// convert a blocktriple to a cfloat. blocktriples come out of the arithmetic
95
/// engine in the form ii.ff...ff and a scale. The conversion must take this
96
/// denormalized form into account during conversion.
97
/// 
98
/// The blocktriple must be in this form to round correctly, as all the bits
99
/// after an arithmetic operation must be taken into account.
100
/// 
101
/// Transformation:
102
///    ii.ff...ff  transform to    s.eee.fffff
103
/// All number systems that depend on blocktriple will need to have
104
/// the rounding decision answered, so that functionality can be
105
/// reused if we locate it inside blocktriple.
106
/// 
107
/// if (srcbits > fbits) // we need to round
108
///     if (ii.00..00 > 1) 
109
///         mask is at srcbits - fbits + 1
110
///     else 
111
///                    mask is at srcbits - fbits
112
/// }
113
/// else {               // no need to round
114
/// }
115
/// </summary>
116
/// <typeparam name="bt">type of the block used for cfloat storage</typeparam>
117
/// <param name="src">the blocktriple to be converted</param>
118
/// <param name="tgt">the resulting cfloat</param>
119
template<unsigned srcbits, BlockTripleOperator op, unsigned nbits, unsigned es, typename bt,
120
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
121
inline /*constexpr*/ void convert(const blocktriple<srcbits, op, bt>& src, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& tgt) {
78,061,516✔
122
        using btType = blocktriple<srcbits, op, bt>;
123
        using cfloatType = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
124
        // test special cases
125
        if (src.isnan()) {
78,061,516✔
126
                tgt.setnan(src.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
560✔
127
        }
128
        else        if (src.isinf()) {
78,060,956✔
129
                tgt.setinf(src.sign());
560✔
130
        }
131
        else         if (src.iszero()) {
78,060,396✔
132
                tgt.setzero();
57,364✔
133
                tgt.setsign(src.sign()); // preserve sign
57,364✔
134
        }
135
        else {
136
                int significandScale = src.significandscale();
78,003,032✔
137
                int exponent = src.scale() + significandScale;
78,003,032✔
138
                // blocktriple keeps arithmetic results intentionally unnormalized. scale() tracks the source binade,
139
                // while significandScale() reports whether the exact result spilled into the integer headroom above
140
                // the radix. cfloat conversion has to combine both before it can classify the value as normal,
141
                // subnormal, or overflowing.
142
                // special case of underflow
143
                if constexpr (hasSubnormals) {
144
//                        std::cout << "exponent = " << exponent << " bias = " << cfloatType::EXP_BIAS << " exp subnormal = " << cfloatType::MIN_EXP_SUBNORMAL << '\n';
145
                        // why must exponent be less than (minExpSubnormal - 1) to be rounded to zero? 
146
                        // because the half-way value that would round up to minpos is at exp = (minExpSubnormal - 1)
147
                        if (exponent < cfloatType::MIN_EXP_SUBNORMAL) {
54,651,252✔
148
                                tgt.setzero();
192,399✔
149
                                if (exponent == (cfloatType::MIN_EXP_SUBNORMAL - 1)) {
192,399✔
150
                                        // -exponent because we are right shifting and exponent in this range is negative
151
                                        int adjustment = -(exponent + subnormal_reciprocal_shift[es]);
42,879✔
152
                                        std::pair<bool, unsigned> alignment = src.roundingDecision(adjustment);
42,879✔
153
                                        if (alignment.first) ++tgt; // we are minpos
42,879✔
154
                                }
155
                                tgt.setsign(src.sign());
192,399✔
156
                                return;
3,127,281✔
157
                        }
158
                }
159
                else {
160
                        if (exponent + cfloatType::EXP_BIAS <= 0) {  // value is in the subnormal range, which maps to 0
23,351,780✔
161
                                tgt.setzero();
363,536✔
162
                                tgt.setsign(src.sign());
363,536✔
163
                                return;
1,241,231✔
164
                        }
165
                }
166
                // special case of overflow
167
                if constexpr (hasMaxExpValues) {
168
                        if constexpr (isSaturating) {
169
                                if (exponent > cfloatType::MAX_EXP) {
21,098,841✔
170
                                        if (src.sign()) tgt.maxneg(); else tgt.maxpos();
934,372✔
171
                                        return;
934,372✔
172
                                }
173
                        }
174
                        else {
175
                                if (exponent > cfloatType::MAX_EXP) {
30,556,233✔
176
                                        tgt.setinf(src.sign());
2,673,581✔
177
                                        return;
2,673,581✔
178
                                }
179
                        }
180
                }
181
                else {  // no max-exponent values: will saturate at a different encoding: TODO can we hide it all in maxpos?
182
                        if constexpr (isSaturating) {
183
                                if (exponent > cfloatType::MAX_EXP) {
3,704✔
184
                                        if (src.sign()) tgt.maxneg(); else tgt.maxpos();
7✔
185
                                        return;
7✔
186
                                }
187
                        }
188
                        else {
189
                                if (exponent > cfloatType::MAX_EXP) {
25,788,319✔
190
                                        tgt.setinf(src.sign());
204,617✔
191
                                        return;
204,617✔
192
                                }
193
                        }
194
                }
195

196
                // our value needs to go through rounding to be correctly interpreted
197
                // 
198
                // tgt.clear();  // no need as all bits are going to be set by the code below
199

200
                // exponent construction
201
                int adjustment{ 0 };
73,634,520✔
202
                // construct exponent
203
                uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive if exponent in encoding range
73,634,520✔
204
//                        std::cout << "exponent         " << to_binary(biasedExponent) << '\n';        
205
                if constexpr (hasSubnormals) {
206
                        //if (exponent >= cfloatType::MIN_EXP_SUBNORMAL && exponent < cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::MIN_EXP_NORMAL) {
207
                        if (exponent < cfloatType::MIN_EXP_NORMAL) {
51,523,971✔
208
                                // the value is in the subnormal range of the cfloat
209
                                biasedExponent = 0;
35,473,284✔
210
                                // -exponent because we are right shifting and exponent in this range is negative
211
                                adjustment = -(exponent + subnormal_reciprocal_shift[es]);
35,473,284✔
212
                                // this is the right shift adjustment required for subnormal representation due 
213
                                // to the scale of the input number, i.e. the exponent of 2^-adjustment
214
                        }
215
                        else {
216
                                // the value is in the normal range of the cfloat
217
                                biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive
16,050,687✔
218
                        }
219
                }
220
                else {
221
                        if (exponent < cfloatType::MIN_EXP_NORMAL) biasedExponent = 1ull; // fixup biasedExponent if we are in the subnormal region
22,110,549✔
222
                }
223

224

225
                // get the rounding direction and the LSB right shift: 
226
                std::pair<bool, unsigned> alignment = src.roundingDecision(adjustment);
73,634,520✔
227
                unsigned rightShift = alignment.second;  // this is the shift to get the LSB of the src to the LSB of the tgt
73,634,520✔
228
                // rightShift is also the normalization step: it lines the blocktriple hidden bit up with the cfloat
229
                // hidden-bit position, while the bool tells us whether the discarded tail rounds the retained payload up.
230
                //std::cout << "rightShift       " << rightShift << '\n';
231

232
                if constexpr (btType::bfbits < 65) {
233
                        bool roundup = alignment.first;
73,102,643✔
234
                        //std::cout << "round-up?        " << (roundup ? "yes" : "no") << '\n';
235
                        // we can use a uint64_t to construct the cfloat
236
                        uint64_t raw = (src.sign() ? 1ull : 0ull); // process sign
73,102,643✔
237
                        //std::cout << "raw bits (sign)  " << to_binary(raw) << '\n';
238
                        // construct the fraction bits
239
                        uint64_t fracbits = src.significand_ull(); // get all the bits, including the integer bits
73,102,643✔
240
                        //std::cout << "fracbits         " << to_binary(fracbits) << '\n';
241
                        fracbits >>= rightShift;
73,102,643✔
242
                        //std::cout << "fracbits shifted " << to_binary(fracbits) << '\n';
243
                        fracbits &= cfloatType::ALL_ONES_FR; // remove the hidden bit
73,102,643✔
244
                        //std::cout << "fracbits masked  " << to_binary(fracbits) << '\n';
245
                        if (roundup) ++fracbits;
73,102,643✔
246
                        // Rounding can turn 1.111... into 10.000..., so the carry has to be reinterpreted as an exponent bump.
247
                        // This is the last place where the unrounded blocktriple state can still change the target binade.
248
                        if (fracbits == (1ull << cfloatType::fbits)) { // check for overflow
73,102,643✔
249
                                if (biasedExponent == cfloatType::ALL_ONES_ES) {
433,221✔
250
                                        fracbits = cfloatType::INF_ENCODING; // project to INF
6,424✔
251
                                }
252
                                else {
253
                                        ++biasedExponent;
426,797✔
254
                                        fracbits = 0;
426,797✔
255
                                }
256
                        }
257

258
                        raw <<= es; // shift sign to make room for the exponent bits
73,102,643✔
259
                        raw |= biasedExponent;
73,102,643✔
260
                        //std::cout << "raw bits (exp)   " << to_binary(raw) << '\n';
261
                        raw <<= cfloatType::fbits; // make room for the fraction bits
73,102,643✔
262
                        //std::cout << "raw bits (s+exp) " << to_binary(raw) << '\n';
263
                        raw |= fracbits;
73,102,643✔
264
                        //std::cout << "raw bits (final) " << to_binary(raw) << '\n';
265
                        tgt.setbits(raw);
73,102,643✔
266
//                        std::cout << "raw bits (all)   " << to_binary(raw) << '\n';
267
                        if constexpr (isSaturating) {
268
                                if (tgt.isnan()) {
20,168,159✔
269
                                        if (src.sign()) {
722✔
270
                                                tgt.maxneg();        // map back to maxneg
361✔
271
                                        }
272
                                        else {
273
                                                tgt.maxpos();        // map back to maxpos
361✔
274
                                        }
275
                                }
276
                        }
277
                        else {
278
                                // when you get too far, map it back to +-inf: 
279
                                // TBD: this doesn't appear to be the right algorithm to catch all overflow patterns
280
                                if (tgt.isnan()) tgt.setinf(src.sign());        // map back to +-inf
52,934,484✔
281
                        }
282
                }
283
                else {
284
                        // compose the segments
285
                        bool roundup = alignment.first;
531,877✔
286
                        auto fracbits = src.significand();
531,877✔
287
                        fracbits >>= static_cast<int>(rightShift);
531,877✔
288
                        fracbits.setbit(cfloatType::fbits, false); // remove the hidden bit (matches narrow path: fracbits &= ALL_ONES_FR)
531,877✔
289

290
                        // apply rounding (matches the bfbits < 65 path above)
291
                        if (roundup) fracbits.increment();
531,877✔
292

293
                        // check for rounding overflow: if the fraction carried into
294
                        // the hidden-bit position, clear the fraction and bump the exponent
295
                        if (fracbits.test(cfloatType::fbits)) {
531,877✔
296
                                fracbits.clear();
33✔
297
                                if (biasedExponent == cfloatType::ALL_ONES_ES) {
33✔
298
                                        // rounding overflowed beyond the maximum binade:
299
                                        // handle explicitly because the isnan() remap below
300
                                        // does not catch all configurations (e.g. hasMaxExpValues
301
                                        // with zero fraction is a valid finite encoding, not NaN)
302
                                        if constexpr (isSaturating) {
303
                                                if (src.sign()) tgt.maxneg(); else tgt.maxpos();
×
304
                                        }
305
                                        else {
306
                                                tgt.setinf(src.sign());
×
307
                                        }
308
                                        return;
×
309
                                }
310
                                else {
311
                                        ++biasedExponent;
33✔
312
                                        ++exponent;
33✔
313
                                }
314
                        }
315

316
                        // copy the blocks that contain fraction bits
317
                        tgt.clear();
531,877✔
318
                        for (unsigned b = 0; b < btType::nrBlocks; ++b) {
1,074,250✔
319
                                tgt.setblock(b, fracbits.block(b));
542,373✔
320
                        }
321
                        tgt.setsign(src.sign());
531,877✔
322
                        if (!tgt.setexponent(exponent)) {
531,877✔
323
                                std::cerr << "exponent value is out of range: " << exponent << '\n';
×
324
                        }
325

326
                        // saturation / overflow-to-inf handling (matches the bfbits < 65 path)
327
                        if constexpr (isSaturating) {
328
                                if (tgt.isnan()) {
7✔
329
                                        if (src.sign()) {
×
330
                                                tgt.maxneg();
×
331
                                        }
332
                                        else {
333
                                                tgt.maxpos();
×
334
                                        }
335
                                }
336
                        }
337
                        else {
338
                                if (tgt.isnan()) tgt.setinf(src.sign());
531,870✔
339
                        }
340
                }
341
        }
342
}
343

344

345
/// <summary>
346
/// An arbitrary, fixed-size floating-point number with configurable gradual under/overflow and saturation/non-saturation arithmetic.
347
/// Default configuration offers normal encoding and non-saturating arithmetic.
348
/// /// </summary>
349
/// <typeparam name="nbits">number of bits in the encoding</typeparam>
350
/// <typeparam name="es">number of exponent bits in the encoding</typeparam>
351
/// <typeparam name="bt">the type to use as storage class: one of [uint8_t|uint16_t|uint32_t]</typeparam>
352
/// <typeparam name="hasSubnormals">configure gradual underflow (==subnormals)</typeparam>
353
/// <typeparam name="hasMaxExpValues">reclaim max-exponent encodings as numeric values</typeparam>
354
/// <typeparam name="isSaturating">configure saturation arithmetic</typeparam>
355
template<unsigned _nbits, unsigned _es, typename bt = uint8_t,
356
        bool _hasSubnormals = false, bool _hasMaxExpValues = false, bool _isSaturating = false>
357
class cfloat {
358
public:
359
        static_assert(_nbits > _es + 1ull, "nbits is too small to accomodate the requested number of exponent bits");
360
        static_assert(_es < 21ull, "my God that is a big number, are you trying to break the Interweb?");
361
        static_assert(_es > 0, "number of exponent bits must be bigger than 0 to be a classic floating point number");
362
        // how do you assert on the condition that if es == 1 then subnormals and max-exponent values must be true?
363
        static constexpr bool     subsuper = (_hasSubnormals && _hasMaxExpValues);
364
        static constexpr bool     special = (subsuper ? true : (_es > 1));
365
        static_assert(special, "when es == 1, cfloat must have both subnormals and max-exponent values");
366
        static constexpr unsigned bitsInByte = 8u;
367
        static constexpr unsigned bitsInBlock = sizeof(bt) * bitsInByte;
368
        static_assert(bitsInBlock <= 64, "storage unit for block arithmetic needs to be <= uint64_t"); // TODO: carry propagation on uint64_t requires assembly code
369

370
        static constexpr unsigned nbits = _nbits;
371
        static constexpr unsigned es = _es;
372
        static constexpr unsigned fbits  = nbits - 1u - es;    // number of fraction bits excluding the hidden bit
373
        static constexpr unsigned fhbits = nbits - es;           // number of fraction bits including the hidden bit
374

375
        static constexpr uint64_t  storageMask = (0xFFFFFFFFFFFFFFFFull >> (64ull - bitsInBlock));
376
        static constexpr bt       BLOCK_MASK = bt(~0);
377
        static constexpr bt       ALL_ONES = bt(~0); // block type specific all 1's value
378
        static constexpr uint32_t ALL_ONES_ES = (0xFFFF'FFFFul >> (32 - es));
379
        static constexpr uint64_t topfbits = fbits % 64;
380
        static constexpr uint64_t FR_SHIFT = (topfbits > 0 ? (64 - topfbits) : 0);
381
        static constexpr uint64_t ALL_ONES_FR = (topfbits > 0 ? (0xFFFF'FFFF'FFFF'FFFFull >> FR_SHIFT) : 0ull); // special case for nbits <= 64
382
        static constexpr uint64_t INF_ENCODING = (ALL_ONES_FR & ~1ull);
383

384
        static constexpr unsigned nrBlocks = 1u + ((nbits - 1ull) / bitsInBlock);
385
        static constexpr unsigned MSU = nrBlocks - 1u; // MSU == Most Significant Unit, as MSB is already taken
386
        static constexpr bt       MSU_MASK = (ALL_ONES >> (nrBlocks * bitsInBlock - nbits));
387
        static constexpr unsigned bitsInMSU = bitsInBlock - (nrBlocks * bitsInBlock - nbits);
388
        static constexpr unsigned fBlocks = 1ull + ((fbits - 1ull) / bitsInBlock); // nr of blocks with fraction bits
389
        static constexpr unsigned FSU = fBlocks - 1u;  // FSU = Fraction Significant Unit: the index of the block that contains the most significant fraction bits
390
        static constexpr bt       FSU_MASK = (ALL_ONES >> (fBlocks * bitsInBlock - fbits));
391
        static constexpr unsigned bitsInFSU = bitsInBlock - (fBlocks * bitsInBlock - fbits);
392

393
        static constexpr bt       SIGN_BIT_MASK = bt(bt(1ull) << ((nbits - 1ull) % bitsInBlock));
394
        static constexpr bt       LSB_BIT_MASK = bt(1ull);
395
        static constexpr bool     MSU_CAPTURES_EXP = (1ull + es) <= bitsInMSU;
396
        static constexpr unsigned EXP_SHIFT = (MSU_CAPTURES_EXP ? (1 == nrBlocks ? (nbits - 1ull - es) : (bitsInMSU - 1ull - es)) : 0);
397
        static constexpr bt       MSU_EXP_MASK = ((ALL_ONES << EXP_SHIFT) & ~SIGN_BIT_MASK) & MSU_MASK;
398
        static constexpr int      EXP_BIAS = ((1 << (es - 1u)) - 1l);
399
        static constexpr int      MAX_EXP = (es == 1) ? 1 : ((1 << es) - EXP_BIAS - 1);
400
        static constexpr int      MIN_EXP_NORMAL = 1 - EXP_BIAS;
401
        static constexpr int      MIN_EXP_SUBNORMAL = 1 - EXP_BIAS - int(fbits); // the scale of smallest ULP
402

403
        static constexpr bool     hasSubnormals   = _hasSubnormals;
404
        static constexpr bool     hasMaxExpValues = _hasMaxExpValues;
405
        static constexpr bool     isSaturating    = _isSaturating;
406
        typedef bt BlockType;
407

408
        // constructors
409
        cfloat() = default;
410
        cfloat(const cfloat&) = default;
411
        cfloat& operator=(const cfloat&) = default;
412

413
        // construct a cfloat from another
414
        template<unsigned nnbits, unsigned ees, typename bbt, bool ssub, bool ssup, bool ssat>
415
        cfloat(const cfloat<nnbits, ees, bbt, ssub, ssup, ssat>& rhs) noexcept : _block{} {
10,995✔
416
                if (rhs.isnan()) {
10,995✔
417
                        setnan(rhs.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
461✔
418
                }
419
                else if (rhs.isinf()) {
10,534✔
420
                        setinf(rhs.sign());
23✔
421
                }
422
                else if (rhs.iszero()) {
10,511✔
423
                        setzero();
1,125✔
424
                }
425
                else {
426
                        if constexpr (std::is_same_v<bt, bbt>) {
427
                                // Full-precision path: create a blocktriple sized for the TARGET's
428
                                // fraction width, pre-align the source fraction bits into it (with
429
                                // proper rounding-bit collection), then let convert() handle
430
                                // rounding and encoding.
431
                                constexpr unsigned srcFbits = nnbits - 1u - ees;
4,968✔
432
                                using TgtBT = blocktriple<fbits, BlockTripleOperator::ADD, bt>;
433
                                constexpr unsigned addRbits = TgtBT::rbits;
4,968✔
434
                                TgtBT value;
4,968✔
435

436
                                bool srcSign = rhs.sign();
4,968✔
437
                                int srcScale = rhs.scale();
4,968✔
438

439
                                if constexpr (srcFbits < 64 && (fbits + addRbits) < 64) {
440
                                        // Fast path: fits in uint64_t
441
                                        // Guard requires (fbits + addRbits) < 64 to prevent UB from
442
                                        // left-shift of amount (fbits + addRbits - srcFbits) >= 64.
443
                                        uint64_t srcRaw = rhs.fraction_ull();
4,968✔
444
                                        if (rhs.isnormal() || rhs.ismaxexpvalue()) {
4,968✔
445
                                                // Normal and max-exponent values both have a hidden bit
446
                                                srcRaw |= (1ull << srcFbits);
4,968✔
447
                                        }
448
                                        else {
449
                                                // Normalize subnormal: find leading 1, shift to hidden-bit position
450
                                                using SrcCfloat = cfloat<nnbits, ees, bbt, ssub, ssup, ssat>;
451
                                                srcScale = SrcCfloat::MIN_EXP_NORMAL;
×
452
                                                for (int i = static_cast<int>(srcFbits) - 1; i >= 0; --i) {
×
453
                                                        if (srcRaw & (1ull << i)) {
×
454
                                                                unsigned shift = srcFbits - static_cast<unsigned>(i);
×
455
                                                                srcScale -= static_cast<int>(shift);
×
456
                                                                srcRaw <<= shift;
×
457
                                                                break;
×
458
                                                        }
459
                                                }
460
                                        }
461
                                        // Align source hidden bit (at srcFbits) to target position (fbits + addRbits)
462
                                        uint64_t tgtRaw;
463
                                        if constexpr (srcFbits >= fbits + addRbits) {
464
                                                constexpr unsigned rshift = srcFbits - fbits - addRbits;
4,230✔
465
                                                if constexpr (rshift > 0) {
466
                                                        uint64_t stickyMask = (1ull << rshift) - 1;
646✔
467
                                                        bool sticky = (srcRaw & stickyMask) != 0;
646✔
468
                                                        tgtRaw = srcRaw >> rshift;
646✔
469
                                                        if (sticky) tgtRaw |= 1; // fold discarded bits into sticky
646✔
470
                                                }
471
                                                else {
472
                                                        tgtRaw = srcRaw;
3,584✔
473
                                                }
474
                                        }
475
                                        else {
476
                                                constexpr unsigned lshift = fbits + addRbits - srcFbits;
738✔
477
                                                tgtRaw = srcRaw << lshift;
738✔
478
                                        }
479
                                        value.setsign(srcSign);
4,968✔
480
                                        value.setscale(srcScale);
4,968✔
481
                                        value.setbits(tgtRaw);
4,968✔
482
                                }
483
                                else {
484
                                        // Wide path: bit-by-bit copy with alignment
485
                                        int offset = static_cast<int>(fbits + addRbits) - static_cast<int>(srcFbits);
486
                                        // Initialize radix via setbits, then overwrite significand
487
                                        value.setbits(0);
488
                                        bool sticky = false;
489
                                        for (unsigned i = 0; i < srcFbits; ++i) {
490
                                                bool v = rhs.at(i);
491
                                                if (!v) continue;
492
                                                int tgtBit = static_cast<int>(i) + offset;
493
                                                if (tgtBit >= 0 && tgtBit < static_cast<int>(TgtBT::bfbits)) {
494
                                                        value.setbit(static_cast<unsigned>(tgtBit), true);
495
                                                }
496
                                                else if (tgtBit < 0) {
497
                                                        sticky = true;
498
                                                }
499
                                        }
500
                                        if (sticky) value.setbit(0, true);
501
                                        if (rhs.isnormal() || rhs.ismaxexpvalue()) {
502
                                                // Normal and max-exponent values both have a hidden bit
503
                                                value.setbit(static_cast<unsigned>(fbits + addRbits), true);
504
                                        }
505
                                        else {
506
                                                // Normalize subnormal for wide path.
507
                                                // The bit-copy loop above placed source bit i at target position
508
                                                // i + offset = i + (fbits + addRbits - srcFbits).  The blocktriple
509
                                                // scale represents the exponent such that:
510
                                                //   value = 2^scale * significant * 2^(-fbits-addRbits)
511
                                                // For the subnormal with leading 1 at source bit i, the significant
512
                                                // integer has its leading 1 at position i + fbits + addRbits - srcFbits,
513
                                                // giving value = 2^scale * 2^(i - srcFbits) * 1.xxx.
514
                                                // The true subnormal value is 2^MIN_EXP_NORMAL * 2^(i - srcFbits) * 1.xxx,
515
                                                // so scale = MIN_EXP_NORMAL with no further adjustment.
516
                                                using SrcCfloat = cfloat<nnbits, ees, bbt, ssub, ssup, ssat>;
517
                                                srcScale = SrcCfloat::MIN_EXP_NORMAL;
518
                                                // (No srcScale -= shift: the offset alignment already accounts for
519
                                                //  the bit position; subtracting shift would apply it twice.)
520
                                        }
521
                                        value.setnormal(); // clear _zero flag set by setbits(0)
522
                                        value.setsign(srcSign);
523
                                        value.setscale(srcScale);
524
                                }
525
                                convert(value, *this);
4,968✔
526
                        }
527
                        else {
528
                                // Cross-block-type: marshall through double.
529
                                // Safe only when the source fraction fits in double's 52-bit significand.
530
                                constexpr unsigned srcFbitsXB = nnbits - ees - 1u;
4,418✔
531
                                static_assert(srcFbitsXB <= 52u,
532
                                        "cfloat converting constructor: source and target have different "
533
                                        "block types and source has more fraction bits than double can represent; "
534
                                        "use matching block types for full-precision cross-config conversion");
535
                                *this = double(rhs);
4,418✔
536
                        }
537
                }
538
        }
10,995✔
539

540
        // converting constructors
541
        constexpr cfloat(const std::string& stringRep) : _block{} { assign(stringRep); }
1✔
542
        // specific value constructor
543
        constexpr cfloat(const SpecificValue code) noexcept : _block{} {
309✔
544
                switch (code) {
309✔
545
                case SpecificValue::maxpos:
95✔
546
                        maxpos();
95✔
547
                        break;
95✔
548
                case SpecificValue::minpos:
111✔
549
                        minpos();
111✔
550
                        break;
111✔
551
                case SpecificValue::zero:
×
552
                default:
553
                        zero();
×
554
                        break;
×
555
                case SpecificValue::minneg:
×
556
                        minneg();
×
557
                        break;
×
558
                case SpecificValue::maxneg:
48✔
559
                        maxneg();
48✔
560
                        break;
48✔
561
                case SpecificValue::infpos:
21✔
562
                        setinf(false);
21✔
563
                        break;
21✔
564
                case SpecificValue::infneg:
×
565
                        setinf(true);
×
566
                        break;
×
567
                case SpecificValue::nar: // approximation as cfloats don't have a NaR
17✔
568
                case SpecificValue::qnan:
569
                        setnan(NAN_TYPE_QUIET);
17✔
570
                        break;
17✔
571
                case SpecificValue::snan:
17✔
572
                        setnan(NAN_TYPE_SIGNALLING);
17✔
573
                        break;
17✔
574
                }
575
        }
309✔
576

577
        constexpr cfloat(signed char iv)                    noexcept : _block{} { *this = iv; }
578
        constexpr cfloat(short iv)                          noexcept : _block{} { *this = iv; }
579
        constexpr cfloat(int iv)                            noexcept : _block{} { *this = iv; }
114,803✔
580
        constexpr cfloat(long iv)                           noexcept : _block{} { *this = iv; }
581
        constexpr cfloat(long long iv)                      noexcept : _block{} { *this = iv; }
1✔
582
        constexpr cfloat(char iv)                           noexcept : _block{} { *this = iv; }
583
        constexpr cfloat(unsigned short iv)                 noexcept : _block{} { *this = iv; }
584
        constexpr cfloat(unsigned int iv)                   noexcept : _block{} { *this = iv; }
585
        constexpr cfloat(unsigned long iv)                  noexcept : _block{} { *this = iv; }
100✔
586
        constexpr cfloat(unsigned long long iv)             noexcept : _block{} { *this = iv; }
587
        CONSTEXPRESSION cfloat(float iv)                    noexcept : _block{} { *this = iv; }
340,804✔
588
        CONSTEXPRESSION cfloat(double iv)                   noexcept : _block{} { *this = iv; }
515,969✔
589

590
        // assignment operators
591
        constexpr cfloat& operator=(signed char rhs)        noexcept { return convert_signed_integer(rhs); }
592
        constexpr cfloat& operator=(short rhs)              noexcept { return convert_signed_integer(rhs); }
593
        constexpr cfloat& operator=(int rhs)                noexcept { return convert_signed_integer(rhs); }
174,971✔
594
        constexpr cfloat& operator=(long rhs)               noexcept { return convert_signed_integer(rhs); }
595
        constexpr cfloat& operator=(long long rhs)          noexcept { return convert_signed_integer(rhs); }
1✔
596

597
        constexpr cfloat& operator=(char rhs)               noexcept { return convert_unsigned_integer(rhs); }
598
        constexpr cfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned_integer(rhs); }
599
        constexpr cfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned_integer(rhs); }
30✔
600
        constexpr cfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned_integer(rhs); }
110✔
601
        constexpr cfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned_integer(rhs); }
602

603
        BIT_CAST_CONSTEXPR cfloat& operator=(float rhs)     noexcept { return convert_ieee754(rhs); }
32,488,679✔
604
        BIT_CAST_CONSTEXPR cfloat& operator=(double rhs)    noexcept { return convert_ieee754(rhs); }
105,764,705✔
605

606
        // make conversions to native types explicit
607
        explicit operator int()                       const noexcept { return to_int(); }
59,976✔
608
        explicit operator long()                      const noexcept { return to_long(); }
609
        explicit operator long long()                 const noexcept { return to_long_long(); }
1✔
610
        explicit operator float()                     const noexcept { return to_native<float>(); }
35,177,206✔
611
        explicit operator double()                    const noexcept { return to_native<double>(); }
78,241,504✔
612

613
        // guard long double support to enable ARM and RISC-V embedded environments
614
#if LONG_DOUBLE_SUPPORT
615
        explicit operator long double()               const noexcept { return to_native<long double>(); }
54✔
616
        BIT_CAST_CONSTEXPR cfloat(long double iv)           noexcept : _block{} { *this = iv; }
2✔
617
        BIT_CAST_CONSTEXPR cfloat& operator=(long double rhs)  noexcept { return convert_ieee754(rhs); }
74✔
618
#endif
619

620
        // arithmetic operators
621
        // prefix operator
622
        inline cfloat operator-() const {
8,354,860✔
623
                cfloat tmp(*this);
8,354,860✔
624
                tmp._block[MSU] ^= SIGN_BIT_MASK;
8,354,860✔
625
                return tmp;
8,354,860✔
626
        }
627

628
        cfloat& operator+=(const cfloat& rhs) CFLOAT_EXCEPT {
18,485,463✔
629
                if constexpr (_trace_add) std::cout << "---------------------- ADD -------------------" << std::endl;
630
                // special case handling of the inputs
631
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
632
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
744✔
633
                        throw cfloat_operand_is_nan{};
×
634
                }
635
#else
636
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
18,484,719✔
637
                        setnan(NAN_TYPE_SIGNALLING);
236,541✔
638
                        return *this;
236,541✔
639
                }
640
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
18,248,178✔
641
                        setnan(NAN_TYPE_QUIET);
216,964✔
642
                        return *this;
216,964✔
643
                }
644
#endif
645
                // normal + inf  = inf
646
                // normal + -inf = -inf
647
                // inf + normal = inf
648
                // inf + inf    = inf
649
                // inf + -inf    = ?
650
                // -inf + normal = -inf
651
                // -inf + -inf   = -inf
652
                // -inf + inf    = ?
653
                if (isinf()) {
18,031,958✔
654
                        if (rhs.isinf()) {
60,327✔
655
                                if (sign() != rhs.sign()) {
939✔
656
                                        setnan(NAN_TYPE_SIGNALLING);
514✔
657
                                }
658
                                return *this;
939✔
659
                        }
660
                        else {
661
                                return *this;
59,388✔
662
                        }
663
                }
664
                else {
665
                        if (rhs.isinf()) {
17,971,631✔
666
                                *this = rhs;
58,293✔
667
                                return *this;
58,293✔
668
                        }
669
                }
670

671
                if (iszero()) {
17,913,338✔
672
                        *this = rhs;
206,307✔
673
                        return *this;
206,307✔
674
                }
675
                if (rhs.iszero()) return *this;
17,707,031✔
676

677
                // arithmetic operation
678
                blocktriple<fbits, BlockTripleOperator::ADD, bt> a, b, sum;
17,374,994✔
679

680
                // transform the inputs into (sign,scale,significant) 
681
                normalizeAddition(a); 
17,374,994✔
682
                rhs.normalizeAddition(b);
17,374,994✔
683
                sum.add(a, b);
17,374,994✔
684

685
                convert(sum, *this);
17,374,994✔
686

687
                return *this;
17,374,994✔
688
        }
689
        cfloat& operator+=(double rhs) CFLOAT_EXCEPT {
690
                return *this += cfloat(rhs);
691
        }
692
        cfloat& operator-=(const cfloat& rhs) CFLOAT_EXCEPT {
8,469,305✔
693
                if constexpr (_trace_sub) std::cout << "---------------------- SUB -------------------" << std::endl;
694
                if (rhs.isnan()) 
8,469,305✔
695
                        return *this += rhs;
170,204✔
696
                else 
697
                        return *this += -rhs;
8,299,101✔
698
        }
699
        cfloat& operator-=(double rhs) CFLOAT_EXCEPT {
8✔
700
                return *this -= cfloat(rhs);
8✔
701
        }
702
        cfloat& operator*=(const cfloat& rhs) CFLOAT_EXCEPT {
4,312,845✔
703
                if constexpr (_trace_mul) std::cout << "---------------------- MUL -------------------\n";
322✔
704
                // special case handling of the inputs
705
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
706
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
612✔
707
                        throw cfloat_operand_is_nan{};
×
708
                }
709
#else
710
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
4,312,233✔
711
                        setnan(NAN_TYPE_SIGNALLING);
124,657✔
712
                        return *this;
124,657✔
713
                }
714
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
4,187,576✔
715
                        setnan(NAN_TYPE_QUIET);
113,355✔
716
                        return *this;
113,355✔
717
                }
718
#endif
719
                //  inf * inf = inf
720
                //  inf * -inf = -inf
721
                // -inf * inf = -inf
722
                // -inf * -inf = inf
723
                //        0 * inf = -nan(ind)
724
                //        inf * 0 = -nan(ind)
725
                bool resultSign = sign() != rhs.sign();
4,074,833✔
726
                if (isinf()) {
4,074,833✔
727
                        if (rhs.iszero()) {
24,457✔
728
                                setnan(NAN_TYPE_QUIET);
1,591✔
729
                        }
730
                        else {
731
                                setsign(resultSign);
22,866✔
732
                        }
733
                        return *this;
24,457✔
734
                }
735
                if (rhs.isinf()) {
4,050,376✔
736
                        if (iszero()) {
25,816✔
737
                                setnan(NAN_TYPE_QUIET);
2,969✔
738
                        }
739
                        else {
740
                                setinf(resultSign);
22,847✔
741
                        }
742
                        return *this;
25,816✔
743
                }
744

745
                if (iszero() || rhs.iszero()) {                        
4,024,560✔
746
                        setzero();
1,876,393✔
747
                        setsign(resultSign); // deal with negative 0
1,876,393✔
748
                        return *this;
1,876,393✔
749
                }
750

751
                // arithmetic operation
752
                blocktriple<fbits, BlockTripleOperator::MUL, bt> a, b, product;
2,148,167✔
753

754
                // transform the inputs into (sign,scale,significant) 
755
                // triples of the correct width
756
                normalizeMultiplication(a);
2,148,167✔
757
                rhs.normalizeMultiplication(b);
2,148,167✔
758
                product.mul(a, b);
2,148,167✔
759
                convert(product, *this);
2,148,167✔
760

761
                if constexpr (_trace_mul) std::cout << to_binary(a) << " : " << a << " *\n" << to_binary(b) << " : " << b << " =\n" << to_binary(product) << " : " << product << '\n';
106✔
762

763
                return *this;
2,148,167✔
764
        }
765
        cfloat& operator*=(double rhs) CFLOAT_EXCEPT {
40✔
766
                return *this *= cfloat(rhs);
40✔
767
        }
768
        cfloat& operator/=(const cfloat& rhs) CFLOAT_EXCEPT {
5,139,192✔
769
                if constexpr (_trace_div) std::cout << "---------------------- DIV -------------------" << std::endl;
770

771
                // special case handling of the inputs
772
                // qnan / qnan = qnan
773
                // qnan / snan = qnan
774
                // snan / qnan = snan
775
                // snan / snan = snan
776
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
777
                if (rhs.iszero()) throw cfloat_divide_by_zero();
49✔
778
                if (rhs.isnan()) throw cfloat_divide_by_nan();
48✔
779
                if (isnan()) throw cfloat_operand_is_nan();
48✔
780
#else
781
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
5,139,143✔
782
                        setnan(NAN_TYPE_SIGNALLING);
164,428✔
783
                        return *this;
164,428✔
784
                }
785
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
4,974,715✔
786
                        setnan(NAN_TYPE_QUIET);
151,601✔
787
                        return *this;
151,601✔
788
                }
789
                if (rhs.iszero()) {
4,823,114✔
790
                        if (iszero()) {
169,760✔
791
                                // zero divide by zero yields quiet NaN (in MSVC it is labeled -nan(ind) for indeterminate)
792
                                setnan(NAN_TYPE_QUIET);
29,299✔
793
                        }
794
                        else {
795
                                // non-zero divide by zero yields INF
796
                                bool resultSign = sign() != rhs.sign();
140,461✔
797
                                setinf(resultSign);
140,461✔
798
                        }
799
                        return *this;
169,760✔
800
                }
801
#endif
802
                //  inf /  inf = -nan(ind)
803
                //  inf / -inf = -nan(ind)
804
                // -inf /  inf = -nan(ind)
805
                // -inf / -inf = -nan(ind)
806
                //        1.0 /  inf = 0
807
                bool resultSign = sign() != rhs.sign();
4,653,402✔
808
                if (isinf()) {
4,653,402✔
809
                        if (rhs.isinf()) {
31,087✔
810
                                // inf divide by inf yields quiet NaN (in MSVC it is labeled -nan(ind) for indeterminate)
811
                                setnan(NAN_TYPE_QUIET);
559✔
812
                                return *this;
559✔
813
                        }
814
                        else {
815
                                // we stay an infinite but may change sign
816
                                setsign(resultSign);
30,528✔
817
                                return *this;
30,528✔
818
                        }
819
                }
820
                else {
821
                        if (rhs.isinf()) {
4,622,315✔
822
                                setzero();
32,833✔
823
                                setsign(resultSign);
32,833✔
824
                                return *this;
32,833✔
825
                        }
826
                }
827

828
                if (iszero()) {
4,589,482✔
829
                        setzero();
139,452✔
830
                        setsign(resultSign); // deal with negative 0
139,452✔
831
                        return *this;
139,452✔
832
                }
833

834
                // arithmetic operation
835
                using BlockTriple = blocktriple<fbits, BlockTripleOperator::DIV, bt>;
836
                BlockTriple a, b, quotient;
4,450,030✔
837

838
                // transform the inputs into (sign,scale,significant) 
839
                // triples of the correct width
840
                normalizeDivision(a);
4,450,030✔
841
                rhs.normalizeDivision(b);
4,450,030✔
842
                quotient.div(a, b);
4,450,030✔
843
                quotient.setradix(BlockTriple::radix);
4,450,030✔
844
                convert(quotient, *this);
4,450,030✔
845

846
                if constexpr (_trace_div) std::cout << to_binary(a) << " : " << a << " /\n" << to_binary(b) << " : " << b << " =\n" << to_binary(quotient) << " : " << quotient << '\n';
847

848
                return *this;
4,450,030✔
849
        }
850
        cfloat& operator/=(double rhs) CFLOAT_EXCEPT {
851
                return *this /= cfloat(rhs);
852
        }
853
        cfloat& reciprocal() CFLOAT_EXCEPT {
854
                cfloat c = 1.0 / *this;
855
                return *this = c;
856
        }
857
        /// <summary>
858
        /// move to the next bit encoding modulo 2^nbits
859
        /// </summary>
860
        /// <typeparam name="bt"></typeparam>
861
        cfloat& operator++() {
4,598,635✔
862
                if constexpr (0 == nrBlocks) {
863
                        return *this;
864
                }
865
                else if constexpr (1 == nrBlocks) {
866
                        if (sign()) {
864,255✔
867
                                if (_block[MSU] == (SIGN_BIT_MASK | 1ul)) { // pattern: 1.00.001 = minneg
467✔
868
                                        _block[MSU] = 0; // pattern: 0.00.000 = +0 
7✔
869
                                }
870
                                else {
871
                                        --_block[MSU];
460✔
872
                                }
873
                                if constexpr (!hasSubnormals) {
874
                                        if (isdenormal()) {
198✔
875
                                                // special case, we need to jump past all the subnormal value encodings which puts us on 0
876
                                                _block[MSU] = 0; // pattern: 0.00.000 = +0
3✔
877
                                        }
878
                                }
879
                        }
880
                        else {
881
                                if constexpr (!hasSubnormals) {
882
                                        if (_block[MSU] == 0) {
384,649✔
883
                                                // special case, we need to jump past all the subnormal value encodings minus 1
884
                                                setfraction(0xFFFF'FFFF'FFFF'FFFFull);
4✔
885
                                        }
886
                                }
887
                                if ((_block[MSU] & (MSU_MASK >> 1)) == (MSU_MASK >> 1)) { // pattern: 0.11.111 = nan
863,788✔
888
                                        _block[MSU] |= SIGN_BIT_MASK; // pattern: 1.11.111 = snan : wrap to the other side of the encoding
×
889
                                }
890
                                else {
891
                                        ++_block[MSU];
863,788✔
892
                                }
893
                        }
894
                }
895
                else {
896
                        if (sign()) {
3,734,380✔
897
                                // special case: pattern: 1.00.001 = minneg transitions to pattern: 0.00.000 = +0 
898
                                if (isminnegencoding()) {
66,947✔
899
                                        setzero();
8✔
900
                                }
901
                                else {
902
                                        //  1111 0000
903
                                        //  1110 1111
904
                                        bool borrow = true;
66,939✔
905
                                        for (unsigned i = 0; i < MSU; ++i) {
199,415✔
906
                                                if (borrow) {
132,476✔
907
                                                        if ((_block[i] & storageMask) == 0) { // block will underflow
67,197✔
908
                                                                --_block[i];
261✔
909
                                                                borrow = true;
261✔
910
                                                        }
911
                                                        else {
912
                                                                --_block[i];
66,936✔
913
                                                                borrow = false;
66,936✔
914
                                                        }
915
                                                }
916
                                        }
917
                                        if (borrow) {
66,939✔
918
                                                --_block[MSU];
3✔
919
                                        }
920
                                        if constexpr (!hasSubnormals) {
921
                                                if (isdenormal()) {
385✔
922
                                                        // special case, we need to jump past all the subnormal value encodings which puts us on 0
923
                                                        setzero(); // pattern: 0.00.000 = +0
3✔
924
                                                }
925
                                        }
926
                                }
927
                        }
928
                        else {
929
                                // special case: pattern: 0.11.111 = nan transitions to pattern: 1.11.111 = snan 
930
                                if (isnanencoding()) {
3,667,433✔
931
                                        setnan(NAN_TYPE_SIGNALLING);
×
932
                                }
933
                                else {
934
                                        if constexpr (!hasSubnormals) {
935
                                                if (iszero()) {
1,720,661✔
936
                                                        // special case, we need to jump past all the subnormal value encodings minus 1 so that the increment code below ends up on normal minpos
937
                                                        setfraction(0xFFFF'FFFF'FFFF'FFFFull);
3✔
938
                                                }
939
                                        }
940
                                        bool carry = true;
3,667,433✔
941
                                        for (unsigned i = 0; i < MSU; ++i) {
7,400,772✔
942
                                                if (carry) {
3,733,339✔
943
                                                        if ((_block[i] & storageMask) == storageMask) { // block will overflow
3,667,688✔
944
                                                                _block[i] = 0;
256✔
945
                                                                carry = true;
256✔
946
                                                        }
947
                                                        else {
948
                                                                ++_block[i];
3,667,432✔
949
                                                                carry = false;
3,667,432✔
950
                                                        }
951
                                                }
952
                                        }
953
                                        if (carry) {
3,667,433✔
954
                                                ++_block[MSU];
1✔
955
                                        }
956
                                }
957
                        }
958
                }
959
                return *this;
4,598,635✔
960
        }
961
        cfloat operator++(int) {
134,800✔
962
                cfloat tmp(*this);
134,800✔
963
                operator++();
134,800✔
964
                return tmp;
134,800✔
965
        }
966
        cfloat& operator--() {
5,014,100✔
967
                if constexpr (0 == nrBlocks) {
968
                        return *this;
969
                }
970
                else if constexpr (1 == nrBlocks) {
971
                        if (sign()) {
942,793✔
972
                                ++_block[MSU];
942,299✔
973
                        }
974
                        else {
975
                                // positive range
976
                                if (_block[MSU] == 0) { // pattern: 0.00.000 = 0
494✔
977
                                        if constexpr (hasSubnormals) {
978
                                                _block[MSU] |= SIGN_BIT_MASK | bt(1u); // pattern: 1.00.001 = minneg 
8✔
979
                                        }
980
                                        else {
981
                                                // special case, we need to jump past all the subnormal value encodings
982
                                                setfraction(0xFFFF'FFFF'FFFF'FFFFull); // set to 0.00.11...11
3✔
983
                                                ++_block[MSU]; // increment into 0.01.0000
3✔
984
                                                _block[MSU] |= SIGN_BIT_MASK; // set to 1.01.0000
3✔
985
                                        }
986
                                }
987
                                else {
988
                                        --_block[MSU];
483✔
989
                                }
990
                                if constexpr (!hasSubnormals) {
991
                                        if (isdenormal()) {
211✔
992
                                                // special case, we need to jump past all the subnormal value encodings which puts us on 0
993
                                                _block[MSU] = 0; // pattern: 0.00.000 = +0
3✔
994
                                        }
995
                                }
996
                        }
997
                }
998
                else {
999
                        if (sign()) {
4,071,307✔
1000
                                bool carry = true;
4,004,349✔
1001
                                for (unsigned i = 0; i < MSU; ++i) {
8,074,235✔
1002
                                        if (carry) {
4,069,886✔
1003
                                                if ((_block[i] & storageMask) == storageMask) { // block will overflow
4,004,604✔
1004
                                                        _block[i] = 0;
256✔
1005
                                                        carry = true;
256✔
1006
                                                }
1007
                                                else {
1008
                                                        ++_block[i];
4,004,348✔
1009
                                                        carry = false;
4,004,348✔
1010
                                                }
1011
                                        }
1012
                                }
1013
                                if (carry) {
4,004,349✔
1014
                                        ++_block[MSU];
1✔
1015
                                }
1016
                        }
1017
                        else {
1018
                                // special case: pattern: 0.00.000 = +0 transitions to pattern: 1.00.001 = minneg
1019
                                if (iszeroencoding()) {
66,958✔
1020
                                        if constexpr (hasSubnormals) {
1021
                                                setsign(true);
8✔
1022
                                                setbit(0, true);
8✔
1023
                                        }
1024
                                        else {
1025
                                                // special case, we need to jump past all the subnormal value encodings 1.01.0000 = minneg normal
1026
                                                setexponent(1 - EXP_BIAS);
2✔
1027
                                                setsign(true);
2✔
1028
                                        }
1029
                                }
1030
                                else {
1031
                                        bool borrow = true;
66,948✔
1032
                                        for (unsigned i = 0; i < MSU; ++i) {
199,441✔
1033
                                                if (borrow) {
132,493✔
1034
                                                        if ((_block[i] & storageMask) == 0) { // block will underflow
67,209✔
1035
                                                                --_block[i];
266✔
1036
                                                                borrow = true;
266✔
1037
                                                        }
1038
                                                        else {
1039
                                                                --_block[i];
66,943✔
1040
                                                                borrow = false;
66,943✔
1041
                                                        }
1042
                                                }
1043
                                        }
1044
                                        if (borrow) {
66,948✔
1045
                                                --_block[MSU];
5✔
1046
                                        }
1047
                                        if constexpr (!hasSubnormals) {
1048
                                                if (isdenormal()) {
384✔
1049
                                                        // special case, we need to jump past all the subnormal value encodings which puts us on 0
1050
                                                        setzero(); // pattern: 0.00.000 = +0
2✔
1051
                                                }
1052
                                        }
1053
                                }
1054
                        }
1055
                }
1056
                return *this;
5,014,100✔
1057
        }
1058
        cfloat operator--(int) {
134,812✔
1059
                cfloat tmp(*this);
134,812✔
1060
                operator--();
134,812✔
1061
                return tmp;
134,812✔
1062
        }
1063

1064
        // modifiers        
1065
        constexpr void clear() noexcept {
143,464,444✔
1066
                if constexpr (0 == nrBlocks) {
1067
                        return;
1068
                }
1069
                else if constexpr (1 == nrBlocks) {
1070
                        _block[0] = bt(0);
41,299,738✔
1071
                }
1072
                else if constexpr (2 == nrBlocks) {
1073
                        _block[0] = bt(0);
48,057,462✔
1074
                        _block[1] = bt(0);
48,057,462✔
1075
                }
1076
                else if constexpr (3 == nrBlocks) {
1077
                        _block[0] = bt(0);
53,973,313✔
1078
                        _block[1] = bt(0);
53,973,313✔
1079
                        _block[2] = bt(0);
53,973,313✔
1080
                }
1081
                else if constexpr (4 == nrBlocks) {
1082
                        _block[0] = bt(0);
43,247✔
1083
                        _block[1] = bt(0);
43,247✔
1084
                        _block[2] = bt(0);
43,247✔
1085
                        _block[3] = bt(0);
43,247✔
1086
                }
1087
                else if constexpr (5 == nrBlocks) {
1088
                        _block[0] = bt(0);
29,999✔
1089
                        _block[1] = bt(0);
29,999✔
1090
                        _block[2] = bt(0);
29,999✔
1091
                        _block[3] = bt(0);
29,999✔
1092
                        _block[4] = bt(0);
29,999✔
1093
                }
1094
                else if constexpr (6 == nrBlocks) {
1095
                        _block[0] = bt(0);
10,016✔
1096
                        _block[1] = bt(0);
10,016✔
1097
                        _block[2] = bt(0);
10,016✔
1098
                        _block[3] = bt(0);
10,016✔
1099
                        _block[4] = bt(0);
10,016✔
1100
                        _block[5] = bt(0);
10,016✔
1101
                }
1102
                else if constexpr (7 == nrBlocks) {
1103
                        _block[0] = bt(0);
9,999✔
1104
                        _block[1] = bt(0);
9,999✔
1105
                        _block[2] = bt(0);
9,999✔
1106
                        _block[3] = bt(0);
9,999✔
1107
                        _block[4] = bt(0);
9,999✔
1108
                        _block[5] = bt(0);
9,999✔
1109
                        _block[6] = bt(0);
9,999✔
1110
                }
1111
                else if constexpr (8 == nrBlocks) {
1112
                        _block[0] = bt(0);
20,323✔
1113
                        _block[1] = bt(0);
20,323✔
1114
                        _block[2] = bt(0);
20,323✔
1115
                        _block[3] = bt(0);
20,323✔
1116
                        _block[4] = bt(0);
20,323✔
1117
                        _block[5] = bt(0);
20,323✔
1118
                        _block[6] = bt(0);
20,323✔
1119
                        _block[7] = bt(0);
20,323✔
1120
                }
1121
                else {
1122
                        for (unsigned i = 0; i < nrBlocks; ++i) {
224,339✔
1123
                                _block[i] = bt(0);
203,992✔
1124
                        }
1125
                }
1126
        }
143,464,444✔
1127
        constexpr void setzero() noexcept { clear(); }
2,696,841✔
1128
        constexpr void setinf(bool sign = true) noexcept {
6,557,300✔
1129
                // the Inf encoding is the pattern 0b0'11...11'11...10 for a +inf, and 0b1'11...11'11...110 for a -inf
1130
                if constexpr (0 == nrBlocks) {
1131
                        return;
1132
                }
1133
                else if constexpr (1 == nrBlocks) {
1134
                        _block[MSU] = sign ? bt(MSU_MASK ^ LSB_BIT_MASK) : bt(~SIGN_BIT_MASK & (MSU_MASK ^ LSB_BIT_MASK));
2,200,830✔
1135
                }
1136
                else if constexpr (2 == nrBlocks) {
1137
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
1,859,556✔
1138
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
1,859,556✔
1139
                }
1140
                else if constexpr (3 == nrBlocks) {
1141
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
2,493,907✔
1142
                        _block[1] = BLOCK_MASK;
2,493,907✔
1143
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
2,493,907✔
1144
                }
1145
                else if constexpr (4 == nrBlocks) {
1146
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
2,710✔
1147
                        _block[1] = BLOCK_MASK;
2,710✔
1148
                        _block[2] = BLOCK_MASK;
2,710✔
1149
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
2,710✔
1150
                }
1151
                else if constexpr (5 == nrBlocks) {
1152
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
88✔
1153
                        _block[1] = BLOCK_MASK;
88✔
1154
                        _block[2] = BLOCK_MASK;
88✔
1155
                        _block[3] = BLOCK_MASK;
88✔
1156
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
88✔
1157
                }
1158
                else if constexpr (6 == nrBlocks) {
1159
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
42✔
1160
                        _block[1] = BLOCK_MASK;
42✔
1161
                        _block[2] = BLOCK_MASK;
42✔
1162
                        _block[3] = BLOCK_MASK;
42✔
1163
                        _block[4] = BLOCK_MASK;
42✔
1164
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
42✔
1165
                }
1166
                else if constexpr (7 == nrBlocks) {
1167
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
42✔
1168
                        _block[1] = BLOCK_MASK;
42✔
1169
                        _block[2] = BLOCK_MASK;
42✔
1170
                        _block[3] = BLOCK_MASK;
42✔
1171
                        _block[4] = BLOCK_MASK;
42✔
1172
                        _block[5] = BLOCK_MASK;
42✔
1173
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
42✔
1174
                }
1175
                else if constexpr (8 == nrBlocks) {
1176
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
81✔
1177
                        _block[1] = BLOCK_MASK;
81✔
1178
                        _block[2] = BLOCK_MASK;
81✔
1179
                        _block[3] = BLOCK_MASK;
81✔
1180
                        _block[4] = BLOCK_MASK;
81✔
1181
                        _block[5] = BLOCK_MASK;
81✔
1182
                        _block[6] = BLOCK_MASK;
81✔
1183
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
81✔
1184
                }
1185
                else {
1186
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
44✔
1187
                        for (unsigned i = 1; i < nrBlocks - 1; ++i) {
396✔
1188
                                _block[i] = BLOCK_MASK;
352✔
1189
                        }
1190
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
44✔
1191
                }        
1192
        }
6,557,300✔
1193
        constexpr void setnan(int NaNType = NAN_TYPE_SIGNALLING) noexcept {
7,120,941✔
1194
                // the NaN encoding is the pattern 0b0'11...11'11...11 for a quiet Nan, and 0b1'11...11'11...111 for a signalling NaN
1195
                if constexpr (0 == nrBlocks) {
1196
                        return;
1197
                }
1198
                else if constexpr (1 == nrBlocks) {
1199
                        // fall through
1200
                }
1201
                else if constexpr (2 == nrBlocks) {
1202
                        _block[0] = BLOCK_MASK;
1,994,742✔
1203
                }
1204
                else if constexpr (3 == nrBlocks) {
1205
                        _block[0] = BLOCK_MASK;
1,048,706✔
1206
                        _block[1] = BLOCK_MASK;
1,048,706✔
1207
                }
1208
                else if constexpr (4 == nrBlocks) {
1209
                        _block[0] = BLOCK_MASK;
29✔
1210
                        _block[1] = BLOCK_MASK;
29✔
1211
                        _block[2] = BLOCK_MASK;
29✔
1212
                }
1213
                else if constexpr (5 == nrBlocks) {
1214
                        _block[0] = BLOCK_MASK;
10✔
1215
                        _block[1] = BLOCK_MASK;
10✔
1216
                        _block[2] = BLOCK_MASK;
10✔
1217
                        _block[3] = BLOCK_MASK;
10✔
1218
                }
1219
                else if constexpr (6 == nrBlocks) {
1220
                        _block[0] = BLOCK_MASK;
4✔
1221
                        _block[1] = BLOCK_MASK;
4✔
1222
                        _block[2] = BLOCK_MASK;
4✔
1223
                        _block[3] = BLOCK_MASK;
4✔
1224
                        _block[4] = BLOCK_MASK;
4✔
1225
                }
1226
                else if constexpr (7 == nrBlocks) {
1227
                        _block[0] = BLOCK_MASK;
4✔
1228
                        _block[1] = BLOCK_MASK;
4✔
1229
                        _block[2] = BLOCK_MASK;
4✔
1230
                        _block[3] = BLOCK_MASK;
4✔
1231
                        _block[4] = BLOCK_MASK;
4✔
1232
                        _block[5] = BLOCK_MASK;
4✔
1233
                }
1234
                else if constexpr (8 == nrBlocks) {
1235
                        _block[0] = BLOCK_MASK;
4✔
1236
                        _block[1] = BLOCK_MASK;
4✔
1237
                        _block[2] = BLOCK_MASK;
4✔
1238
                        _block[3] = BLOCK_MASK;
4✔
1239
                        _block[4] = BLOCK_MASK;
4✔
1240
                        _block[5] = BLOCK_MASK;
4✔
1241
                        _block[6] = BLOCK_MASK;
4✔
1242
                }
1243
                else {
1244
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) {
60✔
1245
                                _block[i] = BLOCK_MASK;
54✔
1246
                        }
1247
                }
1248
                _block[MSU] = NaNType == NAN_TYPE_SIGNALLING ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
7,120,941✔
1249
        }
7,120,941✔
1250
        constexpr void setsign(bool sign = true) {
3,538,655✔
1251
                if (sign) {
3,538,655✔
1252
                        _block[MSU] |= SIGN_BIT_MASK;
632,554✔
1253
                }
1254
                else {
1255
                        _block[MSU] &= ~SIGN_BIT_MASK;
2,906,101✔
1256
                }
1257
        }
3,538,655✔
1258
        constexpr bool setexponent(int scale) {
590,493✔
1259
                if (scale < MIN_EXP_SUBNORMAL || scale > MAX_EXP) return false; // this scale cannot be represented
590,493✔
1260
                if constexpr (nbits < 65) {
1261
                        uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
588,952✔
1262
                        if (scale >= MIN_EXP_SUBNORMAL && scale < MIN_EXP_NORMAL) {
588,952✔
1263
                                // setexponent() only writes the encoding field. In the subnormal interval the true scale is carried
1264
                                // by a zero exponent field plus a left-shifted fraction, so callers must have already denormalized
1265
                                // the significand before asking for this exponent pattern.
1266
                                // we are a subnormal number: all exponent bits are 0
1267
                                exponentBits = 0;
45✔
1268
                        }
1269
                        // TODO: optimize
1270
                        uint32_t mask = (1ul << (es - 1));
588,952✔
1271
                        for (unsigned i = nbits - 2; i > nbits - 2 - es; --i) {
5,083,834✔
1272
                                setbit(i, (mask & exponentBits));
4,494,882✔
1273
                                mask >>= 1;
4,494,882✔
1274
                        }
1275
                }
1276
                else {
1277
                        uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
1,541✔
1278
                        uint32_t mask = (1ul << (es - 1));
1,541✔
1279
                        for (unsigned i = nbits - 2; i > nbits - 2 - es; --i) {
25,156✔
1280
                                setbit(i, (mask & exponentBits));
23,615✔
1281
                                mask >>= 1;
23,615✔
1282
                        }
1283
                }
1284
                return true;
590,493✔
1285
        }
1286
        constexpr void setfraction(uint64_t raw_bits) {
610✔
1287
                // unoptimized as it is not meant to be an end-user API, it is a test API
1288
                // raw_bits is uint64_t so can have at most 64 bits of fraction data
1289
                constexpr unsigned bitsToSet = (fbits < 64) ? fbits : 64;
610✔
1290
                uint64_t mask{ 1ull };
610✔
1291
                for (unsigned i = 0; i < bitsToSet; ++i) {
24,744✔
1292
                        setbit(i, (mask & raw_bits));
24,134✔
1293
                        mask <<= 1;
24,134✔
1294
                }
1295
        }
610✔
1296
        constexpr void setbit(unsigned i, bool v = true) noexcept {
8,381,836✔
1297
                unsigned blockIndex = i / bitsInBlock;
8,381,836✔
1298
                if (blockIndex < nrBlocks) {
8,381,836✔
1299
                        bt block = _block[blockIndex];
8,381,836✔
1300
                        bt null = ~(1ull << (i % bitsInBlock));
8,381,836✔
1301
                        bt bit = bt(v ? 1 : 0);
8,381,836✔
1302
                        bt mask = bt(bit << (i % bitsInBlock));
8,381,836✔
1303
                        _block[blockIndex] = bt((block & null) | mask);
8,381,836✔
1304
                }
1305
        }
8,381,836✔
1306
        constexpr cfloat& setbits(uint64_t raw_bits) noexcept {
306,602,147✔
1307
                if constexpr (0 == nrBlocks) {
1308
                        return *this;
1309
                }
1310
                else if constexpr (1 == nrBlocks) {
1311
                        _block[0] = raw_bits & storageMask;
89,235,969✔
1312
                }
1313
                else if constexpr (2 == nrBlocks) {
1314
                        if constexpr (bitsInBlock < 64) {
1315
                                _block[0] = raw_bits & storageMask;
103,248,488✔
1316
                                raw_bits >>= bitsInBlock;
103,248,488✔
1317
                                _block[1] = raw_bits & storageMask;
103,248,488✔
1318
                        }
1319
                        else {
1320
                                _block[0] = raw_bits & storageMask;
1321
                                _block[1] = 0;
1322
                        }
1323
                }
1324
                else if constexpr (3 == nrBlocks) {
1325
                        if constexpr (bitsInBlock < 64) {
1326
                                _block[0] = raw_bits & storageMask;
113,957,350✔
1327
                                raw_bits >>= bitsInBlock;
113,957,350✔
1328
                                _block[1] = raw_bits & storageMask;
113,957,350✔
1329
                                raw_bits >>= bitsInBlock;
113,957,350✔
1330
                                _block[2] = raw_bits & storageMask;
113,957,350✔
1331
                        }
1332
                        else {
1333
                                _block[0] = raw_bits & storageMask;
1334
                                _block[1] = 0;
1335
                                _block[2] = 0;
1336
                        }
1337
                }
1338
                else if constexpr (4 == nrBlocks) {
1339
                        if constexpr (bitsInBlock < 64) {
1340
                                _block[0] = raw_bits & storageMask;
70,352✔
1341
                                raw_bits >>= bitsInBlock;
70,352✔
1342
                                _block[1] = raw_bits & storageMask;
70,352✔
1343
                                raw_bits >>= bitsInBlock;
70,352✔
1344
                                _block[2] = raw_bits & storageMask;
70,352✔
1345
                                raw_bits >>= bitsInBlock;
70,352✔
1346
                                _block[3] = raw_bits & storageMask;
70,352✔
1347
                        }
1348
                        else {
1349
                                _block[0] = raw_bits & storageMask;
1350
                                _block[1] = 0;
1351
                                _block[2] = 0;
1352
                                _block[3] = 0;
1353
                        }
1354
                }
1355
                else {
1356
                        if constexpr (bitsInBlock < 64) {
1357
                                for (unsigned i = 0; i < nrBlocks; ++i) {
730,525✔
1358
                                        _block[i] = raw_bits & storageMask;
640,537✔
1359
                                        raw_bits >>= bitsInBlock;
640,537✔
1360
                                }
1361
                        }
1362
                        else {
1363
                                _block[0] = raw_bits & storageMask;
1364
                                for (unsigned i = 1; i < nrBlocks; ++i) {
1365
                                        _block[i] = 0;
1366
                                }
1367
                        }
1368
                }
1369
                _block[MSU] &= MSU_MASK; // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
306,602,147✔
1370
                return *this;
306,602,147✔
1371
        }
1372
        constexpr void setblock(unsigned b, bt data) noexcept {
542,373✔
1373
                if (b < nrBlocks) _block[b] = data;
542,373✔
1374
        }
542,373✔
1375
        
1376
        // create specific number system values of interest
1377
        constexpr cfloat& maxpos() noexcept {
936,032✔
1378
                if constexpr (isSaturating) {
1379
                        // in a saturating encoding with max-exponent values we are removing the Inf encoding pattern 0b0'11...11'11...10 for a +inf, 
1380
                        // and 0b1'11...11'11...110 for a -inf and using it as a value
1381
                        if constexpr (hasMaxExpValues) {
1382
                                // maximum positive value has this bit pattern: 0-1...1-111...110, that is, sign = 0, e = 11..11, f = 111...110
1383
                                clear();
935,334✔
1384
                                flip();
935,334✔
1385
                                setbit(nbits - 1ull, false); // sign = 0
935,334✔
1386
                                setbit(0ull, false); // bit0 = 0
935,334✔
1387
                        }
1388
                        else {
1389
                                // maximum positive value has this bit pattern: 0-11...10-111...111, that is, sign = 0, e = 11..10, f = 111...111
1390
                                clear();
413✔
1391
                                flip();
413✔
1392
                                setbit(fbits, false); // set least significant exponent bit to 0
413✔
1393
                                setbit(nbits - 1ull, false); // set sign to 0
413✔
1394
                        }
1395
                }
1396
                else {
1397
                        // the Inf encoding is the pattern 0b0'11...11'11...10 for a +inf, and 0b1'11...11'11...110 for a -inf
1398
                        // the maxpos is the encoding before that
1399
                        if constexpr (hasMaxExpValues) {
1400
                                // maximum positive value has this bit pattern: 0-1...1-111...101, that is, sign = 0, e = 11..11, f = 111...101
1401
                                clear();
113✔
1402
                                flip();
113✔
1403
                                setbit(nbits - 1ull, false); // sign = 0
113✔
1404
                                setbit(1ull, false); // bit1 = 0
113✔
1405
                        }
1406
                        else {
1407
                                // maximum positive value has this bit pattern: 0-1...0-111...111, that is, sign = 0, e = 11..10, f = 111...111
1408
                                clear();
172✔
1409
                                flip();
172✔
1410
                                setbit(fbits, false); // set least significant exponent bit to 0
172✔
1411
                                setbit(nbits - 1ull, false); // set sign to 0
172✔
1412
                        }
1413
                }
1414
                return *this;
936,032✔
1415
        }
1416
        constexpr cfloat& minpos() noexcept {
13,290✔
1417
                // minpos encoding is not impacted by saturating encodings, which only affects maxpos and inf
1418
                if constexpr (hasSubnormals) {
1419
                        // minimum positive value has this bit pattern: 0-000-00...01, that is, sign = 0, e = 000, f = 00001
1420
                        clear();
327✔
1421
                        setbit(0);
327✔
1422
                }
1423
                else {
1424
                        // minimum positive value has this bit pattern: 0-001-00...0, that is, sign = 0, e = 001, f = 0000
1425
                        clear();
12,963✔
1426
                        setbit(fbits);
12,963✔
1427
                }
1428
                return *this;
13,290✔
1429
        }
1430
        constexpr cfloat& zero() noexcept {
1✔
1431
                // the zero value
1432
                clear();
1✔
1433
                return *this;
1✔
1434
        }
1435
        constexpr cfloat& minneg() noexcept {
14✔
1436
                // minneg encoding is not impacted by saturating encodings, which only affects maxpos and inf
1437
                if constexpr (hasSubnormals) {
1438
                        // minimum negative value has this bit pattern: 1-000-00...01, that is, sign = 1, e = 00, f = 00001
1439
                        clear();
8✔
1440
                        setbit(nbits - 1ull);
8✔
1441
                        setbit(0);
8✔
1442
                }
1443
                else {
1444
                        // minimum negative value has this bit pattern: 1-001-00...0, that is, sign = 1, e = 001, f = 0000
1445
                        clear();
6✔
1446
                        setbit(fbits);
6✔
1447
                        setbit(nbits - 1ull);
6✔
1448
                }
1449
                return *this;
14✔
1450
        }
1451
        constexpr cfloat& maxneg() noexcept {
935,702✔
1452
                if constexpr (isSaturating) {
1453
                        // in a saturating encoding with max-exponent values we are removing the Inf encoding pattern 0b0'11...11'11...10 for a +inf, 
1454
                        // and 0b1'11...11'11...110 for a -inf and using it as a value
1455
                        if constexpr (hasMaxExpValues) {
1456
                                // maximum negative value has this bit pattern: 1-1...1-111...110, that is, sign = 1, e = 1..1, f = 111...110
1457
                                clear();
935,240✔
1458
                                flip();
935,240✔
1459
                                setbit(0ull, false);
935,240✔
1460
                        }
1461
                        else {
1462
                                // maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1463
                                clear();
401✔
1464
                                flip();
401✔
1465
                                setbit(fbits, false);
401✔
1466
                        }
1467
                }
1468
                else {
1469
                        if constexpr (hasMaxExpValues) {
1470
                                // maximum negative value has this bit pattern: 1-1...1-111...101, that is, sign = 1, e = 1..1, f = 111...101
1471
                                clear();
12✔
1472
                                flip();
12✔
1473
                                setbit(1ull, false);
12✔
1474
                        }
1475
                        else {
1476
                                // maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1477
                                clear();
49✔
1478
                                flip();
49✔
1479
                                setbit(fbits, false);
49✔
1480
                        }
1481
                }
1482
                return *this;
935,702✔
1483
        }
1484

1485

1486
        /// <summary>
1487
        /// assign the value of the string representation to the cfloat
1488
        /// </summary>
1489
        /// <param name="stringRep">decimal scientific notation of a real number to be assigned</param>
1490
        /// <returns>reference to this cfloat</returns>
1491
        /// Clang doesn't support constexpr yet on string manipulations, so we need to make it conditional
1492
        CONSTEXPRESSION cfloat& assign(const std::string& str) noexcept {
7✔
1493
                clear();
7✔
1494
                unsigned nrChars = static_cast<unsigned>(str.size());
7✔
1495
                unsigned nrBits = 0;
7✔
1496
                unsigned nrDots = 0;
7✔
1497
                std::string bits;
7✔
1498
                if (nrChars > 2) {
7✔
1499
                        if (str[0] == '0' && str[1] == 'b') {
7✔
1500
                                for (unsigned i = 2; i < nrChars; ++i) {
211✔
1501
                                        char c = str[i];
204✔
1502
                                        switch (c) {
204✔
1503
                                        case '0':
186✔
1504
                                        case '1':
1505
                                                ++nrBits;
186✔
1506
                                                bits += c;
186✔
1507
                                                break;
186✔
1508
                                        case '.':
14✔
1509
                                                ++nrDots;
14✔
1510
                                                bits += c;
14✔
1511
                                                break;
14✔
1512
                                        case '\'':
4✔
1513
                                                // consume this delimiting character
1514
                                                break;
4✔
1515
                                        default:
×
1516
                                                std::cerr << "string contained a non-standard character: " << c << '\n';
×
1517
                                                return *this;
×
1518
                                        }
1519
                                }
1520
                        }
1521
                        else {
1522
                                std::cerr << "string must start with 0b: instead input pattern was " << str << '\n';
×
1523
                                return *this;
×
1524
                        }
1525
                }
1526
                else {
1527
                        std::cerr << "string is too short\n";
×
1528
                        return *this;
×
1529
                }
1530

1531
                if (nrBits != nbits) {
7✔
1532
                        std::cerr << "number of bits in the string is " << nrBits << " and needs to be " << nbits << '\n';
×
1533
                        return *this;
×
1534
                }
1535
                if (nrDots != 2) {
7✔
1536
                        std::cerr << "number of segment delimiters in string is " << nrDots << " and needs to be 2 for a cfloat<>\n";
×
1537
                        return *this;
×
1538
                }
1539

1540
                // assign the bits
1541
                int field{ 0 };  // three fields: sign, exponent, mantissa: fields are separated by a '.'
7✔
1542
                int nrExponentBits{ -1 };
7✔
1543
                unsigned bit = nrBits;
7✔
1544
                for (unsigned i = 0; i < bits.size(); ++i) {
207✔
1545
                        char c = bits[i];
200✔
1546
                        if (c == '.') {
200✔
1547
                                ++field;
14✔
1548
                                if (field == 2) { // just finished parsing exponent field: we can now check the number of exponent bits
14✔
1549
                                        if (nrExponentBits != es) {
7✔
1550
                                                std::cerr << "provided binary string representation does not contain " << es << " exponent bits. Found " << nrExponentBits << ". Reset to 0\n";
×
1551
                                                clear();
×
1552
                                                return *this;
×
1553
                                        }
1554
                                }
1555
                        }
1556
                        else {
1557
                                setbit(--bit, c == '1');
186✔
1558
                        }
1559
                        if (field == 1) { // exponent field
200✔
1560
                                ++nrExponentBits;
51✔
1561
                        }
1562
                }
1563
                if (field != 2) {
7✔
1564
                        std::cerr << "provided binary string did not contain three fields separated by '.': Reset to 0\n";
×
1565
                        clear();
×
1566
                        return *this;
×
1567
                }
1568
                return *this;
7✔
1569
        }
7✔
1570

1571
        // selectors
1572
        constexpr bool sign() const noexcept { return (_block[MSU] & SIGN_BIT_MASK) == SIGN_BIT_MASK; }
257,866,812✔
1573
        constexpr int  scale() const noexcept {
60,909,310✔
1574
                int e{ 0 };
60,909,310✔
1575
                if constexpr (MSU_CAPTURES_EXP) {
1576
                        e = static_cast<int>((_block[MSU] & ~SIGN_BIT_MASK) >> EXP_SHIFT);
49,257,730✔
1577
                        if (e == 0) {
49,257,730✔
1578
                                // subnormal scale is determined by fraction
1579
                                // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1580
                                e = (2l - (1l << (es - 1ull))) - 1;
6,532,994✔
1581
                                if constexpr (nbits > 2 + es) {
1582
                                        for (unsigned i = nbits - 2ull - es; i > 0; --i) {
12,592,409✔
1583
                                                if (test(i)) break;
12,487,807✔
1584
                                                --e;
6,081,951✔
1585
                                        }
1586
                                }
1587
                        }
1588
                        else {
1589
                                e -= EXP_BIAS;
42,724,736✔
1590
                        }
1591
                }
1592
                else {
1593
                        blockbinary<es, bt> ebits{};
11,651,928✔
1594
                        exponent(ebits);
11,651,580✔
1595
                        if (ebits.iszero()) {
11,651,580✔
1596
                                // subnormal scale is determined by fraction
1597
                                // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1598
                                e = (2l - (1l << (es - 1ull))) - 1;
1,599,670✔
1599
                                if constexpr (nbits > 2 + es) {
1600
                                        for (unsigned i = nbits - 2ull - es; i > 0; --i) {
3,160,467✔
1601
                                                if (test(i)) break;
3,153,345✔
1602
                                                --e;
1,560,808✔
1603
                                        }
1604
                                }
1605
                        }
1606
                        else {
1607
                                e = static_cast<int>(unsigned(ebits) - EXP_BIAS);
10,051,910✔
1608
                        }
1609
                }
1610
                return e;
60,909,310✔
1611
        }
1612
        constexpr bool isneg() const noexcept {
558✔
1613
                if (isnan()) return false;
558✔
1614
                return sign(); 
541✔
1615
        }
1616
        constexpr bool ispos() const noexcept { 
34,472✔
1617
                if (isnan()) return false;
34,472✔
1618
                return !sign(); 
34,472✔
1619
        }
1620
        constexpr bool iszero() const noexcept {
428,026,249✔
1621
                // NOTE: this is a very specific design that makes the decsion that
1622
                // for subnormal encodings found in a configuration that doesn't
1623
                // support them, we assume that these values map to 0.
1624
                if constexpr (hasSubnormals) {
1625
                        return iszeroencoding();
248,935,926✔
1626
                }
1627
                else { // all subnormals round to 0
1628
                        blockbinary<es, bt> ebits{};
179,090,763✔
1629
                        exponent(ebits);
179,090,323✔
1630
                        if (ebits.iszero()) return true; else return false;
179,090,323✔
1631
                }
1632
        }
1633
        constexpr bool isone() const noexcept {
1634
                // unbiased exponent = scale = 0, fraction = 0
1635
                int s = scale();
1636
                if (s == 0) {
1637
                        blockbinary<fbits, bt> f{};
1638
                        fraction(f);
1639
                        return f.iszero();
1640
                }
1641
                return false;
1642
        }
1643
        constexpr bool isinf(int InfType = INF_TYPE_EITHER) const noexcept {
379,811,030✔
1644
                // the bit pattern encoding of Inf is independent of the max-exponent value configuration
1645
                bool isNegInf = false;
379,811,030✔
1646
                bool isPosInf = false;
379,811,030✔
1647
                if constexpr (0 == nrBlocks) {
1648
                        return false;
1649
                }
1650
                else if constexpr (1 == nrBlocks) {
1651
                        isNegInf = (_block[MSU] & MSU_MASK) == (MSU_MASK ^ LSB_BIT_MASK);
167,947,426✔
1652
                        isPosInf = (_block[MSU] & MSU_MASK) == ((MSU_MASK ^ SIGN_BIT_MASK) ^ LSB_BIT_MASK);
167,947,426✔
1653
                }
1654
                else if constexpr (2 == nrBlocks) {
1655
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
145,271,211✔
1656
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
145,271,211✔
1657
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
145,271,211✔
1658
                }
1659
                else if constexpr (3 == nrBlocks) {
1660
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK);
66,400,020✔
1661
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
66,400,020✔
1662
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
66,400,020✔
1663
                }
1664
                else if constexpr (4 == nrBlocks) {
1665
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
100,274✔
1666
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
100,274✔
1667
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
100,274✔
1668
                }
1669
                else {
1670
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
92,099✔
1671
                        for (unsigned i = 1; i < nrBlocks - 1; ++i) {
93,760✔
1672
                                if (_block[i] != BLOCK_MASK) {
93,436✔
1673
                                        isInf = false;
91,775✔
1674
                                        break;
91,775✔
1675
                                }
1676
                        }
1677
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
92,099✔
1678
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
92,099✔
1679
                }
1680

1681
                return (InfType == INF_TYPE_EITHER ? (isNegInf || isPosInf) :
436,650,312✔
1682
                        (InfType == INF_TYPE_NEGATIVE ? isNegInf :
85,259,374✔
1683
                                (InfType == INF_TYPE_POSITIVE ? isPosInf : false)));
436,651,214✔
1684
        }
56,839,282✔
1685
        constexpr bool isnan(int NaNType = NAN_TYPE_EITHER) const noexcept {
995,610,418✔
1686
                if constexpr (hasMaxExpValues) {
1687
                        return isnanencoding(NaNType);
596,654,987✔
1688
                }
1689
                else {
1690
                        if (ismaxexpvalue()) {
398,955,431✔
1691
                                // all these max-exponent encodings are NANs, except for the encoding representing INF
1692
                                bool isNaN = isinf() ? false : true;
24,519,311✔
1693
                                bool isNegNaN = isNaN && sign();
24,519,311✔
1694
                                bool isPosNaN = isNaN && !sign();
24,519,311✔
1695
                                return (NaNType == NAN_TYPE_EITHER ? (isNaN) :
27,941,451✔
1696
                                        (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
4,557,477✔
1697
                                                (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
26,789,985✔
1698
                        }
1699
                        else {
1700
                                return false;
374,436,120✔
1701
                        }
1702
                }
1703
        }
24,519,311✔
1704
        // iszeroencoding returns true if it finds a pure -0 or +0 pattern and returns false otherwise
1705
        constexpr bool iszeroencoding() const noexcept {
354,872,071✔
1706
                if constexpr (0 == nrBlocks) {
1707
                        return true;
1708
                }
1709
                else if constexpr (1 == nrBlocks) {
1710
                        return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
134,115,201✔
1711
                }
1712
                else if constexpr (2 == nrBlocks) {
1713
                        return (_block[0] == 0) && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
146,610,492✔
1714
                }
1715
                else if constexpr (3 == nrBlocks) {
1716
                        return (_block[0] == 0) && _block[1] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
73,918,353✔
1717
                }
1718
                else if constexpr (4 == nrBlocks) {
1719
                        return (_block[0] == 0) && _block[1] == 0 && _block[2] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
135,294✔
1720
                }
1721
                else {
1722
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) if (_block[i] != 0) return false;
338,463✔
1723
                        return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
559✔
1724
                }
1725
        }
1726
        // isminnegencoding returns true if it find the pattern 1.00.00001 and returns false otherwise
1727
        constexpr bool isminnegencoding() const noexcept {
66,947✔
1728
                if constexpr (0 == nrBlocks) {
1729
                        return false;
1730
                }
1731
                else if constexpr (1 == nrBlocks) {
1732
                        return (_block[MSU] & (SIGN_BIT_MASK | 1ul));
1733
                }
1734
                else if constexpr (2 == nrBlocks) {
1735
                        return ((_block[0] == 1ul) && (_block[1] == SIGN_BIT_MASK));
1,408✔
1736
                }
1737
                else if constexpr (3 == nrBlocks) {
1738
                        return ((_block[0] == 1ul) && (_block[1] == 0) && (_block[2] == SIGN_BIT_MASK));
65,536✔
1739
                }
1740
                else if constexpr (4 == nrBlocks) {
1741
                        return ((_block[0] == 1ul) && (_block[1] == 0) && (_block[2] == 0) && (_block[3] == SIGN_BIT_MASK));
3✔
1742
                }
1743
                else {
1744
                        if (_block[0] != 1ul) return false;
×
1745
                        for (unsigned i = 1; i < nrBlocks - 2; ++i) if (_block[i] != 0) return false;
×
1746
                        return (_block[MSU] == SIGN_BIT_MASK);
×
1747
                }
1748
        }
1749
        constexpr bool isnanencoding(int NaNType = NAN_TYPE_EITHER) const noexcept {
600,322,420✔
1750
                // the bit encoding of NaN is independent of the gradual overflow configuration
1751
                bool isNaN = true;
600,322,420✔
1752
                bool isNegNaN = false;
600,322,420✔
1753
                bool isPosNaN = false;
600,322,420✔
1754

1755
                if constexpr (0 == nrBlocks) {
1756
                        return false;
1757
                }
1758
                else if constexpr (1 == nrBlocks) {
1759
                }
1760
                else if constexpr (2 == nrBlocks) {
1761
                        isNaN = (_block[0] == BLOCK_MASK);
223,995,640✔
1762
                }
1763
                else if constexpr (3 == nrBlocks) {
1764
                        isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK);
201,348,401✔
1765
                }
1766
                else if constexpr (4 == nrBlocks) {
1767
                        isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
227,093✔
1768
                }
1769
                else {
1770
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) {
270,021✔
1771
                                if (_block[i] != BLOCK_MASK) {
269,977✔
1772
                                        isNaN = false;
269,723✔
1773
                                        break;
269,723✔
1774
                                }
1775
                        }
1776
                }
1777
                isNegNaN = isNaN && ((_block[MSU] & MSU_MASK) == MSU_MASK);
600,322,420✔
1778
                isPosNaN = isNaN && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
600,322,420✔
1779

1780
                return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
816,677,683✔
1781
                        (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
324,385,275✔
1782
                                (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
816,382,444✔
1783
        }
216,355,263✔
1784
        // isnormal returns true if 0 or exponent bits are not all zero or one, false otherwise
1785
        constexpr bool isnormal() const noexcept {
60,696,740✔
1786
                if (iszeroencoding()) return true; // filter out the one special case
60,696,740✔
1787
                blockbinary<es, bt> e{};
60,697,087✔
1788
                exponent(e);
60,696,739✔
1789
                return !e.iszero() && !e.all();
60,696,739✔
1790
        }
1791
        // isdenormal returns true if exponent bits are all zero, false otherwise
1792
        constexpr bool isdenormal() const noexcept {
45,172,447✔
1793
                if (iszeroencoding()) return false; // filter out the one special case
45,172,447✔
1794
                blockbinary<es, bt> e{};
45,163,668✔
1795
                exponent(e);
45,163,668✔
1796
                return e.iszero(); 
45,163,668✔
1797
        }
1798
        // ismaxexpvalue returns true if exponent bits are all one, false otherwise
1799
        constexpr bool ismaxexpvalue() const noexcept {
402,202,636✔
1800
                blockbinary<es, bt> e{};
402,203,681✔
1801
                exponent(e);
402,202,636✔
1802
                return e.all();
402,202,636✔
1803
        }
1804
        // isinteger is TBD
1805
        constexpr bool isinteger() const noexcept { return false; } // return (floor(*this) == *this) ? true : false; }
1806
        
1807
        template<typename NativeReal>
1808
        constexpr bool inrange(NativeReal v) const noexcept {
9,306,296✔
1809
                // the valid range for this cfloat includes the interval between 
1810
                // maxpos and the value that would round down to maxpos
1811
                bool bIsInRange = true;                
9,306,296✔
1812
                if (v > 0) {
9,306,296✔
1813
                        cfloat c(SpecificValue::maxpos);
4,427,047✔
1814
                        cfloat<nbits + 1, es, BlockType, hasSubnormals, hasMaxExpValues, isSaturating> d{};
8,027,370✔
1815
                        d = NativeReal(c);
4,427,047✔
1816
                        ++d;
4,427,047✔
1817
                        if (v >= NativeReal(d)) bIsInRange = false;
4,427,047✔
1818
                }
1819
                else {
1820
                        cfloat c(SpecificValue::maxneg);
4,879,249✔
1821
                        cfloat<nbits + 1, es, BlockType, hasSubnormals, hasMaxExpValues, isSaturating> d{};
8,816,662✔
1822
                        d = NativeReal(c);
4,879,249✔
1823
                        --d;
4,879,249✔
1824
                        if (v <= NativeReal(d)) bIsInRange = false;
4,879,249✔
1825
                }
1826

1827
                return bIsInRange;
9,306,296✔
1828
        }
1829
        constexpr bool test(unsigned bitIndex) const noexcept {
15,656,659✔
1830
                return at(bitIndex);
15,656,659✔
1831
        }
1832
        constexpr bool at(unsigned bitIndex) const noexcept {
1,878,399,673✔
1833
                if (bitIndex < nbits) {
1,878,399,673✔
1834
                        bt word = _block[bitIndex / bitsInBlock];
1,878,399,673✔
1835
                        bt mask = bt(1ull << (bitIndex % bitsInBlock));
1,878,399,673✔
1836
                        return (word & mask);
1,878,399,673✔
1837
                }
1838
                return false;
×
1839
        }
1840
        constexpr uint8_t nibble(unsigned n) const noexcept {
640✔
1841
                if (n < (1 + ((nbits - 1) >> 2))) {
640✔
1842
                        bt word = _block[(n * 4) / bitsInBlock];
640✔
1843
                        int nibbleIndexInWord = int(n % (bitsInBlock >> 2ull));
640✔
1844
                        bt mask = bt(0xF << (nibbleIndexInWord * 4));
640✔
1845
                        bt nibblebits = bt(mask & word);
640✔
1846
                        return uint8_t(nibblebits >> (nibbleIndexInWord * 4));
640✔
1847
                }
1848
                return 0;
×
1849
        }
1850
        constexpr bt block(unsigned b) const noexcept {
1851
                if (b < nrBlocks) {
1852
                        return _block[b];
1853
                }
1854
                return 0;
1855
        }
1856

1857
        constexpr void sign(bool& s) const {
800✔
1858
                s = sign();
800✔
1859
        }
800✔
1860
        constexpr void exponent(blockbinary<es, bt>& e) const {
805,080,458✔
1861
                e.clear();
805,080,458✔
1862
                if constexpr (0 == nrBlocks) return;
1863
                else if constexpr (1 == nrBlocks) {
1864
                        bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
368,580,044✔
1865
                        e.setbits(uint64_t(ebits >> EXP_SHIFT));
368,580,044✔
1866
                }
1867
                else if constexpr (nrBlocks > 1) {
1868
                        if (MSU_CAPTURES_EXP) {
1869
                                bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
250,086,807✔
1870
                                e.setbits(uint64_t(ebits >> ((nbits - 1ull - es) % bitsInBlock)));
250,086,807✔
1871
                        }
1872
                        else {
1873
                                for (unsigned i = 0; i < es; ++i) { e.setbit(i, at(nbits - 1ull - es + i)); }
841,207,232✔
1874
                        }
1875
                }
1876
        }
805,080,458✔
1877
        template<unsigned targetFractionBits>
1878
        constexpr blockbinary<targetFractionBits, bt>& fraction(blockbinary<targetFractionBits, bt>& f) const {
34,082✔
1879
                static_assert(targetFractionBits >= fbits, "target blockbinary is too small and can't receive all fraction bits");
1880
                f.clear();
34,082✔
1881
                if constexpr (0 == nrBlocks) return f;
1882
                else if constexpr (1 == nrBlocks) {
1883
                        bt fraction = bt(_block[MSU] & ~MSU_EXP_MASK);
999✔
1884
                        f.setbits(fraction);
999✔
1885
                }
1886
                else if constexpr (nrBlocks > 1) {
1887
                        for (unsigned i = 0; i < fbits; ++i) { f.setbit(i, at(i)); }
242,396✔
1888
                }
1889
                return f;
34,082✔
1890
        }
1891
        constexpr uint64_t fraction_ull() const {
59,641,088✔
1892
                uint64_t raw{ 0 };
59,641,088✔
1893
                if constexpr (nbits - es - 1ull < 65ull) { // no-op if precondition doesn't hold
1894
                        if constexpr (1 == nrBlocks) {
1895
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
26,586,261✔
1896
                                raw = fbitMask & uint64_t(_block[0]);
26,586,261✔
1897
                        }
1898
                        else if constexpr (2 == nrBlocks) {
1899
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
22,782,377✔
1900
                                raw = fbitMask & ((uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
22,782,377✔
1901
                        }
1902
                        else if constexpr (3 == nrBlocks) {
1903
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
10,249,633✔
1904
                                raw = fbitMask & ((uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
10,249,633✔
1905
                        }
1906
                        else if constexpr (4 == nrBlocks) {
1907
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
22,453✔
1908
                                raw = fbitMask & ((uint64_t(_block[3]) << (3 * bitsInBlock)) | (uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
22,453✔
1909
                        }
1910
                        else {
1911
                                uint64_t mask{ 1 };
364✔
1912
                                for (unsigned i = 0; i < fbits; ++i) { 
15,871✔
1913
                                        if (test(i)) {
15,507✔
1914
                                                raw |= mask;
2,257✔
1915
                                        }
1916
                                        mask <<= 1;
15,507✔
1917
                                }
1918
                        }
1919
                }
1920
                return raw;
59,641,088✔
1921
        }
1922
        // construct the significant from the encoding, returns normalization offset
1923
        constexpr unsigned significant(blockbinary<fhbits, bt>& s, bool isNormal = true) const {
23✔
1924
                unsigned shift = 0;
23✔
1925
                if (iszero()) return 0;
23✔
1926
                if constexpr (0 == nrBlocks) return 0;
1927
                else if constexpr (1 == nrBlocks) {
1928
                        bt significant = bt(_block[MSU] & ~MSU_EXP_MASK & ~SIGN_BIT_MASK);
23✔
1929
                        if (isNormal) {
23✔
1930
                                significant |= (bt(0x1ul) << fbits);
×
1931
                        }
1932
                        else {
1933
                                unsigned msb = find_msb(significant);
23✔
1934
//                                std::cout << "msb : " << msb << " : fhbits : " << fhbits << " : " << to_binary(significant, true) << std::endl;
1935
                                shift = fhbits - msb;
23✔
1936
                                significant <<= shift;
23✔
1937
                        }
1938
                        s.setbits(significant);
23✔
1939
                }
1940
                else if constexpr (nrBlocks > 1) {
1941
                        s.clear();
1942
                        // TODO: design and implement a block-oriented algorithm, this sequential algorithm is super slow
1943
                        if (isNormal) {
1944
                                s.setbit(fbits);
1945
                                for (unsigned i = 0; i < fbits; ++i) { s.setbit(i, at(i)); }
1946
                        }
1947
                        else {
1948
                                // Find the MSB of the subnormal: 
1949
                                unsigned msb = 0;
1950
                                for (unsigned i = 0; i < fbits; ++i) { // msb protected from not being assigned through iszero test at prelude of function
1951
                                        msb = fbits - 1ull - i;
1952
                                        if (test(msb)) break;
1953
                                }
1954
                                //      m-----lsb
1955
                                // h00001010101
1956
                                // 101010100000
1957
                                for (unsigned i = 0; i <= msb; ++i) {
1958
                                        s.setbit(fbits - msb + i, at(i));
1959
                                }
1960
                                shift = fhbits - msb;
1961
                        }
1962
                }
1963
                return shift;
23✔
1964
        }
1965
        template<unsigned targetbits>
1966
        constexpr void bits(blockbinary<targetbits, bt>& b) const {
640✔
1967
                unsigned upperbound = (nbits > targetbits ? targetbits : nbits);
640✔
1968
                b.clear();
640✔
1969
                for (unsigned i = 0; i < upperbound; ++i) { b.setbit(i, at(i)); }
3,840✔
1970
        }
640✔
1971

1972
        // casts to native types
1973
        int to_int() const {
59,976✔
1974
                if (isnan()) return 0;
59,976✔
1975
                if (isinf()) return sign() ? std::numeric_limits<int>::min() : std::numeric_limits<int>::max();
56,644✔
1976
                return int(to_native<float>());
50,308✔
1977
        }
1978
        long to_long() const {
1979
                if (isnan()) return 0;
1980
                if (isinf()) return sign() ? std::numeric_limits<long>::min() : std::numeric_limits<long>::max();
1981
                return long(to_native<double>());
1982
        }
1983
        long long to_long_long() const {
1✔
1984
                if (isnan()) return 0;
1✔
1985
                if (isinf()) return sign() ? std::numeric_limits<long long>::min() : std::numeric_limits<long long>::max();
1✔
1986
                return (long long)(to_native<double>());
1✔
1987
        }
1988

1989
        // transform an cfloat to a native C++ floating-point. We are using the native
1990
        // precision to compute, which means that all sub-values need to be representable 
1991
        // by the native precision.
1992
        // A more accurate approximation would require an adaptive precision algorithm
1993
        // with a final rounding step.
1994
        template<typename TargetFloat>
1995
        TargetFloat to_native() const { 
113,469,073✔
1996
                TargetFloat v{ 0.0 };
113,469,073✔
1997
                if (iszero()) {
113,469,073✔
1998
                        // the optimizer might destroy the sign
1999
                        return sign() ? -TargetFloat(0) : TargetFloat(0);
911,001✔
2000
                }
2001
                else if (isnan()) {
112,558,072✔
2002
                        v = sign() ? std::numeric_limits<TargetFloat>::signaling_NaN() : std::numeric_limits<TargetFloat>::quiet_NaN();
6,139,189✔
2003
                }
2004
                else if (isinf()) {
106,418,883✔
2005
                        v = sign() ? -std::numeric_limits<TargetFloat>::infinity() : std::numeric_limits<TargetFloat>::infinity();
144,171✔
2006
                }
2007
                else { // TODO: this approach has catastrophic cancellation when nbits is large and native target float is too small
2008
                        TargetFloat f{ 0 };
106,274,712✔
2009
                        TargetFloat fbit{ 0.5 };
106,274,712✔
2010
                        for (int i = static_cast<int>(nbits - 2ull - es); i >= 0; --i) {
1,313,969,979✔
2011
                                f += at(static_cast<unsigned>(i)) ? fbit : TargetFloat(0);
1,207,695,267✔
2012
                                fbit *= TargetFloat(0.5);
1,207,695,267✔
2013
                        }
2014
                        blockbinary<es, bt> ebits;
2015
                        exponent(ebits);
106,274,712✔
2016
                        if constexpr (hasSubnormals) {
2017
                                if (ebits.iszero()) {
66,509,025✔
2018
                                        // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
2019
                                        TargetFloat exponentiation = TargetFloat(subnormal_exponent[es]); // precomputed values for 2^(2-2^(es-1))
14,672,769✔
2020
                                        v = exponentiation * f;  // f is already f/2^fbits
14,672,769✔
2021
                                        return sign() ? -v : v;
14,672,769✔
2022
                                }
2023
                        }
2024
                        else {
2025
                                if (ebits.iszero()) { // underflow to 0
39,765,687✔
2026
                                        // compiler fast float optimization might destroy the sign
2027
                                        return sign() ? -TargetFloat(0) : TargetFloat(0);
×
2028
                                }
2029
                        }
2030
                        if constexpr (hasMaxExpValues) {
2031
                                // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
2032
                                int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
52,318,479✔
2033
                                if (-64 < exponent && exponent < 64) {
52,318,479✔
2034
                                        TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
51,994,828✔
2035
                                        v = exponentiation * (TargetFloat(1.0) + f);
51,994,828✔
2036
                                }
51,994,828✔
2037
                                else {
2038
                                        double exponentiation = ipow(exponent);
323,651✔
2039
                                        v = TargetFloat(exponentiation * (1.0 + f));
323,651✔
2040
                                }
2041
                        }
2042
                        else {
2043
                                if (ebits.all()) {
39,283,464✔
2044
                                        // max-exponent values are mapped to quiet NaNs
2045
                                        v = std::numeric_limits<TargetFloat>::quiet_NaN();
×
2046
                                        return v;
×
2047
                                }
2048
                                else {
2049
                                        // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
2050
                                        int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
39,283,464✔
2051
                                        if (-64 < exponent && exponent < 64) {
39,283,464✔
2052
                                                TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
39,082,252✔
2053
                                                v = exponentiation * (TargetFloat(1.0) + f);
39,082,252✔
2054
                                        }
39,082,252✔
2055
                                        else {
2056
                                                double exponentiation = ipow(exponent);
201,212✔
2057
                                                v = TargetFloat(exponentiation * (1.0 + f));
201,212✔
2058
                                        }
2059
                                }
2060
                        }
2061
                        v = sign() ? -v : v;
91,601,943✔
2062
                }
2063
                return v;
97,885,303✔
2064
        }
2065

2066
        // convert a cfloat to a blocktriple with the fraction format 1.ffff
2067
        // we are using the same block type so that we can use block copies to move bits around.
2068
        // Since we tend to have at least two exponent bits, this will lead to
2069
        // most cfloat<->blocktriple cases being efficient as the block types are aligned.
2070
        // The relationship between the source cfloat and target blocktriple is not
2071
        // arbitrary, enforce it by setting the blocktriple fbits to the cfloat's (nbits - es - 1)
2072
        template<typename TargetBlockType = bt>
2073
        constexpr void normalize(blocktriple<fbits, BlockTripleOperator::REP, TargetBlockType>& tgt) const {
3,159✔
2074
                // test special cases
2075
                if (isnan()) {
3,159✔
2076
                        tgt.setnan();
253✔
2077
                }
2078
                else if (isinf()) {
2,906✔
2079
                        tgt.setinf();
245✔
2080
                }
2081
                else if (iszero()) {
2,661✔
2082
                        tgt.setzero();
308✔
2083
                }
2084
                else {
2085
                        tgt.setnormal(); // a blocktriple is always normalized
2,353✔
2086
                        int scale = this->scale();
2,353✔
2087
                        tgt.setsign(sign());
2,353✔
2088
                        tgt.setscale(scale);
2,353✔
2089
                        // set significant
2090
                        // we are going to unify to the format 01.ffffeeee
2091
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2092
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2093
                        if (isnormal() || ismaxexpvalue()) {
2,353✔
2094
                                // normal encoding or max-exponent encoding (hasMaxExpValues=true):
2095
                                // significand has a hidden bit at position fbits
2096
                                if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2097
                                        uint64_t raw = fraction_ull();
1,684✔
2098
                                        raw |= (1ull << fbits);
1,684✔
2099
                                        tgt.setbits(raw);
1,684✔
2100
                                }
2101
                                else {
2102
                                        blockcopy(tgt);
111✔
2103
                                        tgt.setbit(fbits);
111✔
2104
                                }
2105
                        }
2106
                        else { // it is a subnormal encoding in this target cfloat
2107
                                int shift = MIN_EXP_NORMAL - scale;
558✔
2108
                                if (shift < 0) shift = 0; // guard against negative shifts
558✔
2109
                                // Stored subnormals have no hidden bit, but blocktriple REP expects a normalized 1.ffff payload.
2110
                                // Shift the encoded fraction up until the leading 1 reaches the hidden-bit position, while leaving
2111
                                // scale at the value returned by cfloat::scale(); that preserves the original real value in canonical form.
2112
                                if constexpr (fbits < 64) {
2113
                                        uint64_t raw = fraction_ull();
541✔
2114
                                        raw <<= shift;
541✔
2115
                                        raw |= (1ull << fbits);
541✔
2116
                                        tgt.setbits(raw);
541✔
2117
                                }
2118
                                else {
2119
                                        blockcopy(tgt);
17✔
2120
                                        tgt <<= shift;
17✔
2121
                                        tgt.setbit(fbits);
17✔
2122
                                }
2123
                        }
2124
                }
2125
        }
3,159✔
2126

2127
        // normalize a cfloat to a blocktriple used in add/sub, which has the form 00h.fffff
2128
        // that is 3 + fbits, the 3 extra bits are required to be able to use 2's complement 
2129
        // and capture the largest value of an addition/subtraction.
2130
        // TODO: currently abits = 2*fhbits as the worst case input argument size to
2131
        // capture the smallest normal value in aligned form. There is a faster/smaller
2132
        // implementation where the input is constrainted to just the round, guard, and sticky bits.
2133
        constexpr void normalizeAddition(blocktriple<fbits, BlockTripleOperator::ADD, bt>& tgt) const {
39,489,207✔
2134
                using BlockTripleConfiguration = blocktriple<fbits, BlockTripleOperator::ADD, bt>;
2135
                // test special cases
2136
                if (isnan()) {
39,489,207✔
2137
                        tgt.setnan();
396,494✔
2138
                }
2139
                else if (isinf()) {
39,092,713✔
2140
                        tgt.setinf();
326✔
2141
                }
2142
                else if (iszero()) {
39,092,387✔
2143
                        tgt.setzero();
325,672✔
2144
                }
2145
                else {
2146
                        tgt.setnormal(); // a blocktriple is always normalized
38,766,715✔
2147
                        int scale = this->scale();
38,766,715✔
2148
                        tgt.setsign(sign());
38,766,715✔
2149
                        tgt.setscale(scale);
38,766,715✔
2150
                        // set significant
2151
                        // we are going to unify to the format 001.ffffeeee
2152
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2153
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2154
                        if (isnormal()) {
38,766,715✔
2155
                                if constexpr (fbits < 64 && BlockTripleConfiguration::rbits < (64 - fbits)) {
2156
                                        uint64_t raw = fraction_ull();
26,010,304✔
2157
                                        raw |= (1ull << fbits); // add the hidden bit
26,010,304✔
2158
                                        //std::cout << "normalize      : " << *this << '\n';
2159
                                        //std::cout << "significant    : " << to_binary(raw, fbits + 2) << '\n';
2160
                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
26,010,304✔
2161
                                        //std::cout << "rounding shift : " << to_binary(raw, fbits + 2 + BlockTripleConfiguration::rbits) << '\n';
2162
                                        tgt.setbits(raw);
26,010,304✔
2163
                                }
2164
                                else {
2165
                                        blockcopy(tgt);
962✔
2166
                                        tgt.setradix();
962✔
2167
                                        tgt.setbit(fbits); // add the hidden bit
962✔
2168
                                        tgt.bitShift(BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
962✔
2169
                                }
2170
                        }
2171
                        else {
2172
                                if (isdenormal()) { // it is a subnormal encoding in this target cfloat
12,755,449✔
2173
                                        if constexpr (hasSubnormals) {
2174
                                                if constexpr (BlockTripleConfiguration::rbits < (64 - fbits)) {
2175
                                                        uint64_t raw = fraction_ull();
6,463,784✔
2176
                                                        int shift = MIN_EXP_NORMAL - scale;
6,463,784✔
2177
                                                        raw <<= shift; // shift but do NOT add a hidden bit as the MSB of the subnormal is shifted in the hidden bit position
6,463,784✔
2178
                                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
6,463,784✔
2179
                                                        tgt.setbits(raw);
6,463,784✔
2180
                                                }
2181
                                                else {
2182
                                                        blockcopy(tgt);
2183
                                                        tgt.setradix();
2184
                                                        int shift = MIN_EXP_NORMAL - scale;
2185
                                                        tgt.bitShift(shift + BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
2186
                                                }
2187
                                        }
2188
                                        else {  // this cfloat has no subnormals
2189
                                                tgt.setzero(tgt.sign()); // preserve the sign
×
2190
                                        }
2191
                                }
2192
                                else {
2193
                                        // by design, a cfloat is either normal, subnormal, or max-exponent value, so this else clause is by deduction covering a max-exponent value
2194
                                        if constexpr (hasMaxExpValues) {
2195
                                                if constexpr (fbits < 64 && BlockTripleConfiguration::rbits < (64 - fbits)) {
2196
                                                        uint64_t raw = fraction_ull();
6,291,665✔
2197
                                                        raw |= (1ull << fbits); // add the hidden bit
6,291,665✔
2198
                                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
6,291,665✔
2199
                                                        tgt.setbits(raw);
6,291,665✔
2200
                                                }
2201
                                                else {
2202
                                                        blockcopy(tgt);
×
2203
                                                        tgt.setradix();
×
2204
                                                        tgt.setbit(fbits); // add the hidden bit
×
2205
                                                        tgt.bitShift(BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
×
2206
                                                }
2207
                                        }
2208
                                        else {  // this cfloat has no max-exponent values and thus this represents a nan, signalling or quiet determined by the sign
2209
                                                tgt.setnan(tgt.sign());
×
2210
                                        }                        
2211
                                }
2212
                        }
2213
                }
2214
                // tgt.setradix(radix);
2215
        }
39,489,207✔
2216

2217
        // Normalize a cfloat to a blocktriple used in mul, which has the form 0'00001.fffff
2218
        // that is 2*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2219
        // The result radix will go to 2*fbits after multiplication.
2220
        constexpr void normalizeMultiplication(blocktriple<fbits, BlockTripleOperator::MUL, bt>& tgt) const {
13,924,728✔
2221
                // test special cases
2222
                if (isnan()) {
13,924,728✔
2223
                        tgt.setnan();
450,696✔
2224
                }
2225
                else if (isinf()) {
13,474,032✔
2226
                        tgt.setinf();
652✔
2227
                }
2228
                else if (iszero()) {
13,473,380✔
2229
                        tgt.setzero();
450,960✔
2230
                }
2231
                else {
2232
                        tgt.setnormal(); // a blocktriple is always normalized
13,022,420✔
2233
                        int scale = this->scale();
13,022,420✔
2234
                        tgt.setsign(sign());
13,022,420✔
2235
                        tgt.setscale(scale);
13,022,420✔
2236

2237
                        // set significant
2238
                        // we are going to unify to the format 01.ffffeeee
2239
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2240
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2241
                        if (isnormal() || ismaxexpvalue()) {
13,022,420✔
2242
                                if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2243
                                        uint64_t raw = fraction_ull();
11,803,483✔
2244
                                        raw |= (1ull << fbits);
11,803,483✔
2245
                                        tgt.setbits(raw);
11,803,483✔
2246
                                }
2247
                                else {
2248
                                        blockcopy(tgt);
832✔
2249
                                        tgt.setradix();
832✔
2250
                                        tgt.setbit(fbits); // add the hidden bit
832✔
2251
                                }
2252
                        }
2253
                        else { 
2254
                                // it is a subnormal encoding in this target cfloat
2255
                                if constexpr (hasSubnormals) {
2256
                                        if constexpr (fbits < 64) {
2257
                                                uint64_t raw = fraction_ull();
1,218,105✔
2258
                                                int shift = MIN_EXP_NORMAL - scale;
1,218,105✔
2259
                                                raw <<= shift;
1,218,105✔
2260
                                                raw |= (1ull << fbits);
1,218,105✔
2261
                                                tgt.setbits(raw);
1,218,105✔
2262
                                        }
2263
                                        else {
2264
                                                blockcopy(tgt);
×
2265
                                                int shift = MIN_EXP_NORMAL - scale;
×
2266
                                                tgt.bitShift(shift);
×
2267
                                                tgt.setbit(fbits);
×
2268
                                        }
2269
                                }
2270
                                else { // this cfloat has no subnormals
2271
                                        tgt.setzero(tgt.sign()); // preserve the sign
×
2272
                                }
2273
                        }
2274
                }
2275
                tgt.setradix(fbits); // override the radix with the input scale for accurate value printing
13,924,728✔
2276
        }
13,924,728✔
2277

2278
        // normalize a cfloat to a blocktriple used in div, which has the form 0'00000'00001.fffff
2279
        // that is 3*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2280
        // the result radix will go to 2*fbits after multiplication.
2281
        // TODO: needs implementation
2282
        constexpr void normalizeDivision(blocktriple<fbits, BlockTripleOperator::DIV, bt>& tgt) const {
8,900,366✔
2283
                constexpr unsigned divshift = blocktriple<fbits, BlockTripleOperator::DIV, bt>::divshift;
8,900,366✔
2284
                // test special cases
2285
                if (isnan()) {
8,900,366✔
2286
                        tgt.setnan();
38✔
2287
                }
2288
                else if (isinf()) {
8,900,328✔
2289
                        tgt.setinf();
6✔
2290
                }
2291
                else if (iszero()) {
8,900,322✔
2292
                        tgt.setzero();
44✔
2293
                }
2294
                else {
2295
                        tgt.setnormal(); // a blocktriple is always normalized
8,900,278✔
2296
                        int scale = this->scale();
8,900,278✔
2297
                        tgt.setsign(sign());
8,900,278✔
2298
                        tgt.setscale(scale);
8,900,278✔
2299
                        // set significant
2300
                        // we are going to unify to the format 01.ffffeeee
2301
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2302
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2303
                        if (isnormal() || ismaxexpvalue()) {
8,900,278✔
2304
                                if constexpr (fbits < 64 && divshift < (64 - fbits)) {
2305
                                        uint64_t raw = fraction_ull();
7,400,262✔
2306
                                        raw |= (1ull << fbits);
7,400,262✔
2307
                                        raw <<= divshift; // shift the input value to the output radix
7,400,262✔
2308
                                        tgt.setbits(raw);
7,400,262✔
2309
                                }
2310
                                else {
2311
                                        // brute force copy of blocks
2312
                                        blockcopy(tgt);
1,054,322✔
2313
                                        tgt.setbit(fbits);
1,054,322✔
2314
                                        tgt.bitShift(divshift); // shift the input value to the output radix
1,054,322✔
2315
                                }
2316
                        }
2317
                        else { // it is a subnormal encoding in this target cfloat
2318
                                if constexpr (fbits < 64 && divshift < (64 - fbits)) {
2319
                                        uint64_t raw = fraction_ull();
445,692✔
2320
                                        int shift = MIN_EXP_NORMAL - scale;
445,692✔
2321
                                        raw <<= shift;
445,692✔
2322
                                        raw |= (1ull << fbits);
445,692✔
2323
                                        raw <<= divshift; // shift the input value to the output radix
445,692✔
2324
                                        tgt.setbits(raw);
445,692✔
2325
                                }
2326
                                else {
2327
                                        blockcopy(tgt);
2✔
2328
                                        int shift = MIN_EXP_NORMAL - scale;
2✔
2329
                                        tgt.bitShift(shift);
2✔
2330
                                        tgt.setbit(fbits);
2✔
2331
                                        tgt.bitShift(divshift); // shift the input value to the output radix
2✔
2332
                                }
2333
                        }
2334
                }
2335
                tgt.setradix(blocktriple<fbits, BlockTripleOperator::DIV, bt>::radix);
8,900,366✔
2336
        }
8,900,366✔
2337

2338
        // helper debug function
2339
        void constexprClassParameters() const noexcept {
8✔
2340
                std::cout << "-------------------------------------------------------------\n";
8✔
2341
                std::cout << "type              : " << typeid(*this).name() << '\n';
8✔
2342
                std::cout << "nbits             : " << nbits << '\n';
8✔
2343
                std::cout << "es                : " << es << std::endl;
8✔
2344
                std::cout << "hasSubnormals     : " << (hasSubnormals ? "true" : "false") << '\n';
8✔
2345
                std::cout << "hasMaxExpValues   : " << (hasMaxExpValues ? "true" : "false") << '\n';
8✔
2346
                std::cout << "isSaturating      : " << (isSaturating ? "true" : "false") << '\n';
8✔
2347
                std::cout << "ALL_ONES          : " << to_binary(ALL_ONES, 0, true) << '\n';
8✔
2348
                std::cout << "BLOCK_MASK        : " << to_binary(BLOCK_MASK, 0, true) << '\n';
8✔
2349
                std::cout << "nrBlocks          : " << nrBlocks << '\n';
8✔
2350
                std::cout << "bits in MSU       : " << bitsInMSU << '\n';
8✔
2351
                std::cout << "MSU               : " << MSU << '\n';
8✔
2352
                std::cout << "MSU MASK          : " << to_binary(MSU_MASK, 0, true) << '\n';
8✔
2353
                std::cout << "SIGN_BIT_MASK     : " << to_binary(SIGN_BIT_MASK, 0, true) << '\n';
8✔
2354
                std::cout << "LSB_BIT_MASK      : " << to_binary(LSB_BIT_MASK, 0, true) << '\n';
8✔
2355
                std::cout << "MSU CAPTURES_EXP  : " << (MSU_CAPTURES_EXP ? "yes\n" : "no\n");
8✔
2356
                std::cout << "EXP_SHIFT         : " << EXP_SHIFT << '\n';
8✔
2357
                std::cout << "MSU EXP MASK      : " << to_binary(MSU_EXP_MASK, 0, true) << '\n';
8✔
2358
                std::cout << "ALL_ONE_MASK_ES   : " << to_binary(ALL_ONES_ES) << '\n';
8✔
2359
                std::cout << "EXP_BIAS          : " << EXP_BIAS << '\n';
8✔
2360
                std::cout << "MAX_EXP           : " << MAX_EXP << '\n';
8✔
2361
                std::cout << "MIN_EXP_NORMAL    : " << MIN_EXP_NORMAL << '\n';
8✔
2362
                std::cout << "MIN_EXP_SUBNORMAL : " << MIN_EXP_SUBNORMAL << '\n';
8✔
2363
                std::cout << "fraction Blocks   : " << fBlocks << '\n';
8✔
2364
                std::cout << "bits in FSU       : " << bitsInFSU << '\n';
8✔
2365
                std::cout << "FSU               : " << FSU << '\n';
8✔
2366
                std::cout << "FSU MASK          : " << to_binary(FSU_MASK, 0, true) << '\n';
8✔
2367
                std::cout << "topfbits          : " << topfbits << '\n';
8✔
2368
                std::cout << "ALL_ONE_MASK_FR   : " << to_binary(ALL_ONES_FR) << '\n';
8✔
2369
        }
8✔
2370
        void showLimbs() const {
2371
                for (unsigned b = 0; b < nrBlocks; ++b) {
2372
                        std::cout << to_binary(_block[nrBlocks - b - 1], sizeof(bt) * 8) << ' ';
2373
                }
2374
                std::cout << '\n';
2375
        }
2376

2377
protected:
2378
        // HELPER methods
2379

2380
        /// <summary>
2381
        /// 1's complement of the encoding used to set up specific encoding patterns.
2382
        /// This is not an arithmetic operator that makes sense for floating-point numbers.
2383
        /// </summary>
2384
        /// <returns>reference to this cfloat object</returns>
2385
        constexpr cfloat& flip() noexcept { // in-place one's complement
1,871,734✔
2386
                for (unsigned i = 0; i < nrBlocks; ++i) {
7,118,183✔
2387
                        _block[i] = bt(~_block[i]);
5,246,449✔
2388
                }
2389
                _block[MSU] &= MSU_MASK; // assert precondition of properly nulled leading non-bits
1,871,734✔
2390
                return *this;
1,871,734✔
2391
        }
2392

2393
        /// <summary>
2394
        /// shift left is a bit level encoding helper for fast limb-based conversions between different cfloats
2395
        /// </summary>
2396
        /// <param name="bitsToShift"></param>
2397
        void shiftLeft(unsigned bitsToShift) {
60,251✔
2398
                if (bitsToShift == 0) return;
60,251✔
2399
                if (bitsToShift > nbits) {
60,251✔
2400
                        setzero();
×
2401
                }
2402
                if (bitsToShift >= bitsInBlock) {
60,251✔
2403
                        int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
60,251✔
2404
                        for (int i = static_cast<int>(MSU); i >= blockShift; --i) {
140,526✔
2405
                                _block[i] = _block[i - blockShift];
80,275✔
2406
                        }
2407
                        for (int i = blockShift - 1; i >= 0; --i) {
341,430✔
2408
                                _block[i] = bt(0);
281,179✔
2409
                        }
2410
                        // adjust the shift
2411
                        bitsToShift -= blockShift * bitsInBlock;
60,251✔
2412
                        if (bitsToShift == 0) return;
60,251✔
2413
                }
2414
                if constexpr (MSU > 0) {
2415
                        // construct the mask for the upper bits in the block that need to move to the higher word
2416
                        bt mask = 0xFFFFFFFFFFFFFFFF << (bitsInBlock - bitsToShift);
60,205✔
2417
                        for (unsigned i = MSU; i > 0; --i) {
360,826✔
2418
                                _block[i] <<= bitsToShift;
300,621✔
2419
                                // mix in the bits from the right
2420
                                bt bits = bt(mask & _block[i - 1]);
300,621✔
2421
                                _block[i] |= (bits >> (bitsInBlock - bitsToShift));
300,621✔
2422
                        }
2423
                }
2424
                _block[0] <<= bitsToShift;
60,205✔
2425
        }
2426

2427
        // convert an unsigned integer into a cfloat
2428
        // TODO: this method does not protect against being called with a signed integer
2429
        template<typename Ty>
2430
        constexpr cfloat& convert_unsigned_integer(const Ty& rhs) noexcept {
140✔
2431
                clear();
140✔
2432
                if (0 == rhs) return *this;
140✔
2433

2434
                uint64_t raw = static_cast<uint64_t>(rhs);
137✔
2435
                int msb = static_cast<int>(find_msb(raw)) - 1; // msb > 0 due to zero test above 
137✔
2436
                int exponent = msb;
137✔
2437
                // remove the MSB as it represents the hidden bit in the cfloat representation
2438
                uint64_t hmask = ~(1ull << msb);
137✔
2439
                raw &= hmask;
137✔
2440

2441
                constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
137✔
2442
                uint32_t shift = sizeInBits - exponent - 1;
137✔
2443
                raw <<= shift;
137✔
2444
                raw = round<sizeInBits, uint64_t>(raw, exponent);
137✔
2445

2446
                // construct the target cfloat
2447
                if constexpr (fbits < (64 - es)) {
2448
                        uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(EXP_BIAS));
107✔
2449
                        uint64_t bits = 0;
107✔
2450
                        bits <<= es;
107✔
2451
                        bits |= biasedExponent;
107✔
2452
                        bits <<= fbits;
107✔
2453
                        bits |= raw;
107✔
2454
                        setbits(bits);
107✔
2455
                }
2456
                else {
2457
                        setsign(false);
30✔
2458
                        setexponent(exponent);
30✔
2459
                        // For large types, place fraction bits at the TOP of the fraction field
2460
                        // After shift, raw has fraction bits at positions (sizeInBits-2) down to (sizeInBits-1-exponent)
2461
                        // We need to place them at positions (fbits-1) down to (fbits-exponent)
2462
                        for (int i = 0; i < exponent; ++i) {
291✔
2463
                                bool bit = (raw >> (sizeInBits - 2 - i)) & 1;
261✔
2464
                                setbit(static_cast<unsigned>(fbits - 1 - i), bit);
261✔
2465
                        }
2466
                }
2467
                return *this;
137✔
2468
        }
2469
        // convert a signed integer into a cfloat
2470
        // TODO: this method does not protect against being called with a signed integer
2471
        template<typename Ty>
2472
        constexpr cfloat& convert_signed_integer(const Ty& rhs) noexcept {
174,972✔
2473
                clear();
174,972✔
2474
                if (0 == rhs) return *this;
174,972✔
2475
                bool s = (rhs < 0);
33,377✔
2476
                using UnsignedTy = std::make_unsigned_t<Ty>;
2477
                UnsignedTy urhs = static_cast<UnsignedTy>(rhs);
33,377✔
2478
                uint64_t raw = static_cast<uint64_t>(s ? (UnsignedTy(0) - urhs) : urhs);
33,377✔
2479

2480
                int msb = static_cast<int>(find_msb(raw)) - 1; // msb > 0 due to zero test above 
33,377✔
2481
                int exponent = msb;
33,377✔
2482
                // remove the MSB as it represents the hidden bit in the cfloat representation
2483
                uint64_t hmask = ~(1ull << msb);
33,377✔
2484
                raw &= hmask;
33,377✔
2485

2486
                // shift the msb to the msb of the fraction
2487
                constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
33,377✔
2488
                uint32_t shift = sizeInBits - exponent - 1;
33,377✔
2489
                raw <<= shift;
33,377✔
2490
                raw = round<sizeInBits, uint64_t>(raw, exponent);
33,377✔
2491

2492
                // construct the target cfloat
2493
                if constexpr (fbits < (64 - es)) {
2494
                        uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(EXP_BIAS));
33,006✔
2495
                        uint64_t bits = (s ? 1ull : 0ull);
33,006✔
2496
                        bits <<= es;
33,006✔
2497
                        bits |= biasedExponent;
33,006✔
2498
                        bits <<= fbits;
33,006✔
2499
                        bits |= raw;
33,006✔
2500
                        setbits(bits);
33,006✔
2501
                }
2502
                else {
2503
                        setsign(s);
371✔
2504
                        setexponent(exponent);
371✔
2505
                        // For large types, place fraction bits at the TOP of the fraction field
2506
                        // After shift, raw has fraction bits at positions (sizeInBits-2) down to (sizeInBits-1-exponent)
2507
                        // We need to place them at positions (fbits-1) down to (fbits-exponent)
2508
                        for (int i = 0; i < exponent; ++i) {
3,120✔
2509
                                bool bit = (raw >> (sizeInBits - 2 - i)) & 1;
2,749✔
2510
                                setbit(static_cast<unsigned>(fbits - 1 - i), bit);
2,749✔
2511
                        }
2512
                }
2513
                return *this;
33,377✔
2514
        }
2515

2516
public:
2517
        template<typename Real>
2518
        CONSTEXPRESSION cfloat& convert_ieee754(Real rhs) noexcept {
138,253,458✔
2519
                if constexpr (nbits == 32 && es == 8 && sizeof(Real) == 4) {
2520
                        // we CANNOT use the native conversion to float as cfloats have max-exponent values
2521
                        // which IEEE-754 does not have and thus a native conversion would destroy
2522
                        // only if the Real type is a float can we use the direct conversion
2523

2524
                        // when our cfloat is a perfect match to single precision IEEE-754
2525
                        bool s{ false };
65,749✔
2526
                        uint64_t rawExponent{ 0 };
65,749✔
2527
                        uint64_t rawFraction{ 0 };
65,749✔
2528
                        uint64_t bits{ 0 };
65,749✔
2529
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
65,749✔
2530
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf need to be remapped
65,749✔
2531
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
24✔
2532
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
18✔
2533
                                        // 1.11111111.00000000.......00000001 signalling nan
2534
                                        // 0.11111111.00000000000000000000001 signalling nan
2535
                                        // MSVC
2536
                                        // 1.11111111.10000000.......00000001 signalling nan
2537
                                        // 0.11111111.10000000.......00000001 signalling nan
2538
                                        setnan(NAN_TYPE_SIGNALLING);
6✔
2539
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2540
                                        return *this;
6✔
2541
                                }
2542
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
18✔
2543
                                        // 1.11111111.10000000.......00000000 quiet nan
2544
                                        // 0.11111111.10000000.......00000000 quiet nan
2545
                                        setnan(NAN_TYPE_QUIET);
5✔
2546
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2547
                                        return *this;
5✔
2548
                                }
2549
                                if (rawFraction == 0ull) {
13✔
2550
                                        // 1.11111111.0000000.......000000000 -inf
2551
                                        // 0.11111111.0000000.......000000000 +inf
2552
                                        setinf(s);
13✔
2553
                                        return *this;
13✔
2554
                                }
2555
                        }
2556
                        uint64_t raw{ s ? 1ull : 0ull };
65,725✔
2557
                        raw <<= 31;
65,725✔
2558
                        raw |= (rawExponent << fbits);
65,725✔
2559
                        raw |= rawFraction;
65,725✔
2560
                        setbits(raw);
65,725✔
2561
                        return *this;
65,725✔
2562
                }
2563
                else if constexpr (nbits == 64 && es == 11 && sizeof(Real) == 8) {
2564
                        // when our cfloat is a perfect match to double precision IEEE-754
2565
                        bool s{ false };
12,177✔
2566
                        uint64_t rawExponent{ 0 };
12,177✔
2567
                        uint64_t rawFraction{ 0 };
12,177✔
2568
                        uint64_t bits{ 0 };
12,177✔
2569
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
12,177✔
2570
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf need to be remapped
12,177✔
2571
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
20✔
2572
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
14✔
2573
                                        // 1.11111111.00000000.......00000001 signalling nan
2574
                                        // 0.11111111.00000000000000000000001 signalling nan
2575
                                        // MSVC
2576
                                        // 1.11111111.10000000.......00000001 signalling nan
2577
                                        // 0.11111111.10000000.......00000001 signalling nan
2578
                                        setnan(NAN_TYPE_SIGNALLING);
6✔
2579
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2580
                                        return *this;
6✔
2581
                                }
2582
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
14✔
2583
                                        // 1.11111111.10000000.......00000000 quiet nan
2584
                                        // 0.11111111.10000000.......00000000 quiet nan
2585
                                        setnan(NAN_TYPE_QUIET);
6✔
2586
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2587
                                        return *this;
6✔
2588
                                }
2589
                                if (rawFraction == 0ull) {
8✔
2590
                                        // 1.11111111.0000000.......000000000 -inf
2591
                                        // 0.11111111.0000000.......000000000 +inf
2592
                                        setinf(s);
8✔
2593
                                        return *this;
8✔
2594
                                }
2595
                        }
2596
                        // normal and subnormal handling
2597
                        uint64_t raw{ s ? 1ull : 0ull };
12,157✔
2598
                        raw <<= 63;
12,157✔
2599
                        raw |= (rawExponent << fbits);
12,157✔
2600
                        raw |= rawFraction;
12,157✔
2601
                        setbits(raw);
12,157✔
2602
                        return *this;
12,157✔
2603
                }
2604
                else {
2605
                        clear();
138,175,532✔
2606
                        // extract raw IEEE-754 bits
2607
                        bool s{ false };
138,175,532✔
2608
                        uint64_t rawExponent{ 0 };
138,175,532✔
2609
                        uint64_t rawFraction{ 0 };
138,175,532✔
2610
                        uint64_t bits{ 0 };
138,175,532✔
2611
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
138,175,532✔
2612
                        // special case handling
2613
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf
138,175,532✔
2614
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
5,101,654✔
2615
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
2,585,964✔
2616
                                        // 1.11111111.00000000.......00000001 signalling nan
2617
                                        // 0.11111111.00000000000000000000001 signalling nan
2618
                                        // MSVC
2619
                                        // 1.11111111.10000000.......00000001 signalling nan
2620
                                        // 0.11111111.10000000.......00000001 signalling nan
2621
                                        setnan(NAN_TYPE_SIGNALLING);
2,521,233✔
2622
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2623
                                        return *this;
2,521,233✔
2624
                                }
2625
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
2,580,421✔
2626
                                        // 1.11111111.10000000.......00000000 quiet nan
2627
                                        // 0.11111111.10000000.......00000000 quiet nan
2628
                                        setnan(NAN_TYPE_QUIET);
2,551,509✔
2629
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2630
                                        return *this;
2,551,509✔
2631
                                }
2632
                                if (rawFraction == 0ull) {
28,912✔
2633
                                        // 1.11111111.0000000.......000000000 -inf
2634
                                        // 0.11111111.0000000.......000000000 +inf
2635
                                        setinf(s);
28,888✔
2636
                                        return *this;
28,888✔
2637
                                }
2638
                        }
2639
                        if (rhs == 0.0) { // IEEE rule: this is valid for + and - 0.0
133,073,902✔
2640
                                setbit(nbits - 1ull, s);
597,893✔
2641
                                return *this;
597,893✔
2642
                        }
2643
        
2644
                        // normal number consists of fbits fraction bits and one hidden bit
2645
                        // subnormal number has no hidden bit
2646
                        int exponent = static_cast<int>(rawExponent) - ieee754_parameter<Real>::bias;  // unbias the exponent
132,476,009✔
2647

2648
                        // normalize subnormal source values so downstream code can treat them as normal
2649
                        // A subnormal has rawExponent == 0 and value = 0.fraction * 2^(1-bias).
2650
                        // We find the MSB of the fraction, shift it into the hidden bit position,
2651
                        // mask it off (like a normal's implicit 1), and adjust the exponent.
2652
                        // Setting rawExponent = 1 causes all downstream if(rawExponent != 0) branches
2653
                        // to take the normal-number path, which already handles normal-to-normal and
2654
                        // normal-to-subnormal target conversions correctly.
2655
                        if (rawExponent == 0 && rawFraction != 0) {
132,476,009✔
2656
                                exponent = 1 - ieee754_parameter<Real>::bias; // effective exponent for subnormals
1,164✔
2657
                                unsigned msb_pos = find_msb(rawFraction); // 1-indexed position of MSB
1,164✔
2658
                                unsigned normalizeShift = static_cast<unsigned>(ieee754_parameter<Real>::fbits) + 1u - msb_pos;
1,164✔
2659
                                rawFraction = (rawFraction << normalizeShift) & static_cast<uint64_t>(ieee754_parameter<Real>::fmask);
1,164✔
2660
                                exponent -= static_cast<int>(normalizeShift);
1,164✔
2661
                                rawExponent = 1; // mark as normalized so downstream paths treat this as a normal number
1,164✔
2662
                        }
2663

2664
                        // check special case of
2665
                        //  1- saturating to maxpos/maxneg, or 
2666
                        //  2- projecting to +-inf 
2667
                        // if the value is out of range.
2668
                        // 
2669
                        // One problem here is that at the rounding cusps of maxpos <-> inf <-> nan
2670
                        // you need to go through the rounding logic to know which encoding you end up
2671
                        // with. 
2672
                        // For each specific cfloat configuration, you can work out these rounding cusps
2673
                        // but they need to go through the value transformation to map them back to native
2674
                        // IEEE-754. That is a complex computation to do as a static constexpr as you need
2675
                        // to construct the value, then evaluate it, and store it.
2676
                        // 
2677
                        // The algorithm used here is to check for the obvious out of range values by
2678
                        // comparing their scale to the max scale this cfloat encoding can represent.
2679
                        // For the rounding cusps, we go through the rounding logic, and then clean up
2680
                        // after rounding using the observation that no conversion from a value can ever
2681
                        // yield the NaN encoding.
2682
                        //
2683
                        // The rounding logic will correctly sort between maxpos and inf, and we clean
2684
                        // up any NaN encodings by projecting back to the configuration's saturation rule.
2685
                        //
2686
                        // We could improve on this by creating the database of rounding cusps and
2687
                        // referencing them with a direct value comparison with the input. That would be
2688
                        // the most performant implementation.
2689
                        if (exponent > MAX_EXP) {
132,476,009✔
2690
                                if constexpr (isSaturating) {
2691
                                        if (s) this->maxneg(); else this->maxpos(); // saturate to maxpos or maxneg
934,374✔
2692
                                }
2693
                                else {
2694
                                        setinf(s);
970,252✔
2695
                                }
2696
                                return *this;
1,904,626✔
2697
                        }
2698
                        if constexpr (hasSubnormals) {
2699
                                if (exponent < MIN_EXP_SUBNORMAL - 1) { 
88,959,189✔
2700
                                        // map to +-0 any values that have a scale less than (MIN_EXP_SUBMORNAL - 1)
2701
                                        this->setbit(nbits - 1, s);
145,377✔
2702
                                        return *this;
145,377✔
2703
                                }
2704
                        }
2705
                        else {
2706
                                if (exponent < MIN_EXP_NORMAL - 1) {
41,612,194✔
2707
                                        // map to +-0 any values that have a scale less than (MIN_EXP_MORNAL - 1)
2708
                                        this->setbit(nbits - 1, s);
271,647✔
2709
                                        return *this;
271,647✔
2710
                                }
2711
                        }
2712

2713
                        /////////////////  
2714
                        /// end of special case processing, move on to value sampling and rounding
2715

2716
#if TRACE_CONVERSION
2717
                        std::cout << '\n';
2718
                        std::cout << "value             : " << rhs << '\n';
2719
                        std::cout << "segments          : " << to_binary(rhs) << '\n';
2720
                        std::cout << "sign     bit      : " << (s ? '1' : '0') << '\n';
2721
                        std::cout << "exponent bits     : " << to_binary(rawExponent, ieee754_parameter<Real>::ebits, true) << '\n';
2722
                        std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::fbits, true) << '\n';
2723
                        std::cout << "exponent value    : " << exponent << '\n';
2724
#endif
2725

2726
                        // do the following scenarios have different rounding bits?
2727
                        // input is normal, cfloat is normal           <-- rounding can happen with native ieee-754 bits
2728
                        // input is normal, cfloat is subnormal
2729
                        // input is subnormal, cfloat is normal
2730
                        // input is subnormal, cfloat is subnormal
2731

2732
                        // The first condition is the relationship between the number 
2733
                        // of fraction bits from the source and the number of fraction bits 
2734
                        // in the target cfloat: these are constexpressions and guard the shifts
2735
                        // input fbits >= cfloat fbits                 <-- need to round
2736
                        // input fbits < cfloat fbits                  <-- no need to round
2737

2738
                        // quick check if we are truncating to 0 for all subnormal values
2739
                        if constexpr (!hasSubnormals) {
2740
                                if (exponent < MIN_EXP_NORMAL) {
41,340,547✔
2741
                                        setsign(s); // rest of the bits, exponent and fraction, are already set correctly
123,840✔
2742
                                        return *this;
123,840✔
2743
                                }
2744
                        }
2745
                        if constexpr (fbits < ieee754_parameter<Real>::fbits) {
2746
                                // this is the common case for cfloats that are smaller in size compared to single and double precision IEEE-754
2747
                                constexpr int rightShift = ieee754_parameter<Real>::fbits - fbits; // this is the bit shift to get the MSB of the src to the MSB of the tgt
129,771,085✔
2748
                                uint32_t biasedExponent{ 0 };
129,771,085✔
2749
                                int adjustment{ 0 }; // right shift adjustment for subnormal representation
129,771,085✔
2750
                                uint64_t mask;
2751
                                if (rawExponent != 0) {
129,771,085✔
2752
                                        // the source real is a normal number, 
2753
//                                        if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2754
                                        if (exponent < MIN_EXP_NORMAL) {
129,771,085✔
2755
//                                                exponent = (exponent < MIN_EXP_SUBNORMAL ? MIN_EXP_SUBNORMAL : exponent); // clip to the smallest subnormal exponent, otherwise the adjustment is off
2756
                                                // the value is a subnormal number in this representation: biasedExponent = 0
2757
                                                // add the hidden bit to the fraction bits so the denormalization has the correct MSB
2758
                                                rawFraction |= ieee754_parameter<Real>::hmask;
41,271,898✔
2759

2760
                                                // fraction processing: we have 1 hidden + 23 explicit fraction bits 
2761
                                                // f = 1.ffff 2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2762
                                                // -exponent because we are right shifting and exponent in this range is negative
2763
                                                adjustment = -(exponent + subnormal_reciprocal_shift[es]);
41,271,898✔
2764
                                                // this is the right shift adjustment required for subnormal representation due 
2765
                                                // to the scale of the input number, i.e. the exponent of 2^-adjustment
2766
                                        }
2767
                                        else {
2768
                                                // the value is a normal number in this representation: common case
2769
                                                biasedExponent = static_cast<uint32_t>(exponent + EXP_BIAS); // project the exponent into the target 
88,499,187✔
2770
                                                // fraction processing
2771
                                                // float structure is: seee'eeee'efff'ffff'ffff'ffff'ffff'ffff, s = sign, e - exponent bit, f = fraction bit
2772
                                                // target structure is for example cfloat<8,2>: seef'ffff
2773
                                                // since both are normals, we can shift the incoming fraction to the target structure bits, and round
2774
                                                // MSB of source = 23 - 1, MSB of target = fbits - 1: shift = MSB of src - MSB of tgt => 23 - fbits
2775
                                                adjustment = 0;
88,499,187✔
2776
                                        }
2777
                                        if constexpr (rightShift > 0) {
2778
                                                // if true we need to round
2779
                                                // round-to-even logic
2780
                                                //  ... lsb | guard  round sticky   round
2781
                                                //       x     0       x     x       down
2782
                                                //       0     1       0     0       down  round to even
2783
                                                //       1     1       0     0        up   round to even
2784
                                                //       x     1       0     1        up
2785
                                                //       x     1       1     0        up
2786
                                                //       x     1       1     1        up
2787
                                                // collect lsb, guard, round, and sticky bits
2788

2789

2790
#if TRACE_CONVERSION
2791
                                                std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::nbits, true) << '\n';
2792
                                                std::cout << "lsb mask bits     : " << to_binary(mask, ieee754_parameter<Real>::nbits, true) << '\n';
2793
#endif
2794
                                                mask = (1ull << (rightShift + adjustment)); // bit mask for the lsb bit
129,771,085✔
2795
                                                bool lsb = (mask & rawFraction);
129,771,085✔
2796
                                                mask >>= 1;
129,771,085✔
2797
                                                bool guard = (mask & rawFraction);
129,771,085✔
2798
                                                mask >>= 1;
129,771,085✔
2799
                                                bool round = (mask & rawFraction);
129,771,085✔
2800
                                                if ((rightShift + adjustment) > 1) {
129,771,085✔
2801
                                                        mask = (0xFFFF'FFFF'FFFF'FFFFull << (rightShift + adjustment - 2));
129,771,085✔
2802
                                                        mask = ~mask;
129,771,085✔
2803
                                                }
2804
                                                else {
2805
                                                        mask = 0;
×
2806
                                                }
2807
#if TRACE_CONVERSION
2808
                                                std::cout << "right shift       : " << rightShift << '\n';
2809
                                                std::cout << "adjustment        : " << adjustment << '\n';
2810
                                                std::cout << "shift to LSB      : " << (rightShift + adjustment) << '\n';
2811
                                                std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::nbits, true) << '\n';
2812
                                                std::cout << "sticky mask bits  : " << to_binary(mask, ieee754_parameter<Real>::nbits, true) << '\n';
2813
#endif
2814
                                                bool sticky = (mask & rawFraction);
129,771,085✔
2815
                                                rawFraction >>= (static_cast<int64_t>(rightShift) + static_cast<int64_t>(adjustment));
129,771,085✔
2816

2817
                                                // execute rounding operation
2818
                                                if (guard) {
129,771,085✔
2819
                                                        if (lsb && (!round && !sticky)) ++rawFraction; // round to even
26,455,668✔
2820
                                                        if (round || sticky) ++rawFraction;
26,455,668✔
2821
                                                        if (rawFraction == (1ull << fbits)) { // overflow
26,455,668✔
2822
                                                                if (biasedExponent == ALL_ONES_ES) { // overflow to INF == .111..01
451,194✔
2823
                                                                        rawFraction = INF_ENCODING;
578✔
2824
                                                                }
2825
                                                                else {
2826
                                                                        ++biasedExponent;
450,616✔
2827
                                                                        rawFraction = 0;
450,616✔
2828
                                                                }
2829
                                                        }
2830
                                                }
2831
#if TRACE_CONVERSION
2832
                                                std::cout << "lsb               : " << (lsb ? "1\n" : "0\n");
2833
                                                std::cout << "guard             : " << (guard ? "1\n" : "0\n");
2834
                                                std::cout << "round             : " << (round ? "1\n" : "0\n");
2835
                                                std::cout << "sticky            : " << (sticky ? "1\n" : "0\n");
2836
                                                std::cout << "rounding decision : " << (lsb && (!round && !sticky) ? "round to even\n" : "-\n");
2837
                                                std::cout << "rounding direction: " << (round || sticky ? "round up\n" : "round down\n");
2838
#endif
2839
                                        }
2840
                                        else { // all bits of the float go into this representation and need to be shifted up, no rounding necessary
2841
                                                int shiftLeft = fbits - ieee754_parameter<Real>::fbits;
2842
                                                rawFraction <<= shiftLeft;
2843
                                        }
2844
#if TRACE_CONVERSION
2845
                                        std::cout << "biased exponent   : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2846
                                        std::cout << "right shift       : " << rightShift << '\n';
2847
                                        std::cout << "adjustment shift  : " << adjustment << '\n';
2848
                                        std::cout << "sticky bit mask   : " << to_binary(mask, 32, true) << '\n';
2849
                                        std::cout << "fraction bits     : " << to_binary(rawFraction, 32, true) << '\n';
2850
#endif
2851
                                        // construct the target cfloat
2852
                                        bits = (s ? 1ull : 0ull);
129,771,085✔
2853
                                        bits <<= es;
129,771,085✔
2854
                                        bits |= biasedExponent;
129,771,085✔
2855
                                        bits <<= fbits;
129,771,085✔
2856
                                        bits |= rawFraction;
129,771,085✔
2857
#if TRACE_CONVERSION
2858
                                        std::cout << "sign bit          : " << (s ? '1' : '0') << '\n';
2859
                                        std::cout << "biased exponent   : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2860
                                        std::cout << "fraction bits     : " << to_binary(rawFraction, 32, true) << '\n';
2861
                                        std::cout << "cfloat bits       : " << to_binary(bits, nbits, true) << '\n';
2862
#endif
2863
                                        setbits(bits);
129,771,085✔
2864
                                }
2865
                                else {
2866
                                        // the source real is a subnormal number                                
2867
                                        mask = 0x00FF'FFFFu >> (fbits + exponent + subnormal_reciprocal_shift[es] + 1); // mask for sticky bit 
×
2868

2869
                                        // fraction processing: we have fbits+1 bits = 1 hidden + fbits explicit fraction bits 
2870
                                        // f = 1.ffff  2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2871
                                        // -exponent because we are right shifting and exponent in this range is negative
2872
                                        adjustment = -(exponent + subnormal_reciprocal_shift[es]); // this is the right shift adjustment due to the scale of the input number, i.e. the exponent of 2^-adjustment
×
2873
#if TRACE_CONVERSION                                        
2874
                                        std::cout << "source is subnormal: TBD\n";
2875
                                        std::cout << "shift to LSB    " << (rightShift + adjustment) << '\n';
2876
                                        std::cout << "adjustment      " << adjustment << '\n';
2877
                                        std::cout << "exponent        " << exponent << '\n';
2878
                                        std::cout << "subnormal shift " << subnormal_reciprocal_shift[es] << '\n';
2879
#endif
2880
                                        if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2881
                                                // the value is a subnormal number in this representation
2882
                                        }
2883
                                        else {
2884
                                                // the value is a normal number in this representation
2885
                                        }
2886
                                }
2887
                        }
2888
                        else {
2889
                                // no need to round, but we need to shift left to deliver the bits
2890
                                // cfloat<40,  8> = float
2891
                                // cfloat<48,  9> = float
2892
                                // cfloat<56, 10> = float
2893
                                // cfloat<64, 11> = float
2894
                                // cfloat<64, 10> = double 
2895
                                // can we go from an input subnormal to a cfloat normal? 
2896
                                // yes, for example a cfloat<64,11> assigned to a subnormal float
2897

2898
                                // map exponent into target cfloat encoding
2899
                                constexpr int upshift = fbits - ieee754_parameter<Real>::fbits;
259,434✔
2900
                                uint64_t biasedExponent{ 0 };
259,434✔
2901
                                bool subnormalInTarget = (exponent < MIN_EXP_NORMAL);
259,434✔
2902
                                int denormShift = 0;
259,434✔
2903
                                if (subnormalInTarget) {
259,434✔
2904
                                        biasedExponent = 0;
614✔
2905
                                        denormShift = MIN_EXP_NORMAL - exponent;
614✔
2906
                                }
2907
                                else {
2908
                                        biasedExponent = static_cast<uint64_t>(static_cast<int64_t>(exponent) + EXP_BIAS);
258,820✔
2909
                                }
2910
                                // output processing
2911
                                if constexpr (nbits < 65) {
2912
                                        // we can compose the bits in a native 64-bit unsigned integer
2913
                                        // common case: normal to normal
2914
                                        // reference example: nbits = 40, es = 8, fbits = 31: rhs = float fbits = 23; shift left by (31 - 23) = 8
2915

2916
                                        if (rawExponent != 0) {
199,183✔
2917
                                                // rhs is a normal encoding (or a normalized former-subnormal)
2918
                                                uint64_t raw{ s ? 1ull : 0ull };
199,183✔
2919
                                                raw <<= es;
199,183✔
2920
                                                raw |= biasedExponent;
199,183✔
2921
                                                raw <<= fbits;
199,183✔
2922
                                                if (subnormalInTarget) {
199,183✔
2923
                                                        // add the hidden bit and denormalize the fraction
2924
                                                        uint64_t significand = rawFraction | ieee754_parameter<Real>::hmask;
603✔
2925
                                                        int netShift = upshift - denormShift;
603✔
2926
                                                        if (netShift >= 0) {
603✔
2927
                                                                rawFraction = significand << netShift;
603✔
2928
                                                        }
2929
                                                        else {
2930
                                                                // right shift with round-to-nearest-even
2931
                                                                int rshift = -netShift;
×
2932
                                                                uint64_t lsbMask = (1ull << rshift);
×
2933
                                                                bool lsb    = (significand & lsbMask) != 0;
×
2934
                                                                bool guard  = (rshift >= 1) && ((significand & (lsbMask >> 1)) != 0);
×
2935
                                                                bool round  = (rshift >= 2) && ((significand & (lsbMask >> 2)) != 0);
×
2936
                                                                bool sticky = (rshift >= 3) && ((significand & ((lsbMask >> 2) - 1)) != 0);
×
2937
                                                                rawFraction = significand >> rshift;
×
2938
                                                                if (guard) {
×
2939
                                                                        if (lsb && (!round && !sticky)) ++rawFraction; // round to even
×
2940
                                                                        if (round || sticky) ++rawFraction;
×
2941
                                                                }
2942
                                                        }
2943
                                                }
2944
                                                else {
2945
                                                        rawFraction <<= upshift;
198,580✔
2946
                                                }
2947
                                                raw |= rawFraction;
199,183✔
2948
                                                setbits(raw);
199,183✔
2949
                                        }
2950
                                        else {
2951
                                                // rhs is a subnormal (unreachable after normalization above, kept for safety)
2952
                                        }
2953
                                }
2954
                                else {
2955
                                        // we need to write and shift bits into place
2956
                                        // use cases are cfloats like cfloat<80, 11, bt>
2957
                                        // even though the bits that come in are single or double precision
2958
                                        // we need to write the fields and then shifting them in place
2959
                                        // 
2960
                                        // common case: normal to normal
2961
                                        if constexpr (bitsInBlock < 64) {
2962
                                                if (rawExponent != 0) {
60,251✔
2963
                                                        // reference example: nbits = 128, es = 15, fbits = 112: rhs = float: shift left by (112 - 23) = 89
2964
                                                        setbits(biasedExponent);
60,251✔
2965
                                                        shiftLeft(fbits);
60,251✔
2966
                                                        // if subnormal in target, add hidden bit before copying
2967
                                                        uint64_t fractionToCopy = subnormalInTarget
60,251✔
2968
                                                                ? (rawFraction | ieee754_parameter<Real>::hmask)
60,251✔
2969
                                                                : rawFraction;
2970
                                                        // shift fraction bits
2971
                                                        int bitsToShift = subnormalInTarget ? (upshift - denormShift) : upshift;
60,251✔
2972
                                                        // for subnormal targets near minpos, bitsToShift can be negative (right shift)
2973
                                                        // apply rounding on the 64-bit significand before splitting into blocks
2974
                                                        if (bitsToShift < 0) {
60,251✔
2975
                                                                int rshift = -bitsToShift;
×
2976
                                                                uint64_t lsbMask = (1ull << rshift);
×
2977
                                                                bool lsb    = (fractionToCopy & lsbMask) != 0;
×
2978
                                                                bool guard  = (rshift >= 1) && ((fractionToCopy & (lsbMask >> 1)) != 0);
×
2979
                                                                bool round  = (rshift >= 2) && ((fractionToCopy & (lsbMask >> 2)) != 0);
×
2980
                                                                bool sticky = (rshift >= 3) && ((fractionToCopy & ((lsbMask >> 2) - 1)) != 0);
×
2981
                                                                fractionToCopy >>= rshift;
×
2982
                                                                if (guard) {
×
2983
                                                                        if (lsb && (!round && !sticky)) ++fractionToCopy; // round to even
×
2984
                                                                        if (round || sticky) ++fractionToCopy;
×
2985
                                                                }
2986
                                                                bitsToShift = 0;
×
2987
                                                        }
2988
                                                        bt fractionBlock[nrBlocks]{ 0 };
60,251✔
2989
                                                        // copy fraction bits
2990
                                                        unsigned blocksRequired = (8 * sizeof(fractionToCopy) + 1) / bitsInBlock;
60,251✔
2991
                                                        unsigned maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
60,251✔
2992
                                                        uint64_t mask = static_cast<uint64_t>(ALL_ONES); // set up the block mask
60,251✔
2993
                                                        unsigned shift = 0;
60,251✔
2994
                                                        for (unsigned i = 0; i < maxBlockNr; ++i) {
340,805✔
2995
                                                                fractionBlock[i] = bt((mask & fractionToCopy) >> shift);
280,554✔
2996
                                                                mask <<= bitsInBlock;
280,554✔
2997
                                                                shift += bitsInBlock;
280,554✔
2998
                                                        }
2999
                                                        if (bitsToShift >= static_cast<int>(bitsInBlock)) {
60,251✔
3000
                                                                int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
50,227✔
3001
                                                                for (int i = MSU; i >= blockShift; --i) {
271,048✔
3002
                                                                        fractionBlock[i] = fractionBlock[i - blockShift];
220,821✔
3003
                                                                }
3004
                                                                for (int i = blockShift - 1; i >= 0; --i) {
160,782✔
3005
                                                                        fractionBlock[i] = bt(0);
110,555✔
3006
                                                                }
3007
                                                                // adjust the shift
3008
                                                                bitsToShift -= blockShift * bitsInBlock;
50,227✔
3009
                                                        }
3010
                                                        if (bitsToShift > 0) {
60,251✔
3011
                                                                // construct the mask for the upper bits in the block that need to move to the higher word
3012
                                                                bt bitsToMoveMask = bt(ALL_ONES << (bitsInBlock - bitsToShift));
40,222✔
3013
                                                                for (unsigned i = MSU; i > 0; --i) {
211,144✔
3014
                                                                        fractionBlock[i] <<= bitsToShift;
170,922✔
3015
                                                                        // mix in the bits from the right
3016
                                                                        bt fracbits = static_cast<bt>(bitsToMoveMask & fractionBlock[i - 1]); // operator & yields an int
170,922✔
3017
                                                                        fractionBlock[i] |= (fracbits >> (bitsInBlock - bitsToShift));
170,922✔
3018
                                                                }
3019
                                                                fractionBlock[0] <<= bitsToShift;
40,222✔
3020
                                                        }
3021
                                                        // OR the bits in
3022
                                                        for (unsigned i = 0; i <= MSU; ++i) {
421,705✔
3023
                                                                _block[i] |= fractionBlock[i];
361,454✔
3024
                                                        }
3025
                                                        // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
3026
                                                        _block[MSU] &= MSU_MASK;
60,251✔
3027
                                                        // finally, set the sign bit
3028
                                                        setsign(s);
60,251✔
3029
                                                }
3030
                                                else {
3031
                                                        // rhs is a subnormal
3032
                //                                        std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
3033
                                                }
3034
                                        }
3035
                                        else {
3036
                                                // BlockType is incorrect
3037
                                        }
3038
                                }
3039
                        }
3040
                        // post-processing results to implement saturation and projection after rounding logic
3041
                        if constexpr (isSaturating) {
3042
                                if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
20,168,379✔
3043
                                        maxpos();
888✔
3044
                                }
3045
                                else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
20,167,491✔
3046
                                        maxneg();
888✔
3047
                                }
3048
                        }
3049
                        else {
3050
                                if (isnan(NAN_TYPE_QUIET)) {
109,862,140✔
3051
                                        setinf(false);
301,420✔
3052
                                }
3053
                                else if (isnan(NAN_TYPE_SIGNALLING)) {
109,560,720✔
3054
                                        setinf(true);
300,372✔
3055
                                }
3056
                        }
3057
                        return *this;  // TODO: unreachable in some configurations        
130,030,519✔
3058
                }
3059
        }
3060

3061
        // post-processing results to implement saturation and projection after rounding logic
3062
        // arithmetic bit operations can't produce NaN encodings, so we need to re-interpret
3063
        // these encodings and 'project' them to the proper values.
3064
        void constexpr post_process() noexcept {
3065
                if constexpr (isSaturating) {
3066
                        if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
3067
                                maxpos();
3068
                        }
3069
                        else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
3070
                                maxneg();
3071
                        }
3072
                }
3073
                else {
3074
                        if (isnan(NAN_TYPE_QUIET)) {
3075
                                setinf(false);
3076
                        }
3077
                        else if (isnan(NAN_TYPE_SIGNALLING)) {
3078
                                setinf(true);
3079
                        }
3080
                }
3081
        }
3082

3083
protected:
3084

3085
        /// <summary>
3086
        /// round a set of source bits to the present representation.
3087
        /// srcbits is the number of bits of significant in the source representation
3088
        /// </summary>
3089
        /// <typeparam name="StorageType"></typeparam>
3090
        /// <param name="raw"></param>
3091
        /// <returns></returns>
3092
        template<unsigned srcbits, typename StorageType>
3093
        constexpr uint64_t round(StorageType raw, int& exponent) noexcept {
33,514✔
3094
                if constexpr (fhbits < srcbits) {
3095
                        // round to even: lsb guard round sticky
3096
                    // collect guard, round, and sticky bits
3097
                    // this same logic will work for the case where
3098
                    // we only have a guard bit and no round and sticky bits
3099
                    // because the mask logic will make round and sticky both 0
3100
                        constexpr uint32_t shift = srcbits - fhbits - 1ull;
30,530✔
3101
                        StorageType mask = (StorageType(1ull) << shift);
30,530✔
3102
                        bool guard = (mask & raw);
30,530✔
3103
                        mask >>= 1;
30,530✔
3104
                        bool round = (mask & raw);
30,530✔
3105
                        if constexpr (shift > 1u) { // protect against a negative shift
3106
                                StorageType allones(StorageType(~0));
30,530✔
3107
                                mask = StorageType(allones << (shift - 2));
30,530✔
3108
                                mask = ~mask;
30,530✔
3109
                        }
3110
                        else {
3111
                                mask = 0;
3112
                        }
3113
                        bool sticky = (mask & raw);
30,530✔
3114

3115
                        raw >>= (shift + 1);  // shift out the bits we are rounding away
30,530✔
3116
                        bool lsb = (raw & 0x1u);
30,530✔
3117
                        //  ... lsb | guard  round sticky   round
3118
                        //       x     0       x     x       down
3119
                        //       0     1       0     0       down  round to even
3120
                        //       1     1       0     0        up   round to even
3121
                        //       x     1       0     1        up
3122
                        //       x     1       1     0        up
3123
                        //       x     1       1     1        up
3124
                        if (guard) {
30,530✔
3125
                                if (lsb && (!round && !sticky)) ++raw; // round to even
3,168✔
3126
                                if (round || sticky) ++raw;
3,168✔
3127
                                if (raw == (1ull << fbits)) { // overflow
3,168✔
3128
                                        ++exponent;
3,168✔
3129
                                        raw >>= 1u;
3,168✔
3130
                                }
3131
                        }
3132
                }
3133
                else {
3134
                        // Target has more precision than source - need to left-shift to align
3135
                        // For large types where fhbits > 64, the fraction bits cannot fit in
3136
                        // a 64-bit raw after shifting. In this case, skip the shift and let
3137
                        // convert_signed/unsigned_integer place bits using setbit().
3138
                        // The caller positions fraction bits at the top of srcbits, and we
3139
                        // need to ensure they don't overflow 64 bits after our shift.
3140
                        constexpr unsigned shift = fhbits - srcbits;
2,984✔
3141
                        if constexpr (fhbits <= 64 && shift < 64) {
3142
                                raw <<= shift;
2,583✔
3143
                        }
3144
                        // else: raw stays as-is; caller will extract bits and place them
3145
                }
3146
                uint64_t significant = raw;
33,514✔
3147
                return significant;
33,514✔
3148
        }
3149

3150
        template<typename ArgumentBlockType>
3151
        constexpr void copyBits(ArgumentBlockType v) {
3152
                unsigned blocksRequired = (8 * sizeof(v) + 1 ) / bitsInBlock;
3153
                unsigned maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
3154
                bt b{ 0ul }; b = bt(~b);
3155
                ArgumentBlockType mask = ArgumentBlockType(b);
3156
                unsigned shift = 0;
3157
                for (unsigned i = 0; i < maxBlockNr; ++i) {
3158
                        _block[i] = bt((mask & v) >> shift);
3159
                        mask <<= bitsInBlock;
3160
                        shift += bitsInBlock;
3161
                }
3162
        }
3163
        void shiftLeft(int leftShift) {
3164
                if (leftShift == 0) return;
3165
                if (leftShift < 0) return shiftRight(-leftShift);
3166
                if (leftShift > long(nbits)) leftShift = nbits; // clip to max
3167
                if (leftShift >= long(bitsInBlock)) {
3168
                        int blockShift = leftShift / static_cast<int>(bitsInBlock);
3169
                        for (signed i = signed(MSU); i >= blockShift; --i) {
3170
                                _block[i] = _block[i - blockShift];
3171
                        }
3172
                        for (signed i = blockShift - 1; i >= 0; --i) {
3173
                                _block[i] = bt(0);
3174
                        }
3175
                        // adjust the shift
3176
                        leftShift -= (long)(blockShift * bitsInBlock);
3177
                        if (leftShift == 0) return;
3178
                }
3179
                // construct the mask for the upper bits in the block that need to move to the higher word
3180
//                bt mask = static_cast<bt>(0xFFFFFFFFFFFFFFFFull << (bitsInBlock - leftShift));
3181
                bt mask = ALL_ONES;
3182
                mask <<= (bitsInBlock - leftShift);
3183
                for (unsigned i = MSU; i > 0; --i) {
3184
                        _block[i] <<= leftShift;
3185
                        // mix in the bits from the right
3186
                        bt bits = static_cast<bt>(mask & _block[i - 1]);
3187
                        _block[i] |= (bits >> (bitsInBlock - leftShift));
3188
                }
3189
                _block[0] <<= leftShift;
3190
        }
3191
        void shiftRight(int rightShift) {
3192
                if (rightShift == 0) return;
3193
                if (rightShift < 0) return shiftLeft(-rightShift);
3194
                if (rightShift >= long(nbits)) {
3195
                        setzero();
3196
                        return;
3197
                }
3198
                bool signext = sign();
3199
                unsigned blockShift = 0;
3200
                if (rightShift >= long(bitsInBlock)) {
3201
                        blockShift = rightShift / bitsInBlock;
3202
                        if (MSU >= blockShift) {
3203
                                // shift by blocks
3204
                                for (unsigned i = 0; i <= MSU - blockShift; ++i) {
3205
                                        _block[i] = _block[i + blockShift];
3206
                                }
3207
                        }
3208
                        // adjust the shift
3209
                        rightShift -= (long)(blockShift * bitsInBlock);
3210
                        if (rightShift == 0) {
3211
                                // fix up the leading zeros if we have a negative number
3212
                                if (signext) {
3213
                                        // rightShift is guaranteed to be less than nbits
3214
                                        rightShift += (long)(blockShift * bitsInBlock);
3215
                                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3216
                                                this->setbit(i);
3217
                                        }
3218
                                }
3219
                                else {
3220
                                        // clean up the blocks we have shifted clean
3221
                                        rightShift += (long)(blockShift * bitsInBlock);
3222
                                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3223
                                                this->setbit(i, false);
3224
                                        }
3225
                                }
3226
                                return;  // shift was aligned to block boundary, no per-bit shift needed
3227
                        }
3228
                }
3229

3230
                bt mask = ALL_ONES;
3231
                mask >>= (bitsInBlock - rightShift); // this is a mask for the lower bits in the block that need to move to the lower word
3232
                for (unsigned i = 0; i < MSU; ++i) {  // TODO: can this be improved? we should not have to work on the upper blocks in case we block shifted
3233
                        _block[i] >>= rightShift;
3234
                        // mix in the bits from the left
3235
                        bt bits = static_cast<bt>(mask & _block[i + 1]); // & operator returns an int
3236
                        _block[i] |= (bits << (bitsInBlock - rightShift));
3237
                }
3238
                _block[MSU] >>= rightShift;
3239

3240
                // fix up the leading zeros if we have a negative number
3241
                if (signext) {
3242
                        // bitsToShift is guaranteed to be less than nbits
3243
                        rightShift += (long)(blockShift * bitsInBlock);
3244
                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3245
                                this->setbit(i);
3246
                        }
3247
                }
3248
                else {
3249
                        // clean up the blocks we have shifted clean
3250
                        rightShift += (long)(blockShift * bitsInBlock);
3251
                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3252
                                this->setbit(i, false);
3253
                        }
3254
                }
3255

3256
                // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
3257
                _block[MSU] &= MSU_MASK;
3258
        }
3259

3260
        // calculate the integer power 2 ^ b using exponentiation by squaring
3261
        double ipow(int exponent) const {
524,863✔
3262
                bool negative = (exponent < 0);
524,863✔
3263
                exponent = negative ? -exponent : exponent;
524,863✔
3264
                double result(1.0);
524,863✔
3265
                double base = 2.0;
524,863✔
3266
                for (;;) {
3267
                        if (exponent % 2) result *= base;
3,846,088✔
3268
                        exponent >>= 1;
3,846,088✔
3269
                        if (exponent == 0) break;
3,846,088✔
3270
                        base *= base;
3,321,225✔
3271
                }
3272
                return (negative ? (1.0 / result) : result);
524,863✔
3273
        }
3274

3275
        template<BlockTripleOperator btop, typename TargetBlockType = bt>
3276
        constexpr void blockcopy(blocktriple<fbits, btop, TargetBlockType>& tgt) const {
1,056,246✔
3277
                // brute force copy of blocks
3278
                if constexpr (1 == fBlocks) {
3279
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0] & FSU_MASK));
1,050,734✔
3280
                }
3281
                else if constexpr (2 == fBlocks) {
3282
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
3,028✔
3283
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1] & FSU_MASK));
3,028✔
3284
                }
3285
                else if constexpr (3 == fBlocks) {
3286
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
204✔
3287
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
204✔
3288
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2] & FSU_MASK));
204✔
3289
                }
3290
                else if constexpr (4 == fBlocks) {
3291
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
1,500✔
3292
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
1,500✔
3293
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
1,500✔
3294
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3] & FSU_MASK));
1,500✔
3295
                }
3296
                else if constexpr (5 == fBlocks) {
3297
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
×
3298
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
×
3299
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
×
3300
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
×
3301
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4] & FSU_MASK));
×
3302
                }
3303
                else if constexpr (6 == fBlocks) {
3304
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
3305
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
3306
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
3307
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
3308
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
3309
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5] & FSU_MASK));
3310
                }
3311
                else if constexpr (7 == fBlocks) {
3312
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
8✔
3313
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
8✔
3314
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
8✔
3315
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
8✔
3316
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
8✔
3317
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5]));
8✔
3318
                        tgt.setblock(6, static_cast<TargetBlockType>(_block[6] & FSU_MASK));
8✔
3319
                }
3320
                else if constexpr (8 == fBlocks) {
3321
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
370✔
3322
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
370✔
3323
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
370✔
3324
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
370✔
3325
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
370✔
3326
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5]));
370✔
3327
                        tgt.setblock(6, static_cast<TargetBlockType>(_block[6]));
370✔
3328
                        tgt.setblock(7, static_cast<TargetBlockType>(_block[7] & FSU_MASK));
370✔
3329
                }
3330
                else {
3331
                        for (unsigned i = 0; i < FSU; ++i) {
4,127✔
3332
                                tgt.setblock(i, static_cast<TargetBlockType>(_block[i]));
3,725✔
3333
                        }
3334
                        tgt.setblock(FSU, static_cast<TargetBlockType>(_block[FSU] & FSU_MASK));
402✔
3335
                }
3336
        }
1,056,246✔
3337

3338
private:
3339
        bt _block[nrBlocks];
3340

3341
        //////////////////////////////////////////////////////////////////////////////
3342
        // friend functions
3343

3344
        // template parameters need names different from class template parameters (for gcc and clang)
3345
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3346
        friend std::ostream& operator<< (std::ostream& ostr, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3347
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3348
        friend std::istream& operator>> (std::istream& istr, cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3349

3350
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3351
        friend bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3352
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3353
        friend bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3354
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3355
        friend bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3356
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3357
        friend bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3358
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3359
        friend bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3360
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3361
        friend bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3362
};
3363

3364
///////////////////////////// IOSTREAM operators ///////////////////////////////////////////////
3365

3366
// convert cfloat to decimal fixpnt string, i.e. "-1234.5678"
3367
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3368
std::string to_decimal_fixpnt_string(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& value, long long precision) {
3369
        constexpr unsigned fbits = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3370
        constexpr unsigned bias = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::EXP_BIAS;
3371
        std::stringstream str;
3372
        if (value.iszero()) {
3373
                str << '0';
3374
                return str.str();
3375
        }
3376
        if (value.sign()) str << '-';
3377

3378
        // construct the discretization levels of the fraction part
3379
        support::decimal range, discretizationLevels, step;
3380
        // create the decimal range we are discretizing
3381
        range.setdigit(1);
3382
        range.shiftLeft(fbits); // the decimal range of the fraction
3383
        discretizationLevels.powerOf2(fbits); // calculate the discretization levels of this range
3384
        step = div(range, discretizationLevels);
3385
        // now construct the value of this range by adding the fraction samples
3386
        support::decimal partial, multiplier;
3387
        partial.setzero();  // if you just want the fraction
3388
        multiplier.setdigit(1);
3389
        // convert the fraction part
3390
        for (unsigned i = 0; i < fbits; ++i) {
3391
                if (value.at(i)) {
3392
                        support::add(partial, multiplier);
3393
                }
3394
                support::add(multiplier, multiplier);
3395
        }
3396
        if (value.isdenormal()) {
3397
                support::mul(partial, step);
3398
                support::decimal scale;
3399
                scale.powerOf2(bias - 1ull);
3400
                partial = support::div(partial, scale);
3401
        } 
3402
        else {
3403
                support::add(partial, multiplier); // add the hidden bit
3404
                support::mul(partial, step);
3405
                support::decimal scale;
3406
                int exponent = value.scale();
3407
                if (exponent < 0) {
3408
                        scale.powerOf2(static_cast<unsigned>(-exponent));
3409
                        partial = support::div(partial, scale);
3410
                }
3411
                else {
3412
                        scale.powerOf2(static_cast<unsigned>(exponent));
3413
                        support::mul(partial, scale);
3414
                }
3415
        }
3416

3417
        // the radix is at fbits
3418
        // The partial represents the parts in the range, so we can deduce
3419
        // the number of leading zeros by comparing to the length of range
3420
        int nrLeadingZeros = static_cast<int>(range.size()) - static_cast<int>(partial.size()) - 1;
3421
        if (nrLeadingZeros >= 0) str << "0.";
3422
        for (int i = 0; i < nrLeadingZeros; ++i) str << '0';
3423
        int digitsWritten = (nrLeadingZeros > 0) ? nrLeadingZeros : 0;
3424
        int position = static_cast<int>(partial.size()) - 1;
3425
        for (support::decimal::const_reverse_iterator rit = partial.rbegin(); rit != partial.rend(); ++rit) {
3426
                str << (int)*rit;
3427
                ++digitsWritten;
3428
                if (position == fbits) str << '.';
3429
                --position;
3430
        }
3431
        if (digitsWritten < precision) { // deal with trailing 0s
3432
                for (unsigned i = static_cast<unsigned>(digitsWritten); i < fbits; ++i) {
3433
                        str << '0';
3434
                }
3435
        }
3436

3437
        return str.str();
3438
}
3439

3440
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3441
std::string to_string(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& value, long long precision) {
3442
        constexpr unsigned fbits = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3443
        std::stringstream str;
3444
        if (value.iszero()) {
3445
                str << '0';
3446
                return str.str();
3447
        }
3448
        if (value.sign()) str << '-';
3449

3450
        // denormalize the number to gain access to the most sigificant digits
3451
        // 1.ffff^e
3452
        // scale is e
3453
        // lsbScale is e - fbits
3454
        // shift to get lsb to position 2^0 = (e - fbits)
3455
        std::int64_t scale = value.scale();
3456
//        std::int64_t shift = scale + fbits; // we want the lsb at 2^0
3457
        std::int64_t lsbScale = scale - fbits;  // scale of the lsb
3458
        support::decimal partial, multiplier;
3459
        partial.setzero();
3460

3461
        multiplier.powerOf2(lsbScale);
3462

3463
        // convert the fraction bits 
3464
        for (unsigned i = 0; i < fbits; ++i) {
3465
                if (value.at(i)) {
3466
                        support::add(partial, multiplier);
3467
                }
3468
                support::add(multiplier, multiplier);
3469
        }
3470
        if (!value.isdenormal()) {
3471
                support::add(partial, multiplier); // add the hidden bit
3472
        }
3473
        str << partial;
3474
        return str.str();
3475
}
3476

3477
//////////////////////////////////////////////////////////////////////////////////////////////
3478
/// stream operators
3479

3480
// ostream output generates an ASCII format for the floating-point argument
3481
// Uses native binary-to-decimal conversion via blocktriple::to_string()
3482
// to produce exact output for all cfloat sizes without double conversion.
3483
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3484
inline std::ostream& operator<<(std::ostream& ostr, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
3,122✔
3485
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3486
        constexpr unsigned cfbits = Cfloat::fbits;
3,122✔
3487

3488
        std::streamsize prec  = ostr.precision();
3,122✔
3489
        std::streamsize width = ostr.width();
3,122✔
3490
        std::ios_base::fmtflags ff = ostr.flags();
3,122✔
3491
        bool bFixed      = (ff & std::ios_base::fixed) == std::ios_base::fixed;
3,122✔
3492
        bool bScientific = (ff & std::ios_base::scientific) == std::ios_base::scientific;
3,122✔
3493
        bool bShowpos    = (ff & std::ios_base::showpos) != 0;
3,122✔
3494
        bool bUppercase  = (ff & std::ios_base::uppercase) != 0;
3,122✔
3495
        bool bInternal   = (ff & std::ios_base::internal) != 0;
3,122✔
3496
        bool bLeft       = (ff & std::ios_base::left) != 0;
3,122✔
3497
        char fillChar    = ostr.fill();
3,122✔
3498

3499
        if constexpr (cfbits == 0) {
3500
                // degenerate cfloat with no fraction bits: fall back to double
3501
                std::ostringstream oss;
3502
                oss.precision(prec);
3503
                if (bFixed) oss << std::fixed;
3504
                if (bScientific) oss << std::scientific;
3505
                if (bUppercase) oss << std::uppercase;
3506
                if (bShowpos) oss << std::showpos;
3507
                oss << static_cast<double>(v);
3508
                std::string s = oss.str();
3509
                if (width > 0 && s.length() < static_cast<size_t>(width)) {
3510
                        size_t pad = static_cast<size_t>(width) - s.length();
3511
                        if (bLeft) { s.append(pad, fillChar); }
3512
                        else { s.insert(0u, pad, fillChar); }
3513
                }
3514
                return ostr << s;
3515
        } else {
3516
                blocktriple<cfbits, BlockTripleOperator::REP, bt> a;
3,122✔
3517
                v.normalize(a);
3,122✔
3518
                return ostr << a.to_string(prec, width, bFixed, bScientific,
6,244✔
3519
                                           bInternal, bLeft, bShowpos, bUppercase, fillChar);
6,244✔
3520
        }
3521
}
3522

3523
// parse a cfloat from a string in either cfloat hex format (nbits.esxHEXVALUEc)
3524
// or a decimal floating-point representation
3525
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3526
bool parse(const std::string& txt, cfloat<nbits,es,bt,hasSubnormals,hasMaxExpValues,isSaturating>& v) {
24✔
3527
        // check if the txt is of the native cfloat form: nbits.esX[0x]hexvaluec
3528
        std::regex cfloat_regex(R"(^[0-9]+\.[0-9]+[xX](0[xX])?[0-9A-Fa-f]+c?$)");
24✔
3529
        if (std::regex_match(txt, cfloat_regex)) {
24✔
3530
                // found a cfloat representation: parse nbits.esxHEXVALUEc
3531
                std::string nbitsStr, esStr, bitStr;
3✔
3532
                auto it = txt.begin();
3✔
3533
                for (; it != txt.end(); ++it) {
9✔
3534
                        if (*it == '.') break;
9✔
3535
                        nbitsStr.append(1, *it);
6✔
3536
                }
3537
                for (++it; it != txt.end(); ++it) {
6✔
3538
                        if (*it == 'x' || *it == 'X') break;
6✔
3539
                        esStr.append(1, *it);
3✔
3540
                }
3541
                for (++it; it != txt.end(); ++it) {
21✔
3542
                        if (*it == 'c') break;
21✔
3543
                        bitStr.append(1, *it);
18✔
3544
                }
3545
                unsigned nbits_in = 0;
3✔
3546
                unsigned es_in = 0;
3✔
3547
                {
3548
                        std::istringstream ss(nbitsStr);
3✔
3549
                        ss >> nbits_in;
3✔
3550
                        if (ss.fail()) return false;
3✔
3551
                }
3✔
3552
                {
3553
                        std::istringstream ss(esStr);
3✔
3554
                        ss >> es_in;
3✔
3555
                        if (ss.fail()) return false;
3✔
3556
                }
3✔
3557
                // native cfloat form must match target configuration
3558
                if (nbits_in != nbits || es_in != es) return false;
3✔
3559
                uint64_t raw = 0;
2✔
3560
                std::istringstream ss(bitStr);
2✔
3561
                ss >> std::hex >> raw;
2✔
3562
                if (ss.fail()) return false;
2✔
3563
                ss >> std::ws;
2✔
3564
                if (!ss.eof()) return false;
2✔
3565
                v.setbits(raw);
2✔
3566
                return true;
2✔
3567
        }
3✔
3568
        else {
3569
                // assume it is a float/double/long double representation
3570
                std::istringstream ss(txt);
21✔
3571
                double d;
3572
                ss >> d;
21✔
3573
                if (ss.fail()) return false;
21✔
3574
                ss >> std::ws;
21✔
3575
                if (!ss.eof()) return false;
21✔
3576
                v = d;
20✔
3577
                return true;
20✔
3578
        }
21✔
3579
}
24✔
3580

3581
// read an ASCII float or cfloat format: nbits.esxNN...NNc, for example: 16.5x7C00c
3582
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3583
inline std::istream& operator>>(std::istream& istr, cfloat<nbits,es,bt,hasSubnormals,hasMaxExpValues,isSaturating>& v) {
17✔
3584
        std::string txt;
17✔
3585
        istr >> txt;
17✔
3586
        if (!parse(txt, v)) {
17✔
3587
                std::cerr << "unable to parse -" << txt << "- into a cfloat value\n";
×
3588
        }
3589
        return istr;
17✔
3590
}
17✔
3591

3592
// encoding helpers
3593

3594
// return the Unit in the Last Position
3595
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3596
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ulp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& a) {
49✔
3597
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> b(a);
49✔
3598
        return ++b - a;
86✔
3599
}
3600

3601
// transform cfloat to a binary representation
3602
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3603
inline std::string to_binary(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& number, bool nibbleMarker = false) {
1,506✔
3604
        std::stringstream s;
1,506✔
3605
        s << "0b";
1,506✔
3606
        unsigned index = nbits;
1,506✔
3607
        s << (number.at(--index) ? '1' : '0') << '.';
1,506✔
3608

3609
        for (int i = int(es) - 1; i >= 0; --i) {
10,495✔
3610
                s << (number.at(--index) ? '1' : '0');
8,989✔
3611
                if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
8,989✔
3612
        }
3613

3614
        s << '.';
1,506✔
3615

3616
        constexpr int fbits = nbits - 1ull - es;
1,506✔
3617
        for (int i = fbits - 1; i >= 0; --i) {
32,620✔
3618
                s << (number.at(--index) ? '1' : '0');
31,114✔
3619
                if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
31,114✔
3620
        }
3621

3622
        return s.str();
3,012✔
3623
}
1,506✔
3624

3625
// transform a cfloat into a triple representation
3626
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3627
inline std::string to_triple(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& number, bool nibbleMarker = true) {
3✔
3628
        std::stringstream s;
3✔
3629
        blocktriple<cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits, BlockTripleOperator::REP, bt> triple;
3✔
3630
        number.normalize(triple);
3✔
3631
        s << to_triple(triple, nibbleMarker);
3✔
3632
        return s.str();
6✔
3633
}
3✔
3634

3635
// Magnitude of a cfloat (equivalent to turning the sign bit off).
3636
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3637
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>
3638
abs(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
16,939✔
3639
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> a(v);
16,939✔
3640
        a.setsign(false);
16,939✔
3641
        return a;
16,939✔
3642
}
3643

3644
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3645
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>
3646
fabs(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> v) {
40✔
3647
        return abs(v);
40✔
3648
}
3649

3650
////////////////////// debug helpers
3651

3652
// convenience method to gain access to the values of the constexpr variables that govern the cfloat behavior
3653
template<unsigned nbits, unsigned es, typename bt = uint8_t, bool hasSubnormals = false, bool hasMaxExpValues = false, bool isSaturating = false>
3654
void ReportCfloatClassParameters() {
3655
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> a;
3656
        a.constexprClassParameters();
3657
}
3658

3659
//////////////////////////////////////////////////////
3660
/// cfloat - cfloat binary logic operators
3661

3662
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3663
inline bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
127,134,584✔
3664
        if (lhs.isnan() || rhs.isnan()) return false;
127,134,584✔
3665
        if (lhs.iszero() && rhs.iszero()) return true; // +0 == -0 per IEEE-754 §5.11
125,479,499✔
3666
        for (unsigned i = 0; i < lhs.nrBlocks; ++i) {
385,791,082✔
3667
                if (lhs._block[i] != rhs._block[i]) {
263,569,038✔
3668
                        return false;
2,104,956✔
3669
                }
3670
        }
3671
        return true;
122,222,044✔
3672
}
3673
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3674
inline bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator==(lhs, rhs); }
125,736,416✔
3675
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3676
inline bool operator< (const cfloat<nnbits, nes, nbt, nsub, nsup, nsat>& lhs, const cfloat<nnbits, nes, nbt, nsub, nsup, nsat>& rhs) {
5,568,897✔
3677
        if (lhs.isnan() || rhs.isnan()) return false;
5,568,897✔
3678
        // need this as arithmetic difference is defined as snan(indeterminate)
3679
        if (lhs.isinf(INF_TYPE_NEGATIVE) && rhs.isinf(INF_TYPE_NEGATIVE)) return false;
5,453,389✔
3680
        if (lhs.isinf(INF_TYPE_POSITIVE) && rhs.isinf(INF_TYPE_POSITIVE)) return false;
5,453,375✔
3681
        if constexpr (nsub) {
3682
                cfloat<nnbits, nes, nbt, nsub, nsup, nsat> diff = (lhs - rhs);
5,384,416✔
3683
                return (!diff.iszero() && diff.sign()) ? true : false;  // got to guard against -0
5,384,416✔
3684
        }
3685
        else {
3686
                if (lhs.iszero() && rhs.iszero()) return false;  // we need to 'collapse' all zero encodings
68,926✔
3687
                if (lhs.sign() && !rhs.sign()) return true;
68,918✔
3688
                if (!lhs.sign() && rhs.sign()) return false;
51,695✔
3689
                bool positive = lhs.ispos();
34,472✔
3690
                if (positive) {
34,472✔
3691
                        if (lhs.scale() < rhs.scale()) return true;
17,252✔
3692
                        if (lhs.scale() > rhs.scale()) return false;
12,786✔
3693
                }
3694
                else {
3695
                        if (lhs.scale() > rhs.scale()) return true;
17,220✔
3696
                        if (lhs.scale() < rhs.scale()) return false;
12,770✔
3697
                }
3698
                // sign and scale are the same
3699
                if (lhs.scale() == rhs.scale()) {
16,641✔
3700
                        // compare fractions: we do not have subnormals, so we can ignore the hidden bit
3701
                        blockbinary<nnbits - 1ull - nes, nbt> l, r;
3702
                        lhs.fraction(l);
16,641✔
3703
                        rhs.fraction(r);
16,641✔
3704
                        blockbinary<nnbits - nes, nbt> ll, rr; // fbits + 1 so we can 0 extend to honor 2's complement encoding of blockbinary
3705
                        ll.assignWithoutSignExtend(l);
16,641✔
3706
                        rr.assignWithoutSignExtend(r);
16,641✔
3707
                        return (positive ? (ll < rr) : (ll > rr));
16,641✔
3708
                }
3709
                return false;
×
3710
        }
3711
}
3712
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3713
inline bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { 
2,794,589✔
3714
        if (lhs.isnan() || rhs.isnan()) return false;
2,794,589✔
3715
        // need this as arithmetic difference is defined as snan(indeterminate)
3716
        if (lhs.isinf(INF_TYPE_NEGATIVE) && rhs.isinf(INF_TYPE_NEGATIVE)) return false;
2,786,475✔
3717
        if (lhs.isinf(INF_TYPE_POSITIVE) && rhs.isinf(INF_TYPE_POSITIVE)) return false;
2,786,461✔
3718
        return  operator< (rhs, lhs); 
2,786,445✔
3719
}
3720
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3721
inline bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { 
1,398,017✔
3722
        if (lhs.isnan() || rhs.isnan()) return false;
1,398,017✔
3723
        return !operator> (lhs, rhs); 
1,389,917✔
3724
}
3725
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3726
inline bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
1,399,400✔
3727
        if (lhs.isnan() || rhs.isnan()) return false;
1,399,400✔
3728
        return !operator< (lhs, rhs); 
1,391,294✔
3729
}
3730

3731
//////////////////////////////////////////////////////
3732
/// cfloat - cfloat binary arithmetic operators
3733

3734
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3735
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
9,974,529✔
3736
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
9,974,529✔
3737
        sum += rhs;
9,974,529✔
3738
        return sum;
9,974,529✔
3739
}
3740
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3741
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
8,460,600✔
3742
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
8,460,600✔
3743
        diff -= rhs;
8,460,600✔
3744
        return diff;
8,460,600✔
3745
}
3746
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3747
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
4,280,467✔
3748
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
4,280,467✔
3749
        mul *= rhs;
4,280,467✔
3750
        return mul;
4,280,467✔
3751
}
3752
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3753
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
5,138,816✔
3754
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
5,138,816✔
3755
        ratio /= rhs;
5,138,816✔
3756
        return ratio;
5,138,815✔
3757
}
3758

3759
/// binary cfloat - literal arithmetic operators
3760

3761
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3762
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3763
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3764
        sum += rhs;
3765
        return sum;
3766
}
3767
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3768
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3769
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3770
        diff -= rhs;
3771
        return diff;
3772
}
3773
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3774
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3775
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3776
        mul *= rhs;
3777
        return mul;
3778
}
3779
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3780
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
2✔
3781
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
2✔
3782
        ratio /= rhs;
2✔
3783
        return ratio;
2✔
3784
}
3785

3786
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3787
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
4✔
3788
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
4✔
3789
        sum += rhs;
4✔
3790
        return sum;
4✔
3791
}
3792
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3793
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3794
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3795
        diff -= rhs;
3796
        return diff;
3797
}
3798
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3799
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
16✔
3800
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
16✔
3801
        mul *= rhs;
16✔
3802
        return mul;
16✔
3803
}
3804
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3805
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3806
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3807
        ratio /= rhs;
3808
        return ratio;
3809
}
3810

3811
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3812
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
×
3813
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
×
3814
        sum += rhs;
×
3815
        return sum;
×
3816
}
3817
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3818
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3819
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3820
        diff -= rhs;
3821
        return diff;
3822
}
3823
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3824
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
38✔
3825
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
38✔
3826
        mul *= rhs;
38✔
3827
        return mul;
38✔
3828
}
3829
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3830
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
374✔
3831
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
374✔
3832
        ratio /= rhs;
374✔
3833
        return ratio;
374✔
3834
}
3835

3836
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3837
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3838
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3839
        sum += rhs;
3840
        return sum;
3841
}
3842
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3843
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3844
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3845
        diff -= rhs;
3846
        return diff;
3847
}
3848
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3849
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3850
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3851
        mul *= rhs;
3852
        return mul;
3853
}
3854
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3855
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3856
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3857
        ratio /= rhs;
3858
        return ratio;
3859
}
3860

3861
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3862
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3863
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3864
        sum += rhs;
3865
        return sum;
3866
}
3867
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3868
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3869
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3870
        diff -= rhs;
3871
        return diff;
3872
}
3873
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3874
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3875
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3876
        mul *= rhs;
3877
        return mul;
3878
}
3879
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3880
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3881
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3882
        ratio /= rhs;
3883
        return ratio;
3884
}
3885

3886
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3887
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3888
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3889
        sum += rhs;
3890
        return sum;
3891
}
3892
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3893
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3894
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3895
        diff -= rhs;
3896
        return diff;
3897
}
3898
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3899
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3900
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3901
        mul *= rhs;
3902
        return mul;
3903
}
3904
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3905
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3906
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3907
        ratio /= rhs;
3908
        return ratio;
3909
}
3910

3911
///  binary cfloat - literal arithmetic operators
3912

3913
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3914
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3915
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3916
        Cfloat sum(lhs);
3917
        sum += Cfloat(rhs);
3918
        return sum;
3919
}
3920
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3921
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3922
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3923
        Cfloat diff(lhs);
3924
        diff -= rhs;
3925
        return diff;
3926
}
3927
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3928
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3929
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3930
        Cfloat mul(lhs);
3931
        mul *= Cfloat(rhs);
3932
        return mul;
3933
}
3934
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3935
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3936
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3937
        Cfloat ratio(lhs);
3938
        ratio /= Cfloat(rhs);
3939
        return ratio;
3940
}
3941

3942
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3943
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3944
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3945
        Cfloat sum(lhs);
3946
        sum += Cfloat(rhs);
3947
        return sum;
3948
}
3949
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3950
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
4✔
3951
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3952
        Cfloat diff(lhs);
4✔
3953
        diff -= rhs;
4✔
3954
        return diff;
4✔
3955
}
3956
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3957
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3958
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3959
        Cfloat mul(lhs);
3960
        mul *= Cfloat(rhs);
3961
        return mul;
3962
}
3963
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3964
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3965
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3966
        Cfloat ratio(lhs);
3967
        ratio /= Cfloat(rhs);
3968
        return ratio;
3969
}
3970

3971
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3972
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3973
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3974
        Cfloat sum(lhs);
3975
        sum += Cfloat(rhs);
3976
        return sum;
3977
}
3978
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3979
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
4✔
3980
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3981
        Cfloat diff(lhs);
4✔
3982
        diff -= rhs;
4✔
3983
        return diff;
4✔
3984
}
3985
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3986
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3987
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3988
        Cfloat mul(lhs);
3989
        mul *= Cfloat(rhs);
3990
        return mul;
3991
}
3992
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3993
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3994
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3995
        Cfloat ratio(lhs);
3996
        ratio /= Cfloat(rhs);
3997
        return ratio;
3998
}
3999

4000
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4001
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
4002
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4003
        Cfloat sum(lhs);
4004
        sum += Cfloat(rhs);
4005
        return sum;
4006
}
4007
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4008
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
4009
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4010
        Cfloat diff(lhs);
4011
        diff -= rhs;
4012
        return diff;
4013
}
4014
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4015
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
4016
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4017
        Cfloat mul(lhs);
4018
        mul *= Cfloat(rhs);
4019
        return mul;
4020
}
4021
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4022
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
4023
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4024
        Cfloat ratio(lhs);
4025
        ratio /= Cfloat(rhs);
4026
        return ratio;
4027
}
4028

4029
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4030
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4031
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4032
        Cfloat sum(lhs);
4033
        sum += Cfloat(rhs);
4034
        return sum;
4035
}
4036
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4037
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4038
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4039
        Cfloat diff(lhs);
4040
        diff -= rhs;
4041
        return diff;
4042
}
4043
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4044
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4045
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4046
        Cfloat mul(lhs);
4047
        mul *= Cfloat(rhs);
4048
        return mul;
4049
}
4050
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4051
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4052
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4053
        Cfloat ratio(lhs);
4054
        ratio /= Cfloat(rhs);
4055
        return ratio;
4056
}
4057

4058
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4059
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
4060
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4061
        Cfloat sum(lhs);
4062
        sum += Cfloat(rhs);
4063
        return sum;
4064
}
4065
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4066
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
4067
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4068
        Cfloat diff(lhs);
4069
        diff -= rhs;
4070
        return diff;
4071
}
4072
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4073
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
4074
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4075
        Cfloat mul(lhs);
4076
        mul *= Cfloat(rhs);
4077
        return mul;
4078
}
4079
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4080
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
4081
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
4082
        Cfloat ratio(lhs);
4083
        ratio /= Cfloat(rhs);
4084
        return ratio;
4085
}
4086

4087
///////////////////////////////////////////////////////////////////////
4088
///   binary logic literal comparisons
4089

4090
// cfloat - literal float logic operators
4091
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4092
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
4093
        return float(lhs) == rhs;
1✔
4094
}
4095
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4096
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
4097
        return float(lhs) != rhs;
1✔
4098
}
4099
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4100
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
2✔
4101
        return float(lhs) < rhs;
2✔
4102
}
4103
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4104
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
4105
        return float(lhs) > rhs;
1✔
4106
}
4107
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4108
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
4109
        return float(lhs) <= rhs;
1✔
4110
}
4111
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4112
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
4113
        return float(lhs) >= rhs;
1✔
4114
}
4115
// cfloat - literal double logic operators
4116
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4117
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
4118
        return double(lhs) == rhs;
1✔
4119
}
4120
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4121
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
4122
        return double(lhs) != rhs;
1✔
4123
}
4124
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4125
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
336✔
4126
        return double(lhs) < rhs;
336✔
4127
}
4128
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4129
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
892✔
4130
        return double(lhs) > rhs;
892✔
4131
}
4132
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4133
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
4134
        return double(lhs) <= rhs;
1✔
4135
}
4136
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4137
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
4138
        return double(lhs) >= rhs;
1✔
4139
}
4140

4141
#if LONG_DOUBLE_SUPPORT
4142
// cfloat - literal long double logic operators
4143
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4144
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4145
        return (long double)(lhs) == rhs;
1✔
4146
}
4147
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4148
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4149
        return (long double)(lhs) != rhs;
1✔
4150
}
4151
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4152
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4153
        return (long double)(lhs) < rhs;
1✔
4154
}
4155
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4156
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4157
        return (long double)(lhs) > rhs;
1✔
4158
}
4159
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4160
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4161
        return (long double)(lhs) <= rhs;
1✔
4162
}
4163
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4164
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4165
        return (long double)(lhs) >= rhs;
1✔
4166
}
4167
#endif
4168

4169
// cfloat - literal int logic operators
4170
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4171
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
11✔
4172
        return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
11✔
4173
}
4174
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4175
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4176
        return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4177
}
4178
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4179
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
107,196✔
4180
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
107,196✔
4181
}
4182
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4183
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
666✔
4184
        return operator<(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs), lhs);
666✔
4185
}
4186
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4187
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4188
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4189
}
4190
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4191
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4192
        return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4193
}
4194

4195
// cfloat - long long logic operators
4196
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4197
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4198
        return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4199
}
4200
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4201
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4202
        return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4203
}
4204
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4205
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4206
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4207
}
4208
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4209
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4210
        return operator<(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs), lhs);
4211
}
4212
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4213
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4214
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4215
}
4216
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4217
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4218
        return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4219
}
4220

4221
// standard library functions for floating point
4222

4223
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4224
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> frexp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& x, int* exp) {
57,412✔
4225
        *exp = x.scale();
57,412✔
4226
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> fraction(x);
57,412✔
4227
        fraction.setexponent(0);
57,412✔
4228
        return fraction;
57,412✔
4229
}
4230

4231
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4232
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ldexp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& x, int exp) {
765✔
4233
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> result(x);
765✔
4234
        int xexp = x.scale();
765✔
4235
        result.setexponent(xexp + exp);  // TODO: this does not work for subnormals
765✔
4236
        return result;
765✔
4237
}
4238

4239
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4240
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> 
4241
fma(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> x,
3✔
4242
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> y,
4243
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> z) {
4244
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> fused{ 0 };
3✔
4245
        constexpr unsigned FBITS = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3✔
4246
        constexpr unsigned EXTRA_FBITS = FBITS+2;
3✔
4247
        constexpr unsigned EXTENDED_PRECISION = nbits + EXTRA_FBITS;
3✔
4248
        // the C++ fma spec indicates that the x*y+z is evaluated in 'infinite' precision
4249
        // with only a single rounding event. The minimum finite precision that would behave like this
4250
        // is the precision where the product x*y does not need to be rounded, which will
4251
        // need at least 2*(fbits+1) mantissa bits to capture all bits that can be
4252
        // generated by the product.
4253
        cfloat<EXTENDED_PRECISION, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> preciseX(x), preciseY(y), preciseZ(z);
3✔
4254
//        ReportValue(preciseX, "extended precision x");
4255
//        ReportValue(preciseY, "extended precision y");
4256
//        ReportValue(preciseZ, "extended precision z");
4257
        cfloat<EXTENDED_PRECISION, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> product = preciseX * preciseY;
3✔
4258
//        ReportValue(product, "extended precision p");
4259
        fused = product + preciseZ;
3✔
4260
        return fused;
3✔
4261
}
4262

4263
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4264
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4265
minpos(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4266
        return c.minpos();
4267
}
4268
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4269
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4270
maxpos(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4271
        return c.maxpos();
4272
}
4273
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4274
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4275
minneg(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4276
        return c.minneg();
4277
}
4278
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4279
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4280
maxneg(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4281
        return c.maxneg();
4282
}
4283

4284
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc