• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

stillwater-sc / universal / 23406981793

22 Mar 2026 04:06PM UTC coverage: 84.393% (-0.006%) from 84.399%
23406981793

push

github

web-flow
fix(lns): use subtraction in non-saturating division operator (#607)

44336 of 52535 relevant lines covered (84.39%)

6097486.39 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.62
/include/sw/universal/number/cfloat/cfloat_impl.hpp
1
#pragma once
2
// cfloat_impl.hpp: implementation of an arbitrary configuration fixed-size 'classic' floating-point representation
3
// cfloat<> can emulate IEEE-754 floats and the new Deep Learning types, such as 
4
// IEEE-754 half-precision floats
5
// Google bfloat16
6
// NVIDIA TensorFloat 
7
// AMD FP16 and FP32
8
// Microsoft FP8 and FP9
9
// Tesla CFP8, CFP16
10
// 
11
// cfloat<> can also emulate more precise configurations, such as
12
// 80bit IEEE-754 extended precision floats
13
// true 128bit quad precision floats
14
//
15
// Copyright (C) 2017 Stillwater Supercomputing, Inc.
16
// SPDX-License-Identifier: MIT
17
//
18
// This file is part of the universal numbers project, which is released under an MIT Open Source license.
19

20
// compiler specific environment has been delegated to be handled by the
21
// number system include file <universal/number/cfloat/cfloat.hpp>
22
// 
23
// supporting types and functions
24
#include <limits>
25
#include <regex>
26
#include <type_traits>
27
#include <universal/native/ieee754.hpp>
28
#include <universal/native/subnormal.hpp>
29
#include <universal/utility/find_msb.hpp>
30
#include <universal/number/shared/nan_encoding.hpp>
31
#include <universal/number/shared/infinite_encoding.hpp>
32
#include <universal/number/shared/specific_value_encoding.hpp>
33
// arithmetic tracing options
34
#include <universal/number/algorithm/trace_constants.hpp>
35
// cfloat exception structure
36
#include <universal/number/cfloat/exceptions.hpp>
37
// composition types used by cfloat
38
#include <universal/internal/blockbinary/blockbinary.hpp>
39
#include <universal/internal/blocktriple/blocktriple.hpp>
40
#include <universal/number/support/decimal.hpp>
41

42
#ifndef CFLOAT_THROW_ARITHMETIC_EXCEPTION
43
#define CFLOAT_THROW_ARITHMETIC_EXCEPTION 0
44
#endif
45

46
#ifndef TRACE_CONVERSION
47
#define TRACE_CONVERSION 0
48
#endif
49

50
namespace sw { namespace universal {
51

52
/*
53
 * classic floating-point cfloat offers subnormals, max-exponent values, and
54
 * saturation. The default configuration turns off subnormals, max-exponent
55
 * values, and projects values outside of their dynamic range to +-inf
56
 *
57
 * Behavior flags
58
 *   hasSubnormals   : gradual underflow: use all fraction encodings when exponent is all 0's
59
 *   hasMaxExpValues  : reclaim max-exponent encodings as numeric values instead of
60
 *                      reserving the entire all-1s exponent binade for inf/NaN;
61
 *                      only 4 encodings are reserved for +-inf and quiet/signalling NaN
62
 *   isSaturating     : saturate to maxneg or maxpos when value is out of dynamic range
63
 */
64

65
/// <summary>
66
/// decode an cfloat value into its constituent parts
67
/// </summary>
68
/// <typeparam name="bt">block type</typeparam>
69
/// <param name="v">cfloat value to decode (input: const ref)</param>
70
/// <param name="s">sign (output: bool ref)</param>
71
/// <param name="e">exponent (output: blockbinary ref)</param>
72
/// <param name="f">fraction (output: blockbinary ref)</param>
73
template<unsigned nbits, unsigned es, unsigned fbitsPlus1, typename bt,
74
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
75
void decode(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v, bool& s, blockbinary<es, bt>& e, blockbinary<fbitsPlus1, bt>& f) {
800✔
76
        v.sign(s);
800✔
77
        v.exponent(e);
800✔
78
        v.fraction(f);
800✔
79
}
800✔
80

81
/// <summary>
82
/// return the binary scale of the given number
83
/// </summary>
84
/// <typeparam name="bt">Block type used for storage: derived through ADL</typeparam>
85
/// <param name="v">the cfloat number for which we seek to know the binary scale</param>
86
/// <returns>binary scale, i.e. 2^scale, of the value of the cfloat</returns>
87
template<unsigned nbits, unsigned es, typename bt,
88
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
89
int scale(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
329✔
90
        return v.scale();
329✔
91
}
92

93
/// <summary>
94
/// convert a blocktriple to a cfloat. blocktriples come out of the arithmetic
95
/// engine in the form ii.ff...ff and a scale. The conversion must take this
96
/// denormalized form into account during conversion.
97
/// 
98
/// The blocktriple must be in this form to round correctly, as all the bits
99
/// after an arithmetic operation must be taken into account.
100
/// 
101
/// Transformation:
102
///    ii.ff...ff  transform to    s.eee.fffff
103
/// All number systems that depend on blocktriple will need to have
104
/// the rounding decision answered, so that functionality can be
105
/// reused if we locate it inside blocktriple.
106
/// 
107
/// if (srcbits > fbits) // we need to round
108
///     if (ii.00..00 > 1) 
109
///         mask is at srcbits - fbits + 1
110
///     else 
111
///                    mask is at srcbits - fbits
112
/// }
113
/// else {               // no need to round
114
/// }
115
/// </summary>
116
/// <typeparam name="bt">type of the block used for cfloat storage</typeparam>
117
/// <param name="src">the blocktriple to be converted</param>
118
/// <param name="tgt">the resulting cfloat</param>
119
template<unsigned srcbits, BlockTripleOperator op, unsigned nbits, unsigned es, typename bt,
120
        bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
121
inline /*constexpr*/ void convert(const blocktriple<srcbits, op, bt>& src, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& tgt) {
71,344,711✔
122
        using btType = blocktriple<srcbits, op, bt>;
123
        using cfloatType = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
124
        // test special cases
125
        if (src.isnan()) {
71,344,711✔
126
                tgt.setnan(src.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
388✔
127
        }
128
        else        if (src.isinf()) {
71,344,323✔
129
                tgt.setinf(src.sign());
388✔
130
        }
131
        else         if (src.iszero()) {
71,343,935✔
132
                tgt.setzero();
57,157✔
133
                tgt.setsign(src.sign()); // preserve sign
57,157✔
134
        }
135
        else {
136
                int significandScale = src.significandscale();
71,286,778✔
137
                int exponent = src.scale() + significandScale;
71,286,778✔
138
                // special case of underflow
139
                if constexpr (hasSubnormals) {
140
//                        std::cout << "exponent = " << exponent << " bias = " << cfloatType::EXP_BIAS << " exp subnormal = " << cfloatType::MIN_EXP_SUBNORMAL << '\n';
141
                        // why must exponent be less than (minExpSubnormal - 1) to be rounded to zero? 
142
                        // because the half-way value that would round up to minpos is at exp = (minExpSubnormal - 1)
143
                        if (exponent < cfloatType::MIN_EXP_SUBNORMAL) {
54,650,306✔
144
                                tgt.setzero();
192,485✔
145
                                if (exponent == (cfloatType::MIN_EXP_SUBNORMAL - 1)) {
192,485✔
146
                                        // -exponent because we are right shifting and exponent in this range is negative
147
                                        int adjustment = -(exponent + subnormal_reciprocal_shift[es]);
42,882✔
148
                                        std::pair<bool, unsigned> alignment = src.roundingDecision(adjustment);
42,882✔
149
                                        if (alignment.first) ++tgt; // we are minpos
42,882✔
150
                                }
151
                                tgt.setsign(src.sign());
192,485✔
152
                                return;
3,127,239✔
153
                        }
154
                }
155
                else {
156
                        if (exponent + cfloatType::EXP_BIAS <= 0) {  // value is in the subnormal range, which maps to 0
16,636,472✔
157
                                tgt.setzero();
362,507✔
158
                                tgt.setsign(src.sign());
362,507✔
159
                                return;
638,774✔
160
                        }
161
                }
162
                // special case of overflow
163
                if constexpr (hasMaxExpValues) {
164
                        if constexpr (isSaturating) {
165
                                if (exponent > cfloatType::MAX_EXP) {
17,743,113✔
166
                                        if (src.sign()) tgt.maxneg(); else tgt.maxpos();
634,020✔
167
                                        return;
634,020✔
168
                                }
169
                        }
170
                        else {
171
                                if (exponent > cfloatType::MAX_EXP) {
27,200,493✔
172
                                        tgt.setinf(src.sign());
2,373,147✔
173
                                        return;
2,373,147✔
174
                                }
175
                        }
176
                }
177
                else {  // no max-exponent values: will saturate at a different encoding: TODO can we hide it all in maxpos?
178
                        if constexpr (isSaturating) {
179
                                if (exponent > cfloatType::MAX_EXP) {
3,707✔
180
                                        if (src.sign()) tgt.maxneg(); else tgt.maxpos();
7✔
181
                                        return;
7✔
182
                                }
183
                        }
184
                        else {
185
                                if (exponent > cfloatType::MAX_EXP) {
25,784,473✔
186
                                        tgt.setinf(src.sign());
203,847✔
187
                                        return;
203,847✔
188
                                }
189
                        }
190
                }
191

192
                // our value needs to go through rounding to be correctly interpreted
193
                // 
194
                // tgt.clear();  // no need as all bits are going to be set by the code below
195

196
                // exponent construction
197
                int adjustment{ 0 };
67,520,765✔
198
                // construct exponent
199
                uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive if exponent in encoding range
67,520,765✔
200
//                        std::cout << "exponent         " << to_binary(biasedExponent) << '\n';        
201
                if constexpr (hasSubnormals) {
202
                        //if (exponent >= cfloatType::MIN_EXP_SUBNORMAL && exponent < cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::MIN_EXP_NORMAL) {
203
                        if (exponent < cfloatType::MIN_EXP_NORMAL) {
51,523,067✔
204
                                // the value is in the subnormal range of the cfloat
205
                                biasedExponent = 0;
35,473,309✔
206
                                // -exponent because we are right shifting and exponent in this range is negative
207
                                adjustment = -(exponent + subnormal_reciprocal_shift[es]);
35,473,309✔
208
                                // this is the right shift adjustment required for subnormal representation due 
209
                                // to the scale of the input number, i.e. the exponent of 2^-adjustment
210
                        }
211
                        else {
212
                                // the value is in the normal range of the cfloat
213
                                biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive
16,049,758✔
214
                        }
215
                }
216
                else {
217
                        if (exponent < cfloatType::MIN_EXP_NORMAL) biasedExponent = 1ull; // fixup biasedExponent if we are in the subnormal region
15,997,698✔
218
                }
219

220

221
                // get the rounding direction and the LSB right shift: 
222
                std::pair<bool, unsigned> alignment = src.roundingDecision(adjustment);
67,520,765✔
223
                unsigned rightShift = alignment.second;  // this is the shift to get the LSB of the src to the LSB of the tgt
67,520,765✔
224
                //std::cout << "rightShift       " << rightShift << '\n';
225

226
                if constexpr (btType::bfbits < 65) {
227
                        bool roundup = alignment.first;
66,988,866✔
228
                        //std::cout << "round-up?        " << (roundup ? "yes" : "no") << '\n';
229
                        // we can use a uint64_t to construct the cfloat
230
                        uint64_t raw = (src.sign() ? 1ull : 0ull); // process sign
66,988,866✔
231
                        //std::cout << "raw bits (sign)  " << to_binary(raw) << '\n';
232
                        // construct the fraction bits
233
                        uint64_t fracbits = src.significand_ull(); // get all the bits, including the integer bits
66,988,866✔
234
                        //std::cout << "fracbits         " << to_binary(fracbits) << '\n';
235
                        fracbits >>= rightShift;
66,988,866✔
236
                        //std::cout << "fracbits shifted " << to_binary(fracbits) << '\n';
237
                        fracbits &= cfloatType::ALL_ONES_FR; // remove the hidden bit
66,988,866✔
238
                        //std::cout << "fracbits masked  " << to_binary(fracbits) << '\n';
239
                        if (roundup) ++fracbits;
66,988,866✔
240
                        if (fracbits == (1ull << cfloatType::fbits)) { // check for overflow
66,988,866✔
241
                                if (biasedExponent == cfloatType::ALL_ONES_ES) {
425,458✔
242
                                        fracbits = cfloatType::INF_ENCODING; // project to INF
6,244✔
243
                                }
244
                                else {
245
                                        ++biasedExponent;
419,214✔
246
                                        fracbits = 0;
419,214✔
247
                                }
248
                        }
249

250
                        raw <<= es; // shift sign to make room for the exponent bits
66,988,866✔
251
                        raw |= biasedExponent;
66,988,866✔
252
                        //std::cout << "raw bits (exp)   " << to_binary(raw) << '\n';
253
                        raw <<= cfloatType::fbits; // make room for the fraction bits
66,988,866✔
254
                        //std::cout << "raw bits (s+exp) " << to_binary(raw) << '\n';
255
                        raw |= fracbits;
66,988,866✔
256
                        //std::cout << "raw bits (final) " << to_binary(raw) << '\n';
257
                        tgt.setbits(raw);
66,988,866✔
258
//                        std::cout << "raw bits (all)   " << to_binary(raw) << '\n';
259
                        if constexpr (isSaturating) {
260
                                if (tgt.isnan()) {
17,112,786✔
261
                                        if (src.sign()) {
550✔
262
                                                tgt.maxneg();        // map back to maxneg
275✔
263
                                        }
264
                                        else {
265
                                                tgt.maxpos();        // map back to maxpos
275✔
266
                                        }
267
                                }
268
                        }
269
                        else {
270
                                // when you get too far, map it back to +-inf: 
271
                                // TBD: this doesn't appear to be the right algorithm to catch all overflow patterns
272
                                if (tgt.isnan()) tgt.setinf(src.sign());        // map back to +-inf
49,876,080✔
273
                        }
274
                }
275
                else {
276
                        // compose the segments
277
                        auto fracbits = src.significand();  // why significand? cheesy optimization: we are going to overwrite the hidden bit position anyway when we write the exponent below, so no need to pay the overhead of generating the fraction here.
531,899✔
278
                        //std::cout << "fraction      : " << to_binary(fracbits, true) << '\n';
279
                        fracbits >>= static_cast<int>(rightShift);
531,899✔
280
                        //std::cout << "aligned fbits : " << to_binary(fracbits, true) << '\n';
281

282
                        // copy the blocks that contain fraction bits
283
                        // significand blocks are organized like this:
284
                        //   ADD        iii.ffffrrrrrrrrr          3 integer bits, f fraction bits, and 2*fhbits rounding bits
285
                        //   MUL         ii.ffff'ffff              2 integer bits, 2*f fraction bits
286
                        //   DIV         ii.ffff'ffff'ffff'rrrr    2 integer bits, 3*f fraction bits, and r rounding bits
287
                        //std::cout << "fraction bits : " << to_binary(fracbits, true) << '\n';
288
                        tgt.clear();
531,899✔
289
                        //std::cout << "initial state : " << to_binary(tgt) << " : " << tgt << '\n';
290
                        for (unsigned b = 0; b < btType::nrBlocks; ++b) {
1,074,306✔
291
                                tgt.setblock(b, fracbits.block(b));
542,407✔
292
                        }
293
                        //std::cout << "fraction bits : " << to_binary(tgt, true) << '\n';
294
                        tgt.setsign(src.sign());
531,899✔
295
                        //std::cout << "adding sign   : " << to_binary(tgt) << '\n';
296
                        if (!tgt.setexponent(exponent)) {
531,899✔
297
                                std::cerr << "exponent value is out of range: " << exponent << '\n';
×
298
                        }
299
                        //std::cout << "add exponent  : " << to_binary(tgt) << '\n';
300
                }
301
        }
302
}
303

304

305
/// <summary>
306
/// An arbitrary, fixed-size floating-point number with configurable gradual under/overflow and saturation/non-saturation arithmetic.
307
/// Default configuration offers normal encoding and non-saturating arithmetic.
308
/// /// </summary>
309
/// <typeparam name="nbits">number of bits in the encoding</typeparam>
310
/// <typeparam name="es">number of exponent bits in the encoding</typeparam>
311
/// <typeparam name="bt">the type to use as storage class: one of [uint8_t|uint16_t|uint32_t]</typeparam>
312
/// <typeparam name="hasSubnormals">configure gradual underflow (==subnormals)</typeparam>
313
/// <typeparam name="hasMaxExpValues">reclaim max-exponent encodings as numeric values</typeparam>
314
/// <typeparam name="isSaturating">configure saturation arithmetic</typeparam>
315
template<unsigned _nbits, unsigned _es, typename bt = uint8_t,
316
        bool _hasSubnormals = false, bool _hasMaxExpValues = false, bool _isSaturating = false>
317
class cfloat {
318
public:
319
        static_assert(_nbits > _es + 1ull, "nbits is too small to accomodate the requested number of exponent bits");
320
        static_assert(_es < 21ull, "my God that is a big number, are you trying to break the Interweb?");
321
        static_assert(_es > 0, "number of exponent bits must be bigger than 0 to be a classic floating point number");
322
        // how do you assert on the condition that if es == 1 then subnormals and max-exponent values must be true?
323
        static constexpr bool     subsuper = (_hasSubnormals && _hasMaxExpValues);
324
        static constexpr bool     special = (subsuper ? true : (_es > 1));
325
        static_assert(special, "when es == 1, cfloat must have both subnormals and max-exponent values");
326
        static constexpr unsigned bitsInByte = 8u;
327
        static constexpr unsigned bitsInBlock = sizeof(bt) * bitsInByte;
328
        static_assert(bitsInBlock <= 64, "storage unit for block arithmetic needs to be <= uint64_t"); // TODO: carry propagation on uint64_t requires assembly code
329

330
        static constexpr unsigned nbits = _nbits;
331
        static constexpr unsigned es = _es;
332
        static constexpr unsigned fbits  = nbits - 1u - es;    // number of fraction bits excluding the hidden bit
333
        static constexpr unsigned fhbits = nbits - es;           // number of fraction bits including the hidden bit
334

335
        static constexpr uint64_t  storageMask = (0xFFFFFFFFFFFFFFFFull >> (64ull - bitsInBlock));
336
        static constexpr bt       BLOCK_MASK = bt(~0);
337
        static constexpr bt       ALL_ONES = bt(~0); // block type specific all 1's value
338
        static constexpr uint32_t ALL_ONES_ES = (0xFFFF'FFFFul >> (32 - es));
339
        static constexpr uint64_t topfbits = fbits % 64;
340
        static constexpr uint64_t FR_SHIFT = (topfbits > 0 ? (64 - topfbits) : 0);
341
        static constexpr uint64_t ALL_ONES_FR = (topfbits > 0 ? (0xFFFF'FFFF'FFFF'FFFFull >> FR_SHIFT) : 0ull); // special case for nbits <= 64
342
        static constexpr uint64_t INF_ENCODING = (ALL_ONES_FR & ~1ull);
343

344
        static constexpr unsigned nrBlocks = 1u + ((nbits - 1ull) / bitsInBlock);
345
        static constexpr unsigned MSU = nrBlocks - 1u; // MSU == Most Significant Unit, as MSB is already taken
346
        static constexpr bt       MSU_MASK = (ALL_ONES >> (nrBlocks * bitsInBlock - nbits));
347
        static constexpr unsigned bitsInMSU = bitsInBlock - (nrBlocks * bitsInBlock - nbits);
348
        static constexpr unsigned fBlocks = 1ull + ((fbits - 1ull) / bitsInBlock); // nr of blocks with fraction bits
349
        static constexpr unsigned FSU = fBlocks - 1u;  // FSU = Fraction Significant Unit: the index of the block that contains the most significant fraction bits
350
        static constexpr bt       FSU_MASK = (ALL_ONES >> (fBlocks * bitsInBlock - fbits));
351
        static constexpr unsigned bitsInFSU = bitsInBlock - (fBlocks * bitsInBlock - fbits);
352

353
        static constexpr bt       SIGN_BIT_MASK = bt(bt(1ull) << ((nbits - 1ull) % bitsInBlock));
354
        static constexpr bt       LSB_BIT_MASK = bt(1ull);
355
        static constexpr bool     MSU_CAPTURES_EXP = (1ull + es) <= bitsInMSU;
356
        static constexpr unsigned EXP_SHIFT = (MSU_CAPTURES_EXP ? (1 == nrBlocks ? (nbits - 1ull - es) : (bitsInMSU - 1ull - es)) : 0);
357
        static constexpr bt       MSU_EXP_MASK = ((ALL_ONES << EXP_SHIFT) & ~SIGN_BIT_MASK) & MSU_MASK;
358
        static constexpr int      EXP_BIAS = ((1 << (es - 1u)) - 1l);
359
        static constexpr int      MAX_EXP = (es == 1) ? 1 : ((1 << es) - EXP_BIAS - 1);
360
        static constexpr int      MIN_EXP_NORMAL = 1 - EXP_BIAS;
361
        static constexpr int      MIN_EXP_SUBNORMAL = 1 - EXP_BIAS - int(fbits); // the scale of smallest ULP
362

363
        static constexpr bool     hasSubnormals   = _hasSubnormals;
364
        static constexpr bool     hasMaxExpValues = _hasMaxExpValues;
365
        static constexpr bool     isSaturating    = _isSaturating;
366
        typedef bt BlockType;
367

368
        // constructors
369
        cfloat() = default;
370
        cfloat(const cfloat&) = default;
371
        cfloat& operator=(const cfloat&) = default;
372

373
        // construct a cfloat from another
374
        template<unsigned nnbits, unsigned ees, typename bbt, bool ssub, bool ssup, bool ssat>
375
        cfloat(const cfloat<nnbits, ees, bbt, ssub, ssup, ssat>& rhs) noexcept : _block{} {
6,104✔
376
                if (rhs.isnan()) {
6,104✔
377
                        setnan(rhs.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
79✔
378
                }
379
                else if (rhs.isinf()) {
6,025✔
380
                        setinf(rhs.sign());
9✔
381
                }
382
                else if (rhs.iszero()) {
6,016✔
383
                        setzero();
738✔
384
                }
385
                else {
386
                        // TODO: cfloat from another cfloat: marshall through a proper blocktriple
387
                        /*
388
                        if constexpr (std::is_same_v<bt, bbt>) {
389
                                blocktriple<fbits, BlockTripleOperator::REP, bt> value;
390
                                value.setnormal();
391
                                value.setsign(rhs.sign());
392
                                value.setscale(rhs.scale());
393
                                //constexpr unsigned rhsFbits = nnbits - 1ul - ees;
394
                                //blockbinary<rhsFbits, bbt, BinaryNumberType::Signed> fraction;
395
                                //rhs.fraction<rhsFbits>(fraction);
396
                                //std::cout << "fraction : " << to_binary(fraction) << '\n';
397
                                //value.setfraction(fraction);
398
                                convert(value, *this);
399
                        }
400
                        else {
401
                                static_assert(nnbits < 64, "converting constructor marshalls values through native double precision, and rhs has more bits");
402
                                *this = double(rhs); 
403
                        }
404
                        */
405
                        *this = double(rhs);
5,278✔
406
                }
407
        }
6,104✔
408

409
        // converting constructors
410
        constexpr cfloat(const std::string& stringRep) : _block{} { assign(stringRep); }
1✔
411
        // specific value constructor
412
        constexpr cfloat(const SpecificValue code) noexcept : _block{} {
309✔
413
                switch (code) {
309✔
414
                case SpecificValue::maxpos:
95✔
415
                        maxpos();
95✔
416
                        break;
95✔
417
                case SpecificValue::minpos:
111✔
418
                        minpos();
111✔
419
                        break;
111✔
420
                case SpecificValue::zero:
×
421
                default:
422
                        zero();
×
423
                        break;
×
424
                case SpecificValue::minneg:
×
425
                        minneg();
×
426
                        break;
×
427
                case SpecificValue::maxneg:
48✔
428
                        maxneg();
48✔
429
                        break;
48✔
430
                case SpecificValue::infpos:
21✔
431
                        setinf(false);
21✔
432
                        break;
21✔
433
                case SpecificValue::infneg:
×
434
                        setinf(true);
×
435
                        break;
×
436
                case SpecificValue::nar: // approximation as cfloats don't have a NaR
17✔
437
                case SpecificValue::qnan:
438
                        setnan(NAN_TYPE_QUIET);
17✔
439
                        break;
17✔
440
                case SpecificValue::snan:
17✔
441
                        setnan(NAN_TYPE_SIGNALLING);
17✔
442
                        break;
17✔
443
                }
444
        }
309✔
445

446
        constexpr cfloat(signed char iv)                    noexcept : _block{} { *this = iv; }
447
        constexpr cfloat(short iv)                          noexcept : _block{} { *this = iv; }
448
        constexpr cfloat(int iv)                            noexcept : _block{} { *this = iv; }
114,803✔
449
        constexpr cfloat(long iv)                           noexcept : _block{} { *this = iv; }
450
        constexpr cfloat(long long iv)                      noexcept : _block{} { *this = iv; }
1✔
451
        constexpr cfloat(char iv)                           noexcept : _block{} { *this = iv; }
452
        constexpr cfloat(unsigned short iv)                 noexcept : _block{} { *this = iv; }
453
        constexpr cfloat(unsigned int iv)                   noexcept : _block{} { *this = iv; }
454
        constexpr cfloat(unsigned long iv)                  noexcept : _block{} { *this = iv; }
100✔
455
        constexpr cfloat(unsigned long long iv)             noexcept : _block{} { *this = iv; }
456
        CONSTEXPRESSION cfloat(float iv)                    noexcept : _block{} { *this = iv; }
340,804✔
457
        CONSTEXPRESSION cfloat(double iv)                   noexcept : _block{} { *this = iv; }
515,969✔
458

459
        // assignment operators
460
        constexpr cfloat& operator=(signed char rhs)        noexcept { return convert_signed_integer(rhs); }
461
        constexpr cfloat& operator=(short rhs)              noexcept { return convert_signed_integer(rhs); }
462
        constexpr cfloat& operator=(int rhs)                noexcept { return convert_signed_integer(rhs); }
174,971✔
463
        constexpr cfloat& operator=(long rhs)               noexcept { return convert_signed_integer(rhs); }
464
        constexpr cfloat& operator=(long long rhs)          noexcept { return convert_signed_integer(rhs); }
1✔
465

466
        constexpr cfloat& operator=(char rhs)               noexcept { return convert_unsigned_integer(rhs); }
467
        constexpr cfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned_integer(rhs); }
468
        constexpr cfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned_integer(rhs); }
30✔
469
        constexpr cfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned_integer(rhs); }
110✔
470
        constexpr cfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned_integer(rhs); }
471

472
        BIT_CAST_CONSTEXPR cfloat& operator=(float rhs)     noexcept { return convert_ieee754(rhs); }
32,488,679✔
473
        BIT_CAST_CONSTEXPR cfloat& operator=(double rhs)    noexcept { return convert_ieee754(rhs); }
99,049,156✔
474

475
        // make conversions to native types explicit
476
        explicit operator int()                       const noexcept { return to_int(); }
59,976✔
477
        explicit operator long()                      const noexcept { return to_long(); }
478
        explicit operator long long()                 const noexcept { return to_long_long(); }
1✔
479
        explicit operator float()                     const noexcept { return to_native<float>(); }
35,177,206✔
480
        explicit operator double()                    const noexcept { return to_native<double>(); }
78,237,855✔
481

482
        // guard long double support to enable ARM and RISC-V embedded environments
483
#if LONG_DOUBLE_SUPPORT
484
        explicit operator long double()               const noexcept { return to_native<long double>(); }
54✔
485
        BIT_CAST_CONSTEXPR cfloat(long double iv)           noexcept : _block{} { *this = iv; }
2✔
486
        BIT_CAST_CONSTEXPR cfloat& operator=(long double rhs)  noexcept { return convert_ieee754(rhs); }
74✔
487
#endif
488

489
        // arithmetic operators
490
        // prefix operator
491
        inline cfloat operator-() const {
8,354,869✔
492
                cfloat tmp(*this);
8,354,869✔
493
                tmp._block[MSU] ^= SIGN_BIT_MASK;
8,354,869✔
494
                return tmp;
8,354,869✔
495
        }
496

497
        cfloat& operator+=(const cfloat& rhs) CFLOAT_EXCEPT {
18,485,472✔
498
                if constexpr (_trace_add) std::cout << "---------------------- ADD -------------------" << std::endl;
499
                // special case handling of the inputs
500
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
501
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
744✔
502
                        throw cfloat_operand_is_nan{};
×
503
                }
504
#else
505
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
18,484,728✔
506
                        setnan(NAN_TYPE_SIGNALLING);
236,540✔
507
                        return *this;
236,540✔
508
                }
509
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
18,248,188✔
510
                        setnan(NAN_TYPE_QUIET);
216,964✔
511
                        return *this;
216,964✔
512
                }
513
#endif
514
                // normal + inf  = inf
515
                // normal + -inf = -inf
516
                // inf + normal = inf
517
                // inf + inf    = inf
518
                // inf + -inf    = ?
519
                // -inf + normal = -inf
520
                // -inf + -inf   = -inf
521
                // -inf + inf    = ?
522
                if (isinf()) {
18,031,968✔
523
                        if (rhs.isinf()) {
60,327✔
524
                                if (sign() != rhs.sign()) {
939✔
525
                                        setnan(NAN_TYPE_SIGNALLING);
514✔
526
                                }
527
                                return *this;
939✔
528
                        }
529
                        else {
530
                                return *this;
59,388✔
531
                        }
532
                }
533
                else {
534
                        if (rhs.isinf()) {
17,971,641✔
535
                                *this = rhs;
58,294✔
536
                                return *this;
58,294✔
537
                        }
538
                }
539

540
                if (iszero()) {
17,913,347✔
541
                        *this = rhs;
206,239✔
542
                        return *this;
206,239✔
543
                }
544
                if (rhs.iszero()) return *this;
17,707,108✔
545

546
                // arithmetic operation
547
                blocktriple<fbits, BlockTripleOperator::ADD, bt> a, b, sum;
17,375,033✔
548

549
                // transform the inputs into (sign,scale,significant) 
550
                normalizeAddition(a); 
17,375,033✔
551
                rhs.normalizeAddition(b);
17,375,033✔
552
                sum.add(a, b);
17,375,033✔
553

554
                convert(sum, *this);
17,375,033✔
555

556
                return *this;
17,375,033✔
557
        }
558
        cfloat& operator+=(double rhs) CFLOAT_EXCEPT {
559
                return *this += cfloat(rhs);
560
        }
561
        cfloat& operator-=(const cfloat& rhs) CFLOAT_EXCEPT {
8,469,314✔
562
                if constexpr (_trace_sub) std::cout << "---------------------- SUB -------------------" << std::endl;
563
                if (rhs.isnan()) 
8,469,314✔
564
                        return *this += rhs;
170,204✔
565
                else 
566
                        return *this += -rhs;
8,299,110✔
567
        }
568
        cfloat& operator-=(double rhs) CFLOAT_EXCEPT {
8✔
569
                return *this -= cfloat(rhs);
8✔
570
        }
571
        cfloat& operator*=(const cfloat& rhs) CFLOAT_EXCEPT {
4,312,845✔
572
                if constexpr (_trace_mul) std::cout << "---------------------- MUL -------------------\n";
322✔
573
                // special case handling of the inputs
574
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
575
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
612✔
576
                        throw cfloat_operand_is_nan{};
×
577
                }
578
#else
579
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
4,312,233✔
580
                        setnan(NAN_TYPE_SIGNALLING);
124,658✔
581
                        return *this;
124,658✔
582
                }
583
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
4,187,575✔
584
                        setnan(NAN_TYPE_QUIET);
113,354✔
585
                        return *this;
113,354✔
586
                }
587
#endif
588
                //  inf * inf = inf
589
                //  inf * -inf = -inf
590
                // -inf * inf = -inf
591
                // -inf * -inf = inf
592
                //        0 * inf = -nan(ind)
593
                //        inf * 0 = -nan(ind)
594
                bool resultSign = sign() != rhs.sign();
4,074,833✔
595
                if (isinf()) {
4,074,833✔
596
                        if (rhs.iszero()) {
24,457✔
597
                                setnan(NAN_TYPE_QUIET);
1,591✔
598
                        }
599
                        else {
600
                                setsign(resultSign);
22,866✔
601
                        }
602
                        return *this;
24,457✔
603
                }
604
                if (rhs.isinf()) {
4,050,376✔
605
                        if (iszero()) {
25,815✔
606
                                setnan(NAN_TYPE_QUIET);
2,969✔
607
                        }
608
                        else {
609
                                setinf(resultSign);
22,846✔
610
                        }
611
                        return *this;
25,815✔
612
                }
613

614
                if (iszero() || rhs.iszero()) {                        
4,024,561✔
615
                        setzero();
1,876,394✔
616
                        setsign(resultSign); // deal with negative 0
1,876,394✔
617
                        return *this;
1,876,394✔
618
                }
619

620
                // arithmetic operation
621
                blocktriple<fbits, BlockTripleOperator::MUL, bt> a, b, product;
2,148,167✔
622

623
                // transform the inputs into (sign,scale,significant) 
624
                // triples of the correct width
625
                normalizeMultiplication(a);
2,148,167✔
626
                rhs.normalizeMultiplication(b);
2,148,167✔
627
                product.mul(a, b);
2,148,167✔
628
                convert(product, *this);
2,148,167✔
629

630
                if constexpr (_trace_mul) std::cout << to_binary(a) << " : " << a << " *\n" << to_binary(b) << " : " << b << " =\n" << to_binary(product) << " : " << product << '\n';
106✔
631

632
                return *this;
2,148,167✔
633
        }
634
        cfloat& operator*=(double rhs) CFLOAT_EXCEPT {
40✔
635
                return *this *= cfloat(rhs);
40✔
636
        }
637
        cfloat& operator/=(const cfloat& rhs) CFLOAT_EXCEPT {
5,139,192✔
638
                if constexpr (_trace_div) std::cout << "---------------------- DIV -------------------" << std::endl;
639

640
                // special case handling of the inputs
641
                // qnan / qnan = qnan
642
                // qnan / snan = qnan
643
                // snan / qnan = snan
644
                // snan / snan = snan
645
#if CFLOAT_THROW_ARITHMETIC_EXCEPTION
646
                if (rhs.iszero()) throw cfloat_divide_by_zero();
49✔
647
                if (rhs.isnan()) throw cfloat_divide_by_nan();
48✔
648
                if (isnan()) throw cfloat_operand_is_nan();
48✔
649
#else
650
                if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
5,139,143✔
651
                        setnan(NAN_TYPE_SIGNALLING);
164,428✔
652
                        return *this;
164,428✔
653
                }
654
                if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
4,974,715✔
655
                        setnan(NAN_TYPE_QUIET);
151,601✔
656
                        return *this;
151,601✔
657
                }
658
                if (rhs.iszero()) {
4,823,114✔
659
                        if (iszero()) {
169,760✔
660
                                // zero divide by zero yields quiet NaN (in MSVC it is labeled -nan(ind) for indeterminate)
661
                                setnan(NAN_TYPE_QUIET);
29,299✔
662
                        }
663
                        else {
664
                                // non-zero divide by zero yields INF
665
                                bool resultSign = sign() != rhs.sign();
140,461✔
666
                                setinf(resultSign);
140,461✔
667
                        }
668
                        return *this;
169,760✔
669
                }
670
#endif
671
                //  inf /  inf = -nan(ind)
672
                //  inf / -inf = -nan(ind)
673
                // -inf /  inf = -nan(ind)
674
                // -inf / -inf = -nan(ind)
675
                //        1.0 /  inf = 0
676
                bool resultSign = sign() != rhs.sign();
4,653,402✔
677
                if (isinf()) {
4,653,402✔
678
                        if (rhs.isinf()) {
31,087✔
679
                                // inf divide by inf yields quiet NaN (in MSVC it is labeled -nan(ind) for indeterminate)
680
                                setnan(NAN_TYPE_QUIET);
559✔
681
                                return *this;
559✔
682
                        }
683
                        else {
684
                                // we stay an infinite but may change sign
685
                                setsign(resultSign);
30,528✔
686
                                return *this;
30,528✔
687
                        }
688
                }
689
                else {
690
                        if (rhs.isinf()) {
4,622,315✔
691
                                setzero();
32,833✔
692
                                setsign(resultSign);
32,833✔
693
                                return *this;
32,833✔
694
                        }
695
                }
696

697
                if (iszero()) {
4,589,482✔
698
                        setzero();
139,438✔
699
                        setsign(resultSign); // deal with negative 0
139,438✔
700
                        return *this;
139,438✔
701
                }
702

703
                // arithmetic operation
704
                using BlockTriple = blocktriple<fbits, BlockTripleOperator::DIV, bt>;
705
                BlockTriple a, b, quotient;
4,450,044✔
706

707
                // transform the inputs into (sign,scale,significant) 
708
                // triples of the correct width
709
                normalizeDivision(a);
4,450,044✔
710
                rhs.normalizeDivision(b);
4,450,044✔
711
                quotient.div(a, b);
4,450,044✔
712
                quotient.setradix(BlockTriple::radix);
4,450,044✔
713
                convert(quotient, *this);
4,450,044✔
714

715
                if constexpr (_trace_div) std::cout << to_binary(a) << " : " << a << " /\n" << to_binary(b) << " : " << b << " =\n" << to_binary(quotient) << " : " << quotient << '\n';
716

717
                return *this;
4,450,044✔
718
        }
719
        cfloat& operator/=(double rhs) CFLOAT_EXCEPT {
720
                return *this /= cfloat(rhs);
721
        }
722
        cfloat& reciprocal() CFLOAT_EXCEPT {
723
                cfloat c = 1.0 / *this;
724
                return *this = c;
725
        }
726
        /// <summary>
727
        /// move to the next bit encoding modulo 2^nbits
728
        /// </summary>
729
        /// <typeparam name="bt"></typeparam>
730
        cfloat& operator++() {
4,598,638✔
731
                if constexpr (0 == nrBlocks) {
732
                        return *this;
733
                }
734
                else if constexpr (1 == nrBlocks) {
735
                        if (sign()) {
864,255✔
736
                                if (_block[MSU] == (SIGN_BIT_MASK | 1ul)) { // pattern: 1.00.001 = minneg
467✔
737
                                        _block[MSU] = 0; // pattern: 0.00.000 = +0 
7✔
738
                                }
739
                                else {
740
                                        --_block[MSU];
460✔
741
                                }
742
                                if constexpr (!hasSubnormals) {
743
                                        if (isdenormal()) {
198✔
744
                                                // special case, we need to jump past all the subnormal value encodings which puts us on 0
745
                                                _block[MSU] = 0; // pattern: 0.00.000 = +0
3✔
746
                                        }
747
                                }
748
                        }
749
                        else {
750
                                if constexpr (!hasSubnormals) {
751
                                        if (_block[MSU] == 0) {
384,649✔
752
                                                // special case, we need to jump past all the subnormal value encodings minus 1
753
                                                setfraction(0xFFFF'FFFF'FFFF'FFFFull);
4✔
754
                                        }
755
                                }
756
                                if ((_block[MSU] & (MSU_MASK >> 1)) == (MSU_MASK >> 1)) { // pattern: 0.11.111 = nan
863,788✔
757
                                        _block[MSU] |= SIGN_BIT_MASK; // pattern: 1.11.111 = snan : wrap to the other side of the encoding
×
758
                                }
759
                                else {
760
                                        ++_block[MSU];
863,788✔
761
                                }
762
                        }
763
                }
764
                else {
765
                        if (sign()) {
3,734,383✔
766
                                // special case: pattern: 1.00.001 = minneg transitions to pattern: 0.00.000 = +0 
767
                                if (isminnegencoding()) {
66,947✔
768
                                        setzero();
8✔
769
                                }
770
                                else {
771
                                        //  1111 0000
772
                                        //  1110 1111
773
                                        bool borrow = true;
66,939✔
774
                                        for (unsigned i = 0; i < MSU; ++i) {
199,415✔
775
                                                if (borrow) {
132,476✔
776
                                                        if ((_block[i] & storageMask) == 0) { // block will underflow
67,197✔
777
                                                                --_block[i];
261✔
778
                                                                borrow = true;
261✔
779
                                                        }
780
                                                        else {
781
                                                                --_block[i];
66,936✔
782
                                                                borrow = false;
66,936✔
783
                                                        }
784
                                                }
785
                                        }
786
                                        if (borrow) {
66,939✔
787
                                                --_block[MSU];
3✔
788
                                        }
789
                                        if constexpr (!hasSubnormals) {
790
                                                if (isdenormal()) {
385✔
791
                                                        // special case, we need to jump past all the subnormal value encodings which puts us on 0
792
                                                        setzero(); // pattern: 0.00.000 = +0
3✔
793
                                                }
794
                                        }
795
                                }
796
                        }
797
                        else {
798
                                // special case: pattern: 0.11.111 = nan transitions to pattern: 1.11.111 = snan 
799
                                if (isnanencoding()) {
3,667,436✔
800
                                        setnan(NAN_TYPE_SIGNALLING);
×
801
                                }
802
                                else {
803
                                        if constexpr (!hasSubnormals) {
804
                                                if (iszero()) {
1,720,661✔
805
                                                        // special case, we need to jump past all the subnormal value encodings minus 1 so that the increment code below ends up on normal minpos
806
                                                        setfraction(0xFFFF'FFFF'FFFF'FFFFull);
3✔
807
                                                }
808
                                        }
809
                                        bool carry = true;
3,667,436✔
810
                                        for (unsigned i = 0; i < MSU; ++i) {
7,400,769✔
811
                                                if (carry) {
3,733,333✔
812
                                                        if ((_block[i] & storageMask) == storageMask) { // block will overflow
3,667,691✔
813
                                                                _block[i] = 0;
256✔
814
                                                                carry = true;
256✔
815
                                                        }
816
                                                        else {
817
                                                                ++_block[i];
3,667,435✔
818
                                                                carry = false;
3,667,435✔
819
                                                        }
820
                                                }
821
                                        }
822
                                        if (carry) {
3,667,436✔
823
                                                ++_block[MSU];
1✔
824
                                        }
825
                                }
826
                        }
827
                }
828
                return *this;
4,598,638✔
829
        }
830
        cfloat operator++(int) {
134,800✔
831
                cfloat tmp(*this);
134,800✔
832
                operator++();
134,800✔
833
                return tmp;
134,800✔
834
        }
835
        cfloat& operator--() {
5,014,100✔
836
                if constexpr (0 == nrBlocks) {
837
                        return *this;
838
                }
839
                else if constexpr (1 == nrBlocks) {
840
                        if (sign()) {
942,793✔
841
                                ++_block[MSU];
942,299✔
842
                        }
843
                        else {
844
                                // positive range
845
                                if (_block[MSU] == 0) { // pattern: 0.00.000 = 0
494✔
846
                                        if constexpr (hasSubnormals) {
847
                                                _block[MSU] |= SIGN_BIT_MASK | bt(1u); // pattern: 1.00.001 = minneg 
8✔
848
                                        }
849
                                        else {
850
                                                // special case, we need to jump past all the subnormal value encodings
851
                                                setfraction(0xFFFF'FFFF'FFFF'FFFFull); // set to 0.00.11...11
3✔
852
                                                ++_block[MSU]; // increment into 0.01.0000
3✔
853
                                                _block[MSU] |= SIGN_BIT_MASK; // set to 1.01.0000
3✔
854
                                        }
855
                                }
856
                                else {
857
                                        --_block[MSU];
483✔
858
                                }
859
                                if constexpr (!hasSubnormals) {
860
                                        if (isdenormal()) {
211✔
861
                                                // special case, we need to jump past all the subnormal value encodings which puts us on 0
862
                                                _block[MSU] = 0; // pattern: 0.00.000 = +0
3✔
863
                                        }
864
                                }
865
                        }
866
                }
867
                else {
868
                        if (sign()) {
4,071,307✔
869
                                bool carry = true;
4,004,349✔
870
                                for (unsigned i = 0; i < MSU; ++i) {
8,074,235✔
871
                                        if (carry) {
4,069,886✔
872
                                                if ((_block[i] & storageMask) == storageMask) { // block will overflow
4,004,604✔
873
                                                        _block[i] = 0;
256✔
874
                                                        carry = true;
256✔
875
                                                }
876
                                                else {
877
                                                        ++_block[i];
4,004,348✔
878
                                                        carry = false;
4,004,348✔
879
                                                }
880
                                        }
881
                                }
882
                                if (carry) {
4,004,349✔
883
                                        ++_block[MSU];
1✔
884
                                }
885
                        }
886
                        else {
887
                                // special case: pattern: 0.00.000 = +0 transitions to pattern: 1.00.001 = minneg
888
                                if (iszeroencoding()) {
66,958✔
889
                                        if constexpr (hasSubnormals) {
890
                                                setsign(true);
8✔
891
                                                setbit(0, true);
8✔
892
                                        }
893
                                        else {
894
                                                // special case, we need to jump past all the subnormal value encodings 1.01.0000 = minneg normal
895
                                                setexponent(1 - EXP_BIAS);
2✔
896
                                                setsign(true);
2✔
897
                                        }
898
                                }
899
                                else {
900
                                        bool borrow = true;
66,948✔
901
                                        for (unsigned i = 0; i < MSU; ++i) {
199,441✔
902
                                                if (borrow) {
132,493✔
903
                                                        if ((_block[i] & storageMask) == 0) { // block will underflow
67,209✔
904
                                                                --_block[i];
266✔
905
                                                                borrow = true;
266✔
906
                                                        }
907
                                                        else {
908
                                                                --_block[i];
66,943✔
909
                                                                borrow = false;
66,943✔
910
                                                        }
911
                                                }
912
                                        }
913
                                        if (borrow) {
66,948✔
914
                                                --_block[MSU];
5✔
915
                                        }
916
                                        if constexpr (!hasSubnormals) {
917
                                                if (isdenormal()) {
384✔
918
                                                        // special case, we need to jump past all the subnormal value encodings which puts us on 0
919
                                                        setzero(); // pattern: 0.00.000 = +0
2✔
920
                                                }
921
                                        }
922
                                }
923
                        }
924
                }
925
                return *this;
5,014,100✔
926
        }
927
        cfloat operator--(int) {
134,812✔
928
                cfloat tmp(*this);
134,812✔
929
                operator--();
134,812✔
930
                return tmp;
134,812✔
931
        }
932

933
        // modifiers        
934
        constexpr void clear() noexcept {
136,145,286✔
935
                if constexpr (0 == nrBlocks) {
936
                        return;
937
                }
938
                else if constexpr (1 == nrBlocks) {
939
                        _block[0] = bt(0);
41,282,332✔
940
                }
941
                else if constexpr (2 == nrBlocks) {
942
                        _block[0] = bt(0);
46,801,529✔
943
                        _block[1] = bt(0);
46,801,529✔
944
                }
945
                else if constexpr (3 == nrBlocks) {
946
                        _block[0] = bt(0);
47,927,466✔
947
                        _block[1] = bt(0);
47,927,466✔
948
                        _block[2] = bt(0);
47,927,466✔
949
                }
950
                else if constexpr (4 == nrBlocks) {
951
                        _block[0] = bt(0);
43,275✔
952
                        _block[1] = bt(0);
43,275✔
953
                        _block[2] = bt(0);
43,275✔
954
                        _block[3] = bt(0);
43,275✔
955
                }
956
                else if constexpr (5 == nrBlocks) {
957
                        _block[0] = bt(0);
29,999✔
958
                        _block[1] = bt(0);
29,999✔
959
                        _block[2] = bt(0);
29,999✔
960
                        _block[3] = bt(0);
29,999✔
961
                        _block[4] = bt(0);
29,999✔
962
                }
963
                else if constexpr (6 == nrBlocks) {
964
                        _block[0] = bt(0);
10,016✔
965
                        _block[1] = bt(0);
10,016✔
966
                        _block[2] = bt(0);
10,016✔
967
                        _block[3] = bt(0);
10,016✔
968
                        _block[4] = bt(0);
10,016✔
969
                        _block[5] = bt(0);
10,016✔
970
                }
971
                else if constexpr (7 == nrBlocks) {
972
                        _block[0] = bt(0);
9,999✔
973
                        _block[1] = bt(0);
9,999✔
974
                        _block[2] = bt(0);
9,999✔
975
                        _block[3] = bt(0);
9,999✔
976
                        _block[4] = bt(0);
9,999✔
977
                        _block[5] = bt(0);
9,999✔
978
                        _block[6] = bt(0);
9,999✔
979
                }
980
                else if constexpr (8 == nrBlocks) {
981
                        _block[0] = bt(0);
20,323✔
982
                        _block[1] = bt(0);
20,323✔
983
                        _block[2] = bt(0);
20,323✔
984
                        _block[3] = bt(0);
20,323✔
985
                        _block[4] = bt(0);
20,323✔
986
                        _block[5] = bt(0);
20,323✔
987
                        _block[6] = bt(0);
20,323✔
988
                        _block[7] = bt(0);
20,323✔
989
                }
990
                else {
991
                        for (unsigned i = 0; i < nrBlocks; ++i) {
224,339✔
992
                                _block[i] = bt(0);
203,992✔
993
                        }
994
                }
995
        }
136,145,286✔
996
        constexpr void setzero() noexcept { clear(); }
2,695,278✔
997
        constexpr void setinf(bool sign = true) noexcept {
5,953,689✔
998
                // the Inf encoding is the pattern 0b0'11...11'11...10 for a +inf, and 0b1'11...11'11...110 for a -inf
999
                if constexpr (0 == nrBlocks) {
1000
                        return;
1001
                }
1002
                else if constexpr (1 == nrBlocks) {
1003
                        _block[MSU] = sign ? bt(MSU_MASK ^ LSB_BIT_MASK) : bt(~SIGN_BIT_MASK & (MSU_MASK ^ LSB_BIT_MASK));
2,197,734✔
1004
                }
1005
                else if constexpr (2 == nrBlocks) {
1006
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
1,750,668✔
1007
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
1,750,668✔
1008
                }
1009
                else if constexpr (3 == nrBlocks) {
1010
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
2,002,261✔
1011
                        _block[1] = BLOCK_MASK;
2,002,261✔
1012
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
2,002,261✔
1013
                }
1014
                else if constexpr (4 == nrBlocks) {
1015
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
2,740✔
1016
                        _block[1] = BLOCK_MASK;
2,740✔
1017
                        _block[2] = BLOCK_MASK;
2,740✔
1018
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
2,740✔
1019
                }
1020
                else if constexpr (5 == nrBlocks) {
1021
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
79✔
1022
                        _block[1] = BLOCK_MASK;
79✔
1023
                        _block[2] = BLOCK_MASK;
79✔
1024
                        _block[3] = BLOCK_MASK;
79✔
1025
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
79✔
1026
                }
1027
                else if constexpr (6 == nrBlocks) {
1028
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
41✔
1029
                        _block[1] = BLOCK_MASK;
41✔
1030
                        _block[2] = BLOCK_MASK;
41✔
1031
                        _block[3] = BLOCK_MASK;
41✔
1032
                        _block[4] = BLOCK_MASK;
41✔
1033
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
41✔
1034
                }
1035
                else if constexpr (7 == nrBlocks) {
1036
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
35✔
1037
                        _block[1] = BLOCK_MASK;
35✔
1038
                        _block[2] = BLOCK_MASK;
35✔
1039
                        _block[3] = BLOCK_MASK;
35✔
1040
                        _block[4] = BLOCK_MASK;
35✔
1041
                        _block[5] = BLOCK_MASK;
35✔
1042
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
35✔
1043
                }
1044
                else if constexpr (8 == nrBlocks) {
1045
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
86✔
1046
                        _block[1] = BLOCK_MASK;
86✔
1047
                        _block[2] = BLOCK_MASK;
86✔
1048
                        _block[3] = BLOCK_MASK;
86✔
1049
                        _block[4] = BLOCK_MASK;
86✔
1050
                        _block[5] = BLOCK_MASK;
86✔
1051
                        _block[6] = BLOCK_MASK;
86✔
1052
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
86✔
1053
                }
1054
                else {
1055
                        _block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
45✔
1056
                        for (unsigned i = 1; i < nrBlocks - 1; ++i) {
405✔
1057
                                _block[i] = BLOCK_MASK;
360✔
1058
                        }
1059
                        _block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
45✔
1060
                }        
1061
        }
5,953,689✔
1062
        constexpr void setnan(int NaNType = NAN_TYPE_SIGNALLING) noexcept {
7,120,203✔
1063
                // the NaN encoding is the pattern 0b0'11...11'11...11 for a quiet Nan, and 0b1'11...11'11...111 for a signalling NaN
1064
                if constexpr (0 == nrBlocks) {
1065
                        return;
1066
                }
1067
                else if constexpr (1 == nrBlocks) {
1068
                        // fall through
1069
                }
1070
                else if constexpr (2 == nrBlocks) {
1071
                        _block[0] = BLOCK_MASK;
1,994,452✔
1072
                }
1073
                else if constexpr (3 == nrBlocks) {
1074
                        _block[0] = BLOCK_MASK;
1,048,671✔
1075
                        _block[1] = BLOCK_MASK;
1,048,671✔
1076
                }
1077
                else if constexpr (4 == nrBlocks) {
1078
                        _block[0] = BLOCK_MASK;
25✔
1079
                        _block[1] = BLOCK_MASK;
25✔
1080
                        _block[2] = BLOCK_MASK;
25✔
1081
                }
1082
                else if constexpr (5 == nrBlocks) {
1083
                        _block[0] = BLOCK_MASK;
9✔
1084
                        _block[1] = BLOCK_MASK;
9✔
1085
                        _block[2] = BLOCK_MASK;
9✔
1086
                        _block[3] = BLOCK_MASK;
9✔
1087
                }
1088
                else if constexpr (6 == nrBlocks) {
1089
                        _block[0] = BLOCK_MASK;
4✔
1090
                        _block[1] = BLOCK_MASK;
4✔
1091
                        _block[2] = BLOCK_MASK;
4✔
1092
                        _block[3] = BLOCK_MASK;
4✔
1093
                        _block[4] = BLOCK_MASK;
4✔
1094
                }
1095
                else if constexpr (7 == nrBlocks) {
1096
                        _block[0] = BLOCK_MASK;
4✔
1097
                        _block[1] = BLOCK_MASK;
4✔
1098
                        _block[2] = BLOCK_MASK;
4✔
1099
                        _block[3] = BLOCK_MASK;
4✔
1100
                        _block[4] = BLOCK_MASK;
4✔
1101
                        _block[5] = BLOCK_MASK;
4✔
1102
                }
1103
                else if constexpr (8 == nrBlocks) {
1104
                        _block[0] = BLOCK_MASK;
4✔
1105
                        _block[1] = BLOCK_MASK;
4✔
1106
                        _block[2] = BLOCK_MASK;
4✔
1107
                        _block[3] = BLOCK_MASK;
4✔
1108
                        _block[4] = BLOCK_MASK;
4✔
1109
                        _block[5] = BLOCK_MASK;
4✔
1110
                        _block[6] = BLOCK_MASK;
4✔
1111
                }
1112
                else {
1113
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) {
10✔
1114
                                _block[i] = BLOCK_MASK;
9✔
1115
                        }
1116
                }
1117
                _block[MSU] = NaNType == NAN_TYPE_SIGNALLING ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
7,120,203✔
1118
        }
7,120,203✔
1119
        constexpr void setsign(bool sign = true) {
3,537,266✔
1120
                if (sign) {
3,537,266✔
1121
                        _block[MSU] |= SIGN_BIT_MASK;
631,488✔
1122
                }
1123
                else {
1124
                        _block[MSU] &= ~SIGN_BIT_MASK;
2,905,778✔
1125
                }
1126
        }
3,537,266✔
1127
        constexpr bool setexponent(int scale) {
590,515✔
1128
                if (scale < MIN_EXP_SUBNORMAL || scale > MAX_EXP) return false; // this scale cannot be represented
590,515✔
1129
                if constexpr (nbits < 65) {
1130
                        uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
588,974✔
1131
                        if (scale >= MIN_EXP_SUBNORMAL && scale < MIN_EXP_NORMAL) {
588,974✔
1132
                                // we are a subnormal number: all exponent bits are 0
1133
                                exponentBits = 0;
45✔
1134
                        }
1135
                        // TODO: optimize
1136
                        uint32_t mask = (1ul << (es - 1));
588,974✔
1137
                        for (unsigned i = nbits - 2; i > nbits - 2 - es; --i) {
5,084,077✔
1138
                                setbit(i, (mask & exponentBits));
4,495,103✔
1139
                                mask >>= 1;
4,495,103✔
1140
                        }
1141
                }
1142
                else {
1143
                        uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
1,541✔
1144
                        uint32_t mask = (1ul << (es - 1));
1,541✔
1145
                        for (unsigned i = nbits - 2; i > nbits - 2 - es; --i) {
25,156✔
1146
                                setbit(i, (mask & exponentBits));
23,615✔
1147
                                mask >>= 1;
23,615✔
1148
                        }
1149
                }
1150
                return true;
590,515✔
1151
        }
1152
        constexpr void setfraction(uint64_t raw_bits) {
610✔
1153
                // unoptimized as it is not meant to be an end-user API, it is a test API
1154
                // raw_bits is uint64_t so can have at most 64 bits of fraction data
1155
                constexpr unsigned bitsToSet = (fbits < 64) ? fbits : 64;
610✔
1156
                uint64_t mask{ 1ull };
610✔
1157
                for (unsigned i = 0; i < bitsToSet; ++i) {
24,744✔
1158
                        setbit(i, (mask & raw_bits));
24,134✔
1159
                        mask <<= 1;
24,134✔
1160
                }
1161
        }
610✔
1162
        constexpr void setbit(unsigned i, bool v = true) noexcept {
7,478,318✔
1163
                unsigned blockIndex = i / bitsInBlock;
7,478,318✔
1164
                if (blockIndex < nrBlocks) {
7,478,318✔
1165
                        bt block = _block[blockIndex];
7,478,318✔
1166
                        bt null = ~(1ull << (i % bitsInBlock));
7,478,318✔
1167
                        bt bit = bt(v ? 1 : 0);
7,478,318✔
1168
                        bt mask = bt(bit << (i % bitsInBlock));
7,478,318✔
1169
                        _block[blockIndex] = bt((block & null) | mask);
7,478,318✔
1170
                }
1171
        }
7,478,318✔
1172
        constexpr cfloat& setbits(uint64_t raw_bits) noexcept {
294,371,406✔
1173
                if constexpr (0 == nrBlocks) {
1174
                        return *this;
1175
                }
1176
                else if constexpr (1 == nrBlocks) {
1177
                        _block[0] = raw_bits & storageMask;
89,211,090✔
1178
                }
1179
                else if constexpr (2 == nrBlocks) {
1180
                        if constexpr (bitsInBlock < 64) {
1181
                                _block[0] = raw_bits & storageMask;
101,167,970✔
1182
                                raw_bits >>= bitsInBlock;
101,167,970✔
1183
                                _block[1] = raw_bits & storageMask;
101,167,970✔
1184
                        }
1185
                        else {
1186
                                _block[0] = raw_bits & storageMask;
1187
                                _block[1] = 0;
1188
                        }
1189
                }
1190
                else if constexpr (3 == nrBlocks) {
1191
                        if constexpr (bitsInBlock < 64) {
1192
                                _block[0] = raw_bits & storageMask;
103,832,102✔
1193
                                raw_bits >>= bitsInBlock;
103,832,102✔
1194
                                _block[1] = raw_bits & storageMask;
103,832,102✔
1195
                                raw_bits >>= bitsInBlock;
103,832,102✔
1196
                                _block[2] = raw_bits & storageMask;
103,832,102✔
1197
                        }
1198
                        else {
1199
                                _block[0] = raw_bits & storageMask;
1200
                                _block[1] = 0;
1201
                                _block[2] = 0;
1202
                        }
1203
                }
1204
                else if constexpr (4 == nrBlocks) {
1205
                        if constexpr (bitsInBlock < 64) {
1206
                                _block[0] = raw_bits & storageMask;
70,229✔
1207
                                raw_bits >>= bitsInBlock;
70,229✔
1208
                                _block[1] = raw_bits & storageMask;
70,229✔
1209
                                raw_bits >>= bitsInBlock;
70,229✔
1210
                                _block[2] = raw_bits & storageMask;
70,229✔
1211
                                raw_bits >>= bitsInBlock;
70,229✔
1212
                                _block[3] = raw_bits & storageMask;
70,229✔
1213
                        }
1214
                        else {
1215
                                _block[0] = raw_bits & storageMask;
1216
                                _block[1] = 0;
1217
                                _block[2] = 0;
1218
                                _block[3] = 0;
1219
                        }
1220
                }
1221
                else {
1222
                        if constexpr (bitsInBlock < 64) {
1223
                                for (unsigned i = 0; i < nrBlocks; ++i) {
730,733✔
1224
                                        _block[i] = raw_bits & storageMask;
640,718✔
1225
                                        raw_bits >>= bitsInBlock;
640,718✔
1226
                                }
1227
                        }
1228
                        else {
1229
                                _block[0] = raw_bits & storageMask;
1230
                                for (unsigned i = 1; i < nrBlocks; ++i) {
1231
                                        _block[i] = 0;
1232
                                }
1233
                        }
1234
                }
1235
                _block[MSU] &= MSU_MASK; // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
294,371,406✔
1236
                return *this;
294,371,406✔
1237
        }
1238
        constexpr void setblock(unsigned b, bt data) noexcept {
542,407✔
1239
                if (b < nrBlocks) _block[b] = data;
542,407✔
1240
        }
542,407✔
1241
        
1242
        // create specific number system values of interest
1243
        constexpr cfloat& maxpos() noexcept {
635,204✔
1244
                if constexpr (isSaturating) {
1245
                        // in a saturating encoding with max-exponent values we are removing the Inf encoding pattern 0b0'11...11'11...10 for a +inf, 
1246
                        // and 0b1'11...11'11...110 for a -inf and using it as a value
1247
                        if constexpr (hasMaxExpValues) {
1248
                                // maximum positive value has this bit pattern: 0-1...1-111...110, that is, sign = 0, e = 11..11, f = 111...110
1249
                                clear();
634,552✔
1250
                                flip();
634,552✔
1251
                                setbit(nbits - 1ull, false); // sign = 0
634,552✔
1252
                                setbit(0ull, false); // bit0 = 0
634,552✔
1253
                        }
1254
                        else {
1255
                                // maximum positive value has this bit pattern: 0-11...10-111...111, that is, sign = 0, e = 11..10, f = 111...111
1256
                                clear();
413✔
1257
                                flip();
413✔
1258
                                setbit(fbits, false); // set least significant exponent bit to 0
413✔
1259
                                setbit(nbits - 1ull, false); // set sign to 0
413✔
1260
                        }
1261
                }
1262
                else {
1263
                        // the Inf encoding is the pattern 0b0'11...11'11...10 for a +inf, and 0b1'11...11'11...110 for a -inf
1264
                        // the maxpos is the encoding before that
1265
                        if constexpr (hasMaxExpValues) {
1266
                                // maximum positive value has this bit pattern: 0-1...1-111...101, that is, sign = 0, e = 11..11, f = 111...101
1267
                                clear();
70✔
1268
                                flip();
70✔
1269
                                setbit(nbits - 1ull, false); // sign = 0
70✔
1270
                                setbit(1ull, false); // bit1 = 0
70✔
1271
                        }
1272
                        else {
1273
                                // maximum positive value has this bit pattern: 0-1...0-111...111, that is, sign = 0, e = 11..10, f = 111...111
1274
                                clear();
169✔
1275
                                flip();
169✔
1276
                                setbit(fbits, false); // set least significant exponent bit to 0
169✔
1277
                                setbit(nbits - 1ull, false); // set sign to 0
169✔
1278
                        }
1279
                }
1280
                return *this;
635,204✔
1281
        }
1282
        constexpr cfloat& minpos() noexcept {
13,201✔
1283
                // minpos encoding is not impacted by saturating encodings, which only affects maxpos and inf
1284
                if constexpr (hasSubnormals) {
1285
                        // minimum positive value has this bit pattern: 0-000-00...01, that is, sign = 0, e = 000, f = 00001
1286
                        clear();
327✔
1287
                        setbit(0);
327✔
1288
                }
1289
                else {
1290
                        // minimum positive value has this bit pattern: 0-001-00...0, that is, sign = 0, e = 001, f = 0000
1291
                        clear();
12,874✔
1292
                        setbit(fbits);
12,874✔
1293
                }
1294
                return *this;
13,201✔
1295
        }
1296
        constexpr cfloat& zero() noexcept {
1✔
1297
                // the zero value
1298
                clear();
1✔
1299
                return *this;
1✔
1300
        }
1301
        constexpr cfloat& minneg() noexcept {
11✔
1302
                // minneg encoding is not impacted by saturating encodings, which only affects maxpos and inf
1303
                if constexpr (hasSubnormals) {
1304
                        // minimum negative value has this bit pattern: 1-000-00...01, that is, sign = 1, e = 00, f = 00001
1305
                        clear();
8✔
1306
                        setbit(nbits - 1ull);
8✔
1307
                        setbit(0);
8✔
1308
                }
1309
                else {
1310
                        // minimum negative value has this bit pattern: 1-001-00...0, that is, sign = 1, e = 001, f = 0000
1311
                        clear();
3✔
1312
                        setbit(fbits);
3✔
1313
                        setbit(nbits - 1ull);
3✔
1314
                }
1315
                return *this;
11✔
1316
        }
1317
        constexpr cfloat& maxneg() noexcept {
634,959✔
1318
                if constexpr (isSaturating) {
1319
                        // in a saturating encoding with max-exponent values we are removing the Inf encoding pattern 0b0'11...11'11...10 for a +inf, 
1320
                        // and 0b1'11...11'11...110 for a -inf and using it as a value
1321
                        if constexpr (hasMaxExpValues) {
1322
                                // maximum negative value has this bit pattern: 1-1...1-111...110, that is, sign = 1, e = 1..1, f = 111...110
1323
                                clear();
634,501✔
1324
                                flip();
634,501✔
1325
                                setbit(0ull, false);
634,501✔
1326
                        }
1327
                        else {
1328
                                // maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1329
                                clear();
400✔
1330
                                flip();
400✔
1331
                                setbit(fbits, false);
400✔
1332
                        }
1333
                }
1334
                else {
1335
                        if constexpr (hasMaxExpValues) {
1336
                                // maximum negative value has this bit pattern: 1-1...1-111...101, that is, sign = 1, e = 1..1, f = 111...101
1337
                                clear();
12✔
1338
                                flip();
12✔
1339
                                setbit(1ull, false);
12✔
1340
                        }
1341
                        else {
1342
                                // maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1343
                                clear();
46✔
1344
                                flip();
46✔
1345
                                setbit(fbits, false);
46✔
1346
                        }
1347
                }
1348
                return *this;
634,959✔
1349
        }
1350

1351

1352
        /// <summary>
1353
        /// assign the value of the string representation to the cfloat
1354
        /// </summary>
1355
        /// <param name="stringRep">decimal scientific notation of a real number to be assigned</param>
1356
        /// <returns>reference to this cfloat</returns>
1357
        /// Clang doesn't support constexpr yet on string manipulations, so we need to make it conditional
1358
        CONSTEXPRESSION cfloat& assign(const std::string& str) noexcept {
7✔
1359
                clear();
7✔
1360
                unsigned nrChars = static_cast<unsigned>(str.size());
7✔
1361
                unsigned nrBits = 0;
7✔
1362
                unsigned nrDots = 0;
7✔
1363
                std::string bits;
7✔
1364
                if (nrChars > 2) {
7✔
1365
                        if (str[0] == '0' && str[1] == 'b') {
7✔
1366
                                for (unsigned i = 2; i < nrChars; ++i) {
211✔
1367
                                        char c = str[i];
204✔
1368
                                        switch (c) {
204✔
1369
                                        case '0':
186✔
1370
                                        case '1':
1371
                                                ++nrBits;
186✔
1372
                                                bits += c;
186✔
1373
                                                break;
186✔
1374
                                        case '.':
14✔
1375
                                                ++nrDots;
14✔
1376
                                                bits += c;
14✔
1377
                                                break;
14✔
1378
                                        case '\'':
4✔
1379
                                                // consume this delimiting character
1380
                                                break;
4✔
1381
                                        default:
×
1382
                                                std::cerr << "string contained a non-standard character: " << c << '\n';
×
1383
                                                return *this;
×
1384
                                        }
1385
                                }
1386
                        }
1387
                        else {
1388
                                std::cerr << "string must start with 0b: instead input pattern was " << str << '\n';
×
1389
                                return *this;
×
1390
                        }
1391
                }
1392
                else {
1393
                        std::cerr << "string is too short\n";
×
1394
                        return *this;
×
1395
                }
1396

1397
                if (nrBits != nbits) {
7✔
1398
                        std::cerr << "number of bits in the string is " << nrBits << " and needs to be " << nbits << '\n';
×
1399
                        return *this;
×
1400
                }
1401
                if (nrDots != 2) {
7✔
1402
                        std::cerr << "number of segment delimiters in string is " << nrDots << " and needs to be 2 for a cfloat<>\n";
×
1403
                        return *this;
×
1404
                }
1405

1406
                // assign the bits
1407
                int field{ 0 };  // three fields: sign, exponent, mantissa: fields are separated by a '.'
7✔
1408
                int nrExponentBits{ -1 };
7✔
1409
                unsigned bit = nrBits;
7✔
1410
                for (unsigned i = 0; i < bits.size(); ++i) {
207✔
1411
                        char c = bits[i];
200✔
1412
                        if (c == '.') {
200✔
1413
                                ++field;
14✔
1414
                                if (field == 2) { // just finished parsing exponent field: we can now check the number of exponent bits
14✔
1415
                                        if (nrExponentBits != es) {
7✔
1416
                                                std::cerr << "provided binary string representation does not contain " << es << " exponent bits. Found " << nrExponentBits << ". Reset to 0\n";
×
1417
                                                clear();
×
1418
                                                return *this;
×
1419
                                        }
1420
                                }
1421
                        }
1422
                        else {
1423
                                setbit(--bit, c == '1');
186✔
1424
                        }
1425
                        if (field == 1) { // exponent field
200✔
1426
                                ++nrExponentBits;
51✔
1427
                        }
1428
                }
1429
                if (field != 2) {
7✔
1430
                        std::cerr << "provided binary string did not contain three fields separated by '.': Reset to 0\n";
×
1431
                        clear();
×
1432
                        return *this;
×
1433
                }
1434
                return *this;
7✔
1435
        }
7✔
1436

1437
        // selectors
1438
        constexpr bool sign() const noexcept { return (_block[MSU] & SIGN_BIT_MASK) == SIGN_BIT_MASK; }
257,854,453✔
1439
        constexpr int  scale() const noexcept {
60,904,306✔
1440
                int e{ 0 };
60,904,306✔
1441
                if constexpr (MSU_CAPTURES_EXP) {
1442
                        e = static_cast<int>((_block[MSU] & ~SIGN_BIT_MASK) >> EXP_SHIFT);
49,256,352✔
1443
                        if (e == 0) {
49,256,352✔
1444
                                // subnormal scale is determined by fraction
1445
                                // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1446
                                e = (2l - (1l << (es - 1ull))) - 1;
6,532,930✔
1447
                                if constexpr (nbits > 2 + es) {
1448
                                        for (unsigned i = nbits - 2ull - es; i > 0; --i) {
12,592,415✔
1449
                                                if (test(i)) break;
12,487,813✔
1450
                                                --e;
6,082,021✔
1451
                                        }
1452
                                }
1453
                        }
1454
                        else {
1455
                                e -= EXP_BIAS;
42,723,422✔
1456
                        }
1457
                }
1458
                else {
1459
                        blockbinary<es, bt> ebits{};
11,648,335✔
1460
                        exponent(ebits);
11,647,954✔
1461
                        if (ebits.iszero()) {
11,647,954✔
1462
                                // subnormal scale is determined by fraction
1463
                                // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1464
                                e = (2l - (1l << (es - 1ull))) - 1;
1,599,671✔
1465
                                if constexpr (nbits > 2 + es) {
1466
                                        for (unsigned i = nbits - 2ull - es; i > 0; --i) {
3,160,457✔
1467
                                                if (test(i)) break;
3,153,335✔
1468
                                                --e;
1,560,797✔
1469
                                        }
1470
                                }
1471
                        }
1472
                        else {
1473
                                e = static_cast<int>(unsigned(ebits) - EXP_BIAS);
10,048,283✔
1474
                        }
1475
                }
1476
                return e;
60,904,306✔
1477
        }
1478
        constexpr bool isneg() const noexcept {
558✔
1479
                if (isnan()) return false;
558✔
1480
                return sign(); 
541✔
1481
        }
1482
        constexpr bool ispos() const noexcept { 
34,472✔
1483
                if (isnan()) return false;
34,472✔
1484
                return !sign(); 
34,472✔
1485
        }
1486
        constexpr bool iszero() const noexcept {
421,300,588✔
1487
                // NOTE: this is a very specific design that makes the decsion that
1488
                // for subnormal encodings found in a configuration that doesn't
1489
                // support them, we assume that these values map to 0.
1490
                if constexpr (hasSubnormals) {
1491
                        return iszeroencoding();
248,937,250✔
1492
                }
1493
                else { // all subnormals round to 0
1494
                        blockbinary<es, bt> ebits{};
172,363,796✔
1495
                        exponent(ebits);
172,363,338✔
1496
                        if (ebits.iszero()) return true; else return false;
172,363,338✔
1497
                }
1498
        }
1499
        constexpr bool isone() const noexcept {
1500
                // unbiased exponent = scale = 0, fraction = 0
1501
                int s = scale();
1502
                if (s == 0) {
1503
                        blockbinary<fbits, bt> f{};
1504
                        fraction(f);
1505
                        return f.iszero();
1506
                }
1507
                return false;
1508
        }
1509
        constexpr bool isinf(int InfType = INF_TYPE_EITHER) const noexcept {
373,689,025✔
1510
                // the bit pattern encoding of Inf is independent of the max-exponent value configuration
1511
                bool isNegInf = false;
373,689,025✔
1512
                bool isPosInf = false;
373,689,025✔
1513
                if constexpr (0 == nrBlocks) {
1514
                        return false;
1515
                }
1516
                else if constexpr (1 == nrBlocks) {
1517
                        isNegInf = (_block[MSU] & MSU_MASK) == (MSU_MASK ^ LSB_BIT_MASK);
167,933,109✔
1518
                        isPosInf = (_block[MSU] & MSU_MASK) == ((MSU_MASK ^ SIGN_BIT_MASK) ^ LSB_BIT_MASK);
167,933,109✔
1519
                }
1520
                else if constexpr (2 == nrBlocks) {
1521
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
144,226,067✔
1522
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
144,226,067✔
1523
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
144,226,067✔
1524
                }
1525
                else if constexpr (3 == nrBlocks) {
1526
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK);
61,337,395✔
1527
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
61,337,395✔
1528
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
61,337,395✔
1529
                }
1530
                else if constexpr (4 == nrBlocks) {
1531
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
100,259✔
1532
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
100,259✔
1533
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
100,259✔
1534
                }
1535
                else {
1536
                        bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
92,195✔
1537
                        for (unsigned i = 1; i < nrBlocks - 1; ++i) {
93,877✔
1538
                                if (_block[i] != BLOCK_MASK) {
93,557✔
1539
                                        isInf = false;
91,875✔
1540
                                        break;
91,875✔
1541
                                }
1542
                        }
1543
                        isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
92,195✔
1544
                        isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
92,195✔
1545
                }
1546

1547
                return (InfType == INF_TYPE_EITHER ? (isNegInf || isPosInf) :
424,417,842✔
1548
                        (InfType == INF_TYPE_NEGATIVE ? isNegInf :
76,093,526✔
1549
                                (InfType == INF_TYPE_POSITIVE ? isPosInf : false)));
424,418,443✔
1550
        }
50,728,817✔
1551
        constexpr bool isnan(int NaNType = NAN_TYPE_EITHER) const noexcept {
963,294,414✔
1552
                if constexpr (hasMaxExpValues) {
1553
                        return isnanencoding(NaNType);
564,899,810✔
1554
                }
1555
                else {
1556
                        if (ismaxexpvalue()) {
398,394,604✔
1557
                                // all these max-exponent encodings are NANs, except for the encoding representing INF
1558
                                bool isNaN = isinf() ? false : true;
24,515,354✔
1559
                                bool isNegNaN = isNaN && sign();
24,515,354✔
1560
                                bool isPosNaN = isNaN && !sign();
24,515,354✔
1561
                                return (NaNType == NAN_TYPE_EITHER ? (isNaN) :
27,937,088✔
1562
                                        (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
4,556,808✔
1563
                                                (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
26,785,502✔
1564
                        }
1565
                        else {
1566
                                return false;
373,879,250✔
1567
                        }
1568
                }
1569
        }
24,515,354✔
1570
        // iszeroencoding returns true if it finds a pure -0 or +0 pattern and returns false otherwise
1571
        constexpr bool iszeroencoding() const noexcept {
354,868,575✔
1572
                if constexpr (0 == nrBlocks) {
1573
                        return true;
1574
                }
1575
                else if constexpr (1 == nrBlocks) {
1576
                        return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
134,114,898✔
1577
                }
1578
                else if constexpr (2 == nrBlocks) {
1579
                        return (_block[0] == 0) && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
146,607,156✔
1580
                }
1581
                else if constexpr (3 == nrBlocks) {
1582
                        return (_block[0] == 0) && _block[1] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
73,918,370✔
1583
                }
1584
                else if constexpr (4 == nrBlocks) {
1585
                        return (_block[0] == 0) && _block[1] == 0 && _block[2] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
135,354✔
1586
                }
1587
                else {
1588
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) if (_block[i] != 0) return false;
338,397✔
1589
                        return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
555✔
1590
                }
1591
        }
1592
        // isminnegencoding returns true if it find the pattern 1.00.00001 and returns false otherwise
1593
        constexpr bool isminnegencoding() const noexcept {
66,947✔
1594
                if constexpr (0 == nrBlocks) {
1595
                        return false;
1596
                }
1597
                else if constexpr (1 == nrBlocks) {
1598
                        return (_block[MSU] & (SIGN_BIT_MASK | 1ul));
1599
                }
1600
                else if constexpr (2 == nrBlocks) {
1601
                        return ((_block[0] == 1ul) && (_block[1] == SIGN_BIT_MASK));
1,408✔
1602
                }
1603
                else if constexpr (3 == nrBlocks) {
1604
                        return ((_block[0] == 1ul) && (_block[1] == 0) && (_block[2] == SIGN_BIT_MASK));
65,536✔
1605
                }
1606
                else if constexpr (4 == nrBlocks) {
1607
                        return ((_block[0] == 1ul) && (_block[1] == 0) && (_block[2] == 0) && (_block[3] == SIGN_BIT_MASK));
3✔
1608
                }
1609
                else {
1610
                        if (_block[0] != 1ul) return false;
×
1611
                        for (unsigned i = 1; i < nrBlocks - 2; ++i) if (_block[i] != 0) return false;
×
1612
                        return (_block[MSU] == SIGN_BIT_MASK);
×
1613
                }
1614
        }
1615
        constexpr bool isnanencoding(int NaNType = NAN_TYPE_EITHER) const noexcept {
568,567,246✔
1616
                // the bit encoding of NaN is independent of the gradual overflow configuration
1617
                bool isNaN = true;
568,567,246✔
1618
                bool isNegNaN = false;
568,567,246✔
1619
                bool isPosNaN = false;
568,567,246✔
1620

1621
                if constexpr (0 == nrBlocks) {
1622
                        return false;
1623
                }
1624
                else if constexpr (1 == nrBlocks) {
1625
                }
1626
                else if constexpr (2 == nrBlocks) {
1627
                        isNaN = (_block[0] == BLOCK_MASK);
218,589,204✔
1628
                }
1629
                else if constexpr (3 == nrBlocks) {
1630
                        isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK);
175,052,132✔
1631
                }
1632
                else if constexpr (4 == nrBlocks) {
1633
                        isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
226,905✔
1634
                }
1635
                else {
1636
                        for (unsigned i = 0; i < nrBlocks - 1; ++i) {
270,020✔
1637
                                if (_block[i] != BLOCK_MASK) {
269,982✔
1638
                                        isNaN = false;
269,777✔
1639
                                        break;
269,777✔
1640
                                }
1641
                        }
1642
                }
1643
                isNegNaN = isNaN && ((_block[MSU] & MSU_MASK) == MSU_MASK);
568,567,246✔
1644
                isPosNaN = isNaN && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
568,567,246✔
1645

1646
                return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
772,702,039✔
1647
                        (NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
306,054,380✔
1648
                                (NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
772,406,420✔
1649
        }
204,134,793✔
1650
        // isnormal returns true if 0 or exponent bits are not all zero or one, false otherwise
1651
        constexpr bool isnormal() const noexcept {
60,691,906✔
1652
                if (iszeroencoding()) return true; // filter out the one special case
60,691,906✔
1653
                blockbinary<es, bt> e{};
60,692,286✔
1654
                exponent(e);
60,691,905✔
1655
                return !e.iszero() && !e.all();
60,691,905✔
1656
        }
1657
        // isdenormal returns true if exponent bits are all zero, false otherwise
1658
        constexpr bool isdenormal() const noexcept {
45,172,461✔
1659
                if (iszeroencoding()) return false; // filter out the one special case
45,172,461✔
1660
                blockbinary<es, bt> e{};
45,163,682✔
1661
                exponent(e);
45,163,682✔
1662
                return e.iszero(); 
45,163,682✔
1663
        }
1664
        // ismaxexpvalue returns true if exponent bits are all one, false otherwise
1665
        constexpr bool ismaxexpvalue() const noexcept {
401,641,672✔
1666
                blockbinary<es, bt> e{};
401,642,740✔
1667
                exponent(e);
401,641,672✔
1668
                return e.all();
401,641,672✔
1669
        }
1670
        // isinteger is TBD
1671
        constexpr bool isinteger() const noexcept { return false; } // return (floor(*this) == *this) ? true : false; }
1672
        
1673
        template<typename NativeReal>
1674
        constexpr bool inrange(NativeReal v) const noexcept {
9,306,296✔
1675
                // the valid range for this cfloat includes the interval between 
1676
                // maxpos and the value that would round down to maxpos
1677
                bool bIsInRange = true;                
9,306,296✔
1678
                if (v > 0) {
9,306,296✔
1679
                        cfloat c(SpecificValue::maxpos);
4,427,047✔
1680
                        cfloat<nbits + 1, es, BlockType, hasSubnormals, hasMaxExpValues, isSaturating> d{};
8,027,370✔
1681
                        d = NativeReal(c);
4,427,047✔
1682
                        ++d;
4,427,047✔
1683
                        if (v >= NativeReal(d)) bIsInRange = false;
4,427,047✔
1684
                }
1685
                else {
1686
                        cfloat c(SpecificValue::maxneg);
4,879,249✔
1687
                        cfloat<nbits + 1, es, BlockType, hasSubnormals, hasMaxExpValues, isSaturating> d{};
8,816,662✔
1688
                        d = NativeReal(c);
4,879,249✔
1689
                        --d;
4,879,249✔
1690
                        if (v <= NativeReal(d)) bIsInRange = false;
4,879,249✔
1691
                }
1692

1693
                return bIsInRange;
9,306,296✔
1694
        }
1695
        constexpr bool test(unsigned bitIndex) const noexcept {
15,658,791✔
1696
                return at(bitIndex);
15,658,791✔
1697
        }
1698
        constexpr bool at(unsigned bitIndex) const noexcept {
1,855,219,480✔
1699
                if (bitIndex < nbits) {
1,855,219,480✔
1700
                        bt word = _block[bitIndex / bitsInBlock];
1,855,219,480✔
1701
                        bt mask = bt(1ull << (bitIndex % bitsInBlock));
1,855,219,480✔
1702
                        return (word & mask);
1,855,219,480✔
1703
                }
1704
                return false;
×
1705
        }
1706
        constexpr uint8_t nibble(unsigned n) const noexcept {
640✔
1707
                if (n < (1 + ((nbits - 1) >> 2))) {
640✔
1708
                        bt word = _block[(n * 4) / bitsInBlock];
640✔
1709
                        int nibbleIndexInWord = int(n % (bitsInBlock >> 2ull));
640✔
1710
                        bt mask = bt(0xF << (nibbleIndexInWord * 4));
640✔
1711
                        bt nibblebits = bt(mask & word);
640✔
1712
                        return uint8_t(nibblebits >> (nibbleIndexInWord * 4));
640✔
1713
                }
1714
                return 0;
×
1715
        }
1716
        constexpr bt block(unsigned b) const noexcept {
1717
                if (b < nrBlocks) {
1718
                        return _block[b];
1719
                }
1720
                return 0;
1721
        }
1722

1723
        constexpr void sign(bool& s) const {
800✔
1724
                s = sign();
800✔
1725
        }
800✔
1726
        constexpr void exponent(blockbinary<es, bt>& e) const {
797,780,854✔
1727
                e.clear();
797,780,854✔
1728
                if constexpr (0 == nrBlocks) return;
1729
                else if constexpr (1 == nrBlocks) {
1730
                        bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
368,019,899✔
1731
                        e.setbits(uint64_t(ebits >> EXP_SHIFT));
368,019,899✔
1732
                }
1733
                else if constexpr (nrBlocks > 1) {
1734
                        if (MSU_CAPTURES_EXP) {
1735
                                bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
249,397,702✔
1736
                                e.setbits(uint64_t(ebits >> ((nbits - 1ull - es) % bitsInBlock)));
249,397,702✔
1737
                        }
1738
                        else {
1739
                                for (unsigned i = 0; i < es; ++i) { e.setbit(i, at(nbits - 1ull - es + i)); }
811,958,986✔
1740
                        }
1741
                }
1742
        }
797,780,854✔
1743
        template<unsigned targetFractionBits>
1744
        constexpr blockbinary<targetFractionBits, bt>& fraction(blockbinary<targetFractionBits, bt>& f) const {
34,084✔
1745
                static_assert(targetFractionBits >= fbits, "target blockbinary is too small and can't receive all fraction bits");
1746
                f.clear();
34,084✔
1747
                if constexpr (0 == nrBlocks) return f;
1748
                else if constexpr (1 == nrBlocks) {
1749
                        bt fraction = bt(_block[MSU] & ~MSU_EXP_MASK);
1,001✔
1750
                        f.setbits(fraction);
1,001✔
1751
                }
1752
                else if constexpr (nrBlocks > 1) {
1753
                        for (unsigned i = 0; i < fbits; ++i) { f.setbit(i, at(i)); }
242,396✔
1754
                }
1755
                return f;
34,084✔
1756
        }
1757
        constexpr uint64_t fraction_ull() const {
59,636,226✔
1758
                uint64_t raw{ 0 };
59,636,226✔
1759
                if constexpr (nbits - es - 1ull < 65ull) { // no-op if precondition doesn't hold
1760
                        if constexpr (1 == nrBlocks) {
1761
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
26,585,500✔
1762
                                raw = fbitMask & uint64_t(_block[0]);
26,585,500✔
1763
                        }
1764
                        else if constexpr (2 == nrBlocks) {
1765
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
22,778,234✔
1766
                                raw = fbitMask & ((uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
22,778,234✔
1767
                        }
1768
                        else if constexpr (3 == nrBlocks) {
1769
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
10,249,633✔
1770
                                raw = fbitMask & ((uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
10,249,633✔
1771
                        }
1772
                        else if constexpr (4 == nrBlocks) {
1773
                                uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
22,451✔
1774
                                raw = fbitMask & ((uint64_t(_block[3]) << (3 * bitsInBlock)) | (uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
22,451✔
1775
                        }
1776
                        else {
1777
                                uint64_t mask{ 1 };
408✔
1778
                                for (unsigned i = 0; i < fbits; ++i) { 
18,051✔
1779
                                        if (test(i)) {
17,643✔
1780
                                                raw |= mask;
3,295✔
1781
                                        }
1782
                                        mask <<= 1;
17,643✔
1783
                                }
1784
                        }
1785
                }
1786
                return raw;
59,636,226✔
1787
        }
1788
        // construct the significant from the encoding, returns normalization offset
1789
        constexpr unsigned significant(blockbinary<fhbits, bt>& s, bool isNormal = true) const {
23✔
1790
                unsigned shift = 0;
23✔
1791
                if (iszero()) return 0;
23✔
1792
                if constexpr (0 == nrBlocks) return 0;
1793
                else if constexpr (1 == nrBlocks) {
1794
                        bt significant = bt(_block[MSU] & ~MSU_EXP_MASK & ~SIGN_BIT_MASK);
23✔
1795
                        if (isNormal) {
23✔
1796
                                significant |= (bt(0x1ul) << fbits);
×
1797
                        }
1798
                        else {
1799
                                unsigned msb = find_msb(significant);
23✔
1800
//                                std::cout << "msb : " << msb << " : fhbits : " << fhbits << " : " << to_binary(significant, true) << std::endl;
1801
                                shift = fhbits - msb;
23✔
1802
                                significant <<= shift;
23✔
1803
                        }
1804
                        s.setbits(significant);
23✔
1805
                }
1806
                else if constexpr (nrBlocks > 1) {
1807
                        s.clear();
1808
                        // TODO: design and implement a block-oriented algorithm, this sequential algorithm is super slow
1809
                        if (isNormal) {
1810
                                s.setbit(fbits);
1811
                                for (unsigned i = 0; i < fbits; ++i) { s.setbit(i, at(i)); }
1812
                        }
1813
                        else {
1814
                                // Find the MSB of the subnormal: 
1815
                                unsigned msb = 0;
1816
                                for (unsigned i = 0; i < fbits; ++i) { // msb protected from not being assigned through iszero test at prelude of function
1817
                                        msb = fbits - 1ull - i;
1818
                                        if (test(msb)) break;
1819
                                }
1820
                                //      m-----lsb
1821
                                // h00001010101
1822
                                // 101010100000
1823
                                for (unsigned i = 0; i <= msb; ++i) {
1824
                                        s.setbit(fbits - msb + i, at(i));
1825
                                }
1826
                                shift = fhbits - msb;
1827
                        }
1828
                }
1829
                return shift;
23✔
1830
        }
1831
        template<unsigned targetbits>
1832
        constexpr void bits(blockbinary<targetbits, bt>& b) const {
640✔
1833
                unsigned upperbound = (nbits > targetbits ? targetbits : nbits);
640✔
1834
                b.clear();
640✔
1835
                for (unsigned i = 0; i < upperbound; ++i) { b.setbit(i, at(i)); }
3,840✔
1836
        }
640✔
1837

1838
        // casts to native types
1839
        int to_int() const {
59,976✔
1840
                if (isnan()) return 0;
59,976✔
1841
                if (isinf()) return sign() ? std::numeric_limits<int>::min() : std::numeric_limits<int>::max();
56,644✔
1842
                return int(to_native<float>());
50,308✔
1843
        }
1844
        long to_long() const {
1845
                if (isnan()) return 0;
1846
                if (isinf()) return sign() ? std::numeric_limits<long>::min() : std::numeric_limits<long>::max();
1847
                return long(to_native<double>());
1848
        }
1849
        long long to_long_long() const {
1✔
1850
                if (isnan()) return 0;
1✔
1851
                if (isinf()) return sign() ? std::numeric_limits<long long>::min() : std::numeric_limits<long long>::max();
1✔
1852
                return (long long)(to_native<double>());
1✔
1853
        }
1854

1855
        // transform an cfloat to a native C++ floating-point. We are using the native
1856
        // precision to compute, which means that all sub-values need to be representable 
1857
        // by the native precision.
1858
        // A more accurate approximation would require an adaptive precision algorithm
1859
        // with a final rounding step.
1860
        template<typename TargetFloat>
1861
        TargetFloat to_native() const { 
113,465,424✔
1862
                TargetFloat v{ 0.0 };
113,465,424✔
1863
                if (iszero()) {
113,465,424✔
1864
                        // the optimizer might destroy the sign
1865
                        return sign() ? -TargetFloat(0) : TargetFloat(0);
910,619✔
1866
                }
1867
                else if (isnan()) {
112,554,805✔
1868
                        v = sign() ? std::numeric_limits<TargetFloat>::signaling_NaN() : std::numeric_limits<TargetFloat>::quiet_NaN();
6,139,174✔
1869
                }
1870
                else if (isinf()) {
106,415,631✔
1871
                        v = sign() ? -std::numeric_limits<TargetFloat>::infinity() : std::numeric_limits<TargetFloat>::infinity();
144,128✔
1872
                }
1873
                else { // TODO: this approach has catastrophic cancellation when nbits is large and native target float is too small
1874
                        TargetFloat f{ 0 };
106,271,503✔
1875
                        TargetFloat fbit{ 0.5 };
106,271,503✔
1876
                        for (int i = static_cast<int>(nbits - 2ull - es); i >= 0; --i) {
1,313,982,625✔
1877
                                f += at(static_cast<unsigned>(i)) ? fbit : TargetFloat(0);
1,207,711,122✔
1878
                                fbit *= TargetFloat(0.5);
1,207,711,122✔
1879
                        }
1880
                        blockbinary<es, bt> ebits;
1881
                        exponent(ebits);
106,271,503✔
1882
                        if constexpr (hasSubnormals) {
1883
                                if (ebits.iszero()) {
66,510,109✔
1884
                                        // subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1885
                                        TargetFloat exponentiation = TargetFloat(subnormal_exponent[es]); // precomputed values for 2^(2-2^(es-1))
14,672,778✔
1886
                                        v = exponentiation * f;  // f is already f/2^fbits
14,672,778✔
1887
                                        return sign() ? -v : v;
14,672,778✔
1888
                                }
1889
                        }
1890
                        else {
1891
                                if (ebits.iszero()) { // underflow to 0
39,761,394✔
1892
                                        // compiler fast float optimization might destroy the sign
1893
                                        return sign() ? -TargetFloat(0) : TargetFloat(0);
×
1894
                                }
1895
                        }
1896
                        if constexpr (hasMaxExpValues) {
1897
                                // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1898
                                int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
52,318,505✔
1899
                                if (-64 < exponent && exponent < 64) {
52,318,505✔
1900
                                        TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
51,994,600✔
1901
                                        v = exponentiation * (TargetFloat(1.0) + f);
51,994,600✔
1902
                                }
51,994,600✔
1903
                                else {
1904
                                        double exponentiation = ipow(exponent);
323,905✔
1905
                                        v = TargetFloat(exponentiation * (1.0 + f));
323,905✔
1906
                                }
1907
                        }
1908
                        else {
1909
                                if (ebits.all()) {
39,280,220✔
1910
                                        // max-exponent values are mapped to quiet NaNs
1911
                                        v = std::numeric_limits<TargetFloat>::quiet_NaN();
×
1912
                                        return v;
×
1913
                                }
1914
                                else {
1915
                                        // regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1916
                                        int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
39,280,220✔
1917
                                        if (-64 < exponent && exponent < 64) {
39,280,220✔
1918
                                                TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
39,079,048✔
1919
                                                v = exponentiation * (TargetFloat(1.0) + f);
39,079,048✔
1920
                                        }
39,079,048✔
1921
                                        else {
1922
                                                double exponentiation = ipow(exponent);
201,172✔
1923
                                                v = TargetFloat(exponentiation * (1.0 + f));
201,172✔
1924
                                        }
1925
                                }
1926
                        }
1927
                        v = sign() ? -v : v;
91,598,725✔
1928
                }
1929
                return v;
97,882,027✔
1930
        }
1931

1932
        // convert a cfloat to a blocktriple with the fraction format 1.ffff
1933
        // we are using the same block type so that we can use block copies to move bits around.
1934
        // Since we tend to have at least two exponent bits, this will lead to
1935
        // most cfloat<->blocktriple cases being efficient as the block types are aligned.
1936
        // The relationship between the source cfloat and target blocktriple is not
1937
        // arbitrary, enforce it by setting the blocktriple fbits to the cfloat's (nbits - es - 1)
1938
        template<typename TargetBlockType = bt>
1939
        constexpr void normalize(blocktriple<fbits, BlockTripleOperator::REP, TargetBlockType>& tgt) const {
3,189✔
1940
                // test special cases
1941
                if (isnan()) {
3,189✔
1942
                        tgt.setnan();
253✔
1943
                }
1944
                else if (isinf()) {
2,936✔
1945
                        tgt.setinf();
246✔
1946
                }
1947
                else if (iszero()) {
2,690✔
1948
                        tgt.setzero();
311✔
1949
                }
1950
                else {
1951
                        tgt.setnormal(); // a blocktriple is always normalized
2,379✔
1952
                        int scale = this->scale();
2,379✔
1953
                        tgt.setsign(sign());
2,379✔
1954
                        tgt.setscale(scale);
2,379✔
1955
                        // set significant
1956
                        // we are going to unify to the format 01.ffffeeee
1957
                        // where 'f' is a fraction bit, and 'e' is an extension bit
1958
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
1959
                        if (isnormal() || ismaxexpvalue()) {
2,379✔
1960
                                // normal encoding or max-exponent encoding (hasMaxExpValues=true):
1961
                                // significand has a hidden bit at position fbits
1962
                                if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1963
                                        uint64_t raw = fraction_ull();
1,710✔
1964
                                        raw |= (1ull << fbits);
1,710✔
1965
                                        tgt.setbits(raw);
1,710✔
1966
                                }
1967
                                else {
1968
                                        blockcopy(tgt);
111✔
1969
                                        tgt.setbit(fbits);
111✔
1970
                                }
1971
                        }
1972
                        else { // it is a subnormal encoding in this target cfloat
1973
                                int shift = MIN_EXP_NORMAL - scale;
558✔
1974
                                if (shift < 0) shift = 0; // guard against negative shifts
558✔
1975
                                if constexpr (fbits < 64) {
1976
                                        uint64_t raw = fraction_ull();
541✔
1977
                                        raw <<= shift;
541✔
1978
                                        raw |= (1ull << fbits);
541✔
1979
                                        tgt.setbits(raw);
541✔
1980
                                }
1981
                                else {
1982
                                        blockcopy(tgt);
17✔
1983
                                        tgt <<= shift;
17✔
1984
                                        tgt.setbit(fbits);
17✔
1985
                                }
1986
                        }
1987
                }
1988
        }
3,189✔
1989

1990
        // normalize a cfloat to a blocktriple used in add/sub, which has the form 00h.fffff
1991
        // that is 3 + fbits, the 3 extra bits are required to be able to use 2's complement 
1992
        // and capture the largest value of an addition/subtraction.
1993
        // TODO: currently abits = 2*fhbits as the worst case input argument size to
1994
        // capture the smallest normal value in aligned form. There is a faster/smaller
1995
        // implementation where the input is constrainted to just the round, guard, and sticky bits.
1996
        constexpr void normalizeAddition(blocktriple<fbits, BlockTripleOperator::ADD, bt>& tgt) const {
39,489,285✔
1997
                using BlockTripleConfiguration = blocktriple<fbits, BlockTripleOperator::ADD, bt>;
1998
                // test special cases
1999
                if (isnan()) {
39,489,285✔
2000
                        tgt.setnan();
396,494✔
2001
                }
2002
                else if (isinf()) {
39,092,791✔
2003
                        tgt.setinf();
326✔
2004
                }
2005
                else if (iszero()) {
39,092,465✔
2006
                        tgt.setzero();
325,672✔
2007
                }
2008
                else {
2009
                        tgt.setnormal(); // a blocktriple is always normalized
38,766,793✔
2010
                        int scale = this->scale();
38,766,793✔
2011
                        tgt.setsign(sign());
38,766,793✔
2012
                        tgt.setscale(scale);
38,766,793✔
2013
                        // set significant
2014
                        // we are going to unify to the format 001.ffffeeee
2015
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2016
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2017
                        if (isnormal()) {
38,766,793✔
2018
                                if constexpr (fbits < 64 && BlockTripleConfiguration::rbits < (64 - fbits)) {
2019
                                        uint64_t raw = fraction_ull();
26,010,368✔
2020
                                        raw |= (1ull << fbits); // add the hidden bit
26,010,368✔
2021
                                        //std::cout << "normalize      : " << *this << '\n';
2022
                                        //std::cout << "significant    : " << to_binary(raw, fbits + 2) << '\n';
2023
                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
26,010,368✔
2024
                                        //std::cout << "rounding shift : " << to_binary(raw, fbits + 2 + BlockTripleConfiguration::rbits) << '\n';
2025
                                        tgt.setbits(raw);
26,010,368✔
2026
                                }
2027
                                else {
2028
                                        blockcopy(tgt);
962✔
2029
                                        tgt.setradix();
962✔
2030
                                        tgt.setbit(fbits); // add the hidden bit
962✔
2031
                                        tgt.bitShift(BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
962✔
2032
                                }
2033
                        }
2034
                        else {
2035
                                if (isdenormal()) { // it is a subnormal encoding in this target cfloat
12,755,463✔
2036
                                        if constexpr (hasSubnormals) {
2037
                                                if constexpr (BlockTripleConfiguration::rbits < (64 - fbits)) {
2038
                                                        uint64_t raw = fraction_ull();
6,463,791✔
2039
                                                        int shift = MIN_EXP_NORMAL - scale;
6,463,791✔
2040
                                                        raw <<= shift; // shift but do NOT add a hidden bit as the MSB of the subnormal is shifted in the hidden bit position
6,463,791✔
2041
                                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
6,463,791✔
2042
                                                        tgt.setbits(raw);
6,463,791✔
2043
                                                }
2044
                                                else {
2045
                                                        blockcopy(tgt);
2046
                                                        tgt.setradix();
2047
                                                        int shift = MIN_EXP_NORMAL - scale;
2048
                                                        tgt.bitShift(shift + BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
2049
                                                }
2050
                                        }
2051
                                        else {  // this cfloat has no subnormals
2052
                                                tgt.setzero(tgt.sign()); // preserve the sign
×
2053
                                        }
2054
                                }
2055
                                else {
2056
                                        // by design, a cfloat is either normal, subnormal, or max-exponent value, so this else clause is by deduction covering a max-exponent value
2057
                                        if constexpr (hasMaxExpValues) {
2058
                                                if constexpr (fbits < 64 && BlockTripleConfiguration::rbits < (64 - fbits)) {
2059
                                                        uint64_t raw = fraction_ull();
6,291,672✔
2060
                                                        raw |= (1ull << fbits); // add the hidden bit
6,291,672✔
2061
                                                        raw <<= BlockTripleConfiguration::rbits;  // rounding bits required for correct rounding
6,291,672✔
2062
                                                        tgt.setbits(raw);
6,291,672✔
2063
                                                }
2064
                                                else {
2065
                                                        blockcopy(tgt);
×
2066
                                                        tgt.setradix();
×
2067
                                                        tgt.setbit(fbits); // add the hidden bit
×
2068
                                                        tgt.bitShift(BlockTripleConfiguration::rbits);  // rounding bits required for correct rounding
×
2069
                                                }
2070
                                        }
2071
                                        else {  // this cfloat has no max-exponent values and thus this represents a nan, signalling or quiet determined by the sign
2072
                                                tgt.setnan(tgt.sign());
×
2073
                                        }                        
2074
                                }
2075
                        }
2076
                }
2077
                // tgt.setradix(radix);
2078
        }
39,489,285✔
2079

2080
        // Normalize a cfloat to a blocktriple used in mul, which has the form 0'00001.fffff
2081
        // that is 2*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2082
        // The result radix will go to 2*fbits after multiplication.
2083
        constexpr void normalizeMultiplication(blocktriple<fbits, BlockTripleOperator::MUL, bt>& tgt) const {
13,924,730✔
2084
                // test special cases
2085
                if (isnan()) {
13,924,730✔
2086
                        tgt.setnan();
450,696✔
2087
                }
2088
                else if (isinf()) {
13,474,034✔
2089
                        tgt.setinf();
652✔
2090
                }
2091
                else if (iszero()) {
13,473,382✔
2092
                        tgt.setzero();
450,960✔
2093
                }
2094
                else {
2095
                        tgt.setnormal(); // a blocktriple is always normalized
13,022,422✔
2096
                        int scale = this->scale();
13,022,422✔
2097
                        tgt.setsign(sign());
13,022,422✔
2098
                        tgt.setscale(scale);
13,022,422✔
2099

2100
                        // set significant
2101
                        // we are going to unify to the format 01.ffffeeee
2102
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2103
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2104
                        if (isnormal() || ismaxexpvalue()) {
13,022,422✔
2105
                                if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2106
                                        uint64_t raw = fraction_ull();
11,803,508✔
2107
                                        raw |= (1ull << fbits);
11,803,508✔
2108
                                        tgt.setbits(raw);
11,803,508✔
2109
                                }
2110
                                else {
2111
                                        blockcopy(tgt);
832✔
2112
                                        tgt.setradix();
832✔
2113
                                        tgt.setbit(fbits); // add the hidden bit
832✔
2114
                                }
2115
                        }
2116
                        else { 
2117
                                // it is a subnormal encoding in this target cfloat
2118
                                if constexpr (hasSubnormals) {
2119
                                        if constexpr (fbits < 64) {
2120
                                                uint64_t raw = fraction_ull();
1,218,082✔
2121
                                                int shift = MIN_EXP_NORMAL - scale;
1,218,082✔
2122
                                                raw <<= shift;
1,218,082✔
2123
                                                raw |= (1ull << fbits);
1,218,082✔
2124
                                                tgt.setbits(raw);
1,218,082✔
2125
                                        }
2126
                                        else {
2127
                                                blockcopy(tgt);
×
2128
                                                int shift = MIN_EXP_NORMAL - scale;
×
2129
                                                tgt.bitShift(shift);
×
2130
                                                tgt.setbit(fbits);
×
2131
                                        }
2132
                                }
2133
                                else { // this cfloat has no subnormals
2134
                                        tgt.setzero(tgt.sign()); // preserve the sign
×
2135
                                }
2136
                        }
2137
                }
2138
                tgt.setradix(fbits); // override the radix with the input scale for accurate value printing
13,924,730✔
2139
        }
13,924,730✔
2140

2141
        // normalize a cfloat to a blocktriple used in div, which has the form 0'00000'00001.fffff
2142
        // that is 3*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2143
        // the result radix will go to 2*fbits after multiplication.
2144
        // TODO: needs implementation
2145
        constexpr void normalizeDivision(blocktriple<fbits, BlockTripleOperator::DIV, bt>& tgt) const {
8,900,394✔
2146
                constexpr unsigned divshift = blocktriple<fbits, BlockTripleOperator::DIV, bt>::divshift;
8,900,394✔
2147
                // test special cases
2148
                if (isnan()) {
8,900,394✔
2149
                        tgt.setnan();
38✔
2150
                }
2151
                else if (isinf()) {
8,900,356✔
2152
                        tgt.setinf();
6✔
2153
                }
2154
                else if (iszero()) {
8,900,350✔
2155
                        tgt.setzero();
44✔
2156
                }
2157
                else {
2158
                        tgt.setnormal(); // a blocktriple is always normalized
8,900,306✔
2159
                        int scale = this->scale();
8,900,306✔
2160
                        tgt.setsign(sign());
8,900,306✔
2161
                        tgt.setscale(scale);
8,900,306✔
2162
                        // set significant
2163
                        // we are going to unify to the format 01.ffffeeee
2164
                        // where 'f' is a fraction bit, and 'e' is an extension bit
2165
                        // so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2166
                        if (isnormal() || ismaxexpvalue()) {
8,900,306✔
2167
                                if constexpr (fbits < 64 && divshift < (64 - fbits)) {
2168
                                        uint64_t raw = fraction_ull();
7,400,309✔
2169
                                        raw |= (1ull << fbits);
7,400,309✔
2170
                                        raw <<= divshift; // shift the input value to the output radix
7,400,309✔
2171
                                        tgt.setbits(raw);
7,400,309✔
2172
                                }
2173
                                else {
2174
                                        // brute force copy of blocks
2175
                                        blockcopy(tgt);
1,054,350✔
2176
                                        tgt.setbit(fbits);
1,054,350✔
2177
                                        tgt.bitShift(divshift); // shift the input value to the output radix
1,054,350✔
2178
                                }
2179
                        }
2180
                        else { // it is a subnormal encoding in this target cfloat
2181
                                if constexpr (fbits < 64 && divshift < (64 - fbits)) {
2182
                                        uint64_t raw = fraction_ull();
445,645✔
2183
                                        int shift = MIN_EXP_NORMAL - scale;
445,645✔
2184
                                        raw <<= shift;
445,645✔
2185
                                        raw |= (1ull << fbits);
445,645✔
2186
                                        raw <<= divshift; // shift the input value to the output radix
445,645✔
2187
                                        tgt.setbits(raw);
445,645✔
2188
                                }
2189
                                else {
2190
                                        blockcopy(tgt);
2✔
2191
                                        int shift = MIN_EXP_NORMAL - scale;
2✔
2192
                                        tgt.bitShift(shift);
2✔
2193
                                        tgt.setbit(fbits);
2✔
2194
                                        tgt.bitShift(divshift); // shift the input value to the output radix
2✔
2195
                                }
2196
                        }
2197
                }
2198
                tgt.setradix(blocktriple<fbits, BlockTripleOperator::DIV, bt>::radix);
8,900,394✔
2199
        }
8,900,394✔
2200

2201
        // helper debug function
2202
        void constexprClassParameters() const noexcept {
8✔
2203
                std::cout << "-------------------------------------------------------------\n";
8✔
2204
                std::cout << "type              : " << typeid(*this).name() << '\n';
8✔
2205
                std::cout << "nbits             : " << nbits << '\n';
8✔
2206
                std::cout << "es                : " << es << std::endl;
8✔
2207
                std::cout << "hasSubnormals     : " << (hasSubnormals ? "true" : "false") << '\n';
8✔
2208
                std::cout << "hasMaxExpValues   : " << (hasMaxExpValues ? "true" : "false") << '\n';
8✔
2209
                std::cout << "isSaturating      : " << (isSaturating ? "true" : "false") << '\n';
8✔
2210
                std::cout << "ALL_ONES          : " << to_binary(ALL_ONES, 0, true) << '\n';
8✔
2211
                std::cout << "BLOCK_MASK        : " << to_binary(BLOCK_MASK, 0, true) << '\n';
8✔
2212
                std::cout << "nrBlocks          : " << nrBlocks << '\n';
8✔
2213
                std::cout << "bits in MSU       : " << bitsInMSU << '\n';
8✔
2214
                std::cout << "MSU               : " << MSU << '\n';
8✔
2215
                std::cout << "MSU MASK          : " << to_binary(MSU_MASK, 0, true) << '\n';
8✔
2216
                std::cout << "SIGN_BIT_MASK     : " << to_binary(SIGN_BIT_MASK, 0, true) << '\n';
8✔
2217
                std::cout << "LSB_BIT_MASK      : " << to_binary(LSB_BIT_MASK, 0, true) << '\n';
8✔
2218
                std::cout << "MSU CAPTURES_EXP  : " << (MSU_CAPTURES_EXP ? "yes\n" : "no\n");
8✔
2219
                std::cout << "EXP_SHIFT         : " << EXP_SHIFT << '\n';
8✔
2220
                std::cout << "MSU EXP MASK      : " << to_binary(MSU_EXP_MASK, 0, true) << '\n';
8✔
2221
                std::cout << "ALL_ONE_MASK_ES   : " << to_binary(ALL_ONES_ES) << '\n';
8✔
2222
                std::cout << "EXP_BIAS          : " << EXP_BIAS << '\n';
8✔
2223
                std::cout << "MAX_EXP           : " << MAX_EXP << '\n';
8✔
2224
                std::cout << "MIN_EXP_NORMAL    : " << MIN_EXP_NORMAL << '\n';
8✔
2225
                std::cout << "MIN_EXP_SUBNORMAL : " << MIN_EXP_SUBNORMAL << '\n';
8✔
2226
                std::cout << "fraction Blocks   : " << fBlocks << '\n';
8✔
2227
                std::cout << "bits in FSU       : " << bitsInFSU << '\n';
8✔
2228
                std::cout << "FSU               : " << FSU << '\n';
8✔
2229
                std::cout << "FSU MASK          : " << to_binary(FSU_MASK, 0, true) << '\n';
8✔
2230
                std::cout << "topfbits          : " << topfbits << '\n';
8✔
2231
                std::cout << "ALL_ONE_MASK_FR   : " << to_binary(ALL_ONES_FR) << '\n';
8✔
2232
        }
8✔
2233
        void showLimbs() const {
2234
                for (unsigned b = 0; b < nrBlocks; ++b) {
2235
                        std::cout << to_binary(_block[nrBlocks - b - 1], sizeof(bt) * 8) << ' ';
2236
                }
2237
                std::cout << '\n';
2238
        }
2239

2240
protected:
2241
        // HELPER methods
2242

2243
        /// <summary>
2244
        /// 1's complement of the encoding used to set up specific encoding patterns.
2245
        /// This is not an arithmetic operator that makes sense for floating-point numbers.
2246
        /// </summary>
2247
        /// <returns>reference to this cfloat object</returns>
2248
        constexpr cfloat& flip() noexcept { // in-place one's complement
1,270,163✔
2249
                for (unsigned i = 0; i < nrBlocks; ++i) {
4,823,082✔
2250
                        _block[i] = bt(~_block[i]);
3,552,919✔
2251
                }
2252
                _block[MSU] &= MSU_MASK; // assert precondition of properly nulled leading non-bits
1,270,163✔
2253
                return *this;
1,270,163✔
2254
        }
2255

2256
        /// <summary>
2257
        /// shift left is a bit level encoding helper for fast limb-based conversions between different cfloats
2258
        /// </summary>
2259
        /// <param name="bitsToShift"></param>
2260
        void shiftLeft(unsigned bitsToShift) {
60,246✔
2261
                if (bitsToShift == 0) return;
60,246✔
2262
                if (bitsToShift > nbits) {
60,246✔
2263
                        setzero();
×
2264
                }
2265
                if (bitsToShift >= bitsInBlock) {
60,246✔
2266
                        int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
60,246✔
2267
                        for (int i = static_cast<int>(MSU); i >= blockShift; --i) {
140,520✔
2268
                                _block[i] = _block[i - blockShift];
80,274✔
2269
                        }
2270
                        for (int i = blockShift - 1; i >= 0; --i) {
341,435✔
2271
                                _block[i] = bt(0);
281,189✔
2272
                        }
2273
                        // adjust the shift
2274
                        bitsToShift -= blockShift * bitsInBlock;
60,246✔
2275
                        if (bitsToShift == 0) return;
60,246✔
2276
                }
2277
                if constexpr (MSU > 0) {
2278
                        // construct the mask for the upper bits in the block that need to move to the higher word
2279
                        bt mask = 0xFFFFFFFFFFFFFFFF << (bitsInBlock - bitsToShift);
60,200✔
2280
                        for (unsigned i = MSU; i > 0; --i) {
360,835✔
2281
                                _block[i] <<= bitsToShift;
300,635✔
2282
                                // mix in the bits from the right
2283
                                bt bits = bt(mask & _block[i - 1]);
300,635✔
2284
                                _block[i] |= (bits >> (bitsInBlock - bitsToShift));
300,635✔
2285
                        }
2286
                }
2287
                _block[0] <<= bitsToShift;
60,200✔
2288
        }
2289

2290
        // convert an unsigned integer into a cfloat
2291
        // TODO: this method does not protect against being called with a signed integer
2292
        template<typename Ty>
2293
        constexpr cfloat& convert_unsigned_integer(const Ty& rhs) noexcept {
140✔
2294
                clear();
140✔
2295
                if (0 == rhs) return *this;
140✔
2296

2297
                uint64_t raw = static_cast<uint64_t>(rhs);
137✔
2298
                int msb = static_cast<int>(find_msb(raw)) - 1; // msb > 0 due to zero test above 
137✔
2299
                int exponent = msb;
137✔
2300
                // remove the MSB as it represents the hidden bit in the cfloat representation
2301
                uint64_t hmask = ~(1ull << msb);
137✔
2302
                raw &= hmask;
137✔
2303

2304
                constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
137✔
2305
                uint32_t shift = sizeInBits - exponent - 1;
137✔
2306
                raw <<= shift;
137✔
2307
                raw = round<sizeInBits, uint64_t>(raw, exponent);
137✔
2308

2309
                // construct the target cfloat
2310
                if constexpr (fbits < (64 - es)) {
2311
                        uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(EXP_BIAS));
107✔
2312
                        uint64_t bits = 0;
107✔
2313
                        bits <<= es;
107✔
2314
                        bits |= biasedExponent;
107✔
2315
                        bits <<= fbits;
107✔
2316
                        bits |= raw;
107✔
2317
                        setbits(bits);
107✔
2318
                }
2319
                else {
2320
                        setsign(false);
30✔
2321
                        setexponent(exponent);
30✔
2322
                        // For large types, place fraction bits at the TOP of the fraction field
2323
                        // After shift, raw has fraction bits at positions (sizeInBits-2) down to (sizeInBits-1-exponent)
2324
                        // We need to place them at positions (fbits-1) down to (fbits-exponent)
2325
                        for (int i = 0; i < exponent; ++i) {
291✔
2326
                                bool bit = (raw >> (sizeInBits - 2 - i)) & 1;
261✔
2327
                                setbit(static_cast<unsigned>(fbits - 1 - i), bit);
261✔
2328
                        }
2329
                }
2330
                return *this;
137✔
2331
        }
2332
        // convert a signed integer into a cfloat
2333
        // TODO: this method does not protect against being called with a signed integer
2334
        template<typename Ty>
2335
        constexpr cfloat& convert_signed_integer(const Ty& rhs) noexcept {
174,972✔
2336
                clear();
174,972✔
2337
                if (0 == rhs) return *this;
174,972✔
2338
                bool s = (rhs < 0);
33,377✔
2339
                using UnsignedTy = std::make_unsigned_t<Ty>;
2340
                UnsignedTy urhs = static_cast<UnsignedTy>(rhs);
33,377✔
2341
                uint64_t raw = static_cast<uint64_t>(s ? (UnsignedTy(0) - urhs) : urhs);
33,377✔
2342

2343
                int msb = static_cast<int>(find_msb(raw)) - 1; // msb > 0 due to zero test above 
33,377✔
2344
                int exponent = msb;
33,377✔
2345
                // remove the MSB as it represents the hidden bit in the cfloat representation
2346
                uint64_t hmask = ~(1ull << msb);
33,377✔
2347
                raw &= hmask;
33,377✔
2348

2349
                // shift the msb to the msb of the fraction
2350
                constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
33,377✔
2351
                uint32_t shift = sizeInBits - exponent - 1;
33,377✔
2352
                raw <<= shift;
33,377✔
2353
                raw = round<sizeInBits, uint64_t>(raw, exponent);
33,377✔
2354

2355
                // construct the target cfloat
2356
                if constexpr (fbits < (64 - es)) {
2357
                        uint64_t biasedExponent = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(EXP_BIAS));
33,006✔
2358
                        uint64_t bits = (s ? 1ull : 0ull);
33,006✔
2359
                        bits <<= es;
33,006✔
2360
                        bits |= biasedExponent;
33,006✔
2361
                        bits <<= fbits;
33,006✔
2362
                        bits |= raw;
33,006✔
2363
                        setbits(bits);
33,006✔
2364
                }
2365
                else {
2366
                        setsign(s);
371✔
2367
                        setexponent(exponent);
371✔
2368
                        // For large types, place fraction bits at the TOP of the fraction field
2369
                        // After shift, raw has fraction bits at positions (sizeInBits-2) down to (sizeInBits-1-exponent)
2370
                        // We need to place them at positions (fbits-1) down to (fbits-exponent)
2371
                        for (int i = 0; i < exponent; ++i) {
3,120✔
2372
                                bool bit = (raw >> (sizeInBits - 2 - i)) & 1;
2,749✔
2373
                                setbit(static_cast<unsigned>(fbits - 1 - i), bit);
2,749✔
2374
                        }
2375
                }
2376
                return *this;
33,377✔
2377
        }
2378

2379
public:
2380
        template<typename Real>
2381
        CONSTEXPRESSION cfloat& convert_ieee754(Real rhs) noexcept {
131,537,909✔
2382
                if constexpr (nbits == 32 && es == 8 && sizeof(Real) == 4) {
2383
                        // we CANNOT use the native conversion to float as cfloats have max-exponent values
2384
                        // which IEEE-754 does not have and thus a native conversion would destroy
2385
                        // only if the Real type is a float can we use the direct conversion
2386

2387
                        // when our cfloat is a perfect match to single precision IEEE-754
2388
                        bool s{ false };
65,749✔
2389
                        uint64_t rawExponent{ 0 };
65,749✔
2390
                        uint64_t rawFraction{ 0 };
65,749✔
2391
                        uint64_t bits{ 0 };
65,749✔
2392
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
65,749✔
2393
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf need to be remapped
65,749✔
2394
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
24✔
2395
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
18✔
2396
                                        // 1.11111111.00000000.......00000001 signalling nan
2397
                                        // 0.11111111.00000000000000000000001 signalling nan
2398
                                        // MSVC
2399
                                        // 1.11111111.10000000.......00000001 signalling nan
2400
                                        // 0.11111111.10000000.......00000001 signalling nan
2401
                                        setnan(NAN_TYPE_SIGNALLING);
6✔
2402
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2403
                                        return *this;
6✔
2404
                                }
2405
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
18✔
2406
                                        // 1.11111111.10000000.......00000000 quiet nan
2407
                                        // 0.11111111.10000000.......00000000 quiet nan
2408
                                        setnan(NAN_TYPE_QUIET);
5✔
2409
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2410
                                        return *this;
5✔
2411
                                }
2412
                                if (rawFraction == 0ull) {
13✔
2413
                                        // 1.11111111.0000000.......000000000 -inf
2414
                                        // 0.11111111.0000000.......000000000 +inf
2415
                                        setinf(s);
13✔
2416
                                        return *this;
13✔
2417
                                }
2418
                        }
2419
                        uint64_t raw{ s ? 1ull : 0ull };
65,725✔
2420
                        raw <<= 31;
65,725✔
2421
                        raw |= (rawExponent << fbits);
65,725✔
2422
                        raw |= rawFraction;
65,725✔
2423
                        setbits(raw);
65,725✔
2424
                        return *this;
65,725✔
2425
                }
2426
                else if constexpr (nbits == 64 && es == 11 && sizeof(Real) == 8) {
2427
                        // when our cfloat is a perfect match to double precision IEEE-754
2428
                        bool s{ false };
12,582✔
2429
                        uint64_t rawExponent{ 0 };
12,582✔
2430
                        uint64_t rawFraction{ 0 };
12,582✔
2431
                        uint64_t bits{ 0 };
12,582✔
2432
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
12,582✔
2433
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf need to be remapped
12,582✔
2434
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
25✔
2435
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
18✔
2436
                                        // 1.11111111.00000000.......00000001 signalling nan
2437
                                        // 0.11111111.00000000000000000000001 signalling nan
2438
                                        // MSVC
2439
                                        // 1.11111111.10000000.......00000001 signalling nan
2440
                                        // 0.11111111.10000000.......00000001 signalling nan
2441
                                        setnan(NAN_TYPE_SIGNALLING);
7✔
2442
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2443
                                        return *this;
7✔
2444
                                }
2445
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
18✔
2446
                                        // 1.11111111.10000000.......00000000 quiet nan
2447
                                        // 0.11111111.10000000.......00000000 quiet nan
2448
                                        setnan(NAN_TYPE_QUIET);
6✔
2449
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2450
                                        return *this;
6✔
2451
                                }
2452
                                if (rawFraction == 0ull) {
12✔
2453
                                        // 1.11111111.0000000.......000000000 -inf
2454
                                        // 0.11111111.0000000.......000000000 +inf
2455
                                        setinf(s);
12✔
2456
                                        return *this;
12✔
2457
                                }
2458
                        }
2459
                        // normal and subnormal handling
2460
                        uint64_t raw{ s ? 1ull : 0ull };
12,557✔
2461
                        raw <<= 63;
12,557✔
2462
                        raw |= (rawExponent << fbits);
12,557✔
2463
                        raw |= rawFraction;
12,557✔
2464
                        setbits(raw);
12,557✔
2465
                        return *this;
12,557✔
2466
                }
2467
                else {
2468
                        clear();
131,459,578✔
2469
                        // extract raw IEEE-754 bits
2470
                        bool s{ false };
131,459,578✔
2471
                        uint64_t rawExponent{ 0 };
131,459,578✔
2472
                        uint64_t rawFraction{ 0 };
131,459,578✔
2473
                        uint64_t bits{ 0 };
131,459,578✔
2474
                        extractFields(rhs, s, rawExponent, rawFraction, bits);
131,459,578✔
2475
                        // special case handling
2476
                        if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf
131,459,578✔
2477
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
5,101,261✔
2478
                                        rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
2,585,658✔
2479
                                        // 1.11111111.00000000.......00000001 signalling nan
2480
                                        // 0.11111111.00000000000000000000001 signalling nan
2481
                                        // MSVC
2482
                                        // 1.11111111.10000000.......00000001 signalling nan
2483
                                        // 0.11111111.10000000.......00000001 signalling nan
2484
                                        setnan(NAN_TYPE_SIGNALLING);
2,521,146✔
2485
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2486
                                        return *this;
2,521,146✔
2487
                                }
2488
                                if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
2,580,115✔
2489
                                        // 1.11111111.10000000.......00000000 quiet nan
2490
                                        // 0.11111111.10000000.......00000000 quiet nan
2491
                                        setnan(NAN_TYPE_QUIET);
2,551,418✔
2492
                                        //setsign(s);  a cfloat encodes a signalling nan with sign = 1, and a quiet nan with sign = 0
2493
                                        return *this;
2,551,418✔
2494
                                }
2495
                                if (rawFraction == 0ull) {
28,697✔
2496
                                        // 1.11111111.0000000.......000000000 -inf
2497
                                        // 0.11111111.0000000.......000000000 +inf
2498
                                        setinf(s);
28,673✔
2499
                                        return *this;
28,673✔
2500
                                }
2501
                        }
2502
                        if (rhs == 0.0) { // IEEE rule: this is valid for + and - 0.0
126,358,341✔
2503
                                setbit(nbits - 1ull, s);
597,336✔
2504
                                return *this;
597,336✔
2505
                        }
2506
        
2507
                        // normal number consists of fbits fraction bits and one hidden bit
2508
                        // subnormal number has no hidden bit
2509
                        int exponent = static_cast<int>(rawExponent) - ieee754_parameter<Real>::bias;  // unbias the exponent
125,761,005✔
2510

2511
                        // normalize subnormal source values so downstream code can treat them as normal
2512
                        // A subnormal has rawExponent == 0 and value = 0.fraction * 2^(1-bias).
2513
                        // We find the MSB of the fraction, shift it into the hidden bit position,
2514
                        // mask it off (like a normal's implicit 1), and adjust the exponent.
2515
                        // Setting rawExponent = 1 causes all downstream if(rawExponent != 0) branches
2516
                        // to take the normal-number path, which already handles normal-to-normal and
2517
                        // normal-to-subnormal target conversions correctly.
2518
                        if (rawExponent == 0 && rawFraction != 0) {
125,761,005✔
2519
                                exponent = 1 - ieee754_parameter<Real>::bias; // effective exponent for subnormals
1,212✔
2520
                                unsigned msb_pos = find_msb(rawFraction); // 1-indexed position of MSB
1,212✔
2521
                                unsigned normalizeShift = static_cast<unsigned>(ieee754_parameter<Real>::fbits) + 1u - msb_pos;
1,212✔
2522
                                rawFraction = (rawFraction << normalizeShift) & static_cast<uint64_t>(ieee754_parameter<Real>::fmask);
1,212✔
2523
                                exponent -= static_cast<int>(normalizeShift);
1,212✔
2524
                                rawExponent = 1; // mark as normalized so downstream paths treat this as a normal number
1,212✔
2525
                        }
2526

2527
                        // check special case of
2528
                        //  1- saturating to maxpos/maxneg, or 
2529
                        //  2- projecting to +-inf 
2530
                        // if the value is out of range.
2531
                        // 
2532
                        // One problem here is that at the rounding cusps of maxpos <-> inf <-> nan
2533
                        // you need to go through the rounding logic to know which encoding you end up
2534
                        // with. 
2535
                        // For each specific cfloat configuration, you can work out these rounding cusps
2536
                        // but they need to go through the value transformation to map them back to native
2537
                        // IEEE-754. That is a complex computation to do as a static constexpr as you need
2538
                        // to construct the value, then evaluate it, and store it.
2539
                        // 
2540
                        // The algorithm used here is to check for the obvious out of range values by
2541
                        // comparing their scale to the max scale this cfloat encoding can represent.
2542
                        // For the rounding cusps, we go through the rounding logic, and then clean up
2543
                        // after rounding using the observation that no conversion from a value can ever
2544
                        // yield the NaN encoding.
2545
                        //
2546
                        // The rounding logic will correctly sort between maxpos and inf, and we clean
2547
                        // up any NaN encodings by projecting back to the configuration's saturation rule.
2548
                        //
2549
                        // We could improve on this by creating the database of rounding cusps and
2550
                        // referencing them with a direct value comparison with the input. That would be
2551
                        // the most performant implementation.
2552
                        if (exponent > MAX_EXP) {
125,761,005✔
2553
                                if constexpr (isSaturating) {
2554
                                        if (s) this->maxneg(); else this->maxpos(); // saturate to maxpos or maxneg
634,021✔
2555
                                }
2556
                                else {
2557
                                        setinf(s);
669,045✔
2558
                                }
2559
                                return *this;
1,303,066✔
2560
                        }
2561
                        if constexpr (hasSubnormals) {
2562
                                if (exponent < MIN_EXP_SUBNORMAL - 1) { 
88,959,996✔
2563
                                        // map to +-0 any values that have a scale less than (MIN_EXP_SUBMORNAL - 1)
2564
                                        this->setbit(nbits - 1, s);
145,461✔
2565
                                        return *this;
145,461✔
2566
                                }
2567
                        }
2568
                        else {
2569
                                if (exponent < MIN_EXP_NORMAL - 1) {
35,497,943✔
2570
                                        // map to +-0 any values that have a scale less than (MIN_EXP_MORNAL - 1)
2571
                                        this->setbit(nbits - 1, s);
270,875✔
2572
                                        return *this;
270,875✔
2573
                                }
2574
                        }
2575

2576
                        /////////////////  
2577
                        /// end of special case processing, move on to value sampling and rounding
2578

2579
#if TRACE_CONVERSION
2580
                        std::cout << '\n';
2581
                        std::cout << "value             : " << rhs << '\n';
2582
                        std::cout << "segments          : " << to_binary(rhs) << '\n';
2583
                        std::cout << "sign     bit      : " << (s ? '1' : '0') << '\n';
2584
                        std::cout << "exponent bits     : " << to_binary(rawExponent, ieee754_parameter<Real>::ebits, true) << '\n';
2585
                        std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::fbits, true) << '\n';
2586
                        std::cout << "exponent value    : " << exponent << '\n';
2587
#endif
2588

2589
                        // do the following scenarios have different rounding bits?
2590
                        // input is normal, cfloat is normal           <-- rounding can happen with native ieee-754 bits
2591
                        // input is normal, cfloat is subnormal
2592
                        // input is subnormal, cfloat is normal
2593
                        // input is subnormal, cfloat is subnormal
2594

2595
                        // The first condition is the relationship between the number 
2596
                        // of fraction bits from the source and the number of fraction bits 
2597
                        // in the target cfloat: these are constexpressions and guard the shifts
2598
                        // input fbits >= cfloat fbits                 <-- need to round
2599
                        // input fbits < cfloat fbits                  <-- no need to round
2600

2601
                        // quick check if we are truncating to 0 for all subnormal values
2602
                        if constexpr (!hasSubnormals) {
2603
                                if (exponent < MIN_EXP_NORMAL) {
35,227,068✔
2604
                                        setsign(s); // rest of the bits, exponent and fraction, are already set correctly
123,584✔
2605
                                        return *this;
123,584✔
2606
                                }
2607
                        }
2608
                        if constexpr (fbits < ieee754_parameter<Real>::fbits) {
2609
                                // this is the common case for cfloats that are smaller in size compared to single and double precision IEEE-754
2610
                                constexpr int rightShift = ieee754_parameter<Real>::fbits - fbits; // this is the bit shift to get the MSB of the src to the MSB of the tgt
123,658,548✔
2611
                                uint32_t biasedExponent{ 0 };
123,658,548✔
2612
                                int adjustment{ 0 }; // right shift adjustment for subnormal representation
123,658,548✔
2613
                                uint64_t mask;
2614
                                if (rawExponent != 0) {
123,658,548✔
2615
                                        // the source real is a normal number, 
2616
//                                        if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2617
                                        if (exponent < MIN_EXP_NORMAL) {
123,658,548✔
2618
//                                                exponent = (exponent < MIN_EXP_SUBNORMAL ? MIN_EXP_SUBNORMAL : exponent); // clip to the smallest subnormal exponent, otherwise the adjustment is off
2619
                                                // the value is a subnormal number in this representation: biasedExponent = 0
2620
                                                // add the hidden bit to the fraction bits so the denormalization has the correct MSB
2621
                                                rawFraction |= ieee754_parameter<Real>::hmask;
41,271,928✔
2622

2623
                                                // fraction processing: we have 1 hidden + 23 explicit fraction bits 
2624
                                                // f = 1.ffff 2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2625
                                                // -exponent because we are right shifting and exponent in this range is negative
2626
                                                adjustment = -(exponent + subnormal_reciprocal_shift[es]);
41,271,928✔
2627
                                                // this is the right shift adjustment required for subnormal representation due 
2628
                                                // to the scale of the input number, i.e. the exponent of 2^-adjustment
2629
                                        }
2630
                                        else {
2631
                                                // the value is a normal number in this representation: common case
2632
                                                biasedExponent = static_cast<uint32_t>(exponent + EXP_BIAS); // project the exponent into the target 
82,386,620✔
2633
                                                // fraction processing
2634
                                                // float structure is: seee'eeee'efff'ffff'ffff'ffff'ffff'ffff, s = sign, e - exponent bit, f = fraction bit
2635
                                                // target structure is for example cfloat<8,2>: seef'ffff
2636
                                                // since both are normals, we can shift the incoming fraction to the target structure bits, and round
2637
                                                // MSB of source = 23 - 1, MSB of target = fbits - 1: shift = MSB of src - MSB of tgt => 23 - fbits
2638
                                                adjustment = 0;
82,386,620✔
2639
                                        }
2640
                                        if constexpr (rightShift > 0) {
2641
                                                // if true we need to round
2642
                                                // round-to-even logic
2643
                                                //  ... lsb | guard  round sticky   round
2644
                                                //       x     0       x     x       down
2645
                                                //       0     1       0     0       down  round to even
2646
                                                //       1     1       0     0        up   round to even
2647
                                                //       x     1       0     1        up
2648
                                                //       x     1       1     0        up
2649
                                                //       x     1       1     1        up
2650
                                                // collect lsb, guard, round, and sticky bits
2651

2652

2653
#if TRACE_CONVERSION
2654
                                                std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::nbits, true) << '\n';
2655
                                                std::cout << "lsb mask bits     : " << to_binary(mask, ieee754_parameter<Real>::nbits, true) << '\n';
2656
#endif
2657
                                                mask = (1ull << (rightShift + adjustment)); // bit mask for the lsb bit
123,658,548✔
2658
                                                bool lsb = (mask & rawFraction);
123,658,548✔
2659
                                                mask >>= 1;
123,658,548✔
2660
                                                bool guard = (mask & rawFraction);
123,658,548✔
2661
                                                mask >>= 1;
123,658,548✔
2662
                                                bool round = (mask & rawFraction);
123,658,548✔
2663
                                                if ((rightShift + adjustment) > 1) {
123,658,548✔
2664
                                                        mask = (0xFFFF'FFFF'FFFF'FFFFull << (rightShift + adjustment - 2));
123,658,548✔
2665
                                                        mask = ~mask;
123,658,548✔
2666
                                                }
2667
                                                else {
2668
                                                        mask = 0;
×
2669
                                                }
2670
#if TRACE_CONVERSION
2671
                                                std::cout << "right shift       : " << rightShift << '\n';
2672
                                                std::cout << "adjustment        : " << adjustment << '\n';
2673
                                                std::cout << "shift to LSB      : " << (rightShift + adjustment) << '\n';
2674
                                                std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::nbits, true) << '\n';
2675
                                                std::cout << "sticky mask bits  : " << to_binary(mask, ieee754_parameter<Real>::nbits, true) << '\n';
2676
#endif
2677
                                                bool sticky = (mask & rawFraction);
123,658,548✔
2678
                                                rawFraction >>= (static_cast<int64_t>(rightShift) + static_cast<int64_t>(adjustment));
123,658,548✔
2679

2680
                                                // execute rounding operation
2681
                                                if (guard) {
123,658,548✔
2682
                                                        if (lsb && (!round && !sticky)) ++rawFraction; // round to even
24,518,258✔
2683
                                                        if (round || sticky) ++rawFraction;
24,518,258✔
2684
                                                        if (rawFraction == (1ull << fbits)) { // overflow
24,518,258✔
2685
                                                                if (biasedExponent == ALL_ONES_ES) { // overflow to INF == .111..01
443,446✔
2686
                                                                        rawFraction = INF_ENCODING;
398✔
2687
                                                                }
2688
                                                                else {
2689
                                                                        ++biasedExponent;
443,048✔
2690
                                                                        rawFraction = 0;
443,048✔
2691
                                                                }
2692
                                                        }
2693
                                                }
2694
#if TRACE_CONVERSION
2695
                                                std::cout << "lsb               : " << (lsb ? "1\n" : "0\n");
2696
                                                std::cout << "guard             : " << (guard ? "1\n" : "0\n");
2697
                                                std::cout << "round             : " << (round ? "1\n" : "0\n");
2698
                                                std::cout << "sticky            : " << (sticky ? "1\n" : "0\n");
2699
                                                std::cout << "rounding decision : " << (lsb && (!round && !sticky) ? "round to even\n" : "-\n");
2700
                                                std::cout << "rounding direction: " << (round || sticky ? "round up\n" : "round down\n");
2701
#endif
2702
                                        }
2703
                                        else { // all bits of the float go into this representation and need to be shifted up, no rounding necessary
2704
                                                int shiftLeft = fbits - ieee754_parameter<Real>::fbits;
2705
                                                rawFraction <<= shiftLeft;
2706
                                        }
2707
#if TRACE_CONVERSION
2708
                                        std::cout << "biased exponent   : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2709
                                        std::cout << "right shift       : " << rightShift << '\n';
2710
                                        std::cout << "adjustment shift  : " << adjustment << '\n';
2711
                                        std::cout << "sticky bit mask   : " << to_binary(mask, 32, true) << '\n';
2712
                                        std::cout << "fraction bits     : " << to_binary(rawFraction, 32, true) << '\n';
2713
#endif
2714
                                        // construct the target cfloat
2715
                                        bits = (s ? 1ull : 0ull);
123,658,548✔
2716
                                        bits <<= es;
123,658,548✔
2717
                                        bits |= biasedExponent;
123,658,548✔
2718
                                        bits <<= fbits;
123,658,548✔
2719
                                        bits |= rawFraction;
123,658,548✔
2720
#if TRACE_CONVERSION
2721
                                        std::cout << "sign bit          : " << (s ? '1' : '0') << '\n';
2722
                                        std::cout << "biased exponent   : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2723
                                        std::cout << "fraction bits     : " << to_binary(rawFraction, 32, true) << '\n';
2724
                                        std::cout << "cfloat bits       : " << to_binary(bits, nbits, true) << '\n';
2725
#endif
2726
                                        setbits(bits);
123,658,548✔
2727
                                }
2728
                                else {
2729
                                        // the source real is a subnormal number                                
2730
                                        mask = 0x00FF'FFFFu >> (fbits + exponent + subnormal_reciprocal_shift[es] + 1); // mask for sticky bit 
×
2731

2732
                                        // fraction processing: we have fbits+1 bits = 1 hidden + fbits explicit fraction bits 
2733
                                        // f = 1.ffff  2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2734
                                        // -exponent because we are right shifting and exponent in this range is negative
2735
                                        adjustment = -(exponent + subnormal_reciprocal_shift[es]); // this is the right shift adjustment due to the scale of the input number, i.e. the exponent of 2^-adjustment
×
2736
#if TRACE_CONVERSION                                        
2737
                                        std::cout << "source is subnormal: TBD\n";
2738
                                        std::cout << "shift to LSB    " << (rightShift + adjustment) << '\n';
2739
                                        std::cout << "adjustment      " << adjustment << '\n';
2740
                                        std::cout << "exponent        " << exponent << '\n';
2741
                                        std::cout << "subnormal shift " << subnormal_reciprocal_shift[es] << '\n';
2742
#endif
2743
                                        if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2744
                                                // the value is a subnormal number in this representation
2745
                                        }
2746
                                        else {
2747
                                                // the value is a normal number in this representation
2748
                                        }
2749
                                }
2750
                        }
2751
                        else {
2752
                                // no need to round, but we need to shift left to deliver the bits
2753
                                // cfloat<40,  8> = float
2754
                                // cfloat<48,  9> = float
2755
                                // cfloat<56, 10> = float
2756
                                // cfloat<64, 11> = float
2757
                                // cfloat<64, 10> = double 
2758
                                // can we go from an input subnormal to a cfloat normal? 
2759
                                // yes, for example a cfloat<64,11> assigned to a subnormal float
2760

2761
                                // map exponent into target cfloat encoding
2762
                                constexpr int upshift = fbits - ieee754_parameter<Real>::fbits;
259,471✔
2763
                                uint64_t biasedExponent{ 0 };
259,471✔
2764
                                bool subnormalInTarget = (exponent < MIN_EXP_NORMAL);
259,471✔
2765
                                int denormShift = 0;
259,471✔
2766
                                if (subnormalInTarget) {
259,471✔
2767
                                        biasedExponent = 0;
637✔
2768
                                        denormShift = MIN_EXP_NORMAL - exponent;
637✔
2769
                                }
2770
                                else {
2771
                                        biasedExponent = static_cast<uint64_t>(static_cast<int64_t>(exponent) + EXP_BIAS);
258,834✔
2772
                                }
2773
                                // output processing
2774
                                if constexpr (nbits < 65) {
2775
                                        // we can compose the bits in a native 64-bit unsigned integer
2776
                                        // common case: normal to normal
2777
                                        // reference example: nbits = 40, es = 8, fbits = 31: rhs = float fbits = 23; shift left by (31 - 23) = 8
2778

2779
                                        if (rawExponent != 0) {
199,225✔
2780
                                                // rhs is a normal encoding (or a normalized former-subnormal)
2781
                                                uint64_t raw{ s ? 1ull : 0ull };
199,225✔
2782
                                                raw <<= es;
199,225✔
2783
                                                raw |= biasedExponent;
199,225✔
2784
                                                raw <<= fbits;
199,225✔
2785
                                                if (subnormalInTarget) {
199,225✔
2786
                                                        // add the hidden bit and denormalize the fraction
2787
                                                        uint64_t significand = rawFraction | ieee754_parameter<Real>::hmask;
631✔
2788
                                                        int netShift = upshift - denormShift;
631✔
2789
                                                        if (netShift >= 0) {
631✔
2790
                                                                rawFraction = significand << netShift;
631✔
2791
                                                        }
2792
                                                        else {
2793
                                                                // right shift with round-to-nearest-even
2794
                                                                int rshift = -netShift;
×
2795
                                                                uint64_t lsbMask = (1ull << rshift);
×
2796
                                                                bool lsb    = (significand & lsbMask) != 0;
×
2797
                                                                bool guard  = (rshift >= 1) && ((significand & (lsbMask >> 1)) != 0);
×
2798
                                                                bool round  = (rshift >= 2) && ((significand & (lsbMask >> 2)) != 0);
×
2799
                                                                bool sticky = (rshift >= 3) && ((significand & ((lsbMask >> 2) - 1)) != 0);
×
2800
                                                                rawFraction = significand >> rshift;
×
2801
                                                                if (guard) {
×
2802
                                                                        if (lsb && (!round && !sticky)) ++rawFraction; // round to even
×
2803
                                                                        if (round || sticky) ++rawFraction;
×
2804
                                                                }
2805
                                                        }
2806
                                                }
2807
                                                else {
2808
                                                        rawFraction <<= upshift;
198,594✔
2809
                                                }
2810
                                                raw |= rawFraction;
199,225✔
2811
                                                setbits(raw);
199,225✔
2812
                                        }
2813
                                        else {
2814
                                                // rhs is a subnormal (unreachable after normalization above, kept for safety)
2815
                                        }
2816
                                }
2817
                                else {
2818
                                        // we need to write and shift bits into place
2819
                                        // use cases are cfloats like cfloat<80, 11, bt>
2820
                                        // even though the bits that come in are single or double precision
2821
                                        // we need to write the fields and then shifting them in place
2822
                                        // 
2823
                                        // common case: normal to normal
2824
                                        if constexpr (bitsInBlock < 64) {
2825
                                                if (rawExponent != 0) {
60,246✔
2826
                                                        // reference example: nbits = 128, es = 15, fbits = 112: rhs = float: shift left by (112 - 23) = 89
2827
                                                        setbits(biasedExponent);
60,246✔
2828
                                                        shiftLeft(fbits);
60,246✔
2829
                                                        // if subnormal in target, add hidden bit before copying
2830
                                                        uint64_t fractionToCopy = subnormalInTarget
60,246✔
2831
                                                                ? (rawFraction | ieee754_parameter<Real>::hmask)
60,246✔
2832
                                                                : rawFraction;
2833
                                                        // shift fraction bits
2834
                                                        int bitsToShift = subnormalInTarget ? (upshift - denormShift) : upshift;
60,246✔
2835
                                                        // for subnormal targets near minpos, bitsToShift can be negative (right shift)
2836
                                                        // apply rounding on the 64-bit significand before splitting into blocks
2837
                                                        if (bitsToShift < 0) {
60,246✔
2838
                                                                int rshift = -bitsToShift;
×
2839
                                                                uint64_t lsbMask = (1ull << rshift);
×
2840
                                                                bool lsb    = (fractionToCopy & lsbMask) != 0;
×
2841
                                                                bool guard  = (rshift >= 1) && ((fractionToCopy & (lsbMask >> 1)) != 0);
×
2842
                                                                bool round  = (rshift >= 2) && ((fractionToCopy & (lsbMask >> 2)) != 0);
×
2843
                                                                bool sticky = (rshift >= 3) && ((fractionToCopy & ((lsbMask >> 2) - 1)) != 0);
×
2844
                                                                fractionToCopy >>= rshift;
×
2845
                                                                if (guard) {
×
2846
                                                                        if (lsb && (!round && !sticky)) ++fractionToCopy; // round to even
×
2847
                                                                        if (round || sticky) ++fractionToCopy;
×
2848
                                                                }
2849
                                                                bitsToShift = 0;
×
2850
                                                        }
2851
                                                        bt fractionBlock[nrBlocks]{ 0 };
60,246✔
2852
                                                        // copy fraction bits
2853
                                                        unsigned blocksRequired = (8 * sizeof(fractionToCopy) + 1) / bitsInBlock;
60,246✔
2854
                                                        unsigned maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
60,246✔
2855
                                                        uint64_t mask = static_cast<uint64_t>(ALL_ONES); // set up the block mask
60,246✔
2856
                                                        unsigned shift = 0;
60,246✔
2857
                                                        for (unsigned i = 0; i < maxBlockNr; ++i) {
340,810✔
2858
                                                                fractionBlock[i] = bt((mask & fractionToCopy) >> shift);
280,564✔
2859
                                                                mask <<= bitsInBlock;
280,564✔
2860
                                                                shift += bitsInBlock;
280,564✔
2861
                                                        }
2862
                                                        if (bitsToShift >= static_cast<int>(bitsInBlock)) {
60,246✔
2863
                                                                int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
50,219✔
2864
                                                                for (int i = MSU; i >= blockShift; --i) {
271,051✔
2865
                                                                        fractionBlock[i] = fractionBlock[i - blockShift];
220,832✔
2866
                                                                }
2867
                                                                for (int i = blockShift - 1; i >= 0; --i) {
160,763✔
2868
                                                                        fractionBlock[i] = bt(0);
110,544✔
2869
                                                                }
2870
                                                                // adjust the shift
2871
                                                                bitsToShift -= blockShift * bitsInBlock;
50,219✔
2872
                                                        }
2873
                                                        if (bitsToShift > 0) {
60,246✔
2874
                                                                // construct the mask for the upper bits in the block that need to move to the higher word
2875
                                                                bt bitsToMoveMask = bt(ALL_ONES << (bitsInBlock - bitsToShift));
40,212✔
2876
                                                                for (unsigned i = MSU; i > 0; --i) {
211,108✔
2877
                                                                        fractionBlock[i] <<= bitsToShift;
170,896✔
2878
                                                                        // mix in the bits from the right
2879
                                                                        bt fracbits = static_cast<bt>(bitsToMoveMask & fractionBlock[i - 1]); // operator & yields an int
170,896✔
2880
                                                                        fractionBlock[i] |= (fracbits >> (bitsInBlock - bitsToShift));
170,896✔
2881
                                                                }
2882
                                                                fractionBlock[0] <<= bitsToShift;
40,212✔
2883
                                                        }
2884
                                                        // OR the bits in
2885
                                                        for (unsigned i = 0; i <= MSU; ++i) {
421,709✔
2886
                                                                _block[i] |= fractionBlock[i];
361,463✔
2887
                                                        }
2888
                                                        // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
2889
                                                        _block[MSU] &= MSU_MASK;
60,246✔
2890
                                                        // finally, set the sign bit
2891
                                                        setsign(s);
60,246✔
2892
                                                }
2893
                                                else {
2894
                                                        // rhs is a subnormal
2895
                //                                        std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
2896
                                                }
2897
                                        }
2898
                                        else {
2899
                                                // BlockType is incorrect
2900
                                        }
2901
                                }
2902
                        }
2903
                        // post-processing results to implement saturation and projection after rounding logic
2904
                        if constexpr (isSaturating) {
2905
                                if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
17,112,996✔
2906
                                        maxpos();
587✔
2907
                                }
2908
                                else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
17,112,409✔
2909
                                        maxneg();
587✔
2910
                                }
2911
                        }
2912
                        else {
2913
                                if (isnan(NAN_TYPE_QUIET)) {
106,805,023✔
2914
                                        setinf(false);
301,221✔
2915
                                }
2916
                                else if (isnan(NAN_TYPE_SIGNALLING)) {
106,503,802✔
2917
                                        setinf(true);
300,173✔
2918
                                }
2919
                        }
2920
                        return *this;  // TODO: unreachable in some configurations        
123,918,019✔
2921
                }
2922
        }
2923

2924
        // post-processing results to implement saturation and projection after rounding logic
2925
        // arithmetic bit operations can't produce NaN encodings, so we need to re-interpret
2926
        // these encodings and 'project' them to the proper values.
2927
        void constexpr post_process() noexcept {
2928
                if constexpr (isSaturating) {
2929
                        if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
2930
                                maxpos();
2931
                        }
2932
                        else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
2933
                                maxneg();
2934
                        }
2935
                }
2936
                else {
2937
                        if (isnan(NAN_TYPE_QUIET)) {
2938
                                setinf(false);
2939
                        }
2940
                        else if (isnan(NAN_TYPE_SIGNALLING)) {
2941
                                setinf(true);
2942
                        }
2943
                }
2944
        }
2945

2946
protected:
2947

2948
        /// <summary>
2949
        /// round a set of source bits to the present representation.
2950
        /// srcbits is the number of bits of significant in the source representation
2951
        /// </summary>
2952
        /// <typeparam name="StorageType"></typeparam>
2953
        /// <param name="raw"></param>
2954
        /// <returns></returns>
2955
        template<unsigned srcbits, typename StorageType>
2956
        constexpr uint64_t round(StorageType raw, int& exponent) noexcept {
33,514✔
2957
                if constexpr (fhbits < srcbits) {
2958
                        // round to even: lsb guard round sticky
2959
                    // collect guard, round, and sticky bits
2960
                    // this same logic will work for the case where
2961
                    // we only have a guard bit and no round and sticky bits
2962
                    // because the mask logic will make round and sticky both 0
2963
                        constexpr uint32_t shift = srcbits - fhbits - 1ull;
30,530✔
2964
                        StorageType mask = (StorageType(1ull) << shift);
30,530✔
2965
                        bool guard = (mask & raw);
30,530✔
2966
                        mask >>= 1;
30,530✔
2967
                        bool round = (mask & raw);
30,530✔
2968
                        if constexpr (shift > 1u) { // protect against a negative shift
2969
                                StorageType allones(StorageType(~0));
30,530✔
2970
                                mask = StorageType(allones << (shift - 2));
30,530✔
2971
                                mask = ~mask;
30,530✔
2972
                        }
2973
                        else {
2974
                                mask = 0;
2975
                        }
2976
                        bool sticky = (mask & raw);
30,530✔
2977

2978
                        raw >>= (shift + 1);  // shift out the bits we are rounding away
30,530✔
2979
                        bool lsb = (raw & 0x1u);
30,530✔
2980
                        //  ... lsb | guard  round sticky   round
2981
                        //       x     0       x     x       down
2982
                        //       0     1       0     0       down  round to even
2983
                        //       1     1       0     0        up   round to even
2984
                        //       x     1       0     1        up
2985
                        //       x     1       1     0        up
2986
                        //       x     1       1     1        up
2987
                        if (guard) {
30,530✔
2988
                                if (lsb && (!round && !sticky)) ++raw; // round to even
3,168✔
2989
                                if (round || sticky) ++raw;
3,168✔
2990
                                if (raw == (1ull << fbits)) { // overflow
3,168✔
2991
                                        ++exponent;
3,168✔
2992
                                        raw >>= 1u;
3,168✔
2993
                                }
2994
                        }
2995
                }
2996
                else {
2997
                        // Target has more precision than source - need to left-shift to align
2998
                        // For large types where fhbits > 64, the fraction bits cannot fit in
2999
                        // a 64-bit raw after shifting. In this case, skip the shift and let
3000
                        // convert_signed/unsigned_integer place bits using setbit().
3001
                        // The caller positions fraction bits at the top of srcbits, and we
3002
                        // need to ensure they don't overflow 64 bits after our shift.
3003
                        constexpr unsigned shift = fhbits - srcbits;
2,984✔
3004
                        if constexpr (fhbits <= 64 && shift < 64) {
3005
                                raw <<= shift;
2,583✔
3006
                        }
3007
                        // else: raw stays as-is; caller will extract bits and place them
3008
                }
3009
                uint64_t significant = raw;
33,514✔
3010
                return significant;
33,514✔
3011
        }
3012

3013
        template<typename ArgumentBlockType>
3014
        constexpr void copyBits(ArgumentBlockType v) {
3015
                unsigned blocksRequired = (8 * sizeof(v) + 1 ) / bitsInBlock;
3016
                unsigned maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
3017
                bt b{ 0ul }; b = bt(~b);
3018
                ArgumentBlockType mask = ArgumentBlockType(b);
3019
                unsigned shift = 0;
3020
                for (unsigned i = 0; i < maxBlockNr; ++i) {
3021
                        _block[i] = bt((mask & v) >> shift);
3022
                        mask <<= bitsInBlock;
3023
                        shift += bitsInBlock;
3024
                }
3025
        }
3026
        void shiftLeft(int leftShift) {
3027
                if (leftShift == 0) return;
3028
                if (leftShift < 0) return shiftRight(-leftShift);
3029
                if (leftShift > long(nbits)) leftShift = nbits; // clip to max
3030
                if (leftShift >= long(bitsInBlock)) {
3031
                        int blockShift = leftShift / static_cast<int>(bitsInBlock);
3032
                        for (signed i = signed(MSU); i >= blockShift; --i) {
3033
                                _block[i] = _block[i - blockShift];
3034
                        }
3035
                        for (signed i = blockShift - 1; i >= 0; --i) {
3036
                                _block[i] = bt(0);
3037
                        }
3038
                        // adjust the shift
3039
                        leftShift -= (long)(blockShift * bitsInBlock);
3040
                        if (leftShift == 0) return;
3041
                }
3042
                // construct the mask for the upper bits in the block that need to move to the higher word
3043
//                bt mask = static_cast<bt>(0xFFFFFFFFFFFFFFFFull << (bitsInBlock - leftShift));
3044
                bt mask = ALL_ONES;
3045
                mask <<= (bitsInBlock - leftShift);
3046
                for (unsigned i = MSU; i > 0; --i) {
3047
                        _block[i] <<= leftShift;
3048
                        // mix in the bits from the right
3049
                        bt bits = static_cast<bt>(mask & _block[i - 1]);
3050
                        _block[i] |= (bits >> (bitsInBlock - leftShift));
3051
                }
3052
                _block[0] <<= leftShift;
3053
        }
3054
        void shiftRight(int rightShift) {
3055
                if (rightShift == 0) return;
3056
                if (rightShift < 0) return shiftLeft(-rightShift);
3057
                if (rightShift >= long(nbits)) {
3058
                        setzero();
3059
                        return;
3060
                }
3061
                bool signext = sign();
3062
                unsigned blockShift = 0;
3063
                if (rightShift >= long(bitsInBlock)) {
3064
                        blockShift = rightShift / bitsInBlock;
3065
                        if (MSU >= blockShift) {
3066
                                // shift by blocks
3067
                                for (unsigned i = 0; i <= MSU - blockShift; ++i) {
3068
                                        _block[i] = _block[i + blockShift];
3069
                                }
3070
                        }
3071
                        // adjust the shift
3072
                        rightShift -= (long)(blockShift * bitsInBlock);
3073
                        if (rightShift == 0) {
3074
                                // fix up the leading zeros if we have a negative number
3075
                                if (signext) {
3076
                                        // rightShift is guaranteed to be less than nbits
3077
                                        rightShift += (long)(blockShift * bitsInBlock);
3078
                                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3079
                                                this->setbit(i);
3080
                                        }
3081
                                }
3082
                                else {
3083
                                        // clean up the blocks we have shifted clean
3084
                                        rightShift += (long)(blockShift * bitsInBlock);
3085
                                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3086
                                                this->setbit(i, false);
3087
                                        }
3088
                                }
3089
                                return;  // shift was aligned to block boundary, no per-bit shift needed
3090
                        }
3091
                }
3092

3093
                bt mask = ALL_ONES;
3094
                mask >>= (bitsInBlock - rightShift); // this is a mask for the lower bits in the block that need to move to the lower word
3095
                for (unsigned i = 0; i < MSU; ++i) {  // TODO: can this be improved? we should not have to work on the upper blocks in case we block shifted
3096
                        _block[i] >>= rightShift;
3097
                        // mix in the bits from the left
3098
                        bt bits = static_cast<bt>(mask & _block[i + 1]); // & operator returns an int
3099
                        _block[i] |= (bits << (bitsInBlock - rightShift));
3100
                }
3101
                _block[MSU] >>= rightShift;
3102

3103
                // fix up the leading zeros if we have a negative number
3104
                if (signext) {
3105
                        // bitsToShift is guaranteed to be less than nbits
3106
                        rightShift += (long)(blockShift * bitsInBlock);
3107
                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3108
                                this->setbit(i);
3109
                        }
3110
                }
3111
                else {
3112
                        // clean up the blocks we have shifted clean
3113
                        rightShift += (long)(blockShift * bitsInBlock);
3114
                        for (unsigned i = nbits - rightShift; i < nbits; ++i) {
3115
                                this->setbit(i, false);
3116
                        }
3117
                }
3118

3119
                // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
3120
                _block[MSU] &= MSU_MASK;
3121
        }
3122

3123
        // calculate the integer power 2 ^ b using exponentiation by squaring
3124
        double ipow(int exponent) const {
525,077✔
3125
                bool negative = (exponent < 0);
525,077✔
3126
                exponent = negative ? -exponent : exponent;
525,077✔
3127
                double result(1.0);
525,077✔
3128
                double base = 2.0;
525,077✔
3129
                for (;;) {
3130
                        if (exponent % 2) result *= base;
3,846,298✔
3131
                        exponent >>= 1;
3,846,298✔
3132
                        if (exponent == 0) break;
3,846,298✔
3133
                        base *= base;
3,321,221✔
3134
                }
3135
                return (negative ? (1.0 / result) : result);
525,077✔
3136
        }
3137

3138
        template<BlockTripleOperator btop, typename TargetBlockType = bt>
3139
        constexpr void blockcopy(blocktriple<fbits, btop, TargetBlockType>& tgt) const {
1,056,274✔
3140
                // brute force copy of blocks
3141
                if constexpr (1 == fBlocks) {
3142
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0] & FSU_MASK));
1,050,746✔
3143
                }
3144
                else if constexpr (2 == fBlocks) {
3145
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
3,044✔
3146
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1] & FSU_MASK));
3,044✔
3147
                }
3148
                else if constexpr (3 == fBlocks) {
3149
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
204✔
3150
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
204✔
3151
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2] & FSU_MASK));
204✔
3152
                }
3153
                else if constexpr (4 == fBlocks) {
3154
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
1,500✔
3155
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
1,500✔
3156
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
1,500✔
3157
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3] & FSU_MASK));
1,500✔
3158
                }
3159
                else if constexpr (5 == fBlocks) {
3160
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
×
3161
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
×
3162
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
×
3163
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
×
3164
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4] & FSU_MASK));
×
3165
                }
3166
                else if constexpr (6 == fBlocks) {
3167
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
3168
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
3169
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
3170
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
3171
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
3172
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5] & FSU_MASK));
3173
                }
3174
                else if constexpr (7 == fBlocks) {
3175
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
8✔
3176
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
8✔
3177
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
8✔
3178
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
8✔
3179
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
8✔
3180
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5]));
8✔
3181
                        tgt.setblock(6, static_cast<TargetBlockType>(_block[6] & FSU_MASK));
8✔
3182
                }
3183
                else if constexpr (8 == fBlocks) {
3184
                        tgt.setblock(0, static_cast<TargetBlockType>(_block[0]));
370✔
3185
                        tgt.setblock(1, static_cast<TargetBlockType>(_block[1]));
370✔
3186
                        tgt.setblock(2, static_cast<TargetBlockType>(_block[2]));
370✔
3187
                        tgt.setblock(3, static_cast<TargetBlockType>(_block[3]));
370✔
3188
                        tgt.setblock(4, static_cast<TargetBlockType>(_block[4]));
370✔
3189
                        tgt.setblock(5, static_cast<TargetBlockType>(_block[5]));
370✔
3190
                        tgt.setblock(6, static_cast<TargetBlockType>(_block[6]));
370✔
3191
                        tgt.setblock(7, static_cast<TargetBlockType>(_block[7] & FSU_MASK));
370✔
3192
                }
3193
                else {
3194
                        for (unsigned i = 0; i < FSU; ++i) {
4,127✔
3195
                                tgt.setblock(i, static_cast<TargetBlockType>(_block[i]));
3,725✔
3196
                        }
3197
                        tgt.setblock(FSU, static_cast<TargetBlockType>(_block[FSU] & FSU_MASK));
402✔
3198
                }
3199
        }
1,056,274✔
3200

3201
private:
3202
        bt _block[nrBlocks];
3203

3204
        //////////////////////////////////////////////////////////////////////////////
3205
        // friend functions
3206

3207
        // template parameters need names different from class template parameters (for gcc and clang)
3208
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3209
        friend std::ostream& operator<< (std::ostream& ostr, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3210
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3211
        friend std::istream& operator>> (std::istream& istr, cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3212

3213
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3214
        friend bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3215
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3216
        friend bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3217
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3218
        friend bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3219
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3220
        friend bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3221
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3222
        friend bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3223
        template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3224
        friend bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3225
};
3226

3227
///////////////////////////// IOSTREAM operators ///////////////////////////////////////////////
3228

3229
// convert cfloat to decimal fixpnt string, i.e. "-1234.5678"
3230
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3231
std::string to_decimal_fixpnt_string(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& value, long long precision) {
3232
        constexpr unsigned fbits = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3233
        constexpr unsigned bias = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::EXP_BIAS;
3234
        std::stringstream str;
3235
        if (value.iszero()) {
3236
                str << '0';
3237
                return str.str();
3238
        }
3239
        if (value.sign()) str << '-';
3240

3241
        // construct the discretization levels of the fraction part
3242
        support::decimal range, discretizationLevels, step;
3243
        // create the decimal range we are discretizing
3244
        range.setdigit(1);
3245
        range.shiftLeft(fbits); // the decimal range of the fraction
3246
        discretizationLevels.powerOf2(fbits); // calculate the discretization levels of this range
3247
        step = div(range, discretizationLevels);
3248
        // now construct the value of this range by adding the fraction samples
3249
        support::decimal partial, multiplier;
3250
        partial.setzero();  // if you just want the fraction
3251
        multiplier.setdigit(1);
3252
        // convert the fraction part
3253
        for (unsigned i = 0; i < fbits; ++i) {
3254
                if (value.at(i)) {
3255
                        support::add(partial, multiplier);
3256
                }
3257
                support::add(multiplier, multiplier);
3258
        }
3259
        if (value.isdenormal()) {
3260
                support::mul(partial, step);
3261
                support::decimal scale;
3262
                scale.powerOf2(bias - 1ull);
3263
                partial = support::div(partial, scale);
3264
        } 
3265
        else {
3266
                support::add(partial, multiplier); // add the hidden bit
3267
                support::mul(partial, step);
3268
                support::decimal scale;
3269
                int exponent = value.scale();
3270
                if (exponent < 0) {
3271
                        scale.powerOf2(static_cast<unsigned>(-exponent));
3272
                        partial = support::div(partial, scale);
3273
                }
3274
                else {
3275
                        scale.powerOf2(static_cast<unsigned>(exponent));
3276
                        support::mul(partial, scale);
3277
                }
3278
        }
3279

3280
        // the radix is at fbits
3281
        // The partial represents the parts in the range, so we can deduce
3282
        // the number of leading zeros by comparing to the length of range
3283
        int nrLeadingZeros = static_cast<int>(range.size()) - static_cast<int>(partial.size()) - 1;
3284
        if (nrLeadingZeros >= 0) str << "0.";
3285
        for (int i = 0; i < nrLeadingZeros; ++i) str << '0';
3286
        int digitsWritten = (nrLeadingZeros > 0) ? nrLeadingZeros : 0;
3287
        int position = static_cast<int>(partial.size()) - 1;
3288
        for (support::decimal::const_reverse_iterator rit = partial.rbegin(); rit != partial.rend(); ++rit) {
3289
                str << (int)*rit;
3290
                ++digitsWritten;
3291
                if (position == fbits) str << '.';
3292
                --position;
3293
        }
3294
        if (digitsWritten < precision) { // deal with trailing 0s
3295
                for (unsigned i = static_cast<unsigned>(digitsWritten); i < fbits; ++i) {
3296
                        str << '0';
3297
                }
3298
        }
3299

3300
        return str.str();
3301
}
3302

3303
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3304
std::string to_string(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& value, long long precision) {
3305
        constexpr unsigned fbits = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3306
        std::stringstream str;
3307
        if (value.iszero()) {
3308
                str << '0';
3309
                return str.str();
3310
        }
3311
        if (value.sign()) str << '-';
3312

3313
        // denormalize the number to gain access to the most sigificant digits
3314
        // 1.ffff^e
3315
        // scale is e
3316
        // lsbScale is e - fbits
3317
        // shift to get lsb to position 2^0 = (e - fbits)
3318
        std::int64_t scale = value.scale();
3319
//        std::int64_t shift = scale + fbits; // we want the lsb at 2^0
3320
        std::int64_t lsbScale = scale - fbits;  // scale of the lsb
3321
        support::decimal partial, multiplier;
3322
        partial.setzero();
3323

3324
        multiplier.powerOf2(lsbScale);
3325

3326
        // convert the fraction bits 
3327
        for (unsigned i = 0; i < fbits; ++i) {
3328
                if (value.at(i)) {
3329
                        support::add(partial, multiplier);
3330
                }
3331
                support::add(multiplier, multiplier);
3332
        }
3333
        if (!value.isdenormal()) {
3334
                support::add(partial, multiplier); // add the hidden bit
3335
        }
3336
        str << partial;
3337
        return str.str();
3338
}
3339

3340
//////////////////////////////////////////////////////////////////////////////////////////////
3341
/// stream operators
3342

3343
// ostream output generates an ASCII format for the floating-point argument
3344
// Uses native binary-to-decimal conversion via blocktriple::to_string()
3345
// to produce exact output for all cfloat sizes without double conversion.
3346
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3347
inline std::ostream& operator<<(std::ostream& ostr, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
3,152✔
3348
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3349
        constexpr unsigned cfbits = Cfloat::fbits;
3,152✔
3350

3351
        std::streamsize prec  = ostr.precision();
3,152✔
3352
        std::streamsize width = ostr.width();
3,152✔
3353
        std::ios_base::fmtflags ff = ostr.flags();
3,152✔
3354
        bool bFixed      = (ff & std::ios_base::fixed) == std::ios_base::fixed;
3,152✔
3355
        bool bScientific = (ff & std::ios_base::scientific) == std::ios_base::scientific;
3,152✔
3356
        bool bShowpos    = (ff & std::ios_base::showpos) != 0;
3,152✔
3357
        bool bUppercase  = (ff & std::ios_base::uppercase) != 0;
3,152✔
3358
        bool bInternal   = (ff & std::ios_base::internal) != 0;
3,152✔
3359
        bool bLeft       = (ff & std::ios_base::left) != 0;
3,152✔
3360
        char fillChar    = ostr.fill();
3,152✔
3361

3362
        if constexpr (cfbits == 0) {
3363
                // degenerate cfloat with no fraction bits: fall back to double
3364
                std::ostringstream oss;
3365
                oss.precision(prec);
3366
                if (bFixed) oss << std::fixed;
3367
                if (bScientific) oss << std::scientific;
3368
                if (bUppercase) oss << std::uppercase;
3369
                if (bShowpos) oss << std::showpos;
3370
                oss << static_cast<double>(v);
3371
                std::string s = oss.str();
3372
                if (width > 0 && s.length() < static_cast<size_t>(width)) {
3373
                        size_t pad = static_cast<size_t>(width) - s.length();
3374
                        if (bLeft) { s.append(pad, fillChar); }
3375
                        else { s.insert(0u, pad, fillChar); }
3376
                }
3377
                return ostr << s;
3378
        } else {
3379
                blocktriple<cfbits, BlockTripleOperator::REP, bt> a;
3,152✔
3380
                v.normalize(a);
3,152✔
3381
                return ostr << a.to_string(prec, width, bFixed, bScientific,
6,304✔
3382
                                           bInternal, bLeft, bShowpos, bUppercase, fillChar);
6,304✔
3383
        }
3384
}
3385

3386
// parse a cfloat from a string in either cfloat hex format (nbits.esxHEXVALUEc)
3387
// or a decimal floating-point representation
3388
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3389
bool parse(const std::string& txt, cfloat<nbits,es,bt,hasSubnormals,hasMaxExpValues,isSaturating>& v) {
24✔
3390
        // check if the txt is of the native cfloat form: nbits.esX[0x]hexvaluec
3391
        std::regex cfloat_regex(R"(^[0-9]+\.[0-9]+[xX](0[xX])?[0-9A-Fa-f]+c?$)");
24✔
3392
        if (std::regex_match(txt, cfloat_regex)) {
24✔
3393
                // found a cfloat representation: parse nbits.esxHEXVALUEc
3394
                std::string nbitsStr, esStr, bitStr;
3✔
3395
                auto it = txt.begin();
3✔
3396
                for (; it != txt.end(); ++it) {
9✔
3397
                        if (*it == '.') break;
9✔
3398
                        nbitsStr.append(1, *it);
6✔
3399
                }
3400
                for (++it; it != txt.end(); ++it) {
6✔
3401
                        if (*it == 'x' || *it == 'X') break;
6✔
3402
                        esStr.append(1, *it);
3✔
3403
                }
3404
                for (++it; it != txt.end(); ++it) {
21✔
3405
                        if (*it == 'c') break;
21✔
3406
                        bitStr.append(1, *it);
18✔
3407
                }
3408
                unsigned nbits_in = 0;
3✔
3409
                unsigned es_in = 0;
3✔
3410
                {
3411
                        std::istringstream ss(nbitsStr);
3✔
3412
                        ss >> nbits_in;
3✔
3413
                        if (ss.fail()) return false;
3✔
3414
                }
3✔
3415
                {
3416
                        std::istringstream ss(esStr);
3✔
3417
                        ss >> es_in;
3✔
3418
                        if (ss.fail()) return false;
3✔
3419
                }
3✔
3420
                // native cfloat form must match target configuration
3421
                if (nbits_in != nbits || es_in != es) return false;
3✔
3422
                uint64_t raw = 0;
2✔
3423
                std::istringstream ss(bitStr);
2✔
3424
                ss >> std::hex >> raw;
2✔
3425
                if (ss.fail()) return false;
2✔
3426
                ss >> std::ws;
2✔
3427
                if (!ss.eof()) return false;
2✔
3428
                v.setbits(raw);
2✔
3429
                return true;
2✔
3430
        }
3✔
3431
        else {
3432
                // assume it is a float/double/long double representation
3433
                std::istringstream ss(txt);
21✔
3434
                double d;
3435
                ss >> d;
21✔
3436
                if (ss.fail()) return false;
21✔
3437
                ss >> std::ws;
21✔
3438
                if (!ss.eof()) return false;
21✔
3439
                v = d;
20✔
3440
                return true;
20✔
3441
        }
21✔
3442
}
24✔
3443

3444
// read an ASCII float or cfloat format: nbits.esxNN...NNc, for example: 16.5x7C00c
3445
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3446
inline std::istream& operator>>(std::istream& istr, cfloat<nbits,es,bt,hasSubnormals,hasMaxExpValues,isSaturating>& v) {
17✔
3447
        std::string txt;
17✔
3448
        istr >> txt;
17✔
3449
        if (!parse(txt, v)) {
17✔
3450
                std::cerr << "unable to parse -" << txt << "- into a cfloat value\n";
×
3451
        }
3452
        return istr;
17✔
3453
}
17✔
3454

3455
// encoding helpers
3456

3457
// return the Unit in the Last Position
3458
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3459
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ulp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& a) {
49✔
3460
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> b(a);
49✔
3461
        return ++b - a;
86✔
3462
}
3463

3464
// transform cfloat to a binary representation
3465
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3466
inline std::string to_binary(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& number, bool nibbleMarker = false) {
1,511✔
3467
        std::stringstream s;
1,511✔
3468
        s << "0b";
1,511✔
3469
        unsigned index = nbits;
1,511✔
3470
        s << (number.at(--index) ? '1' : '0') << '.';
1,511✔
3471

3472
        for (int i = int(es) - 1; i >= 0; --i) {
10,510✔
3473
                s << (number.at(--index) ? '1' : '0');
8,999✔
3474
                if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
8,999✔
3475
        }
3476

3477
        s << '.';
1,511✔
3478

3479
        constexpr int fbits = nbits - 1ull - es;
1,511✔
3480
        for (int i = fbits - 1; i >= 0; --i) {
32,322✔
3481
                s << (number.at(--index) ? '1' : '0');
30,811✔
3482
                if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
30,811✔
3483
        }
3484

3485
        return s.str();
3,022✔
3486
}
1,511✔
3487

3488
// transform a cfloat into a triple representation
3489
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3490
inline std::string to_triple(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& number, bool nibbleMarker = true) {
3✔
3491
        std::stringstream s;
3✔
3492
        blocktriple<cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits, BlockTripleOperator::REP, bt> triple;
3✔
3493
        number.normalize(triple);
3✔
3494
        s << to_triple(triple, nibbleMarker);
3✔
3495
        return s.str();
6✔
3496
}
3✔
3497

3498
// Magnitude of a cfloat (equivalent to turning the sign bit off).
3499
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3500
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>
3501
abs(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& v) {
16,952✔
3502
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> a(v);
16,952✔
3503
        a.setsign(false);
16,952✔
3504
        return a;
16,952✔
3505
}
3506

3507
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3508
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>
3509
fabs(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> v) {
40✔
3510
        return abs(v);
40✔
3511
}
3512

3513
////////////////////// debug helpers
3514

3515
// convenience method to gain access to the values of the constexpr variables that govern the cfloat behavior
3516
template<unsigned nbits, unsigned es, typename bt = uint8_t, bool hasSubnormals = false, bool hasMaxExpValues = false, bool isSaturating = false>
3517
void ReportCfloatClassParameters() {
3518
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> a;
3519
        a.constexprClassParameters();
3520
}
3521

3522
//////////////////////////////////////////////////////
3523
/// cfloat - cfloat binary logic operators
3524

3525
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3526
inline bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
120,418,175✔
3527
        if (lhs.isnan() || rhs.isnan()) return false;
120,418,175✔
3528
        if (lhs.iszero() && rhs.iszero()) return true; // +0 == -0 per IEEE-754 §5.11
118,763,261✔
3529
        for (unsigned i = 0; i < lhs.nrBlocks; ++i) {
360,106,320✔
3530
                if (lhs._block[i] != rhs._block[i]) {
244,599,020✔
3531
                        return false;
2,104,965✔
3532
                }
3533
        }
3534
        return true;
115,507,300✔
3535
}
3536
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3537
inline bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator==(lhs, rhs); }
119,020,028✔
3538
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3539
inline bool operator< (const cfloat<nnbits, nes, nbt, nsub, nsup, nsat>& lhs, const cfloat<nnbits, nes, nbt, nsub, nsup, nsat>& rhs) {
5,568,897✔
3540
        if (lhs.isnan() || rhs.isnan()) return false;
5,568,897✔
3541
        // need this as arithmetic difference is defined as snan(indeterminate)
3542
        if (lhs.isinf(INF_TYPE_NEGATIVE) && rhs.isinf(INF_TYPE_NEGATIVE)) return false;
5,453,389✔
3543
        if (lhs.isinf(INF_TYPE_POSITIVE) && rhs.isinf(INF_TYPE_POSITIVE)) return false;
5,453,375✔
3544
        if constexpr (nsub) {
3545
                cfloat<nnbits, nes, nbt, nsub, nsup, nsat> diff = (lhs - rhs);
5,384,416✔
3546
                return (!diff.iszero() && diff.sign()) ? true : false;  // got to guard against -0
5,384,416✔
3547
        }
3548
        else {
3549
                if (lhs.iszero() && rhs.iszero()) return false;  // we need to 'collapse' all zero encodings
68,926✔
3550
                if (lhs.sign() && !rhs.sign()) return true;
68,918✔
3551
                if (!lhs.sign() && rhs.sign()) return false;
51,695✔
3552
                bool positive = lhs.ispos();
34,472✔
3553
                if (positive) {
34,472✔
3554
                        if (lhs.scale() < rhs.scale()) return true;
17,252✔
3555
                        if (lhs.scale() > rhs.scale()) return false;
12,786✔
3556
                }
3557
                else {
3558
                        if (lhs.scale() > rhs.scale()) return true;
17,220✔
3559
                        if (lhs.scale() < rhs.scale()) return false;
12,770✔
3560
                }
3561
                // sign and scale are the same
3562
                if (lhs.scale() == rhs.scale()) {
16,642✔
3563
                        // compare fractions: we do not have subnormals, so we can ignore the hidden bit
3564
                        blockbinary<nnbits - 1ull - nes, nbt> l, r;
3565
                        lhs.fraction(l);
16,642✔
3566
                        rhs.fraction(r);
16,642✔
3567
                        blockbinary<nnbits - nes, nbt> ll, rr; // fbits + 1 so we can 0 extend to honor 2's complement encoding of blockbinary
3568
                        ll.assignWithoutSignExtend(l);
16,642✔
3569
                        rr.assignWithoutSignExtend(r);
16,642✔
3570
                        return (positive ? (ll < rr) : (ll > rr));
16,642✔
3571
                }
3572
                return false;
×
3573
        }
3574
}
3575
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3576
inline bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { 
2,794,589✔
3577
        if (lhs.isnan() || rhs.isnan()) return false;
2,794,589✔
3578
        // need this as arithmetic difference is defined as snan(indeterminate)
3579
        if (lhs.isinf(INF_TYPE_NEGATIVE) && rhs.isinf(INF_TYPE_NEGATIVE)) return false;
2,786,475✔
3580
        if (lhs.isinf(INF_TYPE_POSITIVE) && rhs.isinf(INF_TYPE_POSITIVE)) return false;
2,786,461✔
3581
        return  operator< (rhs, lhs); 
2,786,445✔
3582
}
3583
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3584
inline bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { 
1,398,017✔
3585
        if (lhs.isnan() || rhs.isnan()) return false;
1,398,017✔
3586
        return !operator> (lhs, rhs); 
1,389,917✔
3587
}
3588
template<unsigned nnbits, unsigned nes, typename nbt, bool nsub, bool nsup, bool nsat>
3589
inline bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
1,399,400✔
3590
        if (lhs.isnan() || rhs.isnan()) return false;
1,399,400✔
3591
        return !operator< (lhs, rhs); 
1,391,294✔
3592
}
3593

3594
//////////////////////////////////////////////////////
3595
/// cfloat - cfloat binary arithmetic operators
3596

3597
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3598
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
9,974,529✔
3599
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
9,974,529✔
3600
        sum += rhs;
9,974,529✔
3601
        return sum;
9,974,529✔
3602
}
3603
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3604
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
8,460,609✔
3605
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
8,460,609✔
3606
        diff -= rhs;
8,460,609✔
3607
        return diff;
8,460,609✔
3608
}
3609
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3610
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
4,280,467✔
3611
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
4,280,467✔
3612
        mul *= rhs;
4,280,467✔
3613
        return mul;
4,280,467✔
3614
}
3615
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3616
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
5,138,816✔
3617
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
5,138,816✔
3618
        ratio /= rhs;
5,138,816✔
3619
        return ratio;
5,138,815✔
3620
}
3621

3622
/// binary cfloat - literal arithmetic operators
3623

3624
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3625
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3626
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3627
        sum += rhs;
3628
        return sum;
3629
}
3630
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3631
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3632
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3633
        diff -= rhs;
3634
        return diff;
3635
}
3636
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3637
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3638
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3639
        mul *= rhs;
3640
        return mul;
3641
}
3642
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3643
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(float lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
2✔
3644
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
2✔
3645
        ratio /= rhs;
2✔
3646
        return ratio;
2✔
3647
}
3648

3649
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3650
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
4✔
3651
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
4✔
3652
        sum += rhs;
4✔
3653
        return sum;
4✔
3654
}
3655
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3656
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3657
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3658
        diff -= rhs;
3659
        return diff;
3660
}
3661
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3662
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
16✔
3663
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
16✔
3664
        mul *= rhs;
16✔
3665
        return mul;
16✔
3666
}
3667
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3668
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(double lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3669
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3670
        ratio /= rhs;
3671
        return ratio;
3672
}
3673

3674
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3675
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
×
3676
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
×
3677
        sum += rhs;
×
3678
        return sum;
×
3679
}
3680
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3681
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3682
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3683
        diff -= rhs;
3684
        return diff;
3685
}
3686
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3687
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
38✔
3688
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
38✔
3689
        mul *= rhs;
38✔
3690
        return mul;
38✔
3691
}
3692
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3693
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
374✔
3694
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
374✔
3695
        ratio /= rhs;
374✔
3696
        return ratio;
374✔
3697
}
3698

3699
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3700
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3701
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3702
        sum += rhs;
3703
        return sum;
3704
}
3705
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3706
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3707
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3708
        diff -= rhs;
3709
        return diff;
3710
}
3711
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3712
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3713
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3714
        mul *= rhs;
3715
        return mul;
3716
}
3717
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3718
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(unsigned int lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3719
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3720
        ratio /= rhs;
3721
        return ratio;
3722
}
3723

3724
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3725
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3726
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3727
        sum += rhs;
3728
        return sum;
3729
}
3730
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3731
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3732
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3733
        diff -= rhs;
3734
        return diff;
3735
}
3736
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3737
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3738
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3739
        mul *= rhs;
3740
        return mul;
3741
}
3742
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3743
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3744
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3745
        ratio /= rhs;
3746
        return ratio;
3747
}
3748

3749
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3750
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3751
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> sum(lhs);
3752
        sum += rhs;
3753
        return sum;
3754
}
3755
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3756
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3757
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> diff(lhs);
3758
        diff -= rhs;
3759
        return diff;
3760
}
3761
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3762
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3763
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> mul(lhs);
3764
        mul *= rhs;
3765
        return mul;
3766
}
3767
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3768
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(unsigned long long lhs, const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& rhs) {
3769
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ratio(lhs);
3770
        ratio /= rhs;
3771
        return ratio;
3772
}
3773

3774
///  binary cfloat - literal arithmetic operators
3775

3776
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3777
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3778
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3779
        Cfloat sum(lhs);
3780
        sum += Cfloat(rhs);
3781
        return sum;
3782
}
3783
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3784
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3785
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3786
        Cfloat diff(lhs);
3787
        diff -= rhs;
3788
        return diff;
3789
}
3790
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3791
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3792
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3793
        Cfloat mul(lhs);
3794
        mul *= Cfloat(rhs);
3795
        return mul;
3796
}
3797
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3798
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
3799
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3800
        Cfloat ratio(lhs);
3801
        ratio /= Cfloat(rhs);
3802
        return ratio;
3803
}
3804

3805
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3806
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3807
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3808
        Cfloat sum(lhs);
3809
        sum += Cfloat(rhs);
3810
        return sum;
3811
}
3812
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3813
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
4✔
3814
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3815
        Cfloat diff(lhs);
4✔
3816
        diff -= rhs;
4✔
3817
        return diff;
4✔
3818
}
3819
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3820
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3821
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3822
        Cfloat mul(lhs);
3823
        mul *= Cfloat(rhs);
3824
        return mul;
3825
}
3826
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3827
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
3828
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3829
        Cfloat ratio(lhs);
3830
        ratio /= Cfloat(rhs);
3831
        return ratio;
3832
}
3833

3834
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3835
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3836
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3837
        Cfloat sum(lhs);
3838
        sum += Cfloat(rhs);
3839
        return sum;
3840
}
3841
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3842
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
4✔
3843
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3844
        Cfloat diff(lhs);
4✔
3845
        diff -= rhs;
4✔
3846
        return diff;
4✔
3847
}
3848
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3849
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3850
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3851
        Cfloat mul(lhs);
3852
        mul *= Cfloat(rhs);
3853
        return mul;
3854
}
3855
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3856
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
3857
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3858
        Cfloat ratio(lhs);
3859
        ratio /= Cfloat(rhs);
3860
        return ratio;
3861
}
3862

3863
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3864
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
3865
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3866
        Cfloat sum(lhs);
3867
        sum += Cfloat(rhs);
3868
        return sum;
3869
}
3870
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3871
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
3872
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3873
        Cfloat diff(lhs);
3874
        diff -= rhs;
3875
        return diff;
3876
}
3877
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3878
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
3879
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3880
        Cfloat mul(lhs);
3881
        mul *= Cfloat(rhs);
3882
        return mul;
3883
}
3884
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3885
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned int rhs) {
3886
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3887
        Cfloat ratio(lhs);
3888
        ratio /= Cfloat(rhs);
3889
        return ratio;
3890
}
3891

3892
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3893
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
3894
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3895
        Cfloat sum(lhs);
3896
        sum += Cfloat(rhs);
3897
        return sum;
3898
}
3899
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3900
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
3901
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3902
        Cfloat diff(lhs);
3903
        diff -= rhs;
3904
        return diff;
3905
}
3906
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3907
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
3908
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3909
        Cfloat mul(lhs);
3910
        mul *= Cfloat(rhs);
3911
        return mul;
3912
}
3913
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3914
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
3915
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3916
        Cfloat ratio(lhs);
3917
        ratio /= Cfloat(rhs);
3918
        return ratio;
3919
}
3920

3921
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3922
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
3923
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3924
        Cfloat sum(lhs);
3925
        sum += Cfloat(rhs);
3926
        return sum;
3927
}
3928
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3929
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
3930
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3931
        Cfloat diff(lhs);
3932
        diff -= rhs;
3933
        return diff;
3934
}
3935
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3936
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
3937
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3938
        Cfloat mul(lhs);
3939
        mul *= Cfloat(rhs);
3940
        return mul;
3941
}
3942
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3943
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, unsigned long long rhs) {
3944
        using Cfloat = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>;
3945
        Cfloat ratio(lhs);
3946
        ratio /= Cfloat(rhs);
3947
        return ratio;
3948
}
3949

3950
///////////////////////////////////////////////////////////////////////
3951
///   binary logic literal comparisons
3952

3953
// cfloat - literal float logic operators
3954
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3955
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
3956
        return float(lhs) == rhs;
1✔
3957
}
3958
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3959
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
3960
        return float(lhs) != rhs;
1✔
3961
}
3962
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3963
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
2✔
3964
        return float(lhs) < rhs;
2✔
3965
}
3966
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3967
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
3968
        return float(lhs) > rhs;
1✔
3969
}
3970
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3971
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
3972
        return float(lhs) <= rhs;
1✔
3973
}
3974
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3975
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, float rhs) {
1✔
3976
        return float(lhs) >= rhs;
1✔
3977
}
3978
// cfloat - literal double logic operators
3979
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3980
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
3981
        return double(lhs) == rhs;
1✔
3982
}
3983
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3984
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
3985
        return double(lhs) != rhs;
1✔
3986
}
3987
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3988
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
336✔
3989
        return double(lhs) < rhs;
336✔
3990
}
3991
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3992
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
892✔
3993
        return double(lhs) > rhs;
892✔
3994
}
3995
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
3996
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
3997
        return double(lhs) <= rhs;
1✔
3998
}
3999
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4000
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, double rhs) {
1✔
4001
        return double(lhs) >= rhs;
1✔
4002
}
4003

4004
#if LONG_DOUBLE_SUPPORT
4005
// cfloat - literal long double logic operators
4006
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4007
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4008
        return (long double)(lhs) == rhs;
1✔
4009
}
4010
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4011
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4012
        return (long double)(lhs) != rhs;
1✔
4013
}
4014
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4015
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4016
        return (long double)(lhs) < rhs;
1✔
4017
}
4018
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4019
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4020
        return (long double)(lhs) > rhs;
1✔
4021
}
4022
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4023
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4024
        return (long double)(lhs) <= rhs;
1✔
4025
}
4026
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4027
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long double rhs) {
1✔
4028
        return (long double)(lhs) >= rhs;
1✔
4029
}
4030
#endif
4031

4032
// cfloat - literal int logic operators
4033
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4034
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
11✔
4035
        return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
11✔
4036
}
4037
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4038
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4039
        return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4040
}
4041
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4042
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
107,196✔
4043
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
107,196✔
4044
}
4045
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4046
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
666✔
4047
        return operator<(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs), lhs);
666✔
4048
}
4049
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4050
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4051
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4052
}
4053
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4054
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, int rhs) {
1✔
4055
        return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
1✔
4056
}
4057

4058
// cfloat - long long logic operators
4059
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4060
inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4061
        return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4062
}
4063
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4064
inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4065
        return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4066
}
4067
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4068
inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4069
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4070
}
4071
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4072
inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4073
        return operator<(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs), lhs);
4074
}
4075
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4076
inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4077
        return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4078
}
4079
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4080
inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& lhs, long long rhs) {
4081
        return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>(rhs));
4082
}
4083

4084
// standard library functions for floating point
4085

4086
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4087
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> frexp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& x, int* exp) {
57,412✔
4088
        *exp = x.scale();
57,412✔
4089
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> fraction(x);
57,412✔
4090
        fraction.setexponent(0);
57,412✔
4091
        return fraction;
57,412✔
4092
}
4093

4094
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4095
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> ldexp(const cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& x, int exp) {
765✔
4096
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> result(x);
765✔
4097
        int xexp = x.scale();
765✔
4098
        result.setexponent(xexp + exp);  // TODO: this does not work for subnormals
765✔
4099
        return result;
765✔
4100
}
4101

4102
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4103
inline cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> 
4104
fma(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> x,
3✔
4105
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> y,
4106
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> z) {
4107
        cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> fused{ 0 };
3✔
4108
        constexpr unsigned FBITS = cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>::fbits;
3✔
4109
        constexpr unsigned EXTRA_FBITS = FBITS+2;
3✔
4110
        constexpr unsigned EXTENDED_PRECISION = nbits + EXTRA_FBITS;
3✔
4111
        // the C++ fma spec indicates that the x*y+z is evaluated in 'infinite' precision
4112
        // with only a single rounding event. The minimum finite precision that would behave like this
4113
        // is the precision where the product x*y does not need to be rounded, which will
4114
        // need at least 2*(fbits+1) mantissa bits to capture all bits that can be
4115
        // generated by the product.
4116
        cfloat<EXTENDED_PRECISION, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> preciseX(x), preciseY(y), preciseZ(z);
3✔
4117
//        ReportValue(preciseX, "extended precision x");
4118
//        ReportValue(preciseY, "extended precision y");
4119
//        ReportValue(preciseZ, "extended precision z");
4120
        cfloat<EXTENDED_PRECISION, es, bt, hasSubnormals, hasMaxExpValues, isSaturating> product = preciseX * preciseY;
3✔
4121
//        ReportValue(product, "extended precision p");
4122
        fused = product + preciseZ;
3✔
4123
        return fused;
3✔
4124
}
4125

4126
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4127
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4128
minpos(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4129
        return c.minpos();
4130
}
4131
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4132
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4133
maxpos(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4134
        return c.maxpos();
4135
}
4136
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4137
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4138
minneg(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4139
        return c.minneg();
4140
}
4141
template<unsigned nbits, unsigned es, typename bt, bool hasSubnormals, bool hasMaxExpValues, bool isSaturating>
4142
cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>&
4143
maxneg(cfloat<nbits, es, bt, hasSubnormals, hasMaxExpValues, isSaturating>& c) {
4144
        return c.maxneg();
4145
}
4146

4147
}} // namespace sw::universal
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc