• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 20888692252

11 Jan 2026 03:20AM UTC coverage: 90.352% (-0.07%) from 90.42%
20888692252

Pull #5227

github

web-flow
Merge 931b6907a into 9224539eb
Pull Request #5227: Poly1305 optimizations

101900 of 112781 relevant lines covered (90.35%)

12940779.93 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.09
/src/lib/mac/poly1305/poly1305.cpp
1
/*
2
* Derived from poly1305-donna-64.h by Andrew Moon <liquidsun@gmail.com>
3
* in https://github.com/floodyberry/poly1305-donna
4
*
5
* (C) 2014 Andrew Moon
6
* (C) 2014,2025,2026 Jack Lloyd
7
*
8
* Botan is released under the Simplified BSD License (see license.txt)
9
*/
10

11
#include <botan/internal/poly1305.h>
12

13
#include <botan/internal/ct_utils.h>
14
#include <botan/internal/donna128.h>
15
#include <botan/internal/loadstor.h>
16
#include <botan/internal/stl_util.h>
17

18
#if defined(BOTAN_HAS_POLY1305_AVX2) || defined(BOTAN_HAS_POLY1305_AVX512)
19
   #include <botan/internal/cpuid.h>
20
#endif
21

22
namespace Botan {
23

24
namespace {
25

26
// State layout: pad || accum || r || r^2 || r^3 || ... || r^n
27
// This ordering allows extending with more powers of r at the end
28
constexpr size_t PAD_BASE = 0;  // pad[0..1]
29
constexpr size_t H_BASE = 2;    // h[0..2] (accumulator)
30
constexpr size_t R_BASE = 5;    // r^1[0..2], r^2[3..5], r^3[6..8], etc.
31

32
// Multiply two values in radix 2^44 representation mod (2^130 - 5)
33
// h = a * b mod p
34
BOTAN_FORCE_INLINE void poly1305_mul_44(uint64_t& h0,
135,420✔
35
                                        uint64_t& h1,
36
                                        uint64_t& h2,
37
                                        uint64_t a0,
38
                                        uint64_t a1,
39
                                        uint64_t a2,
40
                                        uint64_t b0,
41
                                        uint64_t b1,
42
                                        uint64_t b2) {
43
   constexpr uint64_t M44 = 0xFFFFFFFFFFF;
135,420✔
44
   constexpr uint64_t M42 = 0x3FFFFFFFFFF;
135,420✔
45

46
#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
47
   typedef donna128 uint128_t;
48
#endif
49

50
   const uint64_t s1 = b1 * 20;
135,420✔
51
   const uint64_t s2 = b2 * 20;
135,420✔
52

53
   const uint128_t d0 = uint128_t(a0) * b0 + uint128_t(a1) * s2 + uint128_t(a2) * s1;
135,420✔
54
   const uint64_t c0 = carry_shift(d0, 44);
135,420✔
55

56
   const uint128_t d1 = uint128_t(a0) * b1 + uint128_t(a1) * b0 + uint128_t(a2) * s2 + c0;
135,420✔
57
   const uint64_t c1 = carry_shift(d1, 44);
135,420✔
58

59
   const uint128_t d2 = uint128_t(a0) * b2 + uint128_t(a1) * b1 + uint128_t(a2) * b0 + c1;
135,420✔
60
   const uint64_t c2 = carry_shift(d2, 42);
135,420✔
61

62
   h0 = (d0 & M44) + c2 * 5;
135,420✔
63
   h1 = (d1 & M44) + (h0 >> 44);
135,420✔
64
   h0 &= M44;
135,420✔
65
   h2 = d2 & M42;
135,420✔
66
}
67

68
// Extend powers of r from current max to target
69
void poly1305_extend_powers(secure_vector<uint64_t>& X, size_t target_powers) {
132,814✔
70
   const size_t current_powers = (X.size() - 5) / 3;
132,814✔
71

72
   if(current_powers >= target_powers) {
132,814✔
73
      return;
74
   }
75

76
   // Load r^1 for multiplication
77
   const uint64_t r0 = X[R_BASE + 0];
132,778✔
78
   const uint64_t r1 = X[R_BASE + 1];
132,778✔
79
   const uint64_t r2 = X[R_BASE + 2];
132,778✔
80

81
   X.resize(5 + target_powers * 3);
132,778✔
82

83
   // Compute r^(current+1) through r^target
84
   for(size_t i = current_powers + 1; i <= target_powers; ++i) {
268,198✔
85
      const size_t offset = R_BASE + (i - 1) * 3;
135,420✔
86
      poly1305_mul_44(
135,420✔
87
         X[offset + 0], X[offset + 1], X[offset + 2], X[offset - 3], X[offset - 2], X[offset - 1], r0, r1, r2);
135,420✔
88
   }
89
}
90

91
// Initialize Poly1305 state and precompute powers of r
92
void poly1305_init(secure_vector<uint64_t>& X, const uint8_t key[32]) {
130,136✔
93
   X.clear();
130,136✔
94
   X.reserve(2 + 3 + 2 * 3);
130,136✔
95
   X.resize(2 + 3 + 3);
130,136✔
96

97
   /* Save pad for later (first 2 slots) */
98
   X[PAD_BASE + 0] = load_le<uint64_t>(key, 2);
130,136✔
99
   X[PAD_BASE + 1] = load_le<uint64_t>(key, 3);
130,136✔
100

101
   /* h = 0 (accumulator, next 3 slots) */
102
   X[H_BASE + 0] = 0;
130,136✔
103
   X[H_BASE + 1] = 0;
130,136✔
104
   X[H_BASE + 2] = 0;
130,136✔
105

106
   /* r &= 0xffffffc0ffffffc0ffffffc0fffffff (clamping) */
107
   const uint64_t t0 = load_le<uint64_t>(key, 0);
130,136✔
108
   const uint64_t t1 = load_le<uint64_t>(key, 1);
130,136✔
109

110
   const uint64_t r0 = (t0) & 0xffc0fffffff;
130,136✔
111
   const uint64_t r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
130,136✔
112
   const uint64_t r2 = ((t1 >> 24)) & 0x00ffffffc0f;
130,136✔
113

114
   // Store r^1
115
   X[R_BASE + 0] = r0;
130,136✔
116
   X[R_BASE + 1] = r1;
130,136✔
117
   X[R_BASE + 2] = r2;
130,136✔
118

119
   poly1305_extend_powers(X, 2);
130,136✔
120
}
130,136✔
121

122
// Process a single block: h = (h + m) * r mod p
123
BOTAN_FORCE_INLINE void poly1305_block_single(uint64_t& h0,
380,966✔
124
                                              uint64_t& h1,
125
                                              uint64_t& h2,
126
                                              uint64_t r0,
127
                                              uint64_t r1,
128
                                              uint64_t r2,
129
                                              uint64_t s1,
130
                                              uint64_t s2,
131
                                              const uint8_t* m,
132
                                              uint64_t hibit) {
133
   constexpr uint64_t M44 = 0xFFFFFFFFFFF;
380,966✔
134
   constexpr uint64_t M42 = 0x3FFFFFFFFFF;
380,966✔
135

136
#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
137
   typedef donna128 uint128_t;
138
#endif
139

140
   const uint64_t t0 = load_le<uint64_t>(m, 0);
761,932✔
141
   const uint64_t t1 = load_le<uint64_t>(m, 1);
380,966✔
142

143
   h0 += (t0 & M44);
380,966✔
144
   h1 += ((t0 >> 44) | (t1 << 20)) & M44;
380,966✔
145
   h2 += ((t1 >> 24) & M42) | hibit;
380,966✔
146

147
   const uint128_t d0 = uint128_t(h0) * r0 + uint128_t(h1) * s2 + uint128_t(h2) * s1;
380,966✔
148
   const uint64_t c0 = carry_shift(d0, 44);
380,966✔
149

150
   const uint128_t d1 = uint128_t(h0) * r1 + uint128_t(h1) * r0 + uint128_t(h2) * s2 + c0;
380,966✔
151
   const uint64_t c1 = carry_shift(d1, 44);
380,966✔
152

153
   const uint128_t d2 = uint128_t(h0) * r2 + uint128_t(h1) * r1 + uint128_t(h2) * r0 + c1;
380,966✔
154
   const uint64_t c2 = carry_shift(d2, 42);
380,966✔
155

156
   h0 = (d0 & M44) + c2 * 5;
380,966✔
157
   h1 = (d1 & M44) + (h0 >> 44);
380,966✔
158
   h0 &= M44;
380,966✔
159
   h2 = d2 & M42;
380,966✔
160
}
161

162
// Process two blocks in parallel: h = ((h + m0) * r + m1) * r = (h + m0) * r^2 + m1 * r
163
// The multiplications by r^2 and r are independent, enabling ILP
164
BOTAN_FORCE_INLINE void poly1305_block_pair(uint64_t& h0,
74,528✔
165
                                            uint64_t& h1,
166
                                            uint64_t& h2,
167
                                            uint64_t r0,
168
                                            uint64_t r1,
169
                                            uint64_t r2,
170
                                            uint64_t s1,
171
                                            uint64_t s2,
172
                                            uint64_t rr0,
173
                                            uint64_t rr1,
174
                                            uint64_t rr2,
175
                                            uint64_t ss1,
176
                                            uint64_t ss2,
177
                                            const uint8_t* m,
178
                                            uint64_t hibit) {
179
   constexpr uint64_t M44 = 0xFFFFFFFFFFF;
74,528✔
180
   constexpr uint64_t M42 = 0x3FFFFFFFFFF;
74,528✔
181

182
#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
183
   typedef donna128 uint128_t;
184
#endif
185

186
   // Load first block (will be multiplied by r^2)
187
   const uint64_t m0_t0 = load_le<uint64_t>(m, 0);
149,056✔
188
   const uint64_t m0_t1 = load_le<uint64_t>(m, 1);
74,528✔
189

190
   // Load second block (will be multiplied by r)
191
   const uint64_t m1_t0 = load_le<uint64_t>(m + 16, 0);
74,528✔
192
   const uint64_t m1_t1 = load_le<uint64_t>(m + 16, 1);
74,528✔
193

194
   // Add first block to h
195
   h0 += (m0_t0 & M44);
74,528✔
196
   h1 += ((m0_t0 >> 44) | (m0_t1 << 20)) & M44;
74,528✔
197
   h2 += ((m0_t1 >> 24) & M42) | hibit;
74,528✔
198

199
   // Convert second block to limbs
200
   const uint64_t b0 = (m1_t0 & M44);
74,528✔
201
   const uint64_t b1 = ((m1_t0 >> 44) | (m1_t1 << 20)) & M44;
74,528✔
202
   const uint64_t b2 = ((m1_t1 >> 24) & M42) | hibit;
74,528✔
203

204
   // Compute (h + m0) * r^2 + m1 * r
205
   const uint128_t d0 = uint128_t(h0) * rr0 + uint128_t(h1) * ss2 + uint128_t(h2) * ss1 + uint128_t(b0) * r0 +
74,528✔
206
                        uint128_t(b1) * s2 + uint128_t(b2) * s1;
74,528✔
207
   const uint64_t c0 = carry_shift(d0, 44);
74,528✔
208

209
   const uint128_t d1 = uint128_t(h0) * rr1 + uint128_t(h1) * rr0 + uint128_t(h2) * ss2 + uint128_t(b0) * r1 +
74,528✔
210
                        uint128_t(b1) * r0 + uint128_t(b2) * s2 + c0;
74,528✔
211
   const uint64_t c1 = carry_shift(d1, 44);
74,528✔
212

213
   const uint128_t d2 = uint128_t(h0) * rr2 + uint128_t(h1) * rr1 + uint128_t(h2) * rr0 + uint128_t(b0) * r2 +
74,528✔
214
                        uint128_t(b1) * r1 + uint128_t(b2) * r0 + c1;
74,528✔
215
   const uint64_t c2 = carry_shift(d2, 42);
74,528✔
216

217
   h0 = (d0 & M44) + c2 * 5;
74,528✔
218
   h1 = (d1 & M44) + (h0 >> 44);
74,528✔
219
   h0 &= M44;
74,528✔
220
   h2 = d2 & M42;
74,528✔
221
}
222

223
void poly1305_blocks(secure_vector<uint64_t>& X, const uint8_t* m, size_t blocks, bool is_final = false) {
403,522✔
224
   const uint64_t hibit = is_final ? 0 : (static_cast<uint64_t>(1) << 40);
403,522✔
225

226
   // Load r (at R_BASE + 0)
227
   const uint64_t r0 = X[R_BASE + 0];
403,522✔
228
   const uint64_t r1 = X[R_BASE + 1];
403,522✔
229
   const uint64_t r2 = X[R_BASE + 2];
403,522✔
230
   const uint64_t s1 = r1 * 20;
403,522✔
231
   const uint64_t s2 = r2 * 20;
403,522✔
232

233
   // Load r^2 (at R_BASE + 3)
234
   const uint64_t rr0 = X[R_BASE + 3];
403,522✔
235
   const uint64_t rr1 = X[R_BASE + 4];
403,522✔
236
   const uint64_t rr2 = X[R_BASE + 5];
403,522✔
237

238
   // Precompute
239
   const uint64_t ss1 = rr1 * 20;
403,522✔
240
   const uint64_t ss2 = rr2 * 20;
403,522✔
241

242
   // Load accumulator
243
   uint64_t h0 = X[H_BASE + 0];
403,522✔
244
   uint64_t h1 = X[H_BASE + 1];
403,522✔
245
   uint64_t h2 = X[H_BASE + 2];
403,522✔
246

247
   while(blocks >= 2) {
478,050✔
248
      poly1305_block_pair(h0, h1, h2, r0, r1, r2, s1, s2, rr0, rr1, rr2, ss1, ss2, m, hibit);
74,528✔
249
      m += 32;
74,528✔
250
      blocks -= 2;
74,528✔
251
   }
252

253
   // Final block?
254
   if(blocks > 0) {
403,522✔
255
      poly1305_block_single(h0, h1, h2, r0, r1, r2, s1, s2, m, hibit);
761,932✔
256
   }
257

258
   // Store accumulator
259
   X[H_BASE + 0] = h0;
403,522✔
260
   X[H_BASE + 1] = h1;
403,522✔
261
   X[H_BASE + 2] = h2;
403,522✔
262
}
403,522✔
263

264
void poly1305_finish(secure_vector<uint64_t>& X, uint8_t mac[16]) {
116,108✔
265
   constexpr uint64_t M44 = 0xFFFFFFFFFFF;
116,108✔
266
   constexpr uint64_t M42 = 0x3FFFFFFFFFF;
116,108✔
267

268
   /* fully carry h */
269
   uint64_t h0 = X[H_BASE + 0];
116,108✔
270
   uint64_t h1 = X[H_BASE + 1];
116,108✔
271
   uint64_t h2 = X[H_BASE + 2];
116,108✔
272

273
   uint64_t c = (h1 >> 44);
116,108✔
274
   h1 &= M44;
116,108✔
275
   h2 += c;
116,108✔
276
   c = (h2 >> 42);
116,108✔
277
   h2 &= M42;
116,108✔
278
   h0 += c * 5;
116,108✔
279
   c = (h0 >> 44);
116,108✔
280
   h0 &= M44;
116,108✔
281
   h1 += c;
116,108✔
282
   c = (h1 >> 44);
116,108✔
283
   h1 &= M44;
116,108✔
284
   h2 += c;
116,108✔
285
   c = (h2 >> 42);
116,108✔
286
   h2 &= M42;
116,108✔
287
   h0 += c * 5;
116,108✔
288
   c = (h0 >> 44);
116,108✔
289
   h0 &= M44;
116,108✔
290
   h1 += c;
116,108✔
291

292
   /* compute h + -p */
293
   uint64_t g0 = h0 + 5;
116,108✔
294
   c = (g0 >> 44);
116,108✔
295
   g0 &= M44;
116,108✔
296
   uint64_t g1 = h1 + c;
116,108✔
297
   c = (g1 >> 44);
116,108✔
298
   g1 &= M44;
116,108✔
299
   const uint64_t g2 = h2 + c - (static_cast<uint64_t>(1) << 42);
116,108✔
300

301
   /* select h if h < p, or h + -p if h >= p */
302
   const auto c_mask = CT::Mask<uint64_t>::expand(c);
116,108✔
303
   h0 = c_mask.select(g0, h0);
116,108✔
304
   h1 = c_mask.select(g1, h1);
116,108✔
305
   h2 = c_mask.select(g2, h2);
116,108✔
306

307
   /* h = (h + pad) */
308
   const uint64_t t0 = X[PAD_BASE + 0];
116,108✔
309
   const uint64_t t1 = X[PAD_BASE + 1];
116,108✔
310

311
   h0 += ((t0)&M44);
116,108✔
312
   c = (h0 >> 44);
116,108✔
313
   h0 &= M44;
116,108✔
314
   h1 += (((t0 >> 44) | (t1 << 20)) & M44) + c;
116,108✔
315
   c = (h1 >> 44);
116,108✔
316
   h1 &= M44;
116,108✔
317
   h2 += (((t1 >> 24)) & M42) + c;
116,108✔
318
   h2 &= M42;
116,108✔
319

320
   /* mac = h % (2^128) */
321
   h0 = ((h0) | (h1 << 44));
116,108✔
322
   h1 = ((h1 >> 20) | (h2 << 24));
116,108✔
323

324
   store_le(mac, h0, h1);
116,108✔
325

326
   /* zero out the state */
327
   clear_mem(X.data(), X.size());
116,108✔
328
}
116,108✔
329

330
}  // namespace
331

332
void Poly1305::clear() {
7,186✔
333
   zap(m_poly);
7,186✔
334
   m_buffer.clear();
7,186✔
335
}
7,186✔
336

337
bool Poly1305::has_keying_material() const {
1,237,379✔
338
   // Minimum size: pad(2) + accum(3) + r(3) + r^2(3) = 11
339
   return m_poly.size() >= 11;
1,237,379✔
340
}
341

342
void Poly1305::key_schedule(std::span<const uint8_t> key) {
130,136✔
343
   m_buffer.clear();
130,136✔
344

345
   poly1305_init(m_poly, key.data());
130,136✔
346
}
130,136✔
347

348
void Poly1305::add_data(std::span<const uint8_t> input) {
1,120,583✔
349
   assert_key_material_set();
1,120,583✔
350

351
   BufferSlicer in(input);
1,096,299✔
352

353
   while(!in.empty()) {
3,310,379✔
354
      if(const auto one_block = m_buffer.handle_unaligned_data(in)) {
1,117,781✔
355
         poly1305_blocks(m_poly, one_block->data(), 1);
357,557✔
356
      }
357

358
      if(m_buffer.in_alignment()) {
1,117,781✔
359
         const auto [aligned_data, full_blocks] = m_buffer.aligned_data_to_process(in);
404,770✔
360
         if(full_blocks > 0) {
404,770✔
361
            const uint8_t* data_ptr = aligned_data.data();
47,397✔
362
            size_t blocks_remaining = full_blocks;
47,397✔
363

364
#if defined(BOTAN_HAS_POLY1305_AVX512)
365
            if(blocks_remaining >= 8 * 3 && CPUID::has(CPUID::Feature::AVX512)) {
47,397✔
366
               // Lazily compute r^3 through r^8 on first AVX512 use
367
               poly1305_extend_powers(m_poly, 8);
×
368
               const size_t processed = poly1305_avx512_blocks(m_poly, data_ptr, blocks_remaining);
×
369
               data_ptr += processed * 16;
×
370
               blocks_remaining -= processed;
×
371
            }
372
#endif
373

374
#if defined(BOTAN_HAS_POLY1305_AVX2)
375
            if(blocks_remaining >= 4 * 6 && CPUID::has(CPUID::Feature::AVX2)) {
47,397✔
376
               // Lazily compute r^3 and r^4 on first AVX2 use
377
               poly1305_extend_powers(m_poly, 4);
2,678✔
378
               const size_t processed = poly1305_avx2_blocks(m_poly, data_ptr, blocks_remaining);
2,678✔
379
               data_ptr += processed * 16;
2,678✔
380
               blocks_remaining -= processed;
2,678✔
381
            }
382
#endif
383

384
            if(blocks_remaining > 0) {
47,397✔
385
               poly1305_blocks(m_poly, data_ptr, blocks_remaining);
45,145✔
386
            }
387
         }
388
      }
389
   }
390
}
1,096,299✔
391

392
void Poly1305::final_result(std::span<uint8_t> out) {
116,280✔
393
   assert_key_material_set();
116,280✔
394

395
   if(!m_buffer.in_alignment()) {
116,108✔
396
      const uint8_t final_byte = 0x01;
820✔
397
      m_buffer.append({&final_byte, 1});
820✔
398
      m_buffer.fill_up_with_zeros();
820✔
399
      poly1305_blocks(m_poly, m_buffer.consume().data(), 1, true);
820✔
400
   }
401

402
   poly1305_finish(m_poly, out.data());
116,108✔
403

404
   m_poly.clear();
116,108✔
405
   m_buffer.clear();
116,108✔
406
}
116,108✔
407

408
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc