• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

randombit / botan / 17116554656

21 Aug 2025 03:48AM UTC coverage: 90.665% (+0.01%) from 90.652%
17116554656

push

github

web-flow
Merge pull request #5061 from reneme/feature/ascon_hash256

100199 of 110516 relevant lines covered (90.66%)

12463064.57 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/lib/block/shacal2/shacal2_avx512/shacal2_avx512.cpp
1
/*
2
* (C) 2025 Jack Lloyd
3
*
4
* Botan is released under the Simplified BSD License (see license.txt)
5
*/
6

7
#include <botan/internal/shacal2.h>
8

9
#include <botan/internal/simd_avx2.h>
10
#include <botan/internal/simd_avx512.h>
11

12
namespace Botan {
13

14
namespace SHACAL2_AVX512_F {
15

16
namespace {
17

18
/*
19
* 8x16 Transpose
20
*
21
* Convert from
22
*
23
* A00 B00 C00 ... H00
24
* A01 B01 C01 ... H01
25
* ..
26
* A15 B15 C15 ... H15
27
*
28
* with two blocks stored in each register, into
29
*
30
* A00 A01 ... A15
31
* B00 B01 ... B15
32
* ...
33
* H00 H01 ... H15
34
*/
35
BOTAN_FN_ISA_AVX512
36
void transpose_in(SIMD_16x32& B0,
×
37
                  SIMD_16x32& B1,
38
                  SIMD_16x32& B2,
39
                  SIMD_16x32& B3,
40
                  SIMD_16x32& B4,
41
                  SIMD_16x32& B5,
42
                  SIMD_16x32& B6,
43
                  SIMD_16x32& B7) {
44
   auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
×
45
   auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
×
46
   auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
×
47
   auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
×
48
   auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
×
49
   auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
×
50
   auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
×
51
   auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
×
52

53
   auto r0 = _mm512_unpacklo_epi64(t0, t2);
×
54
   auto r1 = _mm512_unpackhi_epi64(t0, t2);
×
55
   auto r2 = _mm512_unpacklo_epi64(t1, t3);
×
56
   auto r3 = _mm512_unpackhi_epi64(t1, t3);
×
57
   auto r4 = _mm512_unpacklo_epi64(t4, t6);
×
58
   auto r5 = _mm512_unpackhi_epi64(t4, t6);
×
59
   auto r6 = _mm512_unpacklo_epi64(t5, t7);
×
60
   auto r7 = _mm512_unpackhi_epi64(t5, t7);
×
61

62
   const __m512i tbl0 = _mm512_set_epi32(27, 19, 26, 18, 25, 17, 24, 16, 11, 3, 10, 2, 9, 1, 8, 0);
×
63
   const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(4));
×
64
   B0 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl0, r4));
×
65
   B1 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl0, r5));
×
66
   B2 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl0, r6));
×
67
   B3 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl0, r7));
×
68
   B4 = SIMD_16x32(_mm512_permutex2var_epi32(r0, tbl1, r4));
×
69
   B5 = SIMD_16x32(_mm512_permutex2var_epi32(r1, tbl1, r5));
×
70
   B6 = SIMD_16x32(_mm512_permutex2var_epi32(r2, tbl1, r6));
×
71
   B7 = SIMD_16x32(_mm512_permutex2var_epi32(r3, tbl1, r7));
×
72
}
×
73

74
BOTAN_FN_ISA_AVX512
75
void transpose_out(SIMD_16x32& B0,
×
76
                   SIMD_16x32& B1,
77
                   SIMD_16x32& B2,
78
                   SIMD_16x32& B3,
79
                   SIMD_16x32& B4,
80
                   SIMD_16x32& B5,
81
                   SIMD_16x32& B6,
82
                   SIMD_16x32& B7) {
83
   auto t0 = _mm512_unpacklo_epi32(B0.raw(), B1.raw());
×
84
   auto t1 = _mm512_unpackhi_epi32(B0.raw(), B1.raw());
×
85
   auto t2 = _mm512_unpacklo_epi32(B2.raw(), B3.raw());
×
86
   auto t3 = _mm512_unpackhi_epi32(B2.raw(), B3.raw());
×
87
   auto t4 = _mm512_unpacklo_epi32(B4.raw(), B5.raw());
×
88
   auto t5 = _mm512_unpackhi_epi32(B4.raw(), B5.raw());
×
89
   auto t6 = _mm512_unpacklo_epi32(B6.raw(), B7.raw());
×
90
   auto t7 = _mm512_unpackhi_epi32(B6.raw(), B7.raw());
×
91

92
   auto r0 = _mm512_unpacklo_epi64(t0, t2);
×
93
   auto r1 = _mm512_unpackhi_epi64(t0, t2);
×
94
   auto r2 = _mm512_unpacklo_epi64(t1, t3);
×
95
   auto r3 = _mm512_unpackhi_epi64(t1, t3);
×
96
   auto r4 = _mm512_unpacklo_epi64(t4, t6);
×
97
   auto r5 = _mm512_unpackhi_epi64(t4, t6);
×
98
   auto r6 = _mm512_unpacklo_epi64(t5, t7);
×
99
   auto r7 = _mm512_unpackhi_epi64(t5, t7);
×
100

101
   const __m512i tbl0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
×
102
   const __m512i tbl1 = _mm512_add_epi32(tbl0, _mm512_set1_epi32(8));
×
103

104
   auto s0 = _mm512_permutex2var_epi32(r0, tbl0, r4);
×
105
   auto s1 = _mm512_permutex2var_epi32(r1, tbl0, r5);
×
106
   auto s2 = _mm512_permutex2var_epi32(r2, tbl0, r6);
×
107
   auto s3 = _mm512_permutex2var_epi32(r3, tbl0, r7);
×
108
   auto s4 = _mm512_permutex2var_epi32(r0, tbl1, r4);
×
109
   auto s5 = _mm512_permutex2var_epi32(r1, tbl1, r5);
×
110
   auto s6 = _mm512_permutex2var_epi32(r2, tbl1, r6);
×
111
   auto s7 = _mm512_permutex2var_epi32(r3, tbl1, r7);
×
112

113
   B0 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b01000100));
×
114
   B1 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b01000100));
×
115
   B2 = SIMD_16x32(_mm512_shuffle_i32x4(s0, s1, 0b11101110));
×
116
   B3 = SIMD_16x32(_mm512_shuffle_i32x4(s2, s3, 0b11101110));
×
117
   B4 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b01000100));
×
118
   B5 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b01000100));
×
119
   B6 = SIMD_16x32(_mm512_shuffle_i32x4(s4, s5, 0b11101110));
×
120
   B7 = SIMD_16x32(_mm512_shuffle_i32x4(s6, s7, 0b11101110));
×
121
}
×
122

123
template <typename SimdT>
124
void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Fwd(const SimdT& A,
×
125
                                                        const SimdT& B,
126
                                                        const SimdT& C,
127
                                                        SimdT& D,
128
                                                        const SimdT& E,
129
                                                        const SimdT& F,
130
                                                        const SimdT& G,
131
                                                        SimdT& H,
132
                                                        uint32_t RK) {
133
   H += E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
×
134
   D += H;
×
135
   H += A.sigma0() + SimdT::majority(A, B, C);
×
136
}
137

138
template <typename SimdT>
139
void BOTAN_FORCE_INLINE BOTAN_FN_ISA_AVX512 SHACAL2_Rev(const SimdT& A,
×
140
                                                        const SimdT& B,
141
                                                        const SimdT& C,
142
                                                        SimdT& D,
143
                                                        const SimdT& E,
144
                                                        const SimdT& F,
145
                                                        const SimdT& G,
146
                                                        SimdT& H,
147
                                                        uint32_t RK) {
148
   H -= A.sigma0() + SimdT::majority(A, B, C);
×
149
   D -= H;
×
150
   H -= E.sigma1() + SimdT::choose(E, F, G) + SimdT::splat(RK);
×
151
}
152

153
}  // namespace
154

155
}  // namespace SHACAL2_AVX512_F
156

157
size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
×
158
   using namespace SHACAL2_AVX512_F;
×
159

160
   size_t consumed = 0;
×
161

162
   while(blocks >= 16) {
×
163
      SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
×
164
      SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
×
165
      SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
×
166
      SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
×
167
      SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
×
168
      SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
×
169
      SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
×
170
      SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
×
171

172
      transpose_in(A, B, C, D, E, F, G, H);
×
173

174
      for(size_t r = 0; r != 64; r += 8) {
×
175
         SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
×
176
         SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
×
177
         SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
×
178
         SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
×
179
         SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
×
180
         SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
×
181
         SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
×
182
         SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
×
183
      }
184

185
      transpose_out(A, B, C, D, E, F, G, H);
×
186

187
      A.store_be(out + 64 * 0);
×
188
      B.store_be(out + 64 * 1);
×
189
      C.store_be(out + 64 * 2);
×
190
      D.store_be(out + 64 * 3);
×
191
      E.store_be(out + 64 * 4);
×
192
      F.store_be(out + 64 * 5);
×
193
      G.store_be(out + 64 * 6);
×
194
      H.store_be(out + 64 * 7);
×
195

196
      in += 16 * BLOCK_SIZE;
×
197
      out += 16 * BLOCK_SIZE;
×
198
      blocks -= 16;
×
199
      consumed += 16;
×
200
   }
201

202
   while(blocks >= 8) {
×
203
      SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
×
204
      SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
×
205
      SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
×
206
      SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
×
207
      SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
×
208
      SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
×
209
      SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
×
210
      SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
×
211

212
      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
×
213

214
      for(size_t r = 0; r != 64; r += 8) {
×
215
         SHACAL2_Fwd(A, B, C, D, E, F, G, H, m_RK[r + 0]);
×
216
         SHACAL2_Fwd(H, A, B, C, D, E, F, G, m_RK[r + 1]);
×
217
         SHACAL2_Fwd(G, H, A, B, C, D, E, F, m_RK[r + 2]);
×
218
         SHACAL2_Fwd(F, G, H, A, B, C, D, E, m_RK[r + 3]);
×
219
         SHACAL2_Fwd(E, F, G, H, A, B, C, D, m_RK[r + 4]);
×
220
         SHACAL2_Fwd(D, E, F, G, H, A, B, C, m_RK[r + 5]);
×
221
         SHACAL2_Fwd(C, D, E, F, G, H, A, B, m_RK[r + 6]);
×
222
         SHACAL2_Fwd(B, C, D, E, F, G, H, A, m_RK[r + 7]);
×
223
      }
224

225
      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
×
226

227
      A.store_be(out + 32 * 0);
×
228
      B.store_be(out + 32 * 1);
×
229
      C.store_be(out + 32 * 2);
×
230
      D.store_be(out + 32 * 3);
×
231
      E.store_be(out + 32 * 4);
×
232
      F.store_be(out + 32 * 5);
×
233
      G.store_be(out + 32 * 6);
×
234
      H.store_be(out + 32 * 7);
×
235

236
      in += 8 * BLOCK_SIZE;
×
237
      out += 8 * BLOCK_SIZE;
×
238
      blocks -= 8;
×
239
      consumed += 8;
×
240
   }
241

242
   return consumed;
×
243
}
244

245
size_t BOTAN_FN_ISA_AVX512 SHACAL2::avx512_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks) const {
×
246
   using namespace SHACAL2_AVX512_F;
×
247

248
   size_t consumed = 0;
×
249

250
   while(blocks >= 16) {
×
251
      SIMD_16x32 A = SIMD_16x32::load_be(in + 64 * 0);
×
252
      SIMD_16x32 B = SIMD_16x32::load_be(in + 64 * 1);
×
253
      SIMD_16x32 C = SIMD_16x32::load_be(in + 64 * 2);
×
254
      SIMD_16x32 D = SIMD_16x32::load_be(in + 64 * 3);
×
255
      SIMD_16x32 E = SIMD_16x32::load_be(in + 64 * 4);
×
256
      SIMD_16x32 F = SIMD_16x32::load_be(in + 64 * 5);
×
257
      SIMD_16x32 G = SIMD_16x32::load_be(in + 64 * 6);
×
258
      SIMD_16x32 H = SIMD_16x32::load_be(in + 64 * 7);
×
259

260
      transpose_in(A, B, C, D, E, F, G, H);
×
261

262
      for(size_t r = 0; r != 64; r += 8) {
×
263
         SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
×
264
         SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
×
265
         SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
×
266
         SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
×
267
         SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
×
268
         SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
×
269
         SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
×
270
         SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
×
271
      }
272

273
      transpose_out(A, B, C, D, E, F, G, H);
×
274

275
      A.store_be(out + 64 * 0);
×
276
      B.store_be(out + 64 * 1);
×
277
      C.store_be(out + 64 * 2);
×
278
      D.store_be(out + 64 * 3);
×
279
      E.store_be(out + 64 * 4);
×
280
      F.store_be(out + 64 * 5);
×
281
      G.store_be(out + 64 * 6);
×
282
      H.store_be(out + 64 * 7);
×
283

284
      in += 16 * BLOCK_SIZE;
×
285
      out += 16 * BLOCK_SIZE;
×
286
      blocks -= 16;
×
287
      consumed += 16;
×
288
   }
289

290
   while(blocks >= 8) {
×
291
      SIMD_8x32 A = SIMD_8x32::load_be(in + 32 * 0);
×
292
      SIMD_8x32 B = SIMD_8x32::load_be(in + 32 * 1);
×
293
      SIMD_8x32 C = SIMD_8x32::load_be(in + 32 * 2);
×
294
      SIMD_8x32 D = SIMD_8x32::load_be(in + 32 * 3);
×
295
      SIMD_8x32 E = SIMD_8x32::load_be(in + 32 * 4);
×
296
      SIMD_8x32 F = SIMD_8x32::load_be(in + 32 * 5);
×
297
      SIMD_8x32 G = SIMD_8x32::load_be(in + 32 * 6);
×
298
      SIMD_8x32 H = SIMD_8x32::load_be(in + 32 * 7);
×
299

300
      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
×
301

302
      for(size_t r = 0; r != 64; r += 8) {
×
303
         SHACAL2_Rev(B, C, D, E, F, G, H, A, m_RK[63 - r]);
×
304
         SHACAL2_Rev(C, D, E, F, G, H, A, B, m_RK[62 - r]);
×
305
         SHACAL2_Rev(D, E, F, G, H, A, B, C, m_RK[61 - r]);
×
306
         SHACAL2_Rev(E, F, G, H, A, B, C, D, m_RK[60 - r]);
×
307
         SHACAL2_Rev(F, G, H, A, B, C, D, E, m_RK[59 - r]);
×
308
         SHACAL2_Rev(G, H, A, B, C, D, E, F, m_RK[58 - r]);
×
309
         SHACAL2_Rev(H, A, B, C, D, E, F, G, m_RK[57 - r]);
×
310
         SHACAL2_Rev(A, B, C, D, E, F, G, H, m_RK[56 - r]);
×
311
      }
312

313
      SIMD_8x32::transpose(A, B, C, D, E, F, G, H);
×
314

315
      A.store_be(out + 32 * 0);
×
316
      B.store_be(out + 32 * 1);
×
317
      C.store_be(out + 32 * 2);
×
318
      D.store_be(out + 32 * 3);
×
319
      E.store_be(out + 32 * 4);
×
320
      F.store_be(out + 32 * 5);
×
321
      G.store_be(out + 32 * 6);
×
322
      H.store_be(out + 32 * 7);
×
323

324
      in += 8 * BLOCK_SIZE;
×
325
      out += 8 * BLOCK_SIZE;
×
326
      blocks -= 8;
×
327
      consumed += 8;
×
328
   }
329

330
   return consumed;
×
331
}
332

333
}  // namespace Botan
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc