• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ska-sa / spead2 / 24443950125

15 Apr 2026 08:11AM UTC coverage: 78.747% (-0.01%) from 78.761%
24443950125

Pull #430

github

bmerry
Update pytest to 9.0.3

Addresses a dependabot alert. It's probably not that relevant to the
cases where we use the requirements files (CI environments where we
don't need to worry about other untrusted users in the same container).
Pull Request #430: Update pytest to 9.0.3

5569 of 7072 relevant lines covered (78.75%)

120104.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.0
/src/common_memcpy.cpp
1
/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
2
 *
3
 * This program is free software: you can redistribute it and/or modify it under
4
 * the terms of the GNU Lesser General Public License as published by the Free
5
 * Software Foundation, either version 3 of the License, or (at your option) any
6
 * later version.
7
 *
8
 * This program is distributed in the hope that it will be useful, but WITHOUT
9
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10
 * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
11
 * details.
12
 *
13
 * You should have received a copy of the GNU Lesser General Public License
14
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
 */
16

17
#include <cstddef>
18
#include <cstdint>
19
#include <cstring>
20
#include <utility>
21
#include <spead2/common_defines.h>
22
#include <spead2/common_features.h>
23
#include <spead2/common_memcpy.h>
24

25
#if SPEAD2_USE_SSE2_STREAM
26
# include <emmintrin.h>
27
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
28
# define SPEAD2_MEMCPY_TARGET "sse2"
29
# define SPEAD2_MEMCPY_TYPE __m128i
30
# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
31
# define SPEAD2_MEMCPY_STORE _mm_stream_si128
32
# define SPEAD2_MEMCPY_UNROLL 16
33
# define SPEAD2_MEMCPY_VZEROUPPER 0
34
# include "common_memcpy_x86.h"
35
#endif
36

37
#if SPEAD2_USE_AVX_STREAM
38
# include <immintrin.h>
39
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
40
# define SPEAD2_MEMCPY_TARGET "avx"
41
# define SPEAD2_MEMCPY_TYPE __m256i
42
# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
43
# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
44
# define SPEAD2_MEMCPY_UNROLL 8
45
# define SPEAD2_MEMCPY_VZEROUPPER 1
46
# include "common_memcpy_x86.h"
47
#endif
48

49
#if SPEAD2_USE_AVX512_STREAM
50
# include <immintrin.h>
51
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
52
# define SPEAD2_MEMCPY_TARGET "avx512f"
53
# define SPEAD2_MEMCPY_TYPE __m512i
54
# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
55
# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
56
# define SPEAD2_MEMCPY_UNROLL 8
57
# define SPEAD2_MEMCPY_VZEROUPPER 1
58
# include "common_memcpy_x86.h"
59
#endif
60

61
#if SPEAD2_USE_SVE_STREAM
62
# include <atomic>
63
# include <sys/auxv.h>
64
# include <arm_sve.h>
65
#endif
66

67
namespace spead2
68
{
69

70
#if SPEAD2_USE_SVE_STREAM
71
[[gnu::target("+sve")]]
72
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
73
{
74
    /* The AArch64 memory model says
75
     *
76
     * "If an address dependency exists between two Read Memory and an SVE
77
     * non-temporal vector load instruction generated the second read, then in
78
     * the absence of any other barrier mechanism to achieve order, the memory
79
     * accesses can be observed in any order by the other observers within the
80
     * shareability domain of the memory addresses being accessed."
81
     *
82
     * I think that in the C++ memory model, this should only affect
83
     * std::memory_order_consume (since "carries dependency" is the only time
84
     * reads are assumed to be ordered in the absence of explicit
85
     * synchronisation); memory_order_consume is not used anywhere in spead2,
86
     * the C++ standard discourages it, and it's believed that no compiler
87
     * actually implements it other than by upgrade to acquire.
88
     *
89
     * The user documentation for @ref memcpy_nontemporal indicates this
90
     * limitation, so we do not insert any barriers here. If it becomes
91
     * necessary in future, testing on a Grace GH200 (Neoverse V2) chip
92
     * suggests that it is more efficient to write the address to an atomic
93
     * and read it back with memory_order_acquire than it is to use
94
     * atomic_thread_fence.
95
     */
96

97
    std::uint8_t *destc = (std::uint8_t *) dest;
98
    const std::uint8_t *srcc = (const std::uint8_t *) src;
99
    std::size_t i = 0;  // byte offset for next copy
100

101
    /* Alignment requires we have data up to the next multiple, and it's
102
     * not worth unrolling unless we have a reasonable amount of data.
103
     * For anything smaller, we'll just rely on the tail handling.
104
     */
105
    if (n >= 4 * svcntb())
106
    {
107
        /* Align the source pointer to a multiple of the vector size.
108
         * Experiments on Grace (Neoverse V2) show that source alignment
109
         * is more important than destination alignment to throughput.
110
         *
111
         * C++ doesn't guarantee the representation of a pointer when
112
         * cast to uintptr_t, but we're only depending on it for performance,
113
         * not correctness.
114
         */
115
        std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
116
        svbool_t pg = svwhilelt_b8(i, head);
117
        svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
118
        i = head;
119

120
        while (i + 2 * svcntb() <= n)
121
        {
122
            svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
123
            svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
124
            svstnt1_u8(svptrue_b8(), &destc[i], data0);
125
            svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
126
            i += 2 * svcntb();
127
        }
128
    }
129

130
    svbool_t pg = svwhilelt_b8(i, n);
131
    do
132
    {
133
        svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
134
        i += svcntb();
135
    } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
136
    return dest;
137
}
138
#endif // SPEAD2_USE_SVE_STREAM
139

140
extern "C" void *(*spead2_resolve_memcpy_nontemporal(
7✔
141
#if SPEAD2_USE_SVE_STREAM
142
    std::uint64_t hwcaps  // See System V AVI for AArch64
143
#endif
144
))(void *, const void *, std::size_t) noexcept
145
{
146
    /* x86 options */
147
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
148
    __builtin_cpu_init();
7✔
149
#endif
150
#if SPEAD2_USE_AVX512_STREAM
151
    /* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
152
     * Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
153
     * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
154
     * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
155
     * performs well), so we don't need to distinguish.
156
     */
157
    if (__builtin_cpu_supports("avx512f")
7✔
158
        && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
7✔
159
        return memcpy_nontemporal_avx512;
7✔
160
#endif
161
#if SPEAD2_USE_AVX_STREAM
162
    if (__builtin_cpu_supports("avx"))
×
163
        return memcpy_nontemporal_avx;
×
164
#endif
165
#if SPEAD2_USE_SSE2_STREAM
166
    if (__builtin_cpu_supports("sse2"))
×
167
        return memcpy_nontemporal_sse2;
×
168
#endif
169

170
    /* aarch64 options */
171
#if SPEAD2_USE_SVE_STREAM
172
    if (hwcaps & HWCAP_SVE)
173
        return memcpy_nontemporal_sve;
174
#endif
175

176
    /* Depending on the C library, std::memcpy might or might not be marked
177
     * as noexcept. If not, we need this explicit cast.
178
     */
179
    return (void *(*)(void *, const void *, std::size_t) noexcept) std::memcpy;
×
180
}
181

182
#if SPEAD2_USE_FMV
183

184
[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
185
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
186

187
#else
188

189
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
190
{
191
#if SPEAD2_USE_SVE_STREAM
192
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
193
        spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
194
#else
195
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
196
        spead2_resolve_memcpy_nontemporal();
197
#endif
198
    return memcpy_nontemporal_ptr(dest, src, n);
199
}
200

201
#endif
202

203
} // namespace spead2
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc