• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ska-sa / spead2 / 25305128470

04 May 2026 06:48AM UTC coverage: 78.477% (+0.01%) from 78.463%
25305128470

push

github

bmerry
Import MemoryRegion into spead2.send namespace

It is originally defined in spead2._spead2.send.

5586 of 7118 relevant lines covered (78.48%)

90634.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.0
/src/common_memcpy.cpp
1
/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
2
 *
3
 * This program is free software: you can redistribute it and/or modify it under
4
 * the terms of the GNU Lesser General Public License as published by the Free
5
 * Software Foundation, either version 3 of the License, or (at your option) any
6
 * later version.
7
 *
8
 * This program is distributed in the hope that it will be useful, but WITHOUT
9
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10
 * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
11
 * details.
12
 *
13
 * You should have received a copy of the GNU Lesser General Public License
14
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
 */
16

17
#include <cstddef>
18
#include <cstdint>
19
#include <cstring>
20
#include <utility>
21
#include <spead2/common_defines.h>
22
#include <spead2/common_features.h>
23
#include <spead2/common_memcpy.h>
24

25
#if SPEAD2_USE_SSE2_STREAM
26
# include <emmintrin.h>
27
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
28
# define SPEAD2_MEMCPY_TARGET "sse2"
29
# define SPEAD2_MEMCPY_TYPE __m128i
30
# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
31
# define SPEAD2_MEMCPY_STORE _mm_stream_si128
32
# define SPEAD2_MEMCPY_UNROLL 16
33
# define SPEAD2_MEMCPY_VZEROUPPER 0
34
# include "common_memcpy_x86.h"
35
#endif
36

37
#if SPEAD2_USE_AVX_STREAM
38
# include <immintrin.h>
39
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
40
# define SPEAD2_MEMCPY_TARGET "avx"
41
# define SPEAD2_MEMCPY_TYPE __m256i
42
# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
43
# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
44
# define SPEAD2_MEMCPY_UNROLL 8
45
# define SPEAD2_MEMCPY_VZEROUPPER 1
46
# include "common_memcpy_x86.h"
47
#endif
48

49
#if SPEAD2_USE_AVX512_STREAM
50
# include <immintrin.h>
51
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
52
# define SPEAD2_MEMCPY_TARGET "avx512f"
53
# define SPEAD2_MEMCPY_TYPE __m512i
54
# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
55
# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
56
# define SPEAD2_MEMCPY_UNROLL 8
57
# define SPEAD2_MEMCPY_VZEROUPPER 1
58
# include "common_memcpy_x86.h"
59
#endif
60

61
#if SPEAD2_USE_SVE_STREAM
62
# include <atomic>
63
# include <sys/auxv.h>
64
# include <arm_sve.h>
65
#endif
66

67
namespace spead2
68
{
69

70
#if SPEAD2_USE_SVE_STREAM
71
[[gnu::target("+sve")]]
72
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
73
{
74
    /* The AArch64 memory model says
75
     *
76
     * "If an address dependency exists between two Read Memory and an SVE
77
     * non-temporal vector load instruction generated the second read, then in
78
     * the absence of any other barrier mechanism to achieve order, the memory
79
     * accesses can be observed in any order by the other observers within the
80
     * shareability domain of the memory addresses being accessed."
81
     *
82
     * I think that in the C++ memory model, this should only affect
83
     * std::memory_order_consume (since "carries dependency" is the only time
84
     * reads are assumed to be ordered in the absence of explicit
85
     * synchronisation); memory_order_consume is not used anywhere in spead2,
86
     * the C++ standard discourages it, and it's believed that no compiler
87
     * actually implements it other than by upgrade to acquire.
88
     *
89
     * The user documentation for @ref memcpy_nontemporal indicates this
90
     * limitation, so we do not insert any barriers here. If it becomes
91
     * necessary in future, testing on a Grace GH200 (Neoverse V2) chip
92
     * suggests that it is more efficient to write the address to an atomic
93
     * and read it back with memory_order_acquire than it is to use
94
     * atomic_thread_fence.
95
     */
96

97
    std::uint8_t *destc = (std::uint8_t *) dest;
98
    const std::uint8_t *srcc = (const std::uint8_t *) src;
99
    std::size_t i = 0;  // byte offset for next copy
100

101
    /* Alignment requires we have data up to the next multiple, and it's
102
     * not worth unrolling unless we have a reasonable amount of data.
103
     * For anything smaller, we'll just rely on the tail handling.
104
     */
105
    if (n >= 4 * svcntb())
106
    {
107
        /* Align the source pointer to a multiple of the vector size.
108
         * Experiments on Grace (Neoverse V2) show that source alignment
109
         * is more important than destination alignment to throughput.
110
         *
111
         * C++ doesn't guarantee the representation of a pointer when
112
         * cast to uintptr_t, but we're only depending on it for performance,
113
         * not correctness.
114
         */
115
        std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
116
        svbool_t pg = svwhilelt_b8(i, head);
117
        svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
118
        i = head;
119

120
        while (i + 2 * svcntb() <= n)
121
        {
122
            svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
123
            svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
124
            svstnt1_u8(svptrue_b8(), &destc[i], data0);
125
            svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
126
            i += 2 * svcntb();
127
        }
128
    }
129

130
    svbool_t pg = svwhilelt_b8(i, n);
131
    do
132
    {
133
        svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
134
        i += svcntb();
135
    } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
136
    return dest;
137
}
138
#endif // SPEAD2_USE_SVE_STREAM
139

140
extern "C" void *(*spead2_resolve_memcpy_nontemporal(
7✔
141
#if SPEAD2_USE_SVE_STREAM
142
    std::uint64_t hwcaps  // See System V AVI for AArch64
143
#endif
144
))(void *, const void *, std::size_t) noexcept
145
{
146
    /* x86 options */
147
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
148
    __builtin_cpu_init();
7✔
149
#endif
150
#if SPEAD2_USE_AVX512_STREAM
151
    /* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
152
     * Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
153
     * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
154
     * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
155
     * performs well), so we don't need to distinguish.
156
     */
157
    if (__builtin_cpu_supports("avx512f")
7✔
158
        && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
×
159
        return memcpy_nontemporal_avx512;
×
160
#endif
161
#if SPEAD2_USE_AVX_STREAM
162
    if (__builtin_cpu_supports("avx"))
7✔
163
        return memcpy_nontemporal_avx;
7✔
164
#endif
165
#if SPEAD2_USE_SSE2_STREAM
166
    if (__builtin_cpu_supports("sse2"))
×
167
        return memcpy_nontemporal_sse2;
×
168
#endif
169

170
    /* aarch64 options */
171
#if SPEAD2_USE_SVE_STREAM
172
    if (hwcaps & HWCAP_SVE)
173
        return memcpy_nontemporal_sve;
174
#endif
175

176
    /* Depending on the C library, std::memcpy might or might not be marked
177
     * as noexcept. If not, we need this explicit cast.
178
     */
179
    return (void *(*)(void *, const void *, std::size_t) noexcept) std::memcpy;
×
180
}
181

182
#if SPEAD2_USE_FMV
183

184
[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
185
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
186

187
#else
188

189
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
190
{
191
#if SPEAD2_USE_SVE_STREAM
192
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
193
        spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
194
#else
195
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
196
        spead2_resolve_memcpy_nontemporal();
197
#endif
198
    return memcpy_nontemporal_ptr(dest, src, n);
199
}
200

201
#endif
202

203
} // namespace spead2
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc