25305128470

Committed 04 May 2026 06:48AM UTC coverage: 78.477% (+0.01%) from 78.463%

Build # 25305128470

Build Type

push

github

Committed by

bmerry

Commit Message

Import MemoryRegion into spead2.send namespace

It is originally defined in spead2._spead2.send.

Coverage Stats

5586 of 7118 relevant lines covered (78.48%)

90634.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.0

/src/common_memcpy.cpp

/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <cstddef>
#include <cstdint>
#include <cstring>
#include <utility>
#include <spead2/common_defines.h>
#include <spead2/common_features.h>
#include <spead2/common_memcpy.h>

#if SPEAD2_USE_SSE2_STREAM
# include <emmintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
# define SPEAD2_MEMCPY_TARGET "sse2"
# define SPEAD2_MEMCPY_TYPE __m128i
# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
# define SPEAD2_MEMCPY_STORE _mm_stream_si128
# define SPEAD2_MEMCPY_UNROLL 16
# define SPEAD2_MEMCPY_VZEROUPPER 0
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_AVX_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
# define SPEAD2_MEMCPY_TARGET "avx"
# define SPEAD2_MEMCPY_TYPE __m256i
# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
# define SPEAD2_MEMCPY_UNROLL 8
# define SPEAD2_MEMCPY_VZEROUPPER 1
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_AVX512_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
# define SPEAD2_MEMCPY_TARGET "avx512f"
# define SPEAD2_MEMCPY_TYPE __m512i
# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
# define SPEAD2_MEMCPY_UNROLL 8
# define SPEAD2_MEMCPY_VZEROUPPER 1
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_SVE_STREAM
# include <atomic>
# include <sys/auxv.h>
# include <arm_sve.h>
#endif

namespace spead2
{

#if SPEAD2_USE_SVE_STREAM
[[gnu::target("+sve")]]
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
    /* The AArch64 memory model says
     *
     * "If an address dependency exists between two Read Memory and an SVE
     * non-temporal vector load instruction generated the second read, then in
     * the absence of any other barrier mechanism to achieve order, the memory
     * accesses can be observed in any order by the other observers within the
     * shareability domain of the memory addresses being accessed."
     *
     * I think that in the C++ memory model, this should only affect
     * std::memory_order_consume (since "carries dependency" is the only time
     * reads are assumed to be ordered in the absence of explicit
     * synchronisation); memory_order_consume is not used anywhere in spead2,
     * the C++ standard discourages it, and it's believed that no compiler
     * actually implements it other than by upgrade to acquire.
     *
     * The user documentation for @ref memcpy_nontemporal indicates this
     * limitation, so we do not insert any barriers here. If it becomes
     * necessary in future, testing on a Grace GH200 (Neoverse V2) chip
     * suggests that it is more efficient to write the address to an atomic
     * and read it back with memory_order_acquire than it is to use
     * atomic_thread_fence.
     */

    std::uint8_t *destc = (std::uint8_t *) dest;
    const std::uint8_t *srcc = (const std::uint8_t *) src;
    std::size_t i = 0;  // byte offset for next copy

    /* Alignment requires we have data up to the next multiple, and it's
     * not worth unrolling unless we have a reasonable amount of data.
     * For anything smaller, we'll just rely on the tail handling.
     */
    if (n >= 4 * svcntb())
    {
        /* Align the source pointer to a multiple of the vector size.
         * Experiments on Grace (Neoverse V2) show that source alignment
         * is more important than destination alignment to throughput.
         *
         * C++ doesn't guarantee the representation of a pointer when
         * cast to uintptr_t, but we're only depending on it for performance,
         * not correctness.
         */
        std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
        svbool_t pg = svwhilelt_b8(i, head);
        svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
        i = head;

        while (i + 2 * svcntb() <= n)
        {
            svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
            svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
            svstnt1_u8(svptrue_b8(), &destc[i], data0);
            svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
            i += 2 * svcntb();
        }
    }

    svbool_t pg = svwhilelt_b8(i, n);
    do
    {
        svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
        i += svcntb();
    } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
    return dest;
}
#endif // SPEAD2_USE_SVE_STREAM

extern "C" void *(*spead2_resolve_memcpy_nontemporal(
#if SPEAD2_USE_SVE_STREAM
    std::uint64_t hwcaps  // See System V AVI for AArch64
#endif
))(void *, const void *, std::size_t) noexcept
{
    /* x86 options */
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
    __builtin_cpu_init();
#endif
#if SPEAD2_USE_AVX512_STREAM
    /* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
     * Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
     * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
     * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
     * performs well), so we don't need to distinguish.
     */
    if (__builtin_cpu_supports("avx512f")
        && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
        return memcpy_nontemporal_avx512;
#endif
#if SPEAD2_USE_AVX_STREAM
    if (__builtin_cpu_supports("avx"))
        return memcpy_nontemporal_avx;
#endif
#if SPEAD2_USE_SSE2_STREAM
    if (__builtin_cpu_supports("sse2"))
        return memcpy_nontemporal_sse2;
#endif

    /* aarch64 options */
#if SPEAD2_USE_SVE_STREAM
    if (hwcaps & HWCAP_SVE)
        return memcpy_nontemporal_sve;
#endif

    /* Depending on the C library, std::memcpy might or might not be marked
     * as noexcept. If not, we need this explicit cast.
     */
    return (void *(*)(void *, const void *, std::size_t) noexcept) std::memcpy;
}

#if SPEAD2_USE_FMV

[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;

#else

void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
#if SPEAD2_USE_SVE_STREAM
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
        spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
#else
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
        spead2_resolve_memcpy_nontemporal();
#endif
    return memcpy_nontemporal_ptr(dest, src, n);
}

#endif

} // namespace spead2

1	/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
2	*
3	* This program is free software: you can redistribute it and/or modify it under
4	* the terms of the GNU Lesser General Public License as published by the Free
5	* Software Foundation, either version 3 of the License, or (at your option) any
6	* later version.
7	*
8	* This program is distributed in the hope that it will be useful, but WITHOUT
9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
11	* details.
12	*
13	* You should have received a copy of the GNU Lesser General Public License
14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
15	*/
16
17	#include <cstddef>
18	#include <cstdint>
19	#include <cstring>
20	#include <utility>
21	#include <spead2/common_defines.h>
22	#include <spead2/common_features.h>
23	#include <spead2/common_memcpy.h>
24
25	#if SPEAD2_USE_SSE2_STREAM
26	# include <emmintrin.h>
27	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
28	# define SPEAD2_MEMCPY_TARGET "sse2"
29	# define SPEAD2_MEMCPY_TYPE __m128i
30	# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
31	# define SPEAD2_MEMCPY_STORE _mm_stream_si128
32	# define SPEAD2_MEMCPY_UNROLL 16
33	# define SPEAD2_MEMCPY_VZEROUPPER 0
34	# include "common_memcpy_x86.h"
35	#endif
36
37	#if SPEAD2_USE_AVX_STREAM
38	# include <immintrin.h>
39	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
40	# define SPEAD2_MEMCPY_TARGET "avx"
41	# define SPEAD2_MEMCPY_TYPE __m256i
42	# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
43	# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
44	# define SPEAD2_MEMCPY_UNROLL 8
45	# define SPEAD2_MEMCPY_VZEROUPPER 1
46	# include "common_memcpy_x86.h"
47	#endif
48
49	#if SPEAD2_USE_AVX512_STREAM
50	# include <immintrin.h>
51	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
52	# define SPEAD2_MEMCPY_TARGET "avx512f"
53	# define SPEAD2_MEMCPY_TYPE __m512i
54	# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
55	# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
56	# define SPEAD2_MEMCPY_UNROLL 8
57	# define SPEAD2_MEMCPY_VZEROUPPER 1
58	# include "common_memcpy_x86.h"
59	#endif
60
61	#if SPEAD2_USE_SVE_STREAM
62	# include <atomic>
63	# include <sys/auxv.h>
64	# include <arm_sve.h>
65	#endif
66
67	namespace spead2
68	{
69
70	#if SPEAD2_USE_SVE_STREAM
71	[[gnu::target("+sve")]]
72	void memcpy_nontemporal_sve(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
73	{
74	/* The AArch64 memory model says
75	*
76	* "If an address dependency exists between two Read Memory and an SVE
77	* non-temporal vector load instruction generated the second read, then in
78	* the absence of any other barrier mechanism to achieve order, the memory
79	* accesses can be observed in any order by the other observers within the
80	* shareability domain of the memory addresses being accessed."
81	*
82	* I think that in the C++ memory model, this should only affect
83	* std::memory_order_consume (since "carries dependency" is the only time
84	* reads are assumed to be ordered in the absence of explicit
85	* synchronisation); memory_order_consume is not used anywhere in spead2,
86	* the C++ standard discourages it, and it's believed that no compiler
87	* actually implements it other than by upgrade to acquire.
88	*
89	* The user documentation for @ref memcpy_nontemporal indicates this
90	* limitation, so we do not insert any barriers here. If it becomes
91	* necessary in future, testing on a Grace GH200 (Neoverse V2) chip
92	* suggests that it is more efficient to write the address to an atomic
93	* and read it back with memory_order_acquire than it is to use
94	* atomic_thread_fence.
95	*/
96
97	std::uint8_t destc = (std::uint8_t ) dest;
98	const std::uint8_t srcc = (const std::uint8_t ) src;
99	std::size_t i = 0; // byte offset for next copy
100
101	/* Alignment requires we have data up to the next multiple, and it's
102	* not worth unrolling unless we have a reasonable amount of data.
103	* For anything smaller, we'll just rely on the tail handling.
104	*/
105	if (n >= 4 * svcntb())
106	{
107	/* Align the source pointer to a multiple of the vector size.
108	* Experiments on Grace (Neoverse V2) show that source alignment
109	* is more important than destination alignment to throughput.
110	*
111	* C++ doesn't guarantee the representation of a pointer when
112	* cast to uintptr_t, but we're only depending on it for performance,
113	* not correctness.
114	*/
115	std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
116	svbool_t pg = svwhilelt_b8(i, head);
117	svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
118	i = head;
119
120	while (i + 2 * svcntb() <= n)
121	{
122	svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
123	svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
124	svstnt1_u8(svptrue_b8(), &destc[i], data0);
125	svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
126	i += 2 * svcntb();
127	}
128	}
129
130	svbool_t pg = svwhilelt_b8(i, n);
131	do
132	{
133	svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
134	i += svcntb();
135	} while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
136	return dest;
137	}
138	#endif // SPEAD2_USE_SVE_STREAM
139
140	extern "C" void (spead2_resolve_memcpy_nontemporal(	7✔
141	#if SPEAD2_USE_SVE_STREAM
142	std::uint64_t hwcaps // See System V AVI for AArch64
143	#endif
144	))(void , const void , std::size_t) noexcept
145	{
146	/* x86 options */
147	#if SPEAD2_USE_AVX512_STREAM \|\| SPEAD2_USE_AVX_STREAM \|\| SPEAD2_USE_SSE2_STREAM
148	__builtin_cpu_init();	7✔
149	#endif
150	#if SPEAD2_USE_AVX512_STREAM
151	/* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
152	* Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
153	* AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
154	* AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
155	* performs well), so we don't need to distinguish.
156	*/
157	if (__builtin_cpu_supports("avx512f")	7✔
158	&& (__builtin_cpu_supports("avx512er") \|\| __builtin_cpu_supports("avx512vnni")))	×
159	return memcpy_nontemporal_avx512;	×
160	#endif
161	#if SPEAD2_USE_AVX_STREAM
162	if (__builtin_cpu_supports("avx"))	7✔
163	return memcpy_nontemporal_avx;	7✔
164	#endif
165	#if SPEAD2_USE_SSE2_STREAM
166	if (__builtin_cpu_supports("sse2"))	×
167	return memcpy_nontemporal_sse2;	×
168	#endif
169
170	/* aarch64 options */
171	#if SPEAD2_USE_SVE_STREAM
172	if (hwcaps & HWCAP_SVE)
173	return memcpy_nontemporal_sve;
174	#endif
175
176	/* Depending on the C library, std::memcpy might or might not be marked
177	* as noexcept. If not, we need this explicit cast.
178	*/
179	return (void ()(void , const void , std::size_t) noexcept) std::memcpy;	×
180	}
181
182	#if SPEAD2_USE_FMV
183
184	[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
185	void memcpy_nontemporal(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
186
187	#else
188
189	void memcpy_nontemporal(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
190	{
191	#if SPEAD2_USE_SVE_STREAM
192	static void (memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
193	spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
194	#else
195	static void (memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
196	spead2_resolve_memcpy_nontemporal();
197	#endif
198	return memcpy_nontemporal_ptr(dest, src, n);
199	}
200
201	#endif
202
203	} // namespace spead2

ska-sa / spead2 / 25305128470

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous