24443950125

Committed 15 Apr 2026 08:11AM UTC coverage: 78.747% (-0.01%) from 78.761%

Build # 24443950125

Build Type

Pull #430

github

Committed by

bmerry

Commit Message

Update pytest to 9.0.3

Addresses a dependabot alert. It's probably not that relevant to the
cases where we use the requirements files (CI environments where we
don't need to worry about other untrusted users in the same container).

Pull Request Pull Request #430: Update pytest to 9.0.3

Coverage Stats

5569 of 7072 relevant lines covered (78.75%)

120104.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.0

/src/common_memcpy.cpp

/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <cstddef>
#include <cstdint>
#include <cstring>
#include <utility>
#include <spead2/common_defines.h>
#include <spead2/common_features.h>
#include <spead2/common_memcpy.h>

#if SPEAD2_USE_SSE2_STREAM
# include <emmintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
# define SPEAD2_MEMCPY_TARGET "sse2"
# define SPEAD2_MEMCPY_TYPE __m128i
# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
# define SPEAD2_MEMCPY_STORE _mm_stream_si128
# define SPEAD2_MEMCPY_UNROLL 16
# define SPEAD2_MEMCPY_VZEROUPPER 0
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_AVX_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
# define SPEAD2_MEMCPY_TARGET "avx"
# define SPEAD2_MEMCPY_TYPE __m256i
# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
# define SPEAD2_MEMCPY_UNROLL 8
# define SPEAD2_MEMCPY_VZEROUPPER 1
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_AVX512_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
# define SPEAD2_MEMCPY_TARGET "avx512f"
# define SPEAD2_MEMCPY_TYPE __m512i
# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
# define SPEAD2_MEMCPY_UNROLL 8
# define SPEAD2_MEMCPY_VZEROUPPER 1
# include "common_memcpy_x86.h"
#endif

#if SPEAD2_USE_SVE_STREAM
# include <atomic>
# include <sys/auxv.h>
# include <arm_sve.h>
#endif

namespace spead2
{

#if SPEAD2_USE_SVE_STREAM
[[gnu::target("+sve")]]
void *memcpy_nontemporal_sve(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
    /* The AArch64 memory model says
     *
     * "If an address dependency exists between two Read Memory and an SVE
     * non-temporal vector load instruction generated the second read, then in
     * the absence of any other barrier mechanism to achieve order, the memory
     * accesses can be observed in any order by the other observers within the
     * shareability domain of the memory addresses being accessed."
     *
     * I think that in the C++ memory model, this should only affect
     * std::memory_order_consume (since "carries dependency" is the only time
     * reads are assumed to be ordered in the absence of explicit
     * synchronisation); memory_order_consume is not used anywhere in spead2,
     * the C++ standard discourages it, and it's believed that no compiler
     * actually implements it other than by upgrade to acquire.
     *
     * The user documentation for @ref memcpy_nontemporal indicates this
     * limitation, so we do not insert any barriers here. If it becomes
     * necessary in future, testing on a Grace GH200 (Neoverse V2) chip
     * suggests that it is more efficient to write the address to an atomic
     * and read it back with memory_order_acquire than it is to use
     * atomic_thread_fence.
     */

    std::uint8_t *destc = (std::uint8_t *) dest;
    const std::uint8_t *srcc = (const std::uint8_t *) src;
    std::size_t i = 0;  // byte offset for next copy

    /* Alignment requires we have data up to the next multiple, and it's
     * not worth unrolling unless we have a reasonable amount of data.
     * For anything smaller, we'll just rely on the tail handling.
     */
    if (n >= 4 * svcntb())
    {
        /* Align the source pointer to a multiple of the vector size.
         * Experiments on Grace (Neoverse V2) show that source alignment
         * is more important than destination alignment to throughput.
         *
         * C++ doesn't guarantee the representation of a pointer when
         * cast to uintptr_t, but we're only depending on it for performance,
         * not correctness.
         */
        std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
        svbool_t pg = svwhilelt_b8(i, head);
        svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
        i = head;

        while (i + 2 * svcntb() <= n)
        {
            svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
            svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
            svstnt1_u8(svptrue_b8(), &destc[i], data0);
            svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
            i += 2 * svcntb();
        }
    }

    svbool_t pg = svwhilelt_b8(i, n);
    do
    {
        svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
        i += svcntb();
    } while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
    return dest;
}
#endif // SPEAD2_USE_SVE_STREAM

extern "C" void *(*spead2_resolve_memcpy_nontemporal(
#if SPEAD2_USE_SVE_STREAM
    std::uint64_t hwcaps  // See System V AVI for AArch64
#endif
))(void *, const void *, std::size_t) noexcept
{
    /* x86 options */
#if SPEAD2_USE_AVX512_STREAM || SPEAD2_USE_AVX_STREAM || SPEAD2_USE_SSE2_STREAM
    __builtin_cpu_init();
#endif
#if SPEAD2_USE_AVX512_STREAM
    /* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
     * Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
     * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
     * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
     * performs well), so we don't need to distinguish.
     */
    if (__builtin_cpu_supports("avx512f")
        && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
        return memcpy_nontemporal_avx512;
#endif
#if SPEAD2_USE_AVX_STREAM
    if (__builtin_cpu_supports("avx"))
        return memcpy_nontemporal_avx;
#endif
#if SPEAD2_USE_SSE2_STREAM
    if (__builtin_cpu_supports("sse2"))
        return memcpy_nontemporal_sse2;
#endif

    /* aarch64 options */
#if SPEAD2_USE_SVE_STREAM
    if (hwcaps & HWCAP_SVE)
        return memcpy_nontemporal_sve;
#endif

    /* Depending on the C library, std::memcpy might or might not be marked
     * as noexcept. If not, we need this explicit cast.
     */
    return (void *(*)(void *, const void *, std::size_t) noexcept) std::memcpy;
}

#if SPEAD2_USE_FMV

[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;

#else

void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
#if SPEAD2_USE_SVE_STREAM
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
        spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
#else
    static void *(*memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
        spead2_resolve_memcpy_nontemporal();
#endif
    return memcpy_nontemporal_ptr(dest, src, n);
}

#endif

} // namespace spead2

1	/* Copyright 2016, 2020, 2023-2024 National Research Foundation (SARAO)
2	*
3	* This program is free software: you can redistribute it and/or modify it under
4	* the terms of the GNU Lesser General Public License as published by the Free
5	* Software Foundation, either version 3 of the License, or (at your option) any
6	* later version.
7	*
8	* This program is distributed in the hope that it will be useful, but WITHOUT
9	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
11	* details.
12	*
13	* You should have received a copy of the GNU Lesser General Public License
14	* along with this program. If not, see <http://www.gnu.org/licenses/>.
15	*/
16
17	#include <cstddef>
18	#include <cstdint>
19	#include <cstring>
20	#include <utility>
21	#include <spead2/common_defines.h>
22	#include <spead2/common_features.h>
23	#include <spead2/common_memcpy.h>
24
25	#if SPEAD2_USE_SSE2_STREAM
26	# include <emmintrin.h>
27	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
28	# define SPEAD2_MEMCPY_TARGET "sse2"
29	# define SPEAD2_MEMCPY_TYPE __m128i
30	# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
31	# define SPEAD2_MEMCPY_STORE _mm_stream_si128
32	# define SPEAD2_MEMCPY_UNROLL 16
33	# define SPEAD2_MEMCPY_VZEROUPPER 0
34	# include "common_memcpy_x86.h"
35	#endif
36
37	#if SPEAD2_USE_AVX_STREAM
38	# include <immintrin.h>
39	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
40	# define SPEAD2_MEMCPY_TARGET "avx"
41	# define SPEAD2_MEMCPY_TYPE __m256i
42	# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
43	# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
44	# define SPEAD2_MEMCPY_UNROLL 8
45	# define SPEAD2_MEMCPY_VZEROUPPER 1
46	# include "common_memcpy_x86.h"
47	#endif
48
49	#if SPEAD2_USE_AVX512_STREAM
50	# include <immintrin.h>
51	# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
52	# define SPEAD2_MEMCPY_TARGET "avx512f"
53	# define SPEAD2_MEMCPY_TYPE __m512i
54	# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
55	# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
56	# define SPEAD2_MEMCPY_UNROLL 8
57	# define SPEAD2_MEMCPY_VZEROUPPER 1
58	# include "common_memcpy_x86.h"
59	#endif
60
61	#if SPEAD2_USE_SVE_STREAM
62	# include <atomic>
63	# include <sys/auxv.h>
64	# include <arm_sve.h>
65	#endif
66
67	namespace spead2
68	{
69
70	#if SPEAD2_USE_SVE_STREAM
71	[[gnu::target("+sve")]]
72	void memcpy_nontemporal_sve(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
73	{
74	/* The AArch64 memory model says
75	*
76	* "If an address dependency exists between two Read Memory and an SVE
77	* non-temporal vector load instruction generated the second read, then in
78	* the absence of any other barrier mechanism to achieve order, the memory
79	* accesses can be observed in any order by the other observers within the
80	* shareability domain of the memory addresses being accessed."
81	*
82	* I think that in the C++ memory model, this should only affect
83	* std::memory_order_consume (since "carries dependency" is the only time
84	* reads are assumed to be ordered in the absence of explicit
85	* synchronisation); memory_order_consume is not used anywhere in spead2,
86	* the C++ standard discourages it, and it's believed that no compiler
87	* actually implements it other than by upgrade to acquire.
88	*
89	* The user documentation for @ref memcpy_nontemporal indicates this
90	* limitation, so we do not insert any barriers here. If it becomes
91	* necessary in future, testing on a Grace GH200 (Neoverse V2) chip
92	* suggests that it is more efficient to write the address to an atomic
93	* and read it back with memory_order_acquire than it is to use
94	* atomic_thread_fence.
95	*/
96
97	std::uint8_t destc = (std::uint8_t ) dest;
98	const std::uint8_t srcc = (const std::uint8_t ) src;
99	std::size_t i = 0; // byte offset for next copy
100
101	/* Alignment requires we have data up to the next multiple, and it's
102	* not worth unrolling unless we have a reasonable amount of data.
103	* For anything smaller, we'll just rely on the tail handling.
104	*/
105	if (n >= 4 * svcntb())
106	{
107	/* Align the source pointer to a multiple of the vector size.
108	* Experiments on Grace (Neoverse V2) show that source alignment
109	* is more important than destination alignment to throughput.
110	*
111	* C++ doesn't guarantee the representation of a pointer when
112	* cast to uintptr_t, but we're only depending on it for performance,
113	* not correctness.
114	*/
115	std::size_t head = -std::uintptr_t(src) & (svcntb() - 1);
116	svbool_t pg = svwhilelt_b8(i, head);
117	svstnt1_u8(pg, destc, svldnt1_u8(pg, srcc));
118	i = head;
119
120	while (i + 2 * svcntb() <= n)
121	{
122	svuint8_t data0 = svldnt1_u8(svptrue_b8(), &srcc[i]);
123	svuint8_t data1 = svldnt1_u8(svptrue_b8(), &srcc[i + svcntb()]);
124	svstnt1_u8(svptrue_b8(), &destc[i], data0);
125	svstnt1_u8(svptrue_b8(), &destc[i + svcntb()], data1);
126	i += 2 * svcntb();
127	}
128	}
129
130	svbool_t pg = svwhilelt_b8(i, n);
131	do
132	{
133	svstnt1_u8(pg, &destc[i], svldnt1_u8(pg, &srcc[i]));
134	i += svcntb();
135	} while (svptest_first(svptrue_b8(), pg = svwhilelt_b8(i, n)));
136	return dest;
137	}
138	#endif // SPEAD2_USE_SVE_STREAM
139
140	extern "C" void (spead2_resolve_memcpy_nontemporal(	7✔
141	#if SPEAD2_USE_SVE_STREAM
142	std::uint64_t hwcaps // See System V AVI for AArch64
143	#endif
144	))(void , const void , std::size_t) noexcept
145	{
146	/* x86 options */
147	#if SPEAD2_USE_AVX512_STREAM \|\| SPEAD2_USE_AVX_STREAM \|\| SPEAD2_USE_SSE2_STREAM
148	__builtin_cpu_init();	7✔
149	#endif
150	#if SPEAD2_USE_AVX512_STREAM
151	/* On Skylake server, AVX-512 reduces clock speeds. Use the same logic as
152	* Glibc to decide whether AVX-512 is okay: it's okay if either AVX512ER or
153	* AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
154	* AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
155	* performs well), so we don't need to distinguish.
156	*/
157	if (__builtin_cpu_supports("avx512f")	7✔
158	&& (__builtin_cpu_supports("avx512er") \|\| __builtin_cpu_supports("avx512vnni")))	7✔
159	return memcpy_nontemporal_avx512;	7✔
160	#endif
161	#if SPEAD2_USE_AVX_STREAM
162	if (__builtin_cpu_supports("avx"))	×
163	return memcpy_nontemporal_avx;	×
164	#endif
165	#if SPEAD2_USE_SSE2_STREAM
166	if (__builtin_cpu_supports("sse2"))	×
167	return memcpy_nontemporal_sse2;	×
168	#endif
169
170	/* aarch64 options */
171	#if SPEAD2_USE_SVE_STREAM
172	if (hwcaps & HWCAP_SVE)
173	return memcpy_nontemporal_sve;
174	#endif
175
176	/* Depending on the C library, std::memcpy might or might not be marked
177	* as noexcept. If not, we need this explicit cast.
178	*/
179	return (void ()(void , const void , std::size_t) noexcept) std::memcpy;	×
180	}
181
182	#if SPEAD2_USE_FMV
183
184	[[gnu::ifunc("spead2_resolve_memcpy_nontemporal")]]
185	void memcpy_nontemporal(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
186
187	#else
188
189	void memcpy_nontemporal(void __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
190	{
191	#if SPEAD2_USE_SVE_STREAM
192	static void (memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
193	spead2_resolve_memcpy_nontemporal(getauxval(AT_HWCAP));
194	#else
195	static void (memcpy_nontemporal_ptr)(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept =
196	spead2_resolve_memcpy_nontemporal();
197	#endif
198	return memcpy_nontemporal_ptr(dest, src, n);
199	}
200
201	#endif
202
203	} // namespace spead2

ska-sa / spead2 / 24443950125

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous