• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37596

pending completion
#37596

push

local

web-flow
🤖 [master] Bump the Pkg stdlib from 2c04d5a98 to b044bf6a2 (#50851)

Co-authored-by: Dilum Aluthge <dilum@aluthge.com>

71913 of 84418 relevant lines covered (85.19%)

32144286.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.61
/stdlib/Random/src/XoshiroSimd.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
module XoshiroSimd
4
# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl.
5
import ..Random: TaskLocalRNG, rand, rand!, Xoshiro, CloseOpen01, UnsafeView,
6
                 SamplerType, SamplerTrivial
7
using Base: BitInteger_types
8
using Base.Libc: memcpy
9
using Core.Intrinsics: llvmcall
10

11
# Vector-width. Influences random stream.
12
xoshiroWidth() = Val(8)
13,752,421✔
13
# Simd threshold. Influences random stream.
14
simdThreshold(::Type{T}) where T = 64
13,748,378✔
15
simdThreshold(::Type{Bool}) = 640
4,043✔
16

17
@inline _rotl45(x::UInt64) = (x<<45)|(x>>19)
52,261,206✔
18
@inline _shl17(x::UInt64) = x<<17
52,261,206✔
19
@inline _rotl23(x::UInt64) = (x<<23)|(x>>41)
52,261,206✔
20
@inline _plus(x::UInt64,y::UInt64) = x+y
104,522,412✔
21
@inline _xor(x::UInt64,y::UInt64) = xor(x,y)
261,306,030✔
22
@inline _and(x::UInt64, y::UInt64) = x & y
8,374✔
23
@inline _or(x::UInt64, y::UInt64) = x | y
×
24
@inline _lshr(x, y::Int32) = _lshr(x, y % Int64)
×
25
@inline _lshr(x::UInt64, y::Int64) = llvmcall("""
4,752✔
26
    %res = lshr i64 %0, %1
27
    ret i64 %res
28
    """,
29
    UInt64,
30
    Tuple{UInt64, Int64},
31
    x, y)
32

33
@inline _bits2float(x::UInt64, ::Type{Float64}) = reinterpret(UInt64, Float64(x >>> 11) * 0x1.0p-53)
7,009✔
34
@inline function _bits2float(x::UInt64, ::Type{Float32})
2,652✔
35
    #=
36
    # this implementation uses more high bits, but is harder to vectorize
37
    x = x >>> 16  # discard low 16 bits
38
    u = Float32(x >>> 24) * Float32(0x1.0p-24)
39
    l = Float32(x & 0x00ffffff) * Float32(0x1.0p-24)
40
    =#
41
    ui = (x>>>32) % UInt32
2,652✔
42
    li = x % UInt32
2,652✔
43
    u = Float32(ui >>> 8) * Float32(0x1.0p-24)
2,652✔
44
    l = Float32(li >>> 8) * Float32(0x1.0p-24)
2,652✔
45
    (UInt64(reinterpret(UInt32, u)) << 32) | UInt64(reinterpret(UInt32, l))
2,652✔
46
end
47

48
# required operations. These could be written more concisely with `ntuple`, but the compiler
49
# sometimes refuses to properly vectorize.
50
for N in [4,8,16]
51
    let code, s, fshl = "llvm.fshl.v$(N)i64",
52
        VT = :(NTuple{$N, VecElement{UInt64}})
53

54
        s = ntuple(_->VecElement(UInt64(45)), N)
×
55
        @eval @inline _rotl45(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
5,179,338✔
56

57
        s = ntuple(_->VecElement(UInt64(23)), N)
×
58
        @eval @inline _rotl23(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
5,179,338✔
59

60
        code = """
61
        %lshiftOp = shufflevector <1 x i64> <i64 17>, <1 x i64> undef, <$N x i32> zeroinitializer
62
        %res = shl <$N x i64> %0, %lshiftOp
63
        ret <$N x i64> %res
64
        """
65
        @eval @inline _shl17(x::$VT) = llvmcall($code, $VT, Tuple{$VT}, x)
5,179,338✔
66

67
        code = """
68
        %res = add <$N x i64> %1, %0
69
        ret <$N x i64> %res
70
        """
71
        @eval @inline _plus(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
10,358,676✔
72

73
        code = """
74
        %res = xor <$N x i64> %1, %0
75
        ret <$N x i64> %res
76
        """
77
        @eval @inline _xor(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
25,896,690✔
78

79
        code = """
80
        %res = and <$N x i64> %1, %0
81
        ret <$N x i64> %res
82
        """
83
        @eval @inline _and(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
392✔
84

85
        code = """
86
        %res = or <$N x i64> %1, %0
87
        ret <$N x i64> %res
88
        """
89
        @eval @inline _or(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
×
90

91
        code = """
92
        %tmp = insertelement <1 x i64> undef, i64 %1, i32 0
93
        %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer
94
        %res = lshr <$N x i64> %0, %shift
95
        ret <$N x i64> %res
96
        """
97
        @eval @inline _lshr(x::$VT, y::Int64) = llvmcall($code, $VT, Tuple{$VT, Int64}, x, y)
392✔
98

99
        code = """
100
        %shiftamt = shufflevector <1 x i64> <i64 11>, <1 x i64> undef, <$N x i32> zeroinitializer
101
        %sh = lshr <$N x i64> %0, %shiftamt
102
        %f = uitofp <$N x i64> %sh to <$N x double>
103
        %scale = shufflevector <1 x double> <double 0x3ca0000000000000>, <1 x double> undef, <$N x i32> zeroinitializer
104
        %m = fmul <$N x double> %f, %scale
105
        %i = bitcast <$N x double> %m to <$N x i64>
106
        ret <$N x i64> %i
107
        """
108
        @eval @inline _bits2float(x::$VT, ::Type{Float64}) = llvmcall($code, $VT, Tuple{$VT}, x)
4,542,410✔
109

110
        code = """
111
        %as32 = bitcast <$N x i64> %0 to <$(2N) x i32>
112
        %shiftamt = shufflevector <1 x i32> <i32 8>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
113
        %sh = lshr <$(2N) x i32> %as32, %shiftamt
114
        %f = uitofp <$(2N) x i32> %sh to <$(2N) x float>
115
        %scale = shufflevector <1 x float> <float 0x3e70000000000000>, <1 x float> undef, <$(2N) x i32> zeroinitializer
116
        %m = fmul <$(2N) x float> %f, %scale
117
        %i = bitcast <$(2N) x float> %m to <$N x i64>
118
        ret <$N x i64> %i
119
        """
120
        @eval @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
14,819✔
121
    end
122
end
123

124

125
function forkRand(rng::Union{TaskLocalRNG, Xoshiro}, ::Val{N}) where N
61,358✔
126
    # constants have nothing up their sleeve. For more discussion, cf rng_split in task.c
127
    # 0x02011ce34bce797f == hash(UInt(1))|0x01
128
    # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
129
    # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
130
    # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
131
    s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N))
552,222✔
132
    s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N))
552,222✔
133
    s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N))
552,222✔
134
    s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N))
552,222✔
135
    (s0, s1, s2, s3)
61,358✔
136
end
137

138
_id(x, T) = x
52,867,226✔
139

140
@inline function xoshiro_bulk(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, T::Union{Type{UInt8}, Type{Bool}, Type{Float32}, Type{Float64}}, ::Val{N}, f::F = _id) where {N, F}
27,484,500✔
141
    if len >= simdThreshold(T)
27,484,500✔
142
        written = xoshiro_bulk_simd(rng, dst, len, T, Val(N), f)
61,358✔
143
        len -= written
61,358✔
144
        dst += written
61,358✔
145
    end
146
    if len != 0
13,752,421✔
147
        xoshiro_bulk_nosimd(rng, dst, len, T, f)
13,708,543✔
148
    end
149
    nothing
13,752,421✔
150
end
151

152
@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, f::F) where {T, F}
13,704,504✔
153
    if rng isa TaskLocalRNG
13,704,504✔
154
        task = current_task()
13,704,504✔
155
        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
13,704,504✔
156
    else
157
        (; s0, s1, s2, s3) = rng::Xoshiro
×
158
    end
159

160
    i = 0
13,704,504✔
161
    while i+8 <= len
65,172,390✔
162
        res = _plus(_rotl23(_plus(s0,s3)),s0)
51,467,886✔
163
        unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), f(res, T))
51,467,886✔
164
        t = _shl17(s1)
51,467,886✔
165
        s2 = _xor(s2, s0)
51,467,886✔
166
        s3 = _xor(s3, s1)
51,467,886✔
167
        s1 = _xor(s1, s2)
51,467,886✔
168
        s0 = _xor(s0, s3)
51,467,886✔
169
        s2 = _xor(s2, t)
51,467,886✔
170
        s3 = _rotl45(s3)
51,467,886✔
171
        i += 8
51,467,886✔
172
    end
51,467,886✔
173
    if i < len
13,704,504✔
174
        res = _plus(_rotl23(_plus(s0,s3)),s0)
786,941✔
175
        t = _shl17(s1)
786,941✔
176
        s2 = _xor(s2, s0)
786,941✔
177
        s3 = _xor(s3, s1)
786,941✔
178
        s1 = _xor(s1, s2)
786,941✔
179
        s0 = _xor(s0, s3)
786,941✔
180
        s2 = _xor(s2, t)
786,941✔
181
        s3 = _rotl45(s3)
786,941✔
182
        ref = Ref(f(res, T))
786,941✔
183
        # TODO: This may make the random-stream dependent on system endianness
184
        GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i)
786,941✔
185
    end
186
    if rng isa TaskLocalRNG
13,704,504✔
187
        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
13,704,504✔
188
    else
189
       rng.s0, rng.s1, rng.s2, rng.s3 =  s0, s1, s2, s3
×
190
    end
191
    nothing
13,704,504✔
192
end
193

194
@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, f)
4,039✔
195
    if rng isa TaskLocalRNG
4,039✔
196
        task = current_task()
4,039✔
197
        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
4,039✔
198
    else
199
        (; s0, s1, s2, s3) = rng::Xoshiro
×
200
    end
201

202
    i = 0
4,039✔
203
    while i+8 <= len
6,796✔
204
        res = _plus(_rotl23(_plus(s0,s3)),s0)
2,757✔
205
        shift = 0
2,757✔
206
        while i+8 <= len && shift < 8
7,509✔
207
            resLoc = _and(_lshr(res, shift), 0x0101010101010101)
4,752✔
208
            unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc)
4,752✔
209
            i += 8
4,752✔
210
            shift += 1
4,752✔
211
        end
4,752✔
212

213
        t = _shl17(s1)
2,757✔
214
        s2 = _xor(s2, s0)
2,757✔
215
        s3 = _xor(s3, s1)
2,757✔
216
        s1 = _xor(s1, s2)
2,757✔
217
        s0 = _xor(s0, s3)
2,757✔
218
        s2 = _xor(s2, t)
2,757✔
219
        s3 = _rotl45(s3)
2,757✔
220
    end
2,757✔
221
    if i < len
4,039✔
222
        # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0
223
        res = _plus(_rotl23(_plus(s0,s3)),s0)
3,622✔
224
        resLoc = _and(res, 0x0101010101010101)
3,622✔
225
        ref = Ref(resLoc)
3,622✔
226
        GC.@preserve ref memcpy(dst+i, Base.unsafe_convert(Ptr{Cvoid}, ref), len-i)
3,622✔
227
        t = _shl17(s1)
3,622✔
228
        s2 = _xor(s2, s0)
3,622✔
229
        s3 = _xor(s3, s1)
3,622✔
230
        s1 = _xor(s1, s2)
3,622✔
231
        s0 = _xor(s0, s3)
3,622✔
232
        s2 = _xor(s2, t)
3,622✔
233
        s3 = _rotl45(s3)
3,622✔
234
    end
235
    if rng isa TaskLocalRNG
4,039✔
236
        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
4,039✔
237
    else
238
        rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3
×
239
    end
240
    nothing
4,039✔
241
end
242

243

244
@noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}, f::F) where {T,N,F}
61,343✔
245
    s0, s1, s2, s3 = forkRand(rng, Val(N))
61,343✔
246

247
    i = 0
61,343✔
248
    while i + 8*N <= len
5,240,632✔
249
        res = _plus(_rotl23(_plus(s0,s3)),s0)
5,179,289✔
250
        t = _shl17(s1)
5,179,289✔
251
        s2 = _xor(s2, s0)
5,179,289✔
252
        s3 = _xor(s3, s1)
5,179,289✔
253
        s1 = _xor(s1, s2)
5,179,289✔
254
        s0 = _xor(s0, s3)
5,179,289✔
255
        s2 = _xor(s2, t)
5,179,289✔
256
        s3 = _rotl45(s3)
5,179,289✔
257
        unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), f(res, T))
5,179,289✔
258
        i += 8*N
5,179,289✔
259
    end
5,179,289✔
260
    return i
61,343✔
261
end
262

263
@noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}, f) where {N}
15✔
264
    s0, s1, s2, s3 = forkRand(rng, Val(N))
15✔
265
    msk = ntuple(i->VecElement(0x0101010101010101), Val(N))
135✔
266
    i = 0
15✔
267
    while i + 64*N <= len
64✔
268
        res = _plus(_rotl23(_plus(s0,s3)),s0)
49✔
269
        t = _shl17(s1)
49✔
270
        s2 = _xor(s2, s0)
49✔
271
        s3 = _xor(s3, s1)
49✔
272
        s1 = _xor(s1, s2)
49✔
273
        s0 = _xor(s0, s3)
49✔
274
        s2 = _xor(s2, t)
49✔
275
        s3 = _rotl45(s3)
49✔
276
        for k=0:7
49✔
277
            tmp = _lshr(res, k)
392✔
278
            toWrite = _and(tmp, msk)
392✔
279
            unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite)
392✔
280
        end
735✔
281
        i += 64*N
49✔
282
    end
49✔
283
    return i
15✔
284
end
285

286

287
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float32}, ::SamplerTrivial{CloseOpen01{Float32}})
704✔
288
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth(), _bits2float)
704✔
289
    dst
704✔
290
end
291

292
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float64}, ::SamplerTrivial{CloseOpen01{Float64}})
19,638✔
293
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth(), _bits2float)
19,638✔
294
    dst
19,638✔
295
end
296

297
for T in BitInteger_types
298
    @eval function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Union{Array{$T}, UnsafeView{$T}}, ::SamplerType{$T})
13,728,036✔
299
        GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof($T), UInt8, xoshiroWidth())
13,728,036✔
300
        dst
13,728,036✔
301
    end
302
end
303

304
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Bool}, ::SamplerType{Bool})
4,043✔
305
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth())
4,043✔
306
    dst
4,043✔
307
end
308

309
end # module
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc