• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37518

pending completion
#37518

push

local

web-flow
improve effects of  `objectid` and `getindex(::Dict)` (#49447)

This commit also marks `Module` type as `identityfree`.

Co-authored-by: Shuhei Kadowaki <aviatesk@gmail.com>

10 of 10 new or added lines in 2 files covered. (100.0%)

72092 of 83276 relevant lines covered (86.57%)

31697257.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.61
/stdlib/Random/src/XoshiroSimd.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
module XoshiroSimd
4
# Getting the xoroshiro RNG to reliably vectorize is somewhat of a hassle without Simd.jl.
5
import ..Random: TaskLocalRNG, rand, rand!, Xoshiro, CloseOpen01, UnsafeView,
6
                 SamplerType, SamplerTrivial
7
using Base: BitInteger_types
8
using Core.Intrinsics: llvmcall
9

10
# Vector-width. Influences random stream.
11
xoshiroWidth() = Val(8)
1,746,350✔
12
# Simd threshold. Influences random stream.
13
simdThreshold(::Type{T}) where T = 64
1,742,306✔
14
simdThreshold(::Type{Bool}) = 640
4,044✔
15

16
@inline _rotl45(x::UInt64) = (x<<45)|(x>>19)
4,237,428✔
17
@inline _shl17(x::UInt64) = x<<17
4,237,428✔
18
@inline _rotl23(x::UInt64) = (x<<23)|(x>>41)
4,237,428✔
19
@inline _plus(x::UInt64,y::UInt64) = x+y
8,474,856✔
20
@inline _xor(x::UInt64,y::UInt64) = xor(x,y)
21,187,140✔
21
@inline _and(x::UInt64, y::UInt64) = x & y
8,407✔
22
@inline _or(x::UInt64, y::UInt64) = x | y
×
23
@inline _lshr(x, y::Int32) = _lshr(x, y % Int64)
×
24
@inline _lshr(x::UInt64, y::Int64) = llvmcall("""
4,786✔
25
    %res = lshr i64 %0, %1
26
    ret i64 %res
27
    """,
28
    UInt64,
29
    Tuple{UInt64, Int64},
30
    x, y)
31

32
@inline _bits2float(x::UInt64, ::Type{Float64}) = reinterpret(UInt64, Float64(x >>> 11) * 0x1.0p-53)
7,432✔
33
@inline function _bits2float(x::UInt64, ::Type{Float32})
2,731✔
34
    #=
35
    # this implementation uses more high bits, but is harder to vectorize
36
    x = x >>> 16  # discard low 16 bits
37
    u = Float32(x >>> 24) * Float32(0x1.0p-24)
38
    l = Float32(x & 0x00ffffff) * Float32(0x1.0p-24)
39
    =#
40
    ui = (x>>>32) % UInt32
2,731✔
41
    li = x % UInt32
2,731✔
42
    u = Float32(ui >>> 8) * Float32(0x1.0p-24)
2,731✔
43
    l = Float32(li >>> 8) * Float32(0x1.0p-24)
2,731✔
44
    (UInt64(reinterpret(UInt32, u)) << 32) | UInt64(reinterpret(UInt32, l))
2,731✔
45
end
46

47
# required operations. These could be written more concisely with `ntuple`, but the compiler
48
# sometimes refuses to properly vectorize.
49
for N in [4,8,16]
50
    let code, s, fshl = "llvm.fshl.v$(N)i64",
51
        VT = :(NTuple{$N, VecElement{UInt64}})
52

53
        s = ntuple(_->VecElement(UInt64(45)), N)
×
54
        @eval @inline _rotl45(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
5,195,034✔
55

56
        s = ntuple(_->VecElement(UInt64(23)), N)
×
57
        @eval @inline _rotl23(x::$VT) = ccall($fshl, llvmcall, $VT, ($VT, $VT, $VT), x, x, $s)
5,195,034✔
58

59
        code = """
60
        %lshiftOp = shufflevector <1 x i64> <i64 17>, <1 x i64> undef, <$N x i32> zeroinitializer
61
        %res = shl <$N x i64> %0, %lshiftOp
62
        ret <$N x i64> %res
63
        """
64
        @eval @inline _shl17(x::$VT) = llvmcall($code, $VT, Tuple{$VT}, x)
5,195,034✔
65

66
        code = """
67
        %res = add <$N x i64> %1, %0
68
        ret <$N x i64> %res
69
        """
70
        @eval @inline _plus(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
10,390,068✔
71

72
        code = """
73
        %res = xor <$N x i64> %1, %0
74
        ret <$N x i64> %res
75
        """
76
        @eval @inline _xor(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
25,975,170✔
77

78
        code = """
79
        %res = and <$N x i64> %1, %0
80
        ret <$N x i64> %res
81
        """
82
        @eval @inline _and(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
544✔
83

84
        code = """
85
        %res = or <$N x i64> %1, %0
86
        ret <$N x i64> %res
87
        """
88
        @eval @inline _or(x::$VT, y::$VT) = llvmcall($code, $VT, Tuple{$VT, $VT}, x, y)
×
89

90
        code = """
91
        %tmp = insertelement <1 x i64> undef, i64 %1, i32 0
92
        %shift = shufflevector <1 x i64> %tmp, <1 x i64> %tmp, <$N x i32> zeroinitializer
93
        %res = lshr <$N x i64> %0, %shift
94
        ret <$N x i64> %res
95
        """
96
        @eval @inline _lshr(x::$VT, y::Int64) = llvmcall($code, $VT, Tuple{$VT, Int64}, x, y)
544✔
97

98
        code = """
99
        %shiftamt = shufflevector <1 x i64> <i64 11>, <1 x i64> undef, <$N x i32> zeroinitializer
100
        %sh = lshr <$N x i64> %0, %shiftamt
101
        %f = uitofp <$N x i64> %sh to <$N x double>
102
        %scale = shufflevector <1 x double> <double 0x3ca0000000000000>, <1 x double> undef, <$N x i32> zeroinitializer
103
        %m = fmul <$N x double> %f, %scale
104
        %i = bitcast <$N x double> %m to <$N x i64>
105
        ret <$N x i64> %i
106
        """
107
        @eval @inline _bits2float(x::$VT, ::Type{Float64}) = llvmcall($code, $VT, Tuple{$VT}, x)
4,548,810✔
108

109
        code = """
110
        %as32 = bitcast <$N x i64> %0 to <$(2N) x i32>
111
        %shiftamt = shufflevector <1 x i32> <i32 8>, <1 x i32> undef, <$(2N) x i32> zeroinitializer
112
        %sh = lshr <$(2N) x i32> %as32, %shiftamt
113
        %f = uitofp <$(2N) x i32> %sh to <$(2N) x float>
114
        %scale = shufflevector <1 x float> <float 0x3e70000000000000>, <1 x float> undef, <$(2N) x i32> zeroinitializer
115
        %m = fmul <$(2N) x float> %f, %scale
116
        %i = bitcast <$(2N) x float> %m to <$N x i64>
117
        ret <$N x i64> %i
118
        """
119
        @eval @inline _bits2float(x::$VT, ::Type{Float32}) = llvmcall($code, $VT, Tuple{$VT}, x)
14,903✔
120
    end
121
end
122

123

124
function forkRand(rng::Union{TaskLocalRNG, Xoshiro}, ::Val{N}) where N
61,527✔
125
    # constants have nothing up their sleeve. For more discussion, cf rng_split in task.c
126
    # 0x02011ce34bce797f == hash(UInt(1))|0x01
127
    # 0x5a94851fb48a6e05 == hash(UInt(2))|0x01
128
    # 0x3688cf5d48899fa7 == hash(UInt(3))|0x01
129
    # 0x867b4bb4c42e5661 == hash(UInt(4))|0x01
130
    s0 = ntuple(i->VecElement(0x02011ce34bce797f * rand(rng, UInt64)), Val(N))
553,743✔
131
    s1 = ntuple(i->VecElement(0x5a94851fb48a6e05 * rand(rng, UInt64)), Val(N))
553,743✔
132
    s2 = ntuple(i->VecElement(0x3688cf5d48899fa7 * rand(rng, UInt64)), Val(N))
553,743✔
133
    s3 = ntuple(i->VecElement(0x867b4bb4c42e5661 * rand(rng, UInt64)), Val(N))
553,743✔
134
    (s0, s1, s2, s3)
61,527✔
135
end
136

137
_id(x, T) = x
4,852,136✔
138

139
@inline function xoshiro_bulk(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, T::Union{Type{UInt8}, Type{Bool}, Type{Float32}, Type{Float64}}, ::Val{N}, f::F = _id) where {N, F}
3,472,399✔
140
    if len >= simdThreshold(T)
3,472,399✔
141
        written = xoshiro_bulk_simd(rng, dst, len, T, Val(N), f)
61,527✔
142
        len -= written
61,527✔
143
        dst += written
61,527✔
144
    end
145
    if len != 0
1,746,350✔
146
        xoshiro_bulk_nosimd(rng, dst, len, T, f)
1,702,561✔
147
    end
148
    nothing
1,746,350✔
149
end
150

151
@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, f::F) where {T, F}
1,698,521✔
152
    if rng isa TaskLocalRNG
1,698,521✔
153
        task = current_task()
1,698,521✔
154
        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
1,698,521✔
155
    else
156
        (; s0, s1, s2, s3) = rng::Xoshiro
×
157
    end
158

159
    i = 0
1,698,521✔
160
    while i+8 <= len
5,142,628✔
161
        res = _plus(_rotl23(_plus(s0,s3)),s0)
3,444,107✔
162
        unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), f(res, T))
3,444,107✔
163
        t = _shl17(s1)
3,444,107✔
164
        s2 = _xor(s2, s0)
3,444,107✔
165
        s3 = _xor(s3, s1)
3,444,107✔
166
        s1 = _xor(s1, s2)
3,444,107✔
167
        s0 = _xor(s0, s3)
3,444,107✔
168
        s2 = _xor(s2, t)
3,444,107✔
169
        s3 = _rotl45(s3)
3,444,107✔
170
        i += 8
3,444,107✔
171
    end
3,444,107✔
172
    if i < len
1,698,521✔
173
        res = _plus(_rotl23(_plus(s0,s3)),s0)
786,939✔
174
        t = _shl17(s1)
786,939✔
175
        s2 = _xor(s2, s0)
786,939✔
176
        s3 = _xor(s3, s1)
786,939✔
177
        s1 = _xor(s1, s2)
786,939✔
178
        s0 = _xor(s0, s3)
786,939✔
179
        s2 = _xor(s2, t)
786,939✔
180
        s3 = _rotl45(s3)
786,939✔
181
        ref = Ref(f(res, T))
786,939✔
182
        # TODO: This may make the random-stream dependent on system endianness
183
        ccall(:memcpy, Ptr{Cvoid}, (Ptr{UInt8}, Ptr{UInt64}, Csize_t), dst+i, ref, len-i)
786,939✔
184
    end
185
    if rng isa TaskLocalRNG
1,698,521✔
186
        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
1,698,521✔
187
    else
188
       rng.s0, rng.s1, rng.s2, rng.s3 =  s0, s1, s2, s3
×
189
    end
190
    nothing
1,698,521✔
191
end
192

193
@noinline function xoshiro_bulk_nosimd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, f)
4,040✔
194
    if rng isa TaskLocalRNG
4,040✔
195
        task = current_task()
4,040✔
196
        s0, s1, s2, s3 = task.rngState0, task.rngState1, task.rngState2, task.rngState3
4,040✔
197
    else
198
        (; s0, s1, s2, s3) = rng::Xoshiro
×
199
    end
200

201
    i = 0
4,040✔
202
    while i+8 <= len
6,801✔
203
        res = _plus(_rotl23(_plus(s0,s3)),s0)
2,761✔
204
        shift = 0
2,761✔
205
        while i+8 <= len && shift < 8
7,547✔
206
            resLoc = _and(_lshr(res, shift), 0x0101010101010101)
4,786✔
207
            unsafe_store!(reinterpret(Ptr{UInt64}, dst + i), resLoc)
4,786✔
208
            i += 8
4,786✔
209
            shift += 1
4,786✔
210
        end
4,786✔
211

212
        t = _shl17(s1)
2,761✔
213
        s2 = _xor(s2, s0)
2,761✔
214
        s3 = _xor(s3, s1)
2,761✔
215
        s1 = _xor(s1, s2)
2,761✔
216
        s0 = _xor(s0, s3)
2,761✔
217
        s2 = _xor(s2, t)
2,761✔
218
        s3 = _rotl45(s3)
2,761✔
219
    end
2,761✔
220
    if i < len
4,040✔
221
        # we may overgenerate some bytes here, if len mod 64 <= 56 and len mod 8 != 0
222
        res = _plus(_rotl23(_plus(s0,s3)),s0)
3,621✔
223
        resLoc = _and(res, 0x0101010101010101)
3,621✔
224
        ref = Ref(resLoc)
3,621✔
225
        ccall(:memcpy, Ptr{Cvoid}, (Ptr{UInt8}, Ptr{UInt64}, Csize_t), dst+i, ref, len-i)
3,621✔
226
        t = _shl17(s1)
3,621✔
227
        s2 = _xor(s2, s0)
3,621✔
228
        s3 = _xor(s3, s1)
3,621✔
229
        s1 = _xor(s1, s2)
3,621✔
230
        s0 = _xor(s0, s3)
3,621✔
231
        s2 = _xor(s2, t)
3,621✔
232
        s3 = _rotl45(s3)
3,621✔
233
    end
234
    if rng isa TaskLocalRNG
4,040✔
235
        task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3
4,040✔
236
    else
237
        rng.s0, rng.s1, rng.s2, rng.s3 = s0, s1, s2, s3
×
238
    end
239
    nothing
4,040✔
240
end
241

242

243
@noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{T}, ::Val{N}, f::F) where {T,N,F}
61,511✔
244
    s0, s1, s2, s3 = forkRand(rng, Val(N))
61,511✔
245

246
    i = 0
61,511✔
247
    while i + 8*N <= len
5,256,477✔
248
        res = _plus(_rotl23(_plus(s0,s3)),s0)
5,194,966✔
249
        t = _shl17(s1)
5,194,966✔
250
        s2 = _xor(s2, s0)
5,194,966✔
251
        s3 = _xor(s3, s1)
5,194,966✔
252
        s1 = _xor(s1, s2)
5,194,966✔
253
        s0 = _xor(s0, s3)
5,194,966✔
254
        s2 = _xor(s2, t)
5,194,966✔
255
        s3 = _rotl45(s3)
5,194,966✔
256
        unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i), f(res, T))
5,194,966✔
257
        i += 8*N
5,194,966✔
258
    end
5,194,966✔
259
    return i
61,511✔
260
end
261

262
@noinline function xoshiro_bulk_simd(rng::Union{TaskLocalRNG, Xoshiro}, dst::Ptr{UInt8}, len::Int, ::Type{Bool}, ::Val{N}, f) where {N}
16✔
263
    s0, s1, s2, s3 = forkRand(rng, Val(N))
16✔
264
    msk = ntuple(i->VecElement(0x0101010101010101), Val(N))
144✔
265
    i = 0
16✔
266
    while i + 64*N <= len
84✔
267
        res = _plus(_rotl23(_plus(s0,s3)),s0)
68✔
268
        t = _shl17(s1)
68✔
269
        s2 = _xor(s2, s0)
68✔
270
        s3 = _xor(s3, s1)
68✔
271
        s1 = _xor(s1, s2)
68✔
272
        s0 = _xor(s0, s3)
68✔
273
        s2 = _xor(s2, t)
68✔
274
        s3 = _rotl45(s3)
68✔
275
        for k=0:7
68✔
276
            tmp = _lshr(res, k)
544✔
277
            toWrite = _and(tmp, msk)
544✔
278
            unsafe_store!(reinterpret(Ptr{NTuple{N,VecElement{UInt64}}}, dst + i + k*N*8), toWrite)
544✔
279
        end
1,020✔
280
        i += 64*N
68✔
281
    end
68✔
282
    return i
16✔
283
end
284

285

286
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float32}, ::SamplerTrivial{CloseOpen01{Float32}})
703✔
287
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*4, Float32, xoshiroWidth(), _bits2float)
703✔
288
    dst
703✔
289
end
290

291
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Float64}, ::SamplerTrivial{CloseOpen01{Float64}})
19,598✔
292
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*8, Float64, xoshiroWidth(), _bits2float)
19,598✔
293
    dst
19,598✔
294
end
295

296
for T in BitInteger_types
297
    @eval function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Union{Array{$T}, UnsafeView{$T}}, ::SamplerType{$T})
1,722,005✔
298
        GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst)*sizeof($T), UInt8, xoshiroWidth())
1,722,005✔
299
        dst
1,722,005✔
300
    end
301
end
302

303
function rand!(rng::Union{TaskLocalRNG, Xoshiro}, dst::Array{Bool}, ::SamplerType{Bool})
4,044✔
304
    GC.@preserve dst xoshiro_bulk(rng, convert(Ptr{UInt8}, pointer(dst)), length(dst), Bool, xoshiroWidth())
4,044✔
305
    dst
4,044✔
306
end
307

308
end # module
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc