• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37832

07 Jul 2024 09:07PM UTC coverage: 86.717% (-0.8%) from 87.5%
#37832

push

local

web-flow
add support for indexing in `@atomic` macro (#54707)

Following the discussion in #54642

Implemented:
- [x] `modifyindex_atomic!`, `swapindex_atomic!`, `replaceindex_atomic!`
for `GenericMemory`
- [x] `getindex_atomic`, `setindex_atomic!`, `setindexonce_atomic!` for
`GenericMemory`
- [x] add support for references in `@atomic` macros
- [x] add support for vararg indices in `@atomic` macros 
- [x] tests
- [x] update docstrings with example usage
- ~[ ] update Atomics section of the manual (?)~
- [x] news

@oscardssmith @vtjnash 

# New `@atomic` transformations implemented here:
```julia
julia> @macroexpand (@atomic a[i1,i2])
:(Base.getindex_atomic(a, :sequentially_consistent, i1, i2))

julia> @macroexpand (@atomic order a[i1,i2])
:(Base.getindex_atomic(a, order, i1, i2))

julia> @macroexpand (@atomic a[i1,i2] = 2.0)
:(Base.setindex_atomic!(a, :sequentially_consistent, 2.0, i1, i2))

julia> @macroexpand (@atomic order a[i1,i2] = 2.0)
:(Base.setindex_atomic!(a, order, 2.0, i1, i2))

julia> @macroexpand (@atomicswap a[i1,i2] = 2.0)
:(Base.swapindex_atomic!(a, :sequentially_consistent, 2.0, i1, i2))

julia> @macroexpand (@atomicswap order a[i1,i2] = 2.0)
:(Base.swapindex_atomic!(a, order, 2.0, i1, i2))

julia> @macroexpand (@atomic a[i1,i2] += 2.0)
:((Base.modifyindex_atomic!(a, :sequentially_consistent, +, 2.0, i1, i2))[2])

julia> @macroexpand (@atomic order a[i1,i2] += 2.0)
:((Base.modifyindex_atomic!(a, order, +, 2.0, i1, i2))[2])

julia> @macroexpand (@atomiconce a[i1,i2] = 2.0)
:(Base.setindexonce_atomic!(a, :sequentially_consistent, :sequentially_consistent, 2.0, i1, i2))

julia> @macroexpand (@atomiconce o1 o2 a[i1,i2] = 2.0)
:(Base.setindexonce_atomic!(a, o1, o2, 2.0, i1, i2))

julia> @macroexpand (@atomicreplace a[i1,i2] (2.0=>3.0))
:(Base.replaceindex_atomic!(a, :sequentially_consistent, :sequentially_consistent, 2.0, 3.0, i1, i2))

julia> @macroexpand (@atomicreplace o1 o2 a[i1,i2] (2.0=>3.0)... (continued)

66 of 73 new or added lines in 3 files covered. (90.41%)

879 existing lines in 26 files now uncovered.

76630 of 88368 relevant lines covered (86.72%)

15209469.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.01
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
311,855,795✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = String(copyto!(StringMemory(length(v)), v))
5,225,710✔
65
function String(v::Memory{UInt8})
328✔
66
    len = length(v)
6,268,173✔
67
    len == 0 && return ""
6,268,173✔
68
    return ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), v, len)
6,268,122✔
69
end
70
function String(v::Vector{UInt8})
5,864✔
71
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
72
    len = length(v)
9,536,008✔
73
    len == 0 && return ""
9,536,008✔
74
    ref = v.ref
8,561,738✔
75
    if ref.ptr_or_offset == ref.mem.ptr
8,561,741✔
76
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
8,561,740✔
77
    else
78
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
1✔
79
    end
80
    # optimized empty!(v); sizehint!(v, 0) calls
81
    setfield!(v, :size, (0,))
8,561,741✔
82
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
8,561,738✔
83
    return str
8,561,741✔
84
end
85

86
"""
87
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
88

89
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
90
(The pointer can be safely freed afterwards.) If `length` is specified
91
(the length of the data in bytes), the string does not have to be NUL-terminated.
92

93
This function is labeled "unsafe" because it will crash if `p` is not
94
a valid memory address to data of the requested length.
95
"""
96
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
54✔
97
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
947,627✔
98
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
947,626✔
99
end
100
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
1,080✔
101
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
6,916,998✔
102
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
6,916,997✔
103
end
104

105
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
106
# but the macro is not available at this time in bootstrap, so we write it manually.
107
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0x000e)), :(convert(Csize_t, n))))
188,196,215✔
108

109
"""
110
    String(s::AbstractString)
111

112
Create a new `String` from an existing `AbstractString`.
113
"""
114
String(s::AbstractString) = print_to_string(s)
461✔
115
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
5,087,711✔
116

117
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
20,967,041✔
118
function unsafe_wrap(::Type{Vector{UInt8}}, s::String)
3✔
119
    mem = unsafe_wrap(Memory{UInt8}, s)
61,258✔
120
    view(mem, eachindex(mem))
122,076✔
121
end
122

123
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
37,539✔
124
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
37,526✔
125
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
126

127
String(s::CodeUnits{UInt8,String}) = s.s
3✔
128

129
## low-level functions ##
130

131
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
803,647,072✔
132
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
435,627,744✔
133

134
ncodeunits(s::String) = Core.sizeof(s)
864,482,903✔
135
codeunit(s::String) = UInt8
×
136

137
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
138
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
509✔
139
    @boundscheck checkbounds(s, i)
406,992,414✔
140
    b = GC.@preserve s unsafe_load(pointer(s, i))
406,992,406✔
141
    return b
406,992,406✔
142
end
143

144
## comparison ##
145

146
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,073,260✔
147

148
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,041,843✔
149
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
150
    GC.@preserve a b begin
10,194,476✔
151
        pa = unsafe_convert(Ptr{UInt8}, a)
9,701,978✔
152
        pb = unsafe_convert(Ptr{UInt8}, b)
10,194,476✔
153
        memcmp(pa, pb, len % Csize_t) % Int
10,194,476✔
154
    end
155
end
156

157
function cmp(a::String, b::String)
1✔
158
    al, bl = sizeof(a), sizeof(b)
5,073,260✔
159
    c = _memcmp(a, b)
5,073,260✔
160
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
9,642,937✔
161
end
162

163
==(a::String, b::String) = a===b
12,512,516✔
164

165
typemin(::Type{String}) = ""
1✔
166
typemin(::String) = typemin(String)
1✔
167

168
## thisind, nextind ##
169

170
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
249,908,517✔
171

172
# s should be String or SubString{String}
173
@inline function _thisind_str(s, i::Int)
174
    i == 0 && return 0
128,877,306✔
175
    n = ncodeunits(s)
129,056,081✔
176
    i == n + 1 && return i
129,056,081✔
177
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
129,033,117✔
178
    @inbounds b = codeunit(s, i)
129,033,093✔
179
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
235,994,949✔
180
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,778,778✔
181
        local b
×
182
        @inbounds b = codeunit(s, i-1)
21,889,390✔
183
        between(b, 0b11000000, 0b11110111) && return i-1
21,889,390✔
184
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,304,016✔
185
        @inbounds b = codeunit(s, i-2)
8,910,070✔
186
        between(b, 0b11100000, 0b11110111) && return i-2
8,910,070✔
187
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,142✔
188
        @inbounds b = codeunit(s, i-3)
1,842,066✔
189
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,066✔
190
        return i
1,056,047✔
191
    end)(s, i, n)
192
end
193

194
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
49,208,448✔
195

196
# s should be String or SubString{String}
197
@inline function _nextind_str(s, i::Int)
198
    i == 0 && return 1
33,730,383✔
199
    n = ncodeunits(s)
33,302,016✔
200
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
33,302,028✔
201
    @inbounds l = codeunit(s, i)
33,302,004✔
202
    between(l, 0x80, 0xf7) || return i+1
65,695,300✔
203
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,817,384✔
204
        if l < 0xc0
908,692✔
205
            # handle invalid codeunit index by scanning back to the start of this index
206
            # (which may be the same as this index)
207
            i′ = @inbounds thisind(s, i)
69,068✔
208
            i′ >= i && return i+1
34,534✔
209
            i = i′
×
210
            @inbounds l = codeunit(s, i)
17,495✔
211
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,495✔
212
            @assert l >= 0xc0
17,495✔
213
        end
214
        # first continuation byte
215
        (i += 1) > n && return i
891,653✔
216
        @inbounds b = codeunit(s, i)
890,895✔
217
        b & 0xc0 ≠ 0x80 && return i
890,895✔
218
        ((i += 1) > n) | (l < 0xe0) && return i
878,248✔
219
        # second continuation byte
220
        @inbounds b = codeunit(s, i)
863,307✔
221
        b & 0xc0 ≠ 0x80 && return i
863,307✔
222
        ((i += 1) > n) | (l < 0xf0) && return i
861,487✔
223
        # third continuation byte
224
        @inbounds b = codeunit(s, i)
279,638✔
225
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,638✔
226
    end)(s, i, n, l)
227
end
228

229
## checking UTF-8 & ACSII validity ##
230
#=
231
    The UTF-8 Validation is performed by a shift based DFA.
232
    ┌───────────────────────────────────────────────────────────────────┐
233
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
234
    │                               ├────────3────────┐           │     │
235
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
236
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
237
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
238
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
239
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
240
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
241
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
242
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
243
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
244
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
245
    │                      │        │     ├─┤               │        │  │
246
    │                      │        └─4──►│6├─────1,9───────┘        │  │
247
    │          INVALID     │              └─┘                        │  │
248
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
249
    │          ┌▼───┴┐                                                  │
250
    │          │  2  ◄─── All undefined transitions result in state 2   │
251
    │          └─────┘                                                  │
252
    └───────────────────────────────────────────────────────────────────┘
253

254
        Validation States
255
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
256
                        If the DFA ends in this state the string is ASCII only
257
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
258
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
259
                    as seen by all 1s in that column of table below
260
            3 -> One valid continuation byte needed to return to state 0
261
        4,5,6 -> Two valid continuation bytes needed to return to state 0
262
        7,8,9 -> Three valids continuation bytes needed to return to state 0
263

264
                        Current State
265
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
266
                0 | 0  1  2  2  2  2  2  2  2  2
267
                1 | 2  2  2  1  3  2  3  2  4  4
268
                2 | 3  3  2  2  2  2  2  2  2  2
269
                3 | 4  4  2  2  2  2  2  2  2  2
270
                4 | 6  6  2  2  2  2  2  2  2  2
271
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
272
    Class       6 | 8  8  2  2  2  2  2  2  2  2
273
                7 | 2  2  2  1  3  3  2  4  4  2
274
                8 | 2  2  2  2  2  2  2  2  2  2
275
                9 | 2  2  2  1  3  2  3  4  4  2
276
               10 | 5  5  2  2  2  2  2  2  2  2
277
               11 | 7  7  2  2  2  2  2  2  2  2
278

279
           Shifts | 0  4 10 14 18 24  8 20 12 26
280

281
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
282
    the rows the correct shift was a result.
283

284
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
285
    the current state then masking the result with 0x11110 give the shift for the new state
286

287

288
=#
289

290
#State type used by UTF-8 DFA
291
const _UTF8DFAState = UInt32
292
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
293
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
294
    num_classes=12
295
    num_states=10
296
    bit_per_state = 6
297

298
    # These shifts were derived using a SMT solver
299
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
300

301
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
311
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
312
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
313
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
314
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
315
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
316
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
317

318
    # These are the rows discussed in comments above
319
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
320
                     2  2  2  1  3  2  3  2  4  4;
321
                     3  3  2  2  2  2  2  2  2  2;
322
                     4  4  2  2  2  2  2  2  2  2;
323
                     6  6  2  2  2  2  2  2  2  2;
324
                     9  9  2  2  2  2  2  2  2  2;
325
                     8  8  2  2  2  2  2  2  2  2;
326
                     2  2  2  1  3  3  2  4  4  2;
327
                     2  2  2  2  2  2  2  2  2  2;
328
                     2  2  2  1  3  2  3  4  4  2;
329
                     5  5  2  2  2  2  2  2  2  2;
330
                     7  7  2  2  2  2  2  2  2  2]
331

332
    #This converts the state_arrays into the shift encoded _UTF8DFAState
333
    class_row = zeros(_UTF8DFAState, num_classes)
334

335
    for i = 1:num_classes
336
        row = _UTF8DFAState(0)
337
        for j in 1:num_states
338
            #Calculate the shift required for the next state
339
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
340
            #Shift the next state into the position of the current state
341
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
342
        end
343
        class_row[i]=row
344
    end
345

346
    map(c->class_row[c+1],character_classes)
×
347
end
348

349

350
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
351
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
352
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
353

354
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
355
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
356

357
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
5,248✔
358
    for i = first:last
25,821✔
359
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
360
    end
82,597✔
361
    return (state)
25,821✔
362
end
363

364
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
365
    n=first
10✔
366
    while n <= last - chunk_size
40✔
367
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
368
        n += chunk_size
30✔
369
    end
30✔
370
    n= last-chunk_size+1
10✔
371
    _isascii(cu,n,last) || return n
10✔
372
    return nothing
10✔
373
end
374

375
##
376

377
# Classifcations of string
378
    # 0: neither valid ASCII nor UTF-8
379
    # 1: valid ASCII
380
    # 2: valid UTF-8
381
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
382

383

384
function byte_string_classify(bytes::AbstractVector{UInt8})
24✔
385
    chunk_size = 1024
20,430✔
386
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,430✔
387
    n = length(bytes)
20,430✔
388
    if n > chunk_threshold
20,430✔
389
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
390
        isnothing(start) && return 1
10✔
391
    else
392
        _isascii(bytes,1,n) && return 1
20,420✔
393
        start = 1
20,189✔
394
    end
395
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
396
end
397

398
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
399
    chunk_size = 256
20,189✔
400

401
    start = first
20,189✔
402
    stop = min(last,first + chunk_size - 1)
20,189✔
403
    state = _UTF8_DFA_ACCEPT
20,189✔
404
    while start <= last
24,926✔
405
        # try to process ascii chunks
406
        while state == _UTF8_DFA_ACCEPT
20,189✔
407
            _isascii(bytes,start,stop) || break
20,189✔
408
            (start = start + chunk_size) <= last || break
×
409
            stop = min(last,stop + chunk_size)
×
410
        end
×
411
        # Process non ascii chunk
412
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
413
        state == _UTF8_DFA_INVALID && return 0
20,189✔
414

415
        start = start + chunk_size
4,737✔
416
        stop = min(last,stop + chunk_size)
4,737✔
417
    end
4,737✔
418
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
419
end
420

421
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,452✔
422
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
423

424
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
193✔
425

UNCOV
426
is_valid_continuation(c) = c & 0xc0 == 0x80
×
427

428
## required core functionality ##
429

430
@inline function iterate(s::String, i::Int=firstindex(s))
6,325✔
431
    (i % UInt) - 1 < ncodeunits(s) || return nothing
78,816,421✔
432
    b = @inbounds codeunit(s, i)
65,803,966✔
433
    u = UInt32(b) << 24
65,803,966✔
434
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
130,606,518✔
435
    return @noinline iterate_continued(s, i, u)
1,001,414✔
436
end
437

438
# duck-type s so that external UTF-8 string packages like StringViews can hook in
439
function iterate_continued(s, i::Int, u::UInt32)
1,001,414✔
440
    u < 0xc0000000 && (i += 1; @goto ret)
1,001,414✔
441
    n = ncodeunits(s)
993,974✔
442
    # first continuation byte
443
    (i += 1) > n && @goto ret
993,974✔
444
    @inbounds b = codeunit(s, i)
991,514✔
445
    b & 0xc0 == 0x80 || @goto ret
991,514✔
446
    u |= UInt32(b) << 16
986,471✔
447
    # second continuation byte
448
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
986,471✔
449
    @inbounds b = codeunit(s, i)
912,919✔
450
    b & 0xc0 == 0x80 || @goto ret
913,040✔
451
    u |= UInt32(b) << 8
912,798✔
452
    # third continuation byte
453
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
912,798✔
454
    @inbounds b = codeunit(s, i)
273,495✔
455
    b & 0xc0 == 0x80 || @goto ret
273,503✔
456
    u |= UInt32(b); i += 1
273,487✔
457
@label ret
458
    return reinterpret(Char, u), i
1,001,414✔
459
end
460

461
@propagate_inbounds function getindex(s::String, i::Int)
1,970✔
462
    b = codeunit(s, i)
17,772,816✔
463
    u = UInt32(b) << 24
17,772,816✔
464
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
34,735,953✔
465
    return getindex_continued(s, i, u)
809,646✔
466
end
467

468
# duck-type s so that external UTF-8 string packages like StringViews can hook in
469
function getindex_continued(s, i::Int, u::UInt32)
809,644✔
470
    if u < 0xc0000000
809,646✔
471
        # called from `getindex` which checks bounds
472
        @inbounds isvalid(s, i) && @goto ret
54✔
473
        string_index_err(s, i)
1✔
474
    end
475
    n = ncodeunits(s)
809,619✔
476

477
    (i += 1) > n && @goto ret
809,619✔
478
    @inbounds b = codeunit(s, i) # cont byte 1
809,618✔
479
    b & 0xc0 == 0x80 || @goto ret
809,618✔
480
    u |= UInt32(b) << 16
809,607✔
481

482
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
809,607✔
483
    @inbounds b = codeunit(s, i) # cont byte 2
803,219✔
484
    b & 0xc0 == 0x80 || @goto ret
803,219✔
485
    u |= UInt32(b) << 8
803,219✔
486

487
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
803,219✔
488
    @inbounds b = codeunit(s, i) # cont byte 3
262,132✔
489
    b & 0xc0 == 0x80 || @goto ret
262,132✔
490
    u |= UInt32(b)
262,132✔
491
@label ret
492
    return reinterpret(Char, u)
809,645✔
493
end
494

495
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
496

497
@inline function getindex(s::String, r::UnitRange{Int})
67,378✔
498
    isempty(r) && return ""
1,001,964✔
499
    i, j = first(r), last(r)
73,189✔
500
    @boundscheck begin
978,530✔
501
        checkbounds(s, r)
978,536✔
502
        @inbounds isvalid(s, i) || string_index_err(s, i)
976,117✔
503
        @inbounds isvalid(s, j) || string_index_err(s, j)
978,525✔
504
    end
505
    j = nextind(s, j) - 1
1,957,045✔
506
    n = j - i + 1
978,523✔
507
    ss = _string_n(n)
978,523✔
508
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
978,523✔
509
    return ss
978,523✔
510
end
511

512
# nothrow because we know the start and end indices are valid
513
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
853,014✔
514

515
# effects needed because @inbounds
516
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
5✔
517
    @boundscheck begin
269,538✔
518
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
269,538✔
519
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
269,542✔
520
    end
521
    j < i && return 0
269,534✔
522
    @inbounds i, k = thisind(s, i), i
466,804✔
523
    c = j - i + (i == k)
233,402✔
524
    @inbounds length_continued(s, i, j, c)
233,402✔
525
end
526

527
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
528
    i < n || return c
1,430,533✔
529
    b = codeunit(s, i)
742,299✔
530
    while true
3,345,340✔
531
        while true
47,164,627✔
532
            (i += 1) ≤ n || return c
47,906,164✔
533
            0xc0 ≤ b ≤ 0xf7 && break
46,423,090✔
534
            b = codeunit(s, i)
43,819,287✔
535
        end
43,819,287✔
536
        l = b
×
537
        b = codeunit(s, i) # cont byte 1
2,603,803✔
538
        c -= (x = b & 0xc0 == 0x80)
2,603,803✔
539
        x & (l ≥ 0xe0) || continue
2,603,803✔
540

541
        (i += 1) ≤ n || return c
2,209,532✔
542
        b = codeunit(s, i) # cont byte 2
2,208,020✔
543
        c -= (x = b & 0xc0 == 0x80)
2,208,020✔
544
        x & (l ≥ 0xf0) || continue
3,792,378✔
545

546
        (i += 1) ≤ n || return c
623,668✔
547
        b = codeunit(s, i) # cont byte 3
623,656✔
548
        c -= (b & 0xc0 == 0x80)
623,656✔
549
    end
2,603,041✔
550
end
551

552
## overload methods for efficiency ##
553

554
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
229,048,091✔
555

556
isascii(s::String) = isascii(codeunits(s))
675,545✔
557

558
# don't assume effects for general integers since we cannot know their implementation
559
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
311,322✔
560

561
"""
562
    repeat(c::AbstractChar, r::Integer) -> String
563

564
Repeat a character `r` times. This can equivalently be accomplished by calling
565
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
566

567
# Examples
568
```jldoctest
569
julia> repeat('A', 3)
570
"AAA"
571
```
572
"""
573
function repeat(c::AbstractChar, r::Integer)
311,155✔
574
    c = Char(c)::Char
311,380✔
575
    r == 0 && return ""
311,380✔
576
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
234,597✔
577
    u = bswap(reinterpret(UInt32, c))
234,593✔
578
    n = 4 - (leading_zeros(u | 0xff) >> 3)
234,593✔
579
    s = _string_n(n*r)
234,595✔
580
    p = pointer(s)
234,593✔
581
    GC.@preserve s if n == 1
234,593✔
582
        memset(p, u % UInt8, r)
234,475✔
583
    elseif n == 2
118✔
584
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
585
        for i = 1:r
6✔
586
            unsafe_store!(p16, u % UInt16, i)
20✔
587
        end
25✔
588
    elseif n == 3
112✔
589
        b1 = (u >> 0) % UInt8
108✔
590
        b2 = (u >> 8) % UInt8
108✔
591
        b3 = (u >> 16) % UInt8
108✔
592
        for i = 0:r-1
108✔
593
            unsafe_store!(p, b1, 3i + 1)
580✔
594
            unsafe_store!(p, b2, 3i + 2)
580✔
595
            unsafe_store!(p, b3, 3i + 3)
580✔
596
        end
580✔
597
    elseif n == 4
4✔
598
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
599
        for i = 1:r
4✔
600
            unsafe_store!(p32, u, i)
8✔
601
        end
234,601✔
602
    end
603
    return s
234,591✔
604
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc