• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37999

02 Feb 2025 07:22AM UTC coverage: 17.218% (-8.3%) from 25.515%
#37999

push

local

web-flow
bpart: Start tracking backedges for bindings (#57213)

This PR adds limited backedge support for Bindings. There are two
classes of bindings that get backedges:

1. Cross-module `GlobalRef` bindings (new in this PR)
2. Any globals accesses through intrinsics (i.e. those with forward
edges from #57009)

This is a time/space trade-off for invalidation. As a result of the
first category, invalidating a binding now only needs to scan all the
methods defined in the same module as the binding. At the same time, it
is anticipated that most binding references are to bindings in the same
module, keeping the list of bindings that need explicit (back)edges
small.

7 of 30 new or added lines in 3 files covered. (23.33%)

4235 existing lines in 124 files now uncovered.

7882 of 45779 relevant lines covered (17.22%)

98289.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

29.83
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
×
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
×
15
    s = exc.string
×
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
×
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
×
18
        iprev = thisind(s, exc.index)
×
19
        inext = nextind(s, iprev)
×
20
        escprev = escape_string(s[iprev:iprev])
×
21
        if inext <= ncodeunits(s)
×
22
            escnext = escape_string(s[inext:inext])
×
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
×
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
×
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
1,469,910✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
21✔
65
function String(v::Vector{UInt8})
66
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
67
    len = length(v)
560,802✔
68
    len == 0 && return ""
560,802✔
69
    ref = v.ref
303,314✔
70
    if ref.ptr_or_offset == ref.mem.ptr
308,357✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
308,357✔
72
    else
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
×
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
308,357✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
303,314✔
78
    return str
308,357✔
79
end
80

81
"Create a string re-using the memory, if possible.
82
Mutating or reading the memory after calling this function is undefined behaviour."
83
function unsafe_takestring(m::Memory{UInt8})
84
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
122,102✔
85
end
86

87
"""
88
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
89

90
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
91
(The pointer can be safely freed afterwards.) If `length` is specified
92
(the length of the data in bytes), the string does not have to be NUL-terminated.
93

94
This function is labeled "unsafe" because it will crash if `p` is not
95
a valid memory address to data of the requested length.
96
"""
97
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
98
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
229,851✔
99
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
229,851✔
100
end
101
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
102
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
584✔
103
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
584✔
104
end
105

106
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
107
# but the macro is not available at this time in bootstrap, so we write it manually.
108
const _string_n_override = 0x04ee
109
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
98,052✔
110
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override)), :(convert(Csize_t, n))))
111

112
"""
113
    String(s::AbstractString)
114

115
Create a new `String` from an existing `AbstractString`.
116
"""
117
String(s::AbstractString) = print_to_string(s)
×
118
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
497✔
119

120
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
68,210✔
UNCOV
121
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
×
122

123
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
16,796✔
124
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
16,796✔
125
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
126

127
String(s::CodeUnits{UInt8,String}) = s.s
×
128

129
## low-level functions ##
130

131
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
5,538,202✔
132
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
4,954,430✔
133

134
ncodeunits(s::String) = Core.sizeof(s)
5,935,843✔
135
codeunit(s::String) = UInt8
×
136

137
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
138
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
139
    @boundscheck checkbounds(s, i)
4,667,338✔
140
    b = GC.@preserve s unsafe_load(pointer(s, i))
4,667,338✔
141
    return b
4,667,338✔
142
end
143

144
## comparison ##
145

146
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
178,611✔
147

148
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
480,235✔
149
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
150
    GC.@preserve a b begin
480,235✔
151
        pa = unsafe_convert(Ptr{UInt8}, a)
480,235✔
152
        pb = unsafe_convert(Ptr{UInt8}, b)
480,235✔
153
        memcmp(pa, pb, len % Csize_t) % Int
480,235✔
154
    end
155
end
156

157
function cmp(a::String, b::String)
158
    al, bl = sizeof(a), sizeof(b)
178,611✔
159
    c = _memcmp(a, b)
178,611✔
160
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
243,572✔
161
end
162

163
==(a::String, b::String) = a===b
241,607✔
164

165
typemin(::Type{String}) = ""
×
166
typemin(::String) = typemin(String)
×
167

168
## thisind, nextind ##
169

170
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
20,730✔
171

172
# s should be String or SubString{String}
173
@inline function _thisind_str(s, i::Int)
174
    i == 0 && return 0
15,911✔
175
    n = ncodeunits(s)
15,899✔
176
    i == n + 1 && return i
15,899✔
177
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
15,899✔
178
    @inbounds b = codeunit(s, i)
15,899✔
179
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
31,291✔
180
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
×
181
        local b
182
        @inbounds b = codeunit(s, i-1)
183
        between(b, 0b11000000, 0b11110111) && return i-1
184
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
185
        @inbounds b = codeunit(s, i-2)
186
        between(b, 0b11100000, 0b11110111) && return i-2
187
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
188
        @inbounds b = codeunit(s, i-3)
189
        between(b, 0b11110000, 0b11110111) && return i-3
190
        return i
191
    end)(s, i, n)
192
end
193

194
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
1,054✔
195

196
# s should be String or SubString{String}
197
@inline function _nextind_str(s, i::Int)
198
    i == 0 && return 1
529✔
199
    n = ncodeunits(s)
529✔
200
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
529✔
201
    @inbounds l = codeunit(s, i)
529✔
202
    between(l, 0x80, 0xf7) || return i+1
1,058✔
UNCOV
203
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
×
204
        if l < 0xc0
205
            # handle invalid codeunit index by scanning back to the start of this index
206
            # (which may be the same as this index)
207
            i′ = @inbounds thisind(s, i)
208
            i′ >= i && return i+1
209
            i = i′
210
            @inbounds l = codeunit(s, i)
211
            (l < 0x80) | (0xf8 ≤ l) && return i+1
212
            @assert l >= 0xc0 "invalid codeunit"
213
        end
214
        # first continuation byte
215
        (i += 1) > n && return i
216
        @inbounds b = codeunit(s, i)
217
        b & 0xc0 ≠ 0x80 && return i
218
        ((i += 1) > n) | (l < 0xe0) && return i
219
        # second continuation byte
220
        @inbounds b = codeunit(s, i)
221
        b & 0xc0 ≠ 0x80 && return i
222
        ((i += 1) > n) | (l < 0xf0) && return i
223
        # third continuation byte
224
        @inbounds b = codeunit(s, i)
225
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
226
    end)(s, i, n, l)
227
end
228

229
## checking UTF-8 & ACSII validity ##
230
#=
231
    The UTF-8 Validation is performed by a shift based DFA.
232
    ┌───────────────────────────────────────────────────────────────────┐
233
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
234
    │                               ├────────3────────┐           │     │
235
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
236
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
237
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
238
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
239
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
240
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
241
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
242
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
243
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
244
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
245
    │                      │        │     ├─┤               │        │  │
246
    │                      │        └─4──►│6├─────1,9───────┘        │  │
247
    │          INVALID     │              └─┘                        │  │
248
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
249
    │          ┌▼───┴┐                                                  │
250
    │          │  2  ◄─── All undefined transitions result in state 2   │
251
    │          └─────┘                                                  │
252
    └───────────────────────────────────────────────────────────────────┘
253

254
        Validation States
255
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
256
                        If the DFA ends in this state the string is ASCII only
257
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
258
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
259
                    as seen by all 1s in that column of table below
260
            3 -> One valid continuation byte needed to return to state 0
261
        4,5,6 -> Two valid continuation bytes needed to return to state 0
262
        7,8,9 -> Three valids continuation bytes needed to return to state 0
263

264
                        Current State
265
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
266
                0 | 0  1  2  2  2  2  2  2  2  2
267
                1 | 2  2  2  1  3  2  3  2  4  4
268
                2 | 3  3  2  2  2  2  2  2  2  2
269
                3 | 4  4  2  2  2  2  2  2  2  2
270
                4 | 6  6  2  2  2  2  2  2  2  2
271
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
272
    Class       6 | 8  8  2  2  2  2  2  2  2  2
273
                7 | 2  2  2  1  3  3  2  4  4  2
274
                8 | 2  2  2  2  2  2  2  2  2  2
275
                9 | 2  2  2  1  3  2  3  4  4  2
276
               10 | 5  5  2  2  2  2  2  2  2  2
277
               11 | 7  7  2  2  2  2  2  2  2  2
278

279
           Shifts | 0  4 10 14 18 24  8 20 12 26
280

281
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
282
    the rows the correct shift was a result.
283

284
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
285
    the current state then masking the result with 0x11110 give the shift for the new state
286

287

288
=#
289

290
#State type used by UTF-8 DFA
291
const _UTF8DFAState = UInt32
292
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
293
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
294
    num_classes=12
295
    num_states=10
296
    bit_per_state = 6
297

298
    # These shifts were derived using a SMT solver
299
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
300

301
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
311
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
312
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
313
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
314
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
315
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
316
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
317

318
    # These are the rows discussed in comments above
319
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
320
                     2  2  2  1  3  2  3  2  4  4;
321
                     3  3  2  2  2  2  2  2  2  2;
322
                     4  4  2  2  2  2  2  2  2  2;
323
                     6  6  2  2  2  2  2  2  2  2;
324
                     9  9  2  2  2  2  2  2  2  2;
325
                     8  8  2  2  2  2  2  2  2  2;
326
                     2  2  2  1  3  3  2  4  4  2;
327
                     2  2  2  2  2  2  2  2  2  2;
328
                     2  2  2  1  3  2  3  4  4  2;
329
                     5  5  2  2  2  2  2  2  2  2;
330
                     7  7  2  2  2  2  2  2  2  2]
331

332
    #This converts the state_arrays into the shift encoded _UTF8DFAState
333
    class_row = zeros(_UTF8DFAState, num_classes)
334

335
    for i = 1:num_classes
336
        row = _UTF8DFAState(0)
337
        for j in 1:num_states
338
            #Calculate the shift required for the next state
339
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
340
            #Shift the next state into the position of the current state
341
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
342
        end
343
        class_row[i]=row
344
    end
345

346
    map(c->class_row[c+1],character_classes)
×
347
end
348

349

350
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
351
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
352
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
353

354
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
355
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
×
356

357
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
358
    for i = first:last
×
359
       @inbounds state = _utf_dfa_step(state, bytes[i])
×
360
    end
×
361
    return (state)
×
362
end
363

364
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
365
    n=first
×
366
    while n <= last - chunk_size
×
367
        _isascii(cu,n,n+chunk_size-1) || return n
×
368
        n += chunk_size
×
369
    end
×
370
    n= last-chunk_size+1
×
371
    _isascii(cu,n,last) || return n
×
372
    return nothing
×
373
end
374

375
##
376

377
# Classifcations of string
378
    # 0: neither valid ASCII nor UTF-8
379
    # 1: valid ASCII
380
    # 2: valid UTF-8
381
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
×
382

383

384
function byte_string_classify(bytes::AbstractVector{UInt8})
385
    chunk_size = 1024
×
386
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
×
387
    n = length(bytes)
×
388
    if n > chunk_threshold
×
389
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
×
390
        isnothing(start) && return 1
×
391
    else
392
        _isascii(bytes,1,n) && return 1
×
393
        start = 1
×
394
    end
395
    return _byte_string_classify_nonascii(bytes,start,n)
×
396
end
397

398
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
×
399
    chunk_size = 256
×
400

401
    start = first
×
402
    stop = min(last,first + chunk_size - 1)
×
403
    state = _UTF8_DFA_ACCEPT
×
404
    while start <= last
×
405
        # try to process ascii chunks
406
        while state == _UTF8_DFA_ACCEPT
×
407
            _isascii(bytes,start,stop) || break
×
408
            (start = start + chunk_size) <= last || break
×
409
            stop = min(last,stop + chunk_size)
×
410
        end
×
411
        # Process non ascii chunk
412
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
×
413
        state == _UTF8_DFA_INVALID && return 0
×
414

415
        start = start + chunk_size
×
416
        stop = min(last,stop + chunk_size)
×
417
    end
×
418
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
×
419
end
420

421
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
×
422
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
×
423

424
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
×
425

426
is_valid_continuation(c) = c & 0xc0 == 0x80
×
427

428
## required core functionality ##
429

430
@inline function iterate(s::String, i::Int=firstindex(s))
431
    (i % UInt) - 1 < ncodeunits(s) || return nothing
1,306,897✔
432
    b = @inbounds codeunit(s, i)
1,302,115✔
433
    u = UInt32(b) << 24
1,302,115✔
434
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
2,604,229✔
435
    return @noinline iterate_continued(s, i, u)
1✔
436
end
437

438
# duck-type s so that external UTF-8 string packages like StringViews can hook in
UNCOV
439
function iterate_continued(s, i::Int, u::UInt32)
×
UNCOV
440
    u < 0xc0000000 && (i += 1; @goto ret)
×
UNCOV
441
    n = ncodeunits(s)
×
442
    # first continuation byte
UNCOV
443
    (i += 1) > n && @goto ret
×
UNCOV
444
    @inbounds b = codeunit(s, i)
×
UNCOV
445
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
446
    u |= UInt32(b) << 16
×
447
    # second continuation byte
UNCOV
448
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
×
UNCOV
449
    @inbounds b = codeunit(s, i)
×
UNCOV
450
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
451
    u |= UInt32(b) << 8
×
452
    # third continuation byte
UNCOV
453
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
×
454
    @inbounds b = codeunit(s, i)
×
455
    b & 0xc0 == 0x80 || @goto ret
×
456
    u |= UInt32(b); i += 1
×
UNCOV
457
@label ret
×
UNCOV
458
    return reinterpret(Char, u), i
×
459
end
460

461
@propagate_inbounds function getindex(s::String, i::Int)
462
    b = codeunit(s, i)
150,844✔
463
    u = UInt32(b) << 24
150,844✔
464
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
301,688✔
UNCOV
465
    return getindex_continued(s, i, u)
×
466
end
467

468
# duck-type s so that external UTF-8 string packages like StringViews can hook in
UNCOV
469
function getindex_continued(s, i::Int, u::UInt32)
×
UNCOV
470
    if u < 0xc0000000
×
471
        # called from `getindex` which checks bounds
472
        @inbounds isvalid(s, i) && @goto ret
×
473
        string_index_err(s, i)
×
474
    end
UNCOV
475
    n = ncodeunits(s)
×
476

UNCOV
477
    (i += 1) > n && @goto ret
×
UNCOV
478
    @inbounds b = codeunit(s, i) # cont byte 1
×
UNCOV
479
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
480
    u |= UInt32(b) << 16
×
481

UNCOV
482
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
×
UNCOV
483
    @inbounds b = codeunit(s, i) # cont byte 2
×
UNCOV
484
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
485
    u |= UInt32(b) << 8
×
486

UNCOV
487
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
×
488
    @inbounds b = codeunit(s, i) # cont byte 3
×
489
    b & 0xc0 == 0x80 || @goto ret
×
490
    u |= UInt32(b)
×
UNCOV
491
@label ret
×
UNCOV
492
    return reinterpret(Char, u)
×
493
end
494

495
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
×
496

497
@inline function getindex(s::String, r::UnitRange{Int})
498
    isempty(r) && return ""
509✔
499
    i, j = first(r), last(r)
507✔
500
    @boundscheck begin
509✔
501
        checkbounds(s, r)
509✔
502
        @inbounds isvalid(s, i) || string_index_err(s, i)
509✔
503
        @inbounds isvalid(s, j) || string_index_err(s, j)
509✔
504
    end
505
    j = nextind(s, j) - 1
1,018✔
506
    n = j - i + 1
509✔
507
    ss = _string_n(n)
509✔
508
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
509✔
509
    return ss
509✔
510
end
511

512
# nothrow because we know the start and end indices are valid
UNCOV
513
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
×
514

515
# effects needed because @inbounds
UNCOV
516
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
×
UNCOV
517
    @boundscheck begin
×
UNCOV
518
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
×
UNCOV
519
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
×
520
    end
UNCOV
521
    j < i && return 0
×
UNCOV
522
    @inbounds i, k = thisind(s, i), i
×
UNCOV
523
    c = j - i + (i == k)
×
UNCOV
524
    @inbounds length_continued(s, i, j, c)
×
525
end
526

UNCOV
527
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
×
UNCOV
528
    i < n || return c
×
UNCOV
529
    b = codeunit(s, i)
×
UNCOV
530
    while true
×
UNCOV
531
        while true
×
UNCOV
532
            (i += 1) ≤ n || return c
×
UNCOV
533
            0xc0 ≤ b ≤ 0xf7 && break
×
UNCOV
534
            b = codeunit(s, i)
×
UNCOV
535
        end
×
536
        l = b
×
537
        b = codeunit(s, i) # cont byte 1
×
538
        c -= (x = b & 0xc0 == 0x80)
×
539
        x & (l ≥ 0xe0) || continue
×
540

541
        (i += 1) ≤ n || return c
×
542
        b = codeunit(s, i) # cont byte 2
×
543
        c -= (x = b & 0xc0 == 0x80)
×
544
        x & (l ≥ 0xf0) || continue
×
545

546
        (i += 1) ≤ n || return c
×
547
        b = codeunit(s, i) # cont byte 3
×
548
        c -= (b & 0xc0 == 0x80)
×
549
    end
×
550
end
551

552
## overload methods for efficiency ##
553

554
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
9,176✔
555

556
isascii(s::String) = isascii(codeunits(s))
13✔
557

558
# don't assume effects for general integers since we cannot know their implementation
559
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
442✔
560

561
"""
562
    repeat(c::AbstractChar, r::Integer) -> String
563

564
Repeat a character `r` times. This can equivalently be accomplished by calling
565
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
566

567
# Examples
568
```jldoctest
569
julia> repeat('A', 3)
570
"AAA"
571
```
572
"""
UNCOV
573
function repeat(c::AbstractChar, r::Integer)
×
UNCOV
574
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
×
UNCOV
575
    r = UInt(r)::UInt
×
576
    c = Char(c)::Char
×
UNCOV
577
    r == 0 && return ""
×
UNCOV
578
    u = bswap(reinterpret(UInt32, c))
×
UNCOV
579
    n = 4 - (leading_zeros(u | 0xff) >> 3)
×
UNCOV
580
    s = _string_n(n*r)
×
UNCOV
581
    p = pointer(s)
×
UNCOV
582
    GC.@preserve s if n == 1
×
UNCOV
583
        memset(p, u % UInt8, r)
×
584
    elseif n == 2
×
585
        p16 = reinterpret(Ptr{UInt16}, p)
×
586
        for i = 1:r
×
587
            unsafe_store!(p16, u % UInt16, i)
×
588
        end
×
589
    elseif n == 3
×
590
        b1 = (u >> 0) % UInt8
×
591
        b2 = (u >> 8) % UInt8
×
592
        b3 = (u >> 16) % UInt8
×
593
        for i = 0:r-1
×
594
            unsafe_store!(p, b1, 3i + 1)
×
595
            unsafe_store!(p, b2, 3i + 2)
×
596
            unsafe_store!(p, b3, 3i + 3)
×
597
        end
×
598
    elseif n == 4
×
599
        p32 = reinterpret(Ptr{UInt32}, p)
×
600
        for i = 1:r
×
601
            unsafe_store!(p32, u, i)
×
UNCOV
602
        end
×
603
    end
UNCOV
604
    return s
×
605
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc