• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #38077

19 May 2025 08:49PM UTC coverage: 25.74% (-0.006%) from 25.746%
#38077

push

local

web-flow
avoid deadlock if crashing inside profile_wr_lock (#58452)

The rd/wr lock distinction here was supposed to help prevent deadlocks
by allowing recursion (even over signals), but did not account for
crashes causing recursion while holding the wr lock. Make these lock
acquires fail-able if they would cause deadlock.

12825 of 49826 relevant lines covered (25.74%)

715601.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

50.0
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
×
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
×
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
×
15
    s = exc.string
×
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
×
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
×
18
        iprev = thisind(s, exc.index)
×
19
        inext = nextind(s, iprev)
×
20
        escprev = escape_string(s[iprev:iprev])
×
21
        if inext <= ncodeunits(s)
×
22
            escnext = escape_string(s[inext:inext])
×
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
×
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
×
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
2,537,001✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
22✔
65
function String(v::Vector{UInt8})
66
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
67
    len = length(v)
602,272✔
68
    len == 0 && return ""
602,272✔
69
    ref = v.ref
321,479✔
70
    if ref.ptr_or_offset == ref.mem.ptr
341,264✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
341,264✔
72
    else
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
×
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
341,264✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
321,479✔
78
    return str
341,264✔
79
end
80

81
"""
82
    unsafe_takestring(m::Memory{UInt8})::String
83

84
Create a `String` from `m`, changing the interpretation of the contents of `m`.
85
This is done without copying, if possible. Thus, any access to `m` after
86
calling this function, either to read or to write, is undefined behaviour.
87
"""
88
function unsafe_takestring(m::Memory{UInt8})
89
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
126,804✔
90
end
91

92
"""
93
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
94

95
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
96
(The pointer can be safely freed afterwards.) If `length` is specified
97
(the length of the data in bytes), the string does not have to be NUL-terminated.
98

99
This function is labeled "unsafe" because it will crash if `p` is not
100
a valid memory address to data of the requested length.
101
"""
102
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
103
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
240,622✔
104
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
240,622✔
105
end
106
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
107
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
2,627✔
108
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
2,627✔
109
end
110

111
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
112
# but the macro is not available at this time in bootstrap, so we write it manually.
113
const _string_n_override = 0x04ee
114
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
234,278✔
115
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override, false)), :(convert(Csize_t, n))))
116

117
"""
118
    String(s::AbstractString)
119

120
Create a new `String` from an existing `AbstractString`.
121
"""
122
String(s::AbstractString) = print_to_string(s)
×
123
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
1,620✔
124

125
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
96,309✔
126
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
85✔
127

128
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
17,370✔
129
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
17,370✔
130
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
131

132
String(s::CodeUnits{UInt8,String}) = s.s
×
133

134
## low-level functions ##
135

136
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
3,772,961✔
137
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
2,829,239✔
138

139
ncodeunits(s::String) = Core.sizeof(s)
4,395,561✔
140
codeunit(s::String) = UInt8
×
141

142
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
143
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
10✔
144
    @boundscheck checkbounds(s, i)
2,374,040✔
145
    b = GC.@preserve s unsafe_load(pointer(s, i))
2,374,040✔
146
    return b
1,964,953✔
147
end
148

149
## comparison ##
150

151
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
172,139✔
152

153
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
485,507✔
154
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
155
    GC.@preserve a b begin
486,220✔
156
        pa = unsafe_convert(Ptr{UInt8}, a)
486,099✔
157
        pb = unsafe_convert(Ptr{UInt8}, b)
486,220✔
158
        memcmp(pa, pb, len % Csize_t) % Int
486,220✔
159
    end
160
end
161

162
function cmp(a::String, b::String)
163
    al, bl = sizeof(a), sizeof(b)
172,139✔
164
    c = _memcmp(a, b)
172,139✔
165
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
262,204✔
166
end
167

168
==(a::String, b::String) = a===b
284,019✔
169

170
typemin(::Type{String}) = ""
×
171
typemin(::String) = typemin(String)
×
172

173
## thisind, nextind ##
174

175
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
745,098✔
176

177
# s should be String or SubString{String}
178
@inline function _thisind_str(s, i::Int)
179
    i == 0 && return 0
378,368✔
180
    n = ncodeunits(s)
378,310✔
181
    i == n + 1 && return i
378,310✔
182
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
378,310✔
183
    @inbounds b = codeunit(s, i)
378,310✔
184
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
721,139✔
185
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
×
186
        local b
×
187
        @inbounds b = codeunit(s, i-1)
×
188
        between(b, 0b11000000, 0b11110111) && return i-1
×
189
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
×
190
        @inbounds b = codeunit(s, i-2)
×
191
        between(b, 0b11100000, 0b11110111) && return i-2
×
192
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
×
193
        @inbounds b = codeunit(s, i-3)
×
194
        between(b, 0b11110000, 0b11110111) && return i-3
×
195
        return i
×
196
    end)(s, i, n)
197
end
198

199
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
342,984✔
200

201
# s should be String or SubString{String}
202
@inline function _nextind_str(s, i::Int)
203
    i == 0 && return 1
215,367✔
204
    n = ncodeunits(s)
215,379✔
205
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
215,379✔
206
    @inbounds l = codeunit(s, i)
215,379✔
207
    between(l, 0x80, 0xf7) || return i+1
430,756✔
208
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
4✔
209
        if l < 0xc0
2✔
210
            # handle invalid codeunit index by scanning back to the start of this index
211
            # (which may be the same as this index)
212
            i′ = @inbounds thisind(s, i)
×
213
            i′ >= i && return i+1
×
214
            i = i′
×
215
            @inbounds l = codeunit(s, i)
×
216
            (l < 0x80) | (0xf8 ≤ l) && return i+1
×
217
            @assert l >= 0xc0 "invalid codeunit"
×
218
        end
219
        # first continuation byte
220
        (i += 1) > n && return i
2✔
221
        @inbounds b = codeunit(s, i)
2✔
222
        b & 0xc0 ≠ 0x80 && return i
2✔
223
        ((i += 1) > n) | (l < 0xe0) && return i
2✔
224
        # second continuation byte
225
        @inbounds b = codeunit(s, i)
2✔
226
        b & 0xc0 ≠ 0x80 && return i
2✔
227
        ((i += 1) > n) | (l < 0xf0) && return i
2✔
228
        # third continuation byte
229
        @inbounds b = codeunit(s, i)
×
230
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
×
231
    end)(s, i, n, l)
232
end
233

234
## checking UTF-8 & ASCII validity ##
235
#=
236
    The UTF-8 Validation is performed by a shift based DFA.
237
    ┌───────────────────────────────────────────────────────────────────┐
238
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
239
    │                               ├────────3────────┐           │     │
240
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
241
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
242
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
243
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
244
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
245
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
246
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
247
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
248
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
249
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
250
    │                      │        │     ├─┤               │        │  │
251
    │                      │        └─4──►│6├─────1,9───────┘        │  │
252
    │          INVALID     │              └─┘                        │  │
253
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
254
    │          ┌▼───┴┐                                                  │
255
    │          │  2  ◄─── All undefined transitions result in state 2   │
256
    │          └─────┘                                                  │
257
    └───────────────────────────────────────────────────────────────────┘
258

259
        Validation States
260
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
261
                        If the DFA ends in this state the string is ASCII only
262
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
263
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
264
                    as seen by all 1s in that column of table below
265
            3 -> One valid continuation byte needed to return to state 0
266
        4,5,6 -> Two valid continuation bytes needed to return to state 0
267
        7,8,9 -> Three valids continuation bytes needed to return to state 0
268

269
                        Current State
270
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
271
                0 | 0  1  2  2  2  2  2  2  2  2
272
                1 | 2  2  2  1  3  2  3  2  4  4
273
                2 | 3  3  2  2  2  2  2  2  2  2
274
                3 | 4  4  2  2  2  2  2  2  2  2
275
                4 | 6  6  2  2  2  2  2  2  2  2
276
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
277
    Class       6 | 8  8  2  2  2  2  2  2  2  2
278
                7 | 2  2  2  1  3  3  2  4  4  2
279
                8 | 2  2  2  2  2  2  2  2  2  2
280
                9 | 2  2  2  1  3  2  3  4  4  2
281
               10 | 5  5  2  2  2  2  2  2  2  2
282
               11 | 7  7  2  2  2  2  2  2  2  2
283

284
           Shifts | 0  4 10 14 18 24  8 20 12 26
285

286
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
287
    the rows the correct shift was a result.
288

289
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
290
    the current state then masking the result with 0x11110 give the shift for the new state
291

292

293
=#
294

295
#State type used by UTF-8 DFA
296
const _UTF8DFAState = UInt32
297
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
298
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
299
    num_classes=12
300
    num_states=10
301
    bit_per_state = 6
302

303
    # These shifts were derived using a SMT solver
304
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
305

306
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
310
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
311
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
312
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
313
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
314
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
315
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
316
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
317
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
318
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
319
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
320
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
321
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
322

323
    # These are the rows discussed in comments above
324
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
325
                     2  2  2  1  3  2  3  2  4  4;
326
                     3  3  2  2  2  2  2  2  2  2;
327
                     4  4  2  2  2  2  2  2  2  2;
328
                     6  6  2  2  2  2  2  2  2  2;
329
                     9  9  2  2  2  2  2  2  2  2;
330
                     8  8  2  2  2  2  2  2  2  2;
331
                     2  2  2  1  3  3  2  4  4  2;
332
                     2  2  2  2  2  2  2  2  2  2;
333
                     2  2  2  1  3  2  3  4  4  2;
334
                     5  5  2  2  2  2  2  2  2  2;
335
                     7  7  2  2  2  2  2  2  2  2]
336

337
    #This converts the state_arrays into the shift encoded _UTF8DFAState
338
    class_row = zeros(_UTF8DFAState, num_classes)
339

340
    for i = 1:num_classes
341
        row = _UTF8DFAState(0)
342
        for j in 1:num_states
343
            #Calculate the shift required for the next state
344
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
345
            #Shift the next state into the position of the current state
346
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
347
        end
348
        class_row[i]=row
349
    end
350

351
    map(c->class_row[c+1],character_classes)
×
352
end
353

354

355
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
356
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
357
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
358

359
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
360
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
×
361

362
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
363
    for i = first:last
×
364
       @inbounds state = _utf_dfa_step(state, bytes[i])
×
365
    end
×
366
    return (state)
×
367
end
368

369
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
370
    n=first
×
371
    while n <= last - chunk_size
×
372
        _isascii(cu,n,n+chunk_size-1) || return n
×
373
        n += chunk_size
×
374
    end
×
375
    n= last-chunk_size+1
×
376
    _isascii(cu,n,last) || return n
×
377
    return nothing
×
378
end
379

380
##
381

382
# Classifications of string
383
    # 0: neither valid ASCII nor UTF-8
384
    # 1: valid ASCII
385
    # 2: valid UTF-8
386
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
×
387

388

389
function byte_string_classify(bytes::AbstractVector{UInt8})
390
    chunk_size = 1024
×
391
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
×
392
    n = length(bytes)
×
393
    if n > chunk_threshold
×
394
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
×
395
        isnothing(start) && return 1
×
396
    else
397
        _isascii(bytes,1,n) && return 1
×
398
        start = 1
×
399
    end
400
    return _byte_string_classify_nonascii(bytes,start,n)
×
401
end
402

403
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
×
404
    chunk_size = 256
×
405

406
    start = first
×
407
    stop = min(last,first + chunk_size - 1)
×
408
    state = _UTF8_DFA_ACCEPT
×
409
    while start <= last
×
410
        # try to process ascii chunks
411
        while state == _UTF8_DFA_ACCEPT
×
412
            _isascii(bytes,start,stop) || break
×
413
            (start = start + chunk_size) <= last || break
×
414
            stop = min(last,stop + chunk_size)
×
415
        end
×
416
        # Process non ascii chunk
417
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
×
418
        state == _UTF8_DFA_INVALID && return 0
×
419

420
        start = start + chunk_size
×
421
        stop = min(last,stop + chunk_size)
×
422
    end
×
423
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
×
424
end
425

426
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
×
427
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
×
428

429
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
×
430

431
is_valid_continuation(c) = c & 0xc0 == 0x80
×
432

433
## required core functionality ##
434

435
@inline function iterate(s::String, i::Int=firstindex(s))
436
    (i % UInt) - 1 < ncodeunits(s) || return nothing
1,565,683✔
437
    b = @inbounds codeunit(s, i)
1,516,487✔
438
    u = UInt32(b) << 24
1,516,487✔
439
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
3,032,970✔
440
    return @noinline iterate_continued(s, i, u)
4✔
441
end
442

443
# duck-type s so that external UTF-8 string packages like StringViews can hook in
444
function iterate_continued(s, i::Int, u::UInt32)
1✔
445
    u < 0xc0000000 && (i += 1; @goto ret)
1✔
446
    n = ncodeunits(s)
1✔
447
    # first continuation byte
448
    (i += 1) > n && @goto ret
1✔
449
    @inbounds b = codeunit(s, i)
1✔
450
    b & 0xc0 == 0x80 || @goto ret
1✔
451
    u |= UInt32(b) << 16
1✔
452
    # second continuation byte
453
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
1✔
454
    @inbounds b = codeunit(s, i)
1✔
455
    b & 0xc0 == 0x80 || @goto ret
1✔
456
    u |= UInt32(b) << 8
1✔
457
    # third continuation byte
458
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
1✔
459
    @inbounds b = codeunit(s, i)
×
460
    b & 0xc0 == 0x80 || @goto ret
×
461
    u |= UInt32(b); i += 1
×
462
@label ret
463
    return reinterpret(Char, u), i
1✔
464
end
465

466
@propagate_inbounds function getindex(s::String, i::Int)
467
    b = codeunit(s, i)
211,441✔
468
    u = UInt32(b) << 24
211,441✔
469
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
422,880✔
470
    return getindex_continued(s, i, u)
2✔
471
end
472

473
# duck-type s so that external UTF-8 string packages like StringViews can hook in
474
function getindex_continued(s, i::Int, u::UInt32)
2✔
475
    if u < 0xc0000000
2✔
476
        # called from `getindex` which checks bounds
477
        @inbounds isvalid(s, i) && @goto ret
×
478
        string_index_err(s, i)
×
479
    end
480
    n = ncodeunits(s)
2✔
481

482
    (i += 1) > n && @goto ret
2✔
483
    @inbounds b = codeunit(s, i) # cont byte 1
2✔
484
    b & 0xc0 == 0x80 || @goto ret
2✔
485
    u |= UInt32(b) << 16
2✔
486

487
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
2✔
488
    @inbounds b = codeunit(s, i) # cont byte 2
2✔
489
    b & 0xc0 == 0x80 || @goto ret
2✔
490
    u |= UInt32(b) << 8
2✔
491

492
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
2✔
493
    @inbounds b = codeunit(s, i) # cont byte 3
×
494
    b & 0xc0 == 0x80 || @goto ret
×
495
    u |= UInt32(b)
×
496
@label ret
497
    return reinterpret(Char, u)
2✔
498
end
499

500
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
×
501

502
@inline function getindex(s::String, r::UnitRange{Int})
503
    isempty(r) && return ""
70,469✔
504
    i, j = first(r), last(r)
35,498✔
505
    @boundscheck begin
66,325✔
506
        checkbounds(s, r)
66,325✔
507
        @inbounds isvalid(s, i) || string_index_err(s, i)
66,325✔
508
        @inbounds isvalid(s, j) || string_index_err(s, j)
66,325✔
509
    end
510
    j = nextind(s, j) - 1
132,650✔
511
    n = j - i + 1
66,325✔
512
    ss = _string_n(n)
66,325✔
513
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
66,325✔
514
    return ss
66,325✔
515
end
516

517
# nothrow because we know the start and end indices are valid
518
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
11✔
519

520
# effects needed because @inbounds
521
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
522
    @boundscheck begin
×
523
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
×
524
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
×
525
    end
526
    j < i && return 0
×
527
    @inbounds i, k = thisind(s, i), i
×
528
    c = j - i + (i == k)
×
529
    @inbounds length_continued(s, i, j, c)
×
530
end
531

532
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
533
    i < n || return c
11✔
534
    b = codeunit(s, i)
11✔
535
    while true
11✔
536
        while true
187✔
537
            (i += 1) ≤ n || return c
198✔
538
            0xc0 ≤ b ≤ 0xf7 && break
176✔
539
            b = codeunit(s, i)
176✔
540
        end
176✔
541
        l = b
×
542
        b = codeunit(s, i) # cont byte 1
×
543
        c -= (x = b & 0xc0 == 0x80)
×
544
        x & (l ≥ 0xe0) || continue
×
545

546
        (i += 1) ≤ n || return c
×
547
        b = codeunit(s, i) # cont byte 2
×
548
        c -= (x = b & 0xc0 == 0x80)
×
549
        x & (l ≥ 0xf0) || continue
×
550

551
        (i += 1) ≤ n || return c
×
552
        b = codeunit(s, i) # cont byte 3
×
553
        c -= (b & 0xc0 == 0x80)
×
554
    end
×
555
end
556

557
## overload methods for efficiency ##
558

559
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
596,660✔
560

561
isascii(s::String) = isascii(codeunits(s))
2,212✔
562

563
# don't assume effects for general integers since we cannot know their implementation
564
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
7,923✔
565

566
"""
567
    repeat(c::AbstractChar, r::Integer)::String
568

569
Repeat a character `r` times. This can equivalently be accomplished by calling
570
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
571

572
# Examples
573
```jldoctest
574
julia> repeat('A', 3)
575
"AAA"
576
```
577
"""
578
function repeat(c::AbstractChar, r::Integer)
8,036✔
579
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
8,036✔
580
    r = UInt(r)::UInt
8,036✔
581
    c = Char(c)::Char
8,036✔
582
    r == 0 && return ""
8,036✔
583
    u = bswap(reinterpret(UInt32, c))
7,654✔
584
    n = 4 - (leading_zeros(u | 0xff) >> 3)
7,654✔
585
    s = _string_n(n*r)
7,654✔
586
    p = pointer(s)
7,654✔
587
    GC.@preserve s if n == 1
7,654✔
588
        memset(p, u % UInt8, r)
7,654✔
589
    elseif n == 2
×
590
        p16 = reinterpret(Ptr{UInt16}, p)
×
591
        for i = 1:r
×
592
            unsafe_store!(p16, u % UInt16, i)
×
593
        end
×
594
    elseif n == 3
×
595
        b1 = (u >> 0) % UInt8
×
596
        b2 = (u >> 8) % UInt8
×
597
        b3 = (u >> 16) % UInt8
×
598
        for i = 0:r-1
×
599
            unsafe_store!(p, b1, 3i + 1)
×
600
            unsafe_store!(p, b2, 3i + 2)
×
601
            unsafe_store!(p, b3, 3i + 3)
×
602
        end
×
603
    elseif n == 4
×
604
        p32 = reinterpret(Ptr{UInt32}, p)
×
605
        for i = 1:r
×
606
            unsafe_store!(p32, u, i)
×
607
        end
7,654✔
608
    end
609
    return s
7,654✔
610
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc