• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / 1572

01 Feb 2026 09:55PM UTC coverage: 76.677% (-0.07%) from 76.749%
1572

push

buildkite

web-flow
docs: clarify 'using A, B' semantics (#60856)

Resolves #36090

62889 of 82018 relevant lines covered (76.68%)

23269256.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.09
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
10✔
10
    index::Int
11
end
12
@noinline string_index_err((@nospecialize s::AbstractString), i::Integer) =
2✔
13
    throw(StringIndexError(s, Int(i)))
14
function showerror(io::IO, exc::StringIndexError)
8✔
15
    s = exc.string
8✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
8✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
8✔
18
        iprev = thisind(s, exc.index)
8✔
19
        inext = nextind(s, iprev)
12✔
20
        escprev = escape_string(s[iprev:iprev])
8✔
21
        if inext <= ncodeunits(s)
8✔
22
            escnext = escape_string(s[inext:inext])
6✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
6✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
2✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
748,300,890✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
14,309,287✔
65

66
function String(v::Vector{UInt8})
4,088✔
67
    len = length(v)
22,494,553✔
68
    len == 0 && return ""
22,494,553✔
69
    ref = v.ref
22,442,091✔
70
    if ref.ptr_or_offset == ref.mem.ptr
22,442,095✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
22,442,090✔
72
    else
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
5✔
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
22,442,095✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
22,442,091✔
78
    return str
22,442,095✔
79
end
80

81
"""
82
    unsafe_takestring(m::Memory{UInt8})::String
83

84
Create a `String` from `m`, changing the interpretation of the contents of `m`.
85
This is done without copying, if possible. Thus, any access to `m` after
86
calling this function, either to read or to write, is undefined behavior.
87
"""
88
function unsafe_takestring(m::Memory{UInt8})
89
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
14,311,176✔
90
end
91

92
"""
93
    takestring!(x) -> String
94

95
Create a string from the content of `x`, emptying `x`.
96

97
# Examples
98
```jldoctest
99
julia> v = [0x61, 0x62, 0x63];
100

101
julia> s = takestring!(v)
102
"abc"
103

104
julia> isempty(v)
105
true
106
```
107
"""
108
takestring!(v::Vector{UInt8}) = String(v)
×
109

110
"""
111
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
112
    unsafe_string(p::Cstring)
113

114
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
115
(The pointer can be safely freed afterwards.) If `length` is specified
116
(the length of the data in bytes), the string does not have to be NUL-terminated.
117

118
This function is labeled "unsafe" because it will crash if `p` is not
119
a valid memory address to data of the requested length.
120
"""
121
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
152✔
122
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,712,139✔
123
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,712,137✔
124
end
125
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
2,913✔
126
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
6,426,297✔
127
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
6,426,297✔
128
end
129

130
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
131
# but the macro is not available at this time in bootstrap, so we write it manually.
132
const _string_n_override = 0x04ee
133
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
140,175,851✔
134
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override, false)), :(convert(Csize_t, n))))
135

136
"""
137
    String(s::AbstractString)
138

139
Create a new `String` from an existing `AbstractString`.
140
"""
141
String(s::AbstractString) = print_to_string(s)
934✔
142
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
6,224,209✔
143

144
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
24,523,167✔
145
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
116,810✔
146

147
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
62,918✔
148
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
62,878✔
149
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
150

151
String(s::CodeUnits{UInt8,String}) = s.s
2✔
152

153
## low-level functions ##
154

155
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
1,622,962,301✔
156
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
729,310,028✔
157

158
ncodeunits(s::String) = Core.sizeof(s)
1,148,057,118✔
159
codeunit(s::String) = UInt8
15,702,104✔
160

161
codeunit(s::String, i::Integer) = codeunit(s, Int(i)::Int)
4✔
162
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
13,678✔
163
    @boundscheck checkbounds(s, i)
719,735,540✔
164
    b = GC.@preserve s unsafe_load(pointer(s, i))
719,735,540✔
165
    return b
715,120,931✔
166
end
167

168
## comparison ##
169

170
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
852,982✔
171

172
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
2,023,347✔
173
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
40✔
174
    GC.@preserve a b begin
2,288,731✔
175
        pa = unsafe_convert(Ptr{UInt8}, a)
2,288,731✔
176
        pb = unsafe_convert(Ptr{UInt8}, b)
2,288,731✔
177
        memcmp(pa, pb, len % Csize_t) % Int
2,288,731✔
178
    end
179
end
180

181
function cmp(a::String, b::String)
2✔
182
    al, bl = sizeof(a), sizeof(b)
852,982✔
183
    c = _memcmp(a, b)
852,982✔
184
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
1,227,455✔
185
end
186

187
==(a::String, b::String) = a===b
22,604,845✔
188

189
typemin(::Type{String}) = ""
×
190
typemin(::String) = typemin(String)
×
191

192
## thisind, nextind ##
193

194
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
114,804,029✔
195

196
# s should be String or SubString{String}
197
@inline function _thisind_str(s, i::Int)
6,722✔
198
    i == 0 && return 0
57,891,667✔
199
    n = ncodeunits(s)
57,774,809✔
200
    i == n + 1 && return i
57,774,809✔
201
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
57,774,795✔
202
    @inbounds b = codeunit(s, i)
57,774,795✔
203
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
106,477,755✔
204
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
8,821,066✔
205
        local b
173,970✔
206
        @inbounds b = codeunit(s, i-1)
173,970✔
207
        between(b, 0b11000000, 0b11110111) && return i-1
173,970✔
208
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
112,703✔
209
        @inbounds b = codeunit(s, i-2)
112,703✔
210
        between(b, 0b11100000, 0b11110111) && return i-2
112,703✔
211
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
20✔
212
        @inbounds b = codeunit(s, i-3)
20✔
213
        between(b, 0b11110000, 0b11110111) && return i-3
20✔
214
        return i
×
215
    end)(s, i, n)
216
end
217

218
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
30,193,990✔
219

220
# s should be String or SubString{String}
221
@inline function _nextind_str(s, i::Int)
2,966✔
222
    i == 0 && return 1
92,335,445✔
223
    n = ncodeunits(s)
92,320,925✔
224
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
92,320,925✔
225
    @inbounds l = codeunit(s, i)
92,320,925✔
226
    between(l, 0x80, 0xf7) || return i+1
184,427,152✔
227
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
267,324✔
228
        if l < 0xc0
52,630✔
229
            # handle invalid codeunit index by scanning back to the start of this index
230
            # (which may be the same as this index)
231
            i′ = @inbounds thisind(s, i)
×
232
            i′ >= i && return i+1
×
233
            i = i′
×
234
            @inbounds l = codeunit(s, i)
×
235
            (l < 0x80) | (0xf8 ≤ l) && return i+1
×
236
            @assert l >= 0xc0 "invalid codeunit"
×
237
        end
238
        # first continuation byte
239
        (i += 1) > n && return i
52,630✔
240
        @inbounds b = codeunit(s, i)
52,630✔
241
        b & 0xc0 ≠ 0x80 && return i
52,630✔
242
        ((i += 1) > n) | (l < 0xe0) && return i
52,630✔
243
        # second continuation byte
244
        @inbounds b = codeunit(s, i)
52,604✔
245
        b & 0xc0 ≠ 0x80 && return i
52,604✔
246
        ((i += 1) > n) | (l < 0xf0) && return i
52,604✔
247
        # third continuation byte
248
        @inbounds b = codeunit(s, i)
×
249
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
×
250
    end)(s, i, n, l)
251
end
252

253
## checking UTF-8 & ASCII validity ##
254
#=
255
    The UTF-8 Validation is performed by a shift based DFA.
256
    ┌───────────────────────────────────────────────────────────────────┐
257
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
258
    │                               ├────────3────────┐           │     │
259
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
260
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
261
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
262
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
263
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
264
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
265
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
266
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
267
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
268
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
269
    │                      │        │     ├─┤               │        │  │
270
    │                      │        └─4──►│6├─────1,9───────┘        │  │
271
    │          INVALID     │              └─┘                        │  │
272
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
273
    │          ┌▼───┴┐                                                  │
274
    │          │  2  ◄─── All undefined transitions result in state 2   │
275
    │          └─────┘                                                  │
276
    └───────────────────────────────────────────────────────────────────┘
277

278
        Validation States
279
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
280
                        If the DFA ends in this state the string is ASCII only
281
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
282
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
283
                    as seen by all 1s in that column of table below
284
            3 -> One valid continuation byte needed to return to state 0
285
        4,5,6 -> Two valid continuation bytes needed to return to state 0
286
        7,8,9 -> Three valids continuation bytes needed to return to state 0
287

288
                        Current State
289
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
290
                0 | 0  1  2  2  2  2  2  2  2  2
291
                1 | 2  2  2  1  3  2  3  2  4  4
292
                2 | 3  3  2  2  2  2  2  2  2  2
293
                3 | 4  4  2  2  2  2  2  2  2  2
294
                4 | 6  6  2  2  2  2  2  2  2  2
295
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
296
    Class       6 | 8  8  2  2  2  2  2  2  2  2
297
                7 | 2  2  2  1  3  3  2  4  4  2
298
                8 | 2  2  2  2  2  2  2  2  2  2
299
                9 | 2  2  2  1  3  2  3  4  4  2
300
               10 | 5  5  2  2  2  2  2  2  2  2
301
               11 | 7  7  2  2  2  2  2  2  2  2
302

303
           Shifts | 0  4 10 14 18 24  8 20 12 26
304

305
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
306
    the rows the correct shift was a result.
307

308
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
309
    the current state then masking the result with 0x11110 give the shift for the new state
310

311

312
=#
313

314
#State type used by UTF-8 DFA
315
const _UTF8DFAState = UInt32
316
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
317
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
318
    num_classes=12
319
    num_states=10
320
    bit_per_state = 6
321

322
    # These shifts were derived using a SMT solver
323
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
324

325
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
333
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
334
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
335
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
336
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
337
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
338
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
339
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
340
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
341

342
    # These are the rows discussed in comments above
343
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
344
                     2  2  2  1  3  2  3  2  4  4;
345
                     3  3  2  2  2  2  2  2  2  2;
346
                     4  4  2  2  2  2  2  2  2  2;
347
                     6  6  2  2  2  2  2  2  2  2;
348
                     9  9  2  2  2  2  2  2  2  2;
349
                     8  8  2  2  2  2  2  2  2  2;
350
                     2  2  2  1  3  3  2  4  4  2;
351
                     2  2  2  2  2  2  2  2  2  2;
352
                     2  2  2  1  3  2  3  4  4  2;
353
                     5  5  2  2  2  2  2  2  2  2;
354
                     7  7  2  2  2  2  2  2  2  2]
355

356
    #This converts the state_arrays into the shift encoded _UTF8DFAState
357
    class_row = zeros(_UTF8DFAState, num_classes)
358

359
    for i = 1:num_classes
360
        row = _UTF8DFAState(0)
361
        for j in 1:num_states
362
            #Calculate the shift required for the next state
363
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
364
            #Shift the next state into the position of the current state
365
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
366
        end
367
        class_row[i]=row
368
    end
369

370
    map(c->class_row[c+1],character_classes)
×
371
end
372

373

374
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
375
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
376
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
377

378
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
379
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
110,816✔
380

381
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
10,496✔
382
    for i = first:last
51,884✔
383
       @inbounds state = _utf_dfa_step(state, bytes[i])
110,816✔
384
    end
169,748✔
385
    return (state)
51,884✔
386
end
387

388
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
389
    n=first
20✔
390
    while n <= last - chunk_size
80✔
391
        _isascii(cu,n,n+chunk_size-1) || return n
60✔
392
        n += chunk_size
60✔
393
    end
60✔
394
    n= last-chunk_size+1
20✔
395
    _isascii(cu,n,last) || return n
20✔
396
    return nothing
20✔
397
end
398

399
##
400

401
# Classifications of string
402
    # 0: neither valid ASCII nor UTF-8
403
    # 1: valid ASCII
404
    # 2: valid UTF-8
405
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
48✔
406

407

408
function byte_string_classify(bytes::AbstractVector{UInt8})
48✔
409
    chunk_size = 1024
41,833✔
410
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
41,833✔
411
    n = length(bytes)
41,833✔
412
    if n > chunk_threshold
41,833✔
413
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
20✔
414
        isnothing(start) && return 1
20✔
415
    else
416
        _isascii(bytes,1,n) && return 1
41,813✔
417
        start = 1
40,620✔
418
    end
419
    return _byte_string_classify_nonascii(bytes,start,n)
40,620✔
420
end
421

422
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
40,620✔
423
    chunk_size = 256
40,620✔
424

425
    start = first
40,620✔
426
    stop = min(last,first + chunk_size - 1)
40,620✔
427
    state = _UTF8_DFA_ACCEPT
40,620✔
428
    while start <= last
50,330✔
429
        # try to process ascii chunks
430
        while state == _UTF8_DFA_ACCEPT
40,620✔
431
            _isascii(bytes,start,stop) || break
40,620✔
432
            (start = start + chunk_size) <= last || break
×
433
            stop = min(last,stop + chunk_size)
×
434
        end
×
435
        # Process non ascii chunk
436
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
99,552✔
437
        state == _UTF8_DFA_INVALID && return 0
40,620✔
438

439
        start = start + chunk_size
9,710✔
440
        stop = min(last,stop + chunk_size)
9,710✔
441
    end
9,710✔
442
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
9,710✔
443
end
444

445
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
42,073✔
446
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
48✔
447

448
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
1,555✔
449

450
is_valid_continuation(c) = c & 0xc0 == 0x80
855✔
451

452
## required core functionality ##
453

454
@inline function iterate(s::String, i::Int=firstindex(s))
2,869✔
455
    (i % UInt) - 1 < ncodeunits(s) || return nothing
508,292,403✔
456
    b = @inbounds codeunit(s, i)
425,944,649✔
457
    u = UInt32(b) << 24
425,944,649✔
458
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
851,254,689✔
459
    return @noinline iterate_continued(s, i, u)
634,609✔
460
end
461

462
# duck-type s so that external UTF-8 string packages like StringViews can hook in
463
function iterate_continued(s, i::Int, u::UInt32)
258,685✔
464
    @label _ begin
258,685✔
465
        u < 0xc0000000 && (i += 1; break _)
258,685✔
466
        n = ncodeunits(s)
244,032✔
467
        # first continuation byte
468
        (i += 1) > n && break _
244,032✔
469
        @inbounds b = codeunit(s, i)
240,894✔
470
        b & 0xc0 == 0x80 || break _
240,894✔
471
        u |= UInt32(b) << 16
232,247✔
472
        # second continuation byte
473
        ((i += 1) > n) | (u < 0xe0000000) && break _
232,247✔
474
        @inbounds b = codeunit(s, i)
144,578✔
475
        b & 0xc0 == 0x80 || break _
144,578✔
476
        u |= UInt32(b) << 8
144,578✔
477
        # third continuation byte
478
        ((i += 1) > n) | (u < 0xf0000000) && break _
144,578✔
479
        @inbounds b = codeunit(s, i)
2,061✔
480
        b & 0xc0 == 0x80 || break _
2,061✔
481
        u |= UInt32(b); i += 1
2,061✔
482
    end
483
    return reinterpret(Char, u), i
258,685✔
484
end
485

486
@propagate_inbounds function getindex(s::String, i::Int)
307✔
487
    b = codeunit(s, i)
79,662,041✔
488
    u = UInt32(b) << 24
79,662,041✔
489
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
159,131,465✔
490
    return getindex_continued(s, i, u)
192,549✔
491
end
492

493
# duck-type s so that external UTF-8 string packages like StringViews can hook in
494
function getindex_continued(s, i::Int, u::UInt32)
17,547✔
495
    @label _ begin
17,547✔
496
        if u < 0xc0000000
17,547✔
497
            # called from `getindex` which checks bounds
498
            @inbounds isvalid(s, i) && break _
×
499
            string_index_err(s, i)
×
500
        end
501
        n = ncodeunits(s)
17,547✔
502

503
        (i += 1) > n && break _
17,547✔
504
        @inbounds b = codeunit(s, i) # cont byte 1
17,547✔
505
        b & 0xc0 == 0x80 || break _
17,547✔
506
        u |= UInt32(b) << 16
17,547✔
507

508
        ((i += 1) > n) | (u < 0xe0000000) && break _
17,547✔
509
        @inbounds b = codeunit(s, i) # cont byte 2
17,521✔
510
        b & 0xc0 == 0x80 || break _
17,521✔
511
        u |= UInt32(b) << 8
17,521✔
512

513
        ((i += 1) > n) | (u < 0xf0000000) && break _
17,521✔
514
        @inbounds b = codeunit(s, i) # cont byte 3
40✔
515
        b & 0xc0 == 0x80 || break _
40✔
516
        u |= UInt32(b)
40✔
517
    end
518
    return reinterpret(Char, u)
17,547✔
519
end
520

521
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
8✔
522

523
@inline function getindex(s::String, r::UnitRange{Int})
1,224✔
524
    isempty(r) && return ""
1,782,787✔
525
    i, j = first(r), last(r)
675,817✔
526
    @boundscheck begin
1,751,153✔
527
        checkbounds(s, r)
1,751,153✔
528
        @inbounds isvalid(s, i) || string_index_err(s, i)
1,751,153✔
529
        @inbounds isvalid(s, j) || string_index_err(s, j)
1,751,153✔
530
    end
531
    j = nextind(s, j) - 1
3,501,132✔
532
    n = j - i + 1
1,751,153✔
533
    ss = _string_n(n)
1,751,153✔
534
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
1,751,153✔
535
    return ss
1,751,153✔
536
end
537

538
# nothrow because we know the start and end indices are valid
539
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
105,940✔
540

541
# effects needed because @inbounds
542
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
543
    @boundscheck begin
113,956✔
544
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
113,956✔
545
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
113,956✔
546
    end
547
    j < i && return 0
113,956✔
548
    @inbounds i, k = thisind(s, i), i
114,708✔
549
    c = j - i + (i == k)
57,354✔
550
    @inbounds length_continued(s, i, j, c)
57,354✔
551
end
552

553
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
4✔
554
    i < n || return c
165,051✔
555
    b = codeunit(s, i)
161,537✔
556
    while true
910,477✔
557
        while true
3,410,029✔
558
            (i += 1) ≤ n || return c
19,427,796✔
559
            0xc0 ≤ b ≤ 0xf7 && break
19,107,318✔
560
            b = codeunit(s, i)
18,357,080✔
561
        end
18,357,080✔
562
        l = b
10✔
563
        b = codeunit(s, i) # cont byte 1
750,238✔
564
        c -= (x = b & 0xc0 == 0x80)
750,238✔
565
        x & (l ≥ 0xe0) || continue
750,238✔
566

567
        (i += 1) ≤ n || return c
60,470✔
568
        b = codeunit(s, i) # cont byte 2
57,874✔
569
        c -= (x = b & 0xc0 == 0x80)
57,874✔
570
        x & (l ≥ 0xf0) || continue
115,748✔
571

572
        (i += 1) ≤ n || return c
×
573
        b = codeunit(s, i) # cont byte 3
×
574
        c -= (b & 0xc0 == 0x80)
×
575
    end
748,940✔
576
end
577

578
## overload methods for efficiency ##
579

580
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
98,936,661✔
581

582
isascii(s::String) = isascii(codeunits(s))
6,101,021✔
583

584
# don't assume effects for general integers since we cannot know their implementation
585
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
7,802,736✔
586

587
"""
588
    repeat(c::AbstractChar, r::Integer)::String
589

590
Repeat a character `r` times. This can equivalently be accomplished by calling
591
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
592

593
# Examples
594
```jldoctest
595
julia> repeat('A', 3)
596
"AAA"
597
```
598
"""
599
function repeat(c::AbstractChar, r::Integer)
8,035,544✔
600
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
8,036,026✔
601
    r = UInt(r)::UInt
8,036,018✔
602
    c = Char(c)::Char
8,036,018✔
603
    r == 0 && return ""
8,036,018✔
604
    u = bswap(reinterpret(UInt32, c))
7,883,769✔
605
    n = 4 - (leading_zeros(u | 0xff) >> 3)
7,883,769✔
606
    s = _string_n(n*r)
7,883,769✔
607
    p = pointer(s)
7,883,767✔
608
    GC.@preserve s if n == 1
7,883,767✔
609
        memset(p, u % UInt8, r)
7,883,257✔
610
    elseif n == 2
510✔
611
        p16 = reinterpret(Ptr{UInt16}, p)
14✔
612
        for i = 1:r
14✔
613
            unsafe_store!(p16, u % UInt16, i)
40✔
614
        end
40✔
615
    elseif n == 3
496✔
616
        b1 = (u >> 0) % UInt8
486✔
617
        b2 = (u >> 8) % UInt8
486✔
618
        b3 = (u >> 16) % UInt8
486✔
619
        for i = 0:r-1
486✔
620
            unsafe_store!(p, b1, 3i + 1)
2,504✔
621
            unsafe_store!(p, b2, 3i + 2)
2,504✔
622
            unsafe_store!(p, b3, 3i + 3)
2,504✔
623
        end
2,504✔
624
    elseif n == 4
10✔
625
        p32 = reinterpret(Ptr{UInt32}, p)
10✔
626
        for i = 1:r
10✔
627
            unsafe_store!(p32, u, i)
22✔
628
        end
7,883,789✔
629
    end
630
    return s
7,883,767✔
631
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc