• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / 1407

29 Aug 2025 07:56PM UTC coverage: 74.877% (+0.05%) from 74.829%
1407

push

buildkite

web-flow
`Base`: bootstrap: eliminate `Array`-specific `length` methods (#57627)

Three methods of `length` are deleted.

Made possible by moving the following methods to earlier within the
bootstrapping:
* `size(a::Array)`
* `length(t::AbstractArray)`

Improves abstract return type inference of `f(x::Array) = length(a)`,
presumably because the method count is now low enough for the
world-splitting optimization to kick in.

2 of 2 new or added lines in 1 file covered. (100.0%)

211 existing lines in 12 files now uncovered.

63899 of 85339 relevant lines covered (74.88%)

20883607.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.57
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
10✔
10
    index::Int
11
end
12
@noinline string_index_err((@nospecialize s::AbstractString), i::Integer) =
2✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
8✔
15
    s = exc.string
8✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
8✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
8✔
18
        iprev = thisind(s, exc.index)
8✔
19
        inext = nextind(s, iprev)
8✔
20
        escprev = escape_string(s[iprev:iprev])
8✔
21
        if inext <= ncodeunits(s)
8✔
22
            escnext = escape_string(s[inext:inext])
6✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
6✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
2✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
515,548,429✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
10,904,405✔
65

66
function String(v::Vector{UInt8})
559✔
67
    len = length(v)
12,952,490✔
68
    len == 0 && return ""
12,952,490✔
69
    ref = v.ref
10,076,288✔
70
    if ref.ptr_or_offset == ref.mem.ptr
10,076,293✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
10,076,286✔
72
    else
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
4✔
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
10,076,292✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
10,076,288✔
78
    return str
10,076,292✔
79
end
80

81
"""
82
    unsafe_takestring(m::Memory{UInt8})::String
83

84
Create a `String` from `m`, changing the interpretation of the contents of `m`.
85
This is done without copying, if possible. Thus, any access to `m` after
86
calling this function, either to read or to write, is undefined behavior.
87
"""
88
function unsafe_takestring(m::Memory{UInt8})
514✔
89
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
205,584,259✔
90
end
91

92
"""
93
    takestring!(x) -> String
94

95
Create a string from the content of `x`, emptying `x`.
96

97
# Examples
98
```jldoctest
99
julia> v = [0x61, 0x62, 0x63];
100

101
julia> s = takestring!(v)
102
"abc"
103

104
julia> isempty(v)
105
true
106
```
107
"""
108
takestring!(v::Vector{UInt8}) = String(v)
×
109

110
"""
111
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
112

113
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
114
(The pointer can be safely freed afterwards.) If `length` is specified
115
(the length of the data in bytes), the string does not have to be NUL-terminated.
116

117
This function is labeled "unsafe" because it will crash if `p` is not
118
a valid memory address to data of the requested length.
119
"""
120
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
227✔
121
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,970,127✔
122
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,970,122✔
123
end
124
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
1,660✔
125
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
2,310,014✔
126
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
2,310,016✔
127
end
128

129
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
130
# but the macro is not available at this time in bootstrap, so we write it manually.
131
const _string_n_override = 0x04ee
132
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
125,193,907✔
133
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override, false)), :(convert(Csize_t, n))))
134

135
"""
136
    String(s::AbstractString)
137

138
Create a new `String` from an existing `AbstractString`.
139
"""
140
String(s::AbstractString) = print_to_string(s)
930✔
141
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
2,120,467✔
142

143
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
112,327,460✔
144
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
48,787✔
145

146
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
62,444✔
147
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
62,380✔
148
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
149

150
String(s::CodeUnits{UInt8,String}) = s.s
2✔
151

152
## low-level functions ##
153

154
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
1,356,616,816✔
155
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
578,859,138✔
156

157
ncodeunits(s::String) = Core.sizeof(s)
975,135,074✔
158
codeunit(s::String) = UInt8
19,002,705✔
159

160
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
2✔
161
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
32,607✔
162
    @boundscheck checkbounds(s, i)
566,562,596✔
163
    b = GC.@preserve s unsafe_load(pointer(s, i))
566,914,943✔
164
    return b
562,490,269✔
165
end
166

167
## comparison ##
168

169
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
1,252,962✔
170

171
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
3,197,441✔
172
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
79✔
173
    GC.@preserve a b begin
3,869,183✔
174
        pa = unsafe_convert(Ptr{UInt8}, a)
3,869,182✔
175
        pb = unsafe_convert(Ptr{UInt8}, b)
3,869,185✔
176
        memcmp(pa, pb, len % Csize_t) % Int
3,869,187✔
177
    end
178
end
179

180
function cmp(a::String, b::String)
4✔
181
    al, bl = sizeof(a), sizeof(b)
1,252,962✔
182
    c = _memcmp(a, b)
1,252,962✔
183
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
1,773,548✔
184
end
185

186
==(a::String, b::String) = a===b
22,306,847✔
187

188
typemin(::Type{String}) = ""
×
189
typemin(::String) = typemin(String)
×
190

191
## thisind, nextind ##
192

193
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
122,009,412✔
194

195
# s should be String or SubString{String}
196
@inline function _thisind_str(s, i::Int)
16,413✔
197
    i == 0 && return 0
61,403,257✔
198
    n = ncodeunits(s)
61,370,123✔
199
    i == n + 1 && return i
61,338,426✔
200
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
61,251,047✔
201
    @inbounds b = codeunit(s, i)
61,150,492✔
202
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
112,651,204✔
203
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
8,487,875✔
204
        local b
7,371✔
205
        @inbounds b = codeunit(s, i-1)
7,371✔
206
        between(b, 0b11000000, 0b11110111) && return i-1
7,371✔
207
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
3,966✔
208
        @inbounds b = codeunit(s, i-2)
3,966✔
209
        between(b, 0b11100000, 0b11110111) && return i-2
3,966✔
UNCOV
210
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
×
UNCOV
211
        @inbounds b = codeunit(s, i-3)
×
UNCOV
212
        between(b, 0b11110000, 0b11110111) && return i-3
×
213
        return i
×
214
    end)(s, i, n)
215
end
216

217
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
33,142,461✔
218

219
# s should be String or SubString{String}
220
@inline function _nextind_str(s, i::Int)
6,869✔
221
    i == 0 && return 1
16,765,321✔
222
    n = ncodeunits(s)
16,793,406✔
223
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
16,756,871✔
224
    @inbounds l = codeunit(s, i)
16,705,588✔
225
    between(l, 0x80, 0xf7) || return i+1
33,249,383✔
226
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
31,644✔
227
        if l < 0xc0
1,396✔
228
            # handle invalid codeunit index by scanning back to the start of this index
229
            # (which may be the same as this index)
230
            i′ = @inbounds thisind(s, i)
×
231
            i′ >= i && return i+1
×
232
            i = i′
×
233
            @inbounds l = codeunit(s, i)
×
234
            (l < 0x80) | (0xf8 ≤ l) && return i+1
×
235
            @assert l >= 0xc0 "invalid codeunit"
×
236
        end
237
        # first continuation byte
238
        (i += 1) > n && return i
1,396✔
239
        @inbounds b = codeunit(s, i)
1,396✔
240
        b & 0xc0 ≠ 0x80 && return i
1,396✔
241
        ((i += 1) > n) | (l < 0xe0) && return i
1,396✔
242
        # second continuation byte
243
        @inbounds b = codeunit(s, i)
1,396✔
244
        b & 0xc0 ≠ 0x80 && return i
1,396✔
245
        ((i += 1) > n) | (l < 0xf0) && return i
1,396✔
246
        # third continuation byte
247
        @inbounds b = codeunit(s, i)
×
248
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
×
249
    end)(s, i, n, l)
250
end
251

252
## checking UTF-8 & ASCII validity ##
253
#=
254
    The UTF-8 Validation is performed by a shift based DFA.
255
    ┌───────────────────────────────────────────────────────────────────┐
256
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
257
    │                               ├────────3────────┐           │     │
258
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
259
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
260
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
261
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
262
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
263
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
264
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
265
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
266
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
267
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
268
    │                      │        │     ├─┤               │        │  │
269
    │                      │        └─4──►│6├─────1,9───────┘        │  │
270
    │          INVALID     │              └─┘                        │  │
271
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
272
    │          ┌▼───┴┐                                                  │
273
    │          │  2  ◄─── All undefined transitions result in state 2   │
274
    │          └─────┘                                                  │
275
    └───────────────────────────────────────────────────────────────────┘
276

277
        Validation States
278
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
279
                        If the DFA ends in this state the string is ASCII only
280
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
281
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
282
                    as seen by all 1s in that column of table below
283
            3 -> One valid continuation byte needed to return to state 0
284
        4,5,6 -> Two valid continuation bytes needed to return to state 0
285
        7,8,9 -> Three valids continuation bytes needed to return to state 0
286

287
                        Current State
288
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
289
                0 | 0  1  2  2  2  2  2  2  2  2
290
                1 | 2  2  2  1  3  2  3  2  4  4
291
                2 | 3  3  2  2  2  2  2  2  2  2
292
                3 | 4  4  2  2  2  2  2  2  2  2
293
                4 | 6  6  2  2  2  2  2  2  2  2
294
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
295
    Class       6 | 8  8  2  2  2  2  2  2  2  2
296
                7 | 2  2  2  1  3  3  2  4  4  2
297
                8 | 2  2  2  2  2  2  2  2  2  2
298
                9 | 2  2  2  1  3  2  3  4  4  2
299
               10 | 5  5  2  2  2  2  2  2  2  2
300
               11 | 7  7  2  2  2  2  2  2  2  2
301

302
           Shifts | 0  4 10 14 18 24  8 20 12 26
303

304
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
305
    the rows the correct shift was a result.
306

307
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
308
    the current state then masking the result with 0x11110 give the shift for the new state
309

310

311
=#
312

313
#State type used by UTF-8 DFA
314
const _UTF8DFAState = UInt32
315
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
316
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
317
    num_classes=12
318
    num_states=10
319
    bit_per_state = 6
320

321
    # These shifts were derived using a SMT solver
322
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
323

324
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
325
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
333
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
334
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
335
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
336
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
337
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
338
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
339
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
340

341
    # These are the rows discussed in comments above
342
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
343
                     2  2  2  1  3  2  3  2  4  4;
344
                     3  3  2  2  2  2  2  2  2  2;
345
                     4  4  2  2  2  2  2  2  2  2;
346
                     6  6  2  2  2  2  2  2  2  2;
347
                     9  9  2  2  2  2  2  2  2  2;
348
                     8  8  2  2  2  2  2  2  2  2;
349
                     2  2  2  1  3  3  2  4  4  2;
350
                     2  2  2  2  2  2  2  2  2  2;
351
                     2  2  2  1  3  2  3  4  4  2;
352
                     5  5  2  2  2  2  2  2  2  2;
353
                     7  7  2  2  2  2  2  2  2  2]
354

355
    #This converts the state_arrays into the shift encoded _UTF8DFAState
356
    class_row = zeros(_UTF8DFAState, num_classes)
357

358
    for i = 1:num_classes
359
        row = _UTF8DFAState(0)
360
        for j in 1:num_states
361
            #Calculate the shift required for the next state
362
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
363
            #Shift the next state into the position of the current state
364
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
365
        end
366
        class_row[i]=row
367
    end
368

369
    map(c->class_row[c+1],character_classes)
×
370
end
371

372

373
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
374
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
375
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
376

377
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
378
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
110,816✔
379

380
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
10,496✔
381
    for i = first:last
51,884✔
382
       @inbounds state = _utf_dfa_step(state, bytes[i])
110,816✔
383
    end
169,748✔
384
    return (state)
51,884✔
385
end
386

387
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
388
    n=first
20✔
389
    while n <= last - chunk_size
80✔
390
        _isascii(cu,n,n+chunk_size-1) || return n
60✔
391
        n += chunk_size
60✔
392
    end
60✔
393
    n= last-chunk_size+1
20✔
394
    _isascii(cu,n,last) || return n
20✔
395
    return nothing
20✔
396
end
397

398
##
399

400
# Classifications of string
401
    # 0: neither valid ASCII nor UTF-8
402
    # 1: valid ASCII
403
    # 2: valid UTF-8
404
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
48✔
405

406

407
function byte_string_classify(bytes::AbstractVector{UInt8})
48✔
408
    chunk_size = 1024
41,833✔
409
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
41,833✔
410
    n = length(bytes)
41,833✔
411
    if n > chunk_threshold
41,833✔
412
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
20✔
413
        isnothing(start) && return 1
20✔
414
    else
415
        _isascii(bytes,1,n) && return 1
41,813✔
416
        start = 1
40,620✔
417
    end
418
    return _byte_string_classify_nonascii(bytes,start,n)
40,620✔
419
end
420

421
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
40,620✔
422
    chunk_size = 256
40,620✔
423

424
    start = first
40,620✔
425
    stop = min(last,first + chunk_size - 1)
40,620✔
426
    state = _UTF8_DFA_ACCEPT
40,620✔
427
    while start <= last
50,330✔
428
        # try to process ascii chunks
429
        while state == _UTF8_DFA_ACCEPT
40,620✔
430
            _isascii(bytes,start,stop) || break
40,620✔
431
            (start = start + chunk_size) <= last || break
×
432
            stop = min(last,stop + chunk_size)
×
433
        end
×
434
        # Process non ascii chunk
435
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
99,552✔
436
        state == _UTF8_DFA_INVALID && return 0
40,620✔
437

438
        start = start + chunk_size
9,710✔
439
        stop = min(last,stop + chunk_size)
9,710✔
440
    end
9,710✔
441
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
9,710✔
442
end
443

444
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
42,073✔
445
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
48✔
446

447
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
1,555✔
448

449
is_valid_continuation(c) = c & 0xc0 == 0x80
871✔
450

451
## required core functionality ##
452

453
@inline function iterate(s::String, i::Int=firstindex(s))
7,293✔
454
    (i % UInt) - 1 < ncodeunits(s) || return nothing
497,328,624✔
455
    b = @inbounds codeunit(s, i)
420,028,127✔
456
    u = UInt32(b) << 24
420,029,623✔
457
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
839,492,763✔
458
    return @noinline iterate_continued(s, i, u)
561,209✔
459
end
460

461
# duck-type s so that external UTF-8 string packages like StringViews can hook in
462
function iterate_continued(s, i::Int, u::UInt32)
187,388✔
463
    u < 0xc0000000 && (i += 1; @goto ret)
187,388✔
464
    n = ncodeunits(s)
172,707✔
465
    # first continuation byte
466
    (i += 1) > n && @goto ret
172,707✔
467
    @inbounds b = codeunit(s, i)
169,573✔
468
    b & 0xc0 == 0x80 || @goto ret
169,573✔
469
    u |= UInt32(b) << 16
161,020✔
470
    # second continuation byte
471
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
161,020✔
472
    @inbounds b = codeunit(s, i)
87,703✔
473
    b & 0xc0 == 0x80 || @goto ret
87,703✔
474
    u |= UInt32(b) << 8
87,703✔
475
    # third continuation byte
476
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
87,703✔
477
    @inbounds b = codeunit(s, i)
1,902✔
478
    b & 0xc0 == 0x80 || @goto ret
1,902✔
479
    u |= UInt32(b); i += 1
1,902✔
480
@label ret
481
    return reinterpret(Char, u), i
187,388✔
482
end
483

484
@propagate_inbounds function getindex(s::String, i::Int)
480✔
485
    b = codeunit(s, i)
3,013,555✔
486
    u = UInt32(b) << 24
3,013,534✔
487
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
5,979,170✔
488
    return getindex_continued(s, i, u)
47,694✔
489
end
490

491
# duck-type s so that external UTF-8 string packages like StringViews can hook in
492
function getindex_continued(s, i::Int, u::UInt32)
4,761✔
493
    if u < 0xc0000000
4,761✔
494
        # called from `getindex` which checks bounds
495
        @inbounds isvalid(s, i) && @goto ret
×
496
        string_index_err(s, i)
×
497
    end
498
    n = ncodeunits(s)
4,761✔
499

500
    (i += 1) > n && @goto ret
4,761✔
501
    @inbounds b = codeunit(s, i) # cont byte 1
4,761✔
502
    b & 0xc0 == 0x80 || @goto ret
4,761✔
503
    u |= UInt32(b) << 16
4,761✔
504

505
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
4,761✔
506
    @inbounds b = codeunit(s, i) # cont byte 2
4,761✔
507
    b & 0xc0 == 0x80 || @goto ret
4,761✔
508
    u |= UInt32(b) << 8
4,761✔
509

510
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
4,761✔
UNCOV
511
    @inbounds b = codeunit(s, i) # cont byte 3
×
UNCOV
512
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
513
    u |= UInt32(b)
×
514
@label ret
515
    return reinterpret(Char, u)
4,761✔
516
end
517

518
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
8✔
519

520
@inline function getindex(s::String, r::UnitRange{Int})
2,890✔
521
    isempty(r) && return ""
1,754,226✔
522
    i, j = first(r), last(r)
648,747✔
523
    @boundscheck begin
1,724,511✔
524
        checkbounds(s, r)
1,724,511✔
525
        @inbounds isvalid(s, i) || string_index_err(s, i)
1,724,511✔
526
        @inbounds isvalid(s, j) || string_index_err(s, j)
1,724,511✔
527
    end
528
    j = nextind(s, j) - 1
3,446,252✔
529
    n = j - i + 1
1,724,511✔
530
    ss = _string_n(n)
1,724,511✔
531
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
1,724,511✔
532
    return ss
1,724,511✔
533
end
534

535
# nothrow because we know the start and end indices are valid
536
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
361,854✔
537

538
# effects needed because @inbounds
539
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
×
540
    @boundscheck begin
113,984✔
541
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
113,984✔
542
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
113,984✔
543
    end
544
    j < i && return 0
113,984✔
545
    @inbounds i, k = thisind(s, i), i
114,764✔
546
    c = j - i + (i == k)
57,382✔
547
    @inbounds length_continued(s, i, j, c)
57,382✔
548
end
549

550
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
13✔
551
    i < n || return c
421,007✔
552
    b = codeunit(s, i)
417,461✔
553
    while true
1,166,381✔
554
        while true
3,665,935✔
555
            (i += 1) ≤ n || return c
29,125,713✔
556
            0xc0 ≤ b ≤ 0xf7 && break
28,302,005✔
557
            b = codeunit(s, i)
27,558,652✔
558
        end
27,552,516✔
559
        l = b
4✔
560
        b = codeunit(s, i) # cont byte 1
750,232✔
561
        c -= (x = b & 0xc0 == 0x80)
750,232✔
562
        x & (l ≥ 0xe0) || continue
750,232✔
563

564
        (i += 1) ≤ n || return c
60,464✔
565
        b = codeunit(s, i) # cont byte 2
57,868✔
566
        c -= (x = b & 0xc0 == 0x80)
57,868✔
567
        x & (l ≥ 0xf0) || continue
115,736✔
568

569
        (i += 1) ≤ n || return c
×
570
        b = codeunit(s, i) # cont byte 3
×
571
        c -= (b & 0xc0 == 0x80)
×
572
    end
748,934✔
573
end
574

575
## overload methods for efficiency ##
576

577
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
105,645,169✔
578

579
isascii(s::String) = isascii(codeunits(s))
792✔
580

581
# don't assume effects for general integers since we cannot know their implementation
582
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
7,187,317✔
583

584
"""
585
    repeat(c::AbstractChar, r::Integer)::String
586

587
Repeat a character `r` times. This can equivalently be accomplished by calling
588
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
589

590
# Examples
591
```jldoctest
592
julia> repeat('A', 3)
593
"AAA"
594
```
595
"""
596
function repeat(c::AbstractChar, r::Integer)
7,400,080✔
597
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
7,400,376✔
598
    r = UInt(r)::UInt
7,400,368✔
599
    c = Char(c)::Char
7,400,368✔
600
    r == 0 && return ""
7,400,368✔
601
    u = bswap(reinterpret(UInt32, c))
7,248,024✔
602
    n = 4 - (leading_zeros(u | 0xff) >> 3)
7,248,024✔
603
    s = _string_n(n*r)
7,248,024✔
604
    p = pointer(s)
7,248,022✔
605
    GC.@preserve s if n == 1
7,248,022✔
606
        memset(p, u % UInt8, r)
7,247,834✔
607
    elseif n == 2
188✔
608
        p16 = reinterpret(Ptr{UInt16}, p)
14✔
609
        for i = 1:r
14✔
610
            unsafe_store!(p16, u % UInt16, i)
40✔
611
        end
40✔
612
    elseif n == 3
174✔
613
        b1 = (u >> 0) % UInt8
164✔
614
        b2 = (u >> 8) % UInt8
164✔
615
        b3 = (u >> 16) % UInt8
164✔
616
        for i = 0:r-1
164✔
617
            unsafe_store!(p, b1, 3i + 1)
918✔
618
            unsafe_store!(p, b2, 3i + 2)
918✔
619
            unsafe_store!(p, b3, 3i + 3)
918✔
620
        end
918✔
621
    elseif n == 4
10✔
622
        p32 = reinterpret(Ptr{UInt32}, p)
10✔
623
        for i = 1:r
10✔
624
            unsafe_store!(p32, u, i)
22✔
625
        end
7,248,044✔
626
    end
627
    return s
7,248,022✔
628
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc