• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / 1541

08 Dec 2025 04:29PM UTC coverage: 76.699% (-0.04%) from 76.74%
1541

push

buildkite

web-flow
codegen: implement `sret_union` ABI for pointer-ful types (#55045)

This effectively expands our existing `union` ABI to cover both of these
existing cases:
 - `sret`  ABI (which can stack-allocate a _single pointer-ful_ type)
 - `union` ABI (which can stack-allocate _many pointer-free_ types)

This provides some nice speed-ups for temporary "wrappers":
```julia
const v = Any[]
@noinline maybe_wrapped(i) = (i % 32 != 0) ? Some(v) : nothing
function foo()
    count = 0
    for i = 1:1_000_000
        count += (maybe_wrapped(i) !== nothing) ? 1 : 0
    end
    return count
end
```

On this PR this gives:
```julia
julia> @btime foo()
  1.675 ms (0 allocations: 0 bytes)
968750
```

compared to current master:
```julia
julia> @btime foo()
  6.877 ms (968750 allocations: 14.78 MiB)
968750
```

Co-authored-by: Gabriel Baraldi <baraldigabriel@gmail.com>
Co-authored-by: Jameson Nash <vtjnash@gmail.com>

62469 of 81447 relevant lines covered (76.7%)

22102469.65 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.06
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
10✔
10
    index::Int
11
end
12
@noinline string_index_err((@nospecialize s::AbstractString), i::Integer) =
2✔
13
    throw(StringIndexError(s, Int(i)))
14
function showerror(io::IO, exc::StringIndexError)
8✔
15
    s = exc.string
8✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
8✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
8✔
18
        iprev = thisind(s, exc.index)
8✔
19
        inext = nextind(s, iprev)
8✔
20
        escprev = escape_string(s[iprev:iprev])
8✔
21
        if inext <= ncodeunits(s)
8✔
22
            escnext = escape_string(s[inext:inext])
6✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
6✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
2✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
511,991,350✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
14,282,807✔
65

66
function String(v::Vector{UInt8})
310✔
67
    len = length(v)
9,344,578✔
68
    len == 0 && return ""
9,344,578✔
69
    ref = v.ref
9,302,116✔
70
    if ref.ptr_or_offset == ref.mem.ptr
9,302,120✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
9,302,115✔
72
    else
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
5✔
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
9,302,120✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
9,302,116✔
78
    return str
9,302,120✔
79
end
80

81
"""
82
    unsafe_takestring(m::Memory{UInt8})::String
83

84
Create a `String` from `m`, changing the interpretation of the contents of `m`.
85
This is done without copying, if possible. Thus, any access to `m` after
86
calling this function, either to read or to write, is undefined behavior.
87
"""
88
function unsafe_takestring(m::Memory{UInt8})
514✔
89
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
219,960,558✔
90
end
91

92
"""
93
    takestring!(x) -> String
94

95
Create a string from the content of `x`, emptying `x`.
96

97
# Examples
98
```jldoctest
99
julia> v = [0x61, 0x62, 0x63];
100

101
julia> s = takestring!(v)
102
"abc"
103

104
julia> isempty(v)
105
true
106
```
107
"""
108
takestring!(v::Vector{UInt8}) = String(v)
×
109

110
"""
111
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
112
    unsafe_string(p::Cstring)
113

114
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
115
(The pointer can be safely freed afterwards.) If `length` is specified
116
(the length of the data in bytes), the string does not have to be NUL-terminated.
117

118
This function is labeled "unsafe" because it will crash if `p` is not
119
a valid memory address to data of the requested length.
120
"""
121
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
152✔
122
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,661,563✔
123
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,661,561✔
124
end
125
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
2,683✔
126
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
2,460,459✔
127
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
2,460,459✔
128
end
129

130
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
131
# but the macro is not available at this time in bootstrap, so we write it manually.
132
const _string_n_override = 0x04ee
133
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
135,622,235✔
134
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override, false)), :(convert(Csize_t, n))))
135

136
"""
137
    String(s::AbstractString)
138

139
Create a new `String` from an existing `AbstractString`.
140
"""
141
String(s::AbstractString) = print_to_string(s)
934✔
142
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
2,267,287✔
143

144
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
123,081,912✔
145
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
73,583✔
146

147
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
62,744✔
148
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
62,704✔
149
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
150

151
String(s::CodeUnits{UInt8,String}) = s.s
2✔
152

153
## low-level functions ##
154

155
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
1,355,182,052✔
156
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
569,036,745✔
157

158
ncodeunits(s::String) = Core.sizeof(s)
981,969,952✔
159
codeunit(s::String) = UInt8
15,973,004✔
160

161
codeunit(s::String, i::Integer) = codeunit(s, Int(i)::Int)
4✔
162
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
13,582✔
163
    @boundscheck checkbounds(s, i)
559,401,582✔
164
    b = GC.@preserve s unsafe_load(pointer(s, i))
559,401,582✔
165
    return b
554,916,285✔
166
end
167

168
## comparison ##
169

170
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
841,832✔
171

172
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
1,998,355✔
173
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
40✔
174
    GC.@preserve a b begin
2,249,701✔
175
        pa = unsafe_convert(Ptr{UInt8}, a)
2,249,701✔
176
        pb = unsafe_convert(Ptr{UInt8}, b)
2,249,701✔
177
        memcmp(pa, pb, len % Csize_t) % Int
2,249,701✔
178
    end
179
end
180

181
function cmp(a::String, b::String)
2✔
182
    al, bl = sizeof(a), sizeof(b)
841,832✔
183
    c = _memcmp(a, b)
841,832✔
184
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
1,214,950✔
185
end
186

187
==(a::String, b::String) = a===b
22,031,589✔
188

189
typemin(::Type{String}) = ""
×
190
typemin(::String) = typemin(String)
×
191

192
## thisind, nextind ##
193

194
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
115,776,298✔
195

196
# s should be String or SubString{String}
197
@inline function _thisind_str(s, i::Int)
6,638✔
198
    i == 0 && return 0
58,360,453✔
199
    n = ncodeunits(s)
58,243,836✔
200
    i == n + 1 && return i
58,243,836✔
201
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
58,243,822✔
202
    @inbounds b = codeunit(s, i)
58,243,822✔
203
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
107,422,700✔
204
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
8,820,122✔
205
        local b
173,732✔
206
        @inbounds b = codeunit(s, i-1)
173,732✔
207
        between(b, 0b11000000, 0b11110111) && return i-1
173,732✔
208
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
109,416✔
209
        @inbounds b = codeunit(s, i-2)
109,416✔
210
        between(b, 0b11100000, 0b11110111) && return i-2
109,416✔
211
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
12✔
212
        @inbounds b = codeunit(s, i-3)
12✔
213
        between(b, 0b11110000, 0b11110111) && return i-3
12✔
214
        return i
×
215
    end)(s, i, n)
216
end
217

218
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
30,661,111✔
219

220
# s should be String or SubString{String}
221
@inline function _nextind_str(s, i::Int)
2,931✔
222
    i == 0 && return 1
16,228,636✔
223
    n = ncodeunits(s)
16,209,262✔
224
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
16,209,262✔
225
    @inbounds l = codeunit(s, i)
16,209,262✔
226
    between(l, 0x80, 0xf7) || return i+1
32,343,471✔
227
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
121,142✔
228
        if l < 0xc0
46,093✔
229
            # handle invalid codeunit index by scanning back to the start of this index
230
            # (which may be the same as this index)
231
            i′ = @inbounds thisind(s, i)
×
232
            i′ >= i && return i+1
×
233
            i = i′
×
234
            @inbounds l = codeunit(s, i)
×
235
            (l < 0x80) | (0xf8 ≤ l) && return i+1
×
236
            @assert l >= 0xc0 "invalid codeunit"
×
237
        end
238
        # first continuation byte
239
        (i += 1) > n && return i
46,093✔
240
        @inbounds b = codeunit(s, i)
46,093✔
241
        b & 0xc0 ≠ 0x80 && return i
46,093✔
242
        ((i += 1) > n) | (l < 0xe0) && return i
46,093✔
243
        # second continuation byte
244
        @inbounds b = codeunit(s, i)
46,093✔
245
        b & 0xc0 ≠ 0x80 && return i
46,093✔
246
        ((i += 1) > n) | (l < 0xf0) && return i
46,093✔
247
        # third continuation byte
248
        @inbounds b = codeunit(s, i)
×
249
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
×
250
    end)(s, i, n, l)
251
end
252

253
## checking UTF-8 & ASCII validity ##
254
#=
255
    The UTF-8 Validation is performed by a shift based DFA.
256
    ┌───────────────────────────────────────────────────────────────────┐
257
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
258
    │                               ├────────3────────┐           │     │
259
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
260
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
261
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
262
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
263
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
264
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
265
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
266
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
267
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
268
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
269
    │                      │        │     ├─┤               │        │  │
270
    │                      │        └─4──►│6├─────1,9───────┘        │  │
271
    │          INVALID     │              └─┘                        │  │
272
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
273
    │          ┌▼───┴┐                                                  │
274
    │          │  2  ◄─── All undefined transitions result in state 2   │
275
    │          └─────┘                                                  │
276
    └───────────────────────────────────────────────────────────────────┘
277

278
        Validation States
279
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
280
                        If the DFA ends in this state the string is ASCII only
281
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
282
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
283
                    as seen by all 1s in that column of table below
284
            3 -> One valid continuation byte needed to return to state 0
285
        4,5,6 -> Two valid continuation bytes needed to return to state 0
286
        7,8,9 -> Three valids continuation bytes needed to return to state 0
287

288
                        Current State
289
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
290
                0 | 0  1  2  2  2  2  2  2  2  2
291
                1 | 2  2  2  1  3  2  3  2  4  4
292
                2 | 3  3  2  2  2  2  2  2  2  2
293
                3 | 4  4  2  2  2  2  2  2  2  2
294
                4 | 6  6  2  2  2  2  2  2  2  2
295
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
296
    Class       6 | 8  8  2  2  2  2  2  2  2  2
297
                7 | 2  2  2  1  3  3  2  4  4  2
298
                8 | 2  2  2  2  2  2  2  2  2  2
299
                9 | 2  2  2  1  3  2  3  4  4  2
300
               10 | 5  5  2  2  2  2  2  2  2  2
301
               11 | 7  7  2  2  2  2  2  2  2  2
302

303
           Shifts | 0  4 10 14 18 24  8 20 12 26
304

305
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
306
    the rows the correct shift was a result.
307

308
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
309
    the current state then masking the result with 0x11110 give the shift for the new state
310

311

312
=#
313

314
#State type used by UTF-8 DFA
315
const _UTF8DFAState = UInt32
316
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
317
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
318
    num_classes=12
319
    num_states=10
320
    bit_per_state = 6
321

322
    # These shifts were derived using a SMT solver
323
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
324

325
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
330
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
333
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
334
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
335
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
336
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
337
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
338
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
339
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
340
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
341

342
    # These are the rows discussed in comments above
343
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
344
                     2  2  2  1  3  2  3  2  4  4;
345
                     3  3  2  2  2  2  2  2  2  2;
346
                     4  4  2  2  2  2  2  2  2  2;
347
                     6  6  2  2  2  2  2  2  2  2;
348
                     9  9  2  2  2  2  2  2  2  2;
349
                     8  8  2  2  2  2  2  2  2  2;
350
                     2  2  2  1  3  3  2  4  4  2;
351
                     2  2  2  2  2  2  2  2  2  2;
352
                     2  2  2  1  3  2  3  4  4  2;
353
                     5  5  2  2  2  2  2  2  2  2;
354
                     7  7  2  2  2  2  2  2  2  2]
355

356
    #This converts the state_arrays into the shift encoded _UTF8DFAState
357
    class_row = zeros(_UTF8DFAState, num_classes)
358

359
    for i = 1:num_classes
360
        row = _UTF8DFAState(0)
361
        for j in 1:num_states
362
            #Calculate the shift required for the next state
363
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
364
            #Shift the next state into the position of the current state
365
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
366
        end
367
        class_row[i]=row
368
    end
369

370
    map(c->class_row[c+1],character_classes)
×
371
end
372

373

374
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
375
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
376
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
377

378
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
379
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
110,816✔
380

381
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
10,496✔
382
    for i = first:last
51,884✔
383
       @inbounds state = _utf_dfa_step(state, bytes[i])
110,816✔
384
    end
169,748✔
385
    return (state)
51,884✔
386
end
387

388
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
389
    n=first
20✔
390
    while n <= last - chunk_size
80✔
391
        _isascii(cu,n,n+chunk_size-1) || return n
60✔
392
        n += chunk_size
60✔
393
    end
60✔
394
    n= last-chunk_size+1
20✔
395
    _isascii(cu,n,last) || return n
20✔
396
    return nothing
20✔
397
end
398

399
##
400

401
# Classifications of string
402
    # 0: neither valid ASCII nor UTF-8
403
    # 1: valid ASCII
404
    # 2: valid UTF-8
405
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
48✔
406

407

408
function byte_string_classify(bytes::AbstractVector{UInt8})
48✔
409
    chunk_size = 1024
41,833✔
410
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
41,833✔
411
    n = length(bytes)
41,833✔
412
    if n > chunk_threshold
41,833✔
413
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
20✔
414
        isnothing(start) && return 1
20✔
415
    else
416
        _isascii(bytes,1,n) && return 1
41,813✔
417
        start = 1
40,620✔
418
    end
419
    return _byte_string_classify_nonascii(bytes,start,n)
40,620✔
420
end
421

422
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
40,620✔
423
    chunk_size = 256
40,620✔
424

425
    start = first
40,620✔
426
    stop = min(last,first + chunk_size - 1)
40,620✔
427
    state = _UTF8_DFA_ACCEPT
40,620✔
428
    while start <= last
50,330✔
429
        # try to process ascii chunks
430
        while state == _UTF8_DFA_ACCEPT
40,620✔
431
            _isascii(bytes,start,stop) || break
40,620✔
432
            (start = start + chunk_size) <= last || break
×
433
            stop = min(last,stop + chunk_size)
×
434
        end
×
435
        # Process non ascii chunk
436
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
99,552✔
437
        state == _UTF8_DFA_INVALID && return 0
40,620✔
438

439
        start = start + chunk_size
9,710✔
440
        stop = min(last,stop + chunk_size)
9,710✔
441
    end
9,710✔
442
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
9,710✔
443
end
444

445
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
42,073✔
446
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
48✔
447

448
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
1,555✔
449

450
is_valid_continuation(c) = c & 0xc0 == 0x80
831✔
451

452
## required core functionality ##
453

454
@inline function iterate(s::String, i::Int=firstindex(s))
2,869✔
455
    (i % UInt) - 1 < ncodeunits(s) || return nothing
497,517,475✔
456
    b = @inbounds codeunit(s, i)
417,471,564✔
457
    u = UInt32(b) << 24
417,471,564✔
458
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
834,339,191✔
459
    return @noinline iterate_continued(s, i, u)
603,937✔
460
end
461

462
# duck-type s so that external UTF-8 string packages like StringViews can hook in
463
function iterate_continued(s, i::Int, u::UInt32)
228,666✔
464
    u < 0xc0000000 && (i += 1; @goto ret)
228,666✔
465
    n = ncodeunits(s)
214,034✔
466
    # first continuation byte
467
    (i += 1) > n && @goto ret
214,034✔
468
    @inbounds b = codeunit(s, i)
210,934✔
469
    b & 0xc0 == 0x80 || @goto ret
210,934✔
470
    u |= UInt32(b) << 16
202,179✔
471
    # second continuation byte
472
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
202,179✔
473
    @inbounds b = codeunit(s, i)
134,440✔
474
    b & 0xc0 == 0x80 || @goto ret
134,440✔
475
    u |= UInt32(b) << 8
134,440✔
476
    # third continuation byte
477
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
134,440✔
478
    @inbounds b = codeunit(s, i)
1,963✔
479
    b & 0xc0 == 0x80 || @goto ret
1,963✔
480
    u |= UInt32(b); i += 1
1,963✔
481
@label ret
482
    return reinterpret(Char, u), i
228,666✔
483
end
484

485
@propagate_inbounds function getindex(s::String, i::Int)
18,301✔
486
    b = codeunit(s, i)
3,583,011✔
487
    u = UInt32(b) << 24
3,583,011✔
488
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
7,090,166✔
489
    return getindex_continued(s, i, u)
75,788✔
490
end
491

492
# duck-type s so that external UTF-8 string packages like StringViews can hook in
493
function getindex_continued(s, i::Int, u::UInt32)
33,436✔
494
    if u < 0xc0000000
33,436✔
495
        # called from `getindex` which checks bounds
496
        @inbounds isvalid(s, i) && @goto ret
×
497
        string_index_err(s, i)
×
498
    end
499
    n = ncodeunits(s)
33,436✔
500

501
    (i += 1) > n && @goto ret
33,436✔
502
    @inbounds b = codeunit(s, i) # cont byte 1
33,436✔
503
    b & 0xc0 == 0x80 || @goto ret
33,436✔
504
    u |= UInt32(b) << 16
33,436✔
505

506
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
33,436✔
507
    @inbounds b = codeunit(s, i) # cont byte 2
33,436✔
508
    b & 0xc0 == 0x80 || @goto ret
33,436✔
509
    u |= UInt32(b) << 8
33,436✔
510

511
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
33,436✔
512
    @inbounds b = codeunit(s, i) # cont byte 3
24✔
513
    b & 0xc0 == 0x80 || @goto ret
24✔
514
    u |= UInt32(b)
24✔
515
@label ret
516
    return reinterpret(Char, u)
33,436✔
517
end
518

519
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
8✔
520

521
@inline function getindex(s::String, r::UnitRange{Int})
1,200✔
522
    isempty(r) && return ""
1,768,556✔
523
    i, j = first(r), last(r)
662,594✔
524
    @boundscheck begin
1,737,084✔
525
        checkbounds(s, r)
1,737,084✔
526
        @inbounds isvalid(s, i) || string_index_err(s, i)
1,737,084✔
527
        @inbounds isvalid(s, j) || string_index_err(s, j)
1,737,084✔
528
    end
529
    j = nextind(s, j) - 1
3,473,018✔
530
    n = j - i + 1
1,737,084✔
531
    ss = _string_n(n)
1,737,084✔
532
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
1,737,084✔
533
    return ss
1,737,084✔
534
end
535

536
# nothrow because we know the start and end indices are valid
537
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
106,665✔
538

539
# effects needed because @inbounds
540
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
541
    @boundscheck begin
113,948✔
542
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
113,948✔
543
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
113,948✔
544
    end
545
    j < i && return 0
113,948✔
546
    @inbounds i, k = thisind(s, i), i
114,692✔
547
    c = j - i + (i == k)
57,346✔
548
    @inbounds length_continued(s, i, j, c)
57,346✔
549
end
550

551
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
4✔
552
    i < n || return c
165,762✔
553
    b = codeunit(s, i)
162,260✔
554
    while true
911,200✔
555
        while true
3,410,752✔
556
            (i += 1) ≤ n || return c
19,451,917✔
557
            0xc0 ≤ b ≤ 0xf7 && break
19,129,993✔
558
            b = codeunit(s, i)
18,379,755✔
559
        end
18,379,755✔
560
        l = b
10✔
561
        b = codeunit(s, i) # cont byte 1
750,238✔
562
        c -= (x = b & 0xc0 == 0x80)
750,238✔
563
        x & (l ≥ 0xe0) || continue
750,238✔
564

565
        (i += 1) ≤ n || return c
60,470✔
566
        b = codeunit(s, i) # cont byte 2
57,874✔
567
        c -= (x = b & 0xc0 == 0x80)
57,874✔
568
        x & (l ≥ 0xf0) || continue
115,748✔
569

570
        (i += 1) ≤ n || return c
×
571
        b = codeunit(s, i) # cont byte 3
×
572
        c -= (b & 0xc0 == 0x80)
×
573
    end
748,940✔
574
end
575

576
## overload methods for efficiency ##
577

578
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
100,575,740✔
579

580
isascii(s::String) = isascii(codeunits(s))
19,755✔
581

582
# don't assume effects for general integers since we cannot know their implementation
583
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
7,504,086✔
584

585
"""
586
    repeat(c::AbstractChar, r::Integer)::String
587

588
Repeat a character `r` times. This can equivalently be accomplished by calling
589
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
590

591
# Examples
592
```jldoctest
593
julia> repeat('A', 3)
594
"AAA"
595
```
596
"""
597
function repeat(c::AbstractChar, r::Integer)
7,736,981✔
598
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
7,737,219✔
599
    r = UInt(r)::UInt
7,737,211✔
600
    c = Char(c)::Char
7,737,211✔
601
    r == 0 && return ""
7,737,211✔
602
    u = bswap(reinterpret(UInt32, c))
7,585,052✔
603
    n = 4 - (leading_zeros(u | 0xff) >> 3)
7,585,052✔
604
    s = _string_n(n*r)
7,585,052✔
605
    p = pointer(s)
7,585,050✔
606
    GC.@preserve s if n == 1
7,585,050✔
607
        memset(p, u % UInt8, r)
7,584,532✔
608
    elseif n == 2
518✔
609
        p16 = reinterpret(Ptr{UInt16}, p)
14✔
610
        for i = 1:r
14✔
611
            unsafe_store!(p16, u % UInt16, i)
40✔
612
        end
40✔
613
    elseif n == 3
504✔
614
        b1 = (u >> 0) % UInt8
494✔
615
        b2 = (u >> 8) % UInt8
494✔
616
        b3 = (u >> 16) % UInt8
494✔
617
        for i = 0:r-1
494✔
618
            unsafe_store!(p, b1, 3i + 1)
2,580✔
619
            unsafe_store!(p, b2, 3i + 2)
2,580✔
620
            unsafe_store!(p, b3, 3i + 3)
2,580✔
621
        end
2,580✔
622
    elseif n == 4
10✔
623
        p32 = reinterpret(Ptr{UInt32}, p)
10✔
624
        for i = 1:r
10✔
625
            unsafe_store!(p32, u, i)
22✔
626
        end
7,585,072✔
627
    end
628
    return s
7,585,050✔
629
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc