• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / 1516

25 Apr 2026 03:50PM UTC coverage: 78.044% (-0.02%) from 78.067%
1516

push

buildkite

web-flow
Specialize f in map(f, ::NamedTuple) (#61654)

This lack of specialization is problematic, because the result of the
map is used in the function body. Without specialization, this causes
excessive allocations.

This was discovered by @aplavin with this benchmark:
```julia
using BenchmarkTools

a = (x1=[1.0], x2=[2.0], x3=[3.0], x4=[4.0], x5=[5.0], x6=[6.0], x7=[7.0], x8=[8.0], x9=[9.0], x10=[10.0], x11=[11.0])
b = map(copy, a)

function bench(a, b, N)
    s = 0
    for _ in 1:N; s += length(map(copyto!, a, b).x1); end; s
end
bench(a, b, 1)

@showtime bench(a, b, 1_000_000)
```

Results:
* `map` on master: `bench(a, b, 1000000): 0.495330 seconds (4.00 M
allocations: 366.211 MiB, 34.93% gc time)`
* `map` on PR: `bench(a, b, 1000000): 0.050094 seconds (1 allocation: 16
bytes)`

1 of 1 new or added line in 1 file covered. (100.0%)

134 existing lines in 8 files now uncovered.

65573 of 84021 relevant lines covered (78.04%)

24583102.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.47
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringView{T <: AbstractVector{UInt8}} <: AbstractString
5

6
An `AbstractString` representation of any `vector` of `UInt8` data,
7
interpreted as UTF-8 encoded Unicode.
8
Similar to `String`, the underlying data may be invalid UTF-8.
9

10
`StringView(v::AbstractVector{UInt8})::StringView` does not make a copy of
11
or modify the `v`. Use `codeunits` to get `v` from the `StringView`.
12
After construction, `v` may be mutated, which will be reflected in
13
the resulting `StringView`.
14

15
!!! compat "Julia 1.14"
16
    The `StringView` type requires at least Julia 1.14.
17

18
# Examples
19
```jldoctest
20
julia> arr = [0x61, 0xf0, 0x63, 0x64];
21

22
julia> s = StringView(arr)
23
"a\\xf0cd"
24

25
julia> codeunits(s) === arr
26
true
27

28
julia> arr[2] = Int('b'); s
29
"abcd"
30
```
31
"""
32
struct StringView{T <: AbstractVector{UInt8}} <: AbstractString
33
    data::T
34

35
    function StringView{T}(data::T) where {T <: AbstractVector{UInt8}}
×
36
        # For now, StringViews code assumes one-based indexing
37
        require_one_based_indexing(data)
×
38

39
        # Prevent someone constructing e.g. a `StringView{AbstractVector{UInt8}}`,
40
        # the existence of which will complicate the implementation and provide
41
        # no usability benefit.
42
        if !isconcretetype(T)
×
43
            throw(ArgumentError("StringView must be parameterized with a concrete type"))
×
44
        end
45

46
        new{T}(data)
×
47
    end
48
end
49

50

51
"""
52
    StringIndexError(str, i)
53

54
An error occurred when trying to access `str` at index `i` that is not valid.
55
"""
56
struct StringIndexError <: Exception
57
    string::AbstractString
10✔
58
    index::Int
59
end
60
@noinline string_index_err((@nospecialize s::AbstractString), i::Integer) =
2✔
61
    throw(StringIndexError(s, Int(i)))
62
function showerror(io::IO, exc::StringIndexError)
8✔
63
    s = exc.string
8✔
64
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
8✔
65
    if firstindex(s) <= exc.index <= ncodeunits(s)
8✔
66
        iprev = thisind(s, exc.index)
8✔
67
        inext = nextind(s, iprev)
8✔
68
        escprev = escape_string(s[iprev:iprev])
8✔
69
        if inext <= ncodeunits(s)
8✔
70
            escnext = escape_string(s[inext:inext])
6✔
71
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
6✔
72
        else
73
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
2✔
74
        end
75
    end
76
end
77

78
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
907,501,067✔
79

80
"""
81
    String <: AbstractString
82

83
The default string type in Julia, used by e.g. string literals.
84

85
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
86
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
87
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
88
that the underlying byte sequence is valid as UTF-8.
89
"""
90
String
91

92
## constructors and conversions ##
93

94
# String constructor docstring from boot.jl, workaround for #16730
95
# and the unavailability of @doc in boot.jl context.
96
"""
97
    String(v::AbstractVector{UInt8})
98

99
Create a new `String` object using the data buffer from byte vector `v`.
100
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
101
modification of `v` cannot affect the contents of the resulting string.
102
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
103
`AbstractVector` types, `String(v)` already makes a copy.
104

105
When possible, the memory of `v` will be used without copying when the `String`
106
object is created. This is guaranteed to be the case for byte vectors returned
107
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
108
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
109
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
110
to guarantee consistent behavior.
111
"""
112
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
14,934,490✔
113

114
function String(v::Vector{UInt8})
4,089✔
115
    len = length(v)
23,080,634✔
116
    len == 0 && return ""
23,080,634✔
117
    ref = v.ref
22,972,867✔
118
    if ref.ptr_or_offset == ref.mem.ptr
22,972,869✔
119
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
22,972,864✔
120
    else
121
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
5✔
122
    end
123
    # optimized empty!(v); sizehint!(v, 0) calls
124
    setfield!(v, :size, (0,))
22,972,869✔
125
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
22,972,867✔
126
    return str
22,972,869✔
127
end
128

129
"""
130
    unsafe_takestring(m::Memory{UInt8})::String
131

132
Create a `String` from `m`, changing the interpretation of the contents of `m`.
133
This is done without copying, if possible. Thus, any access to `m` after
134
calling this function, either to read or to write, is undefined behavior.
135
"""
136
function unsafe_takestring(m::Memory{UInt8})
137
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
14,936,373✔
138
end
139

140
"""
141
    takestring!(x) -> String
142

143
Create a string from the content of `x`, emptying `x`.
144

145
# Examples
146
```jldoctest
147
julia> v = [0x61, 0x62, 0x63];
148

149
julia> s = takestring!(v)
150
"abc"
151

152
julia> isempty(v)
153
true
154
```
155
"""
156
takestring!(v::Vector{UInt8}) = String(v)
×
157

158
"""
159
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
160
    unsafe_string(p::Cstring)
161

162
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
163
(The pointer can be safely freed afterwards.) If `length` is specified
164
(the length of the data in bytes), the string does not have to be NUL-terminated.
165

166
This function is labeled "unsafe" because it will crash if `p` is not
167
a valid memory address to data of the requested length.
168
"""
169
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
876✔
170
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
5,664,978✔
171
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
5,664,976✔
172
end
173
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
2,778✔
174
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
6,913,873✔
175
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
6,913,873✔
176
end
177

178
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
179
# but the macro is not available at this time in bootstrap, so we write it manually.
180
const _string_n_override = 0x04ee
181
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
141,916,124✔
182
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override, false)), :(convert(Csize_t, n))))
183

184
"""
185
    String(s::AbstractString)
186

187
Create a new `String` from an existing `AbstractString`.
188
"""
189
String(s::AbstractString) = print_to_string(s)
1,074✔
190
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
6,377,248✔
191

192
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
24,798,793✔
193
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
120,492✔
194

195
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
63,068✔
196
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
63,028✔
197
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
198

199
String(s::CodeUnits{UInt8,String}) = s.s
2✔
200

201
## low-level functions ##
202

203
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
1,758,381,031✔
204
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
861,750,831✔
205

206
ncodeunits(s::String) = Core.sizeof(s)
1,329,541,277✔
207
codeunit(s::String) = UInt8
2,139,577✔
208

209
codeunit(s::String, i::Integer) = codeunit(s, Int(i)::Int)
4✔
210
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
17,360✔
211
    @boundscheck checkbounds(s, i)
846,943,288✔
212
    b = GC.@preserve s unsafe_load(pointer(s, i))
846,943,312✔
213
    return b
841,868,238✔
214
end
215

216
## comparison ##
217

218
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
641,743✔
219

220
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
1,498,389✔
221
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
35✔
222
    GC.@preserve a b begin
1,721,542✔
223
        pa = unsafe_convert(Ptr{UInt8}, a)
1,721,542✔
224
        pb = unsafe_convert(Ptr{UInt8}, b)
1,721,542✔
225
        memcmp(pa, pb, len % Csize_t) % Int
1,721,542✔
226
    end
227
end
228

229
function cmp(a::String, b::String)
2✔
230
    al, bl = sizeof(a), sizeof(b)
641,743✔
231
    c = _memcmp(a, b)
641,743✔
232
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
942,530✔
233
end
234

235
==(a::String, b::String) = a===b
34,618,612✔
236

237
typemin(::Type{String}) = ""
×
238
typemin(::String) = typemin(String)
×
239

240
## thisind, nextind ##
241

242
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
149,117,312✔
243

244
# s should be String, StringView, or SubString{String}
245
@inline function _thisind_str(s, i::Int)
7,451✔
246
    i == 0 && return 0
75,055,711✔
247
    n = ncodeunits(s)
74,938,823✔
248
    i == n + 1 && return i
74,938,823✔
249
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
74,938,809✔
250
    @inbounds b = codeunit(s, i)
74,938,809✔
251
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
140,557,975✔
252
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
8,577,349✔
253
        local b
52,054✔
254
        @inbounds b = codeunit(s, i-1)
52,054✔
255
        between(b, 0b11000000, 0b11110111) && return i-1
52,054✔
256
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
33,268✔
257
        @inbounds b = codeunit(s, i-2)
33,268✔
258
        between(b, 0b11100000, 0b11110111) && return i-2
33,268✔
259
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
2✔
260
        @inbounds b = codeunit(s, i-3)
2✔
261
        between(b, 0b11110000, 0b11110111) && return i-3
2✔
262
        return i
×
263
    end)(s, i, n)
264
end
265

266
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
93,754,020✔
267

268
# s should be String or SubString{String}
269
@inline function _nextind_str(s, i::Int)
4,103✔
270
    i == 0 && return 1
127,805,943✔
271
    n = ncodeunits(s)
127,793,425✔
272
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
127,793,425✔
273
    @inbounds l = codeunit(s, i)
127,793,425✔
274
    between(l, 0x80, 0xf7) || return i+1
255,403,339✔
275
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
199,075✔
276
        if l < 0xc0
15,568✔
277
            # handle invalid codeunit index by scanning back to the start of this index
278
            # (which may be the same as this index)
279
            i′ = @inbounds thisind(s, i)
×
280
            i′ >= i && return i+1
×
281
            i = i′
×
282
            @inbounds l = codeunit(s, i)
×
283
            (l < 0x80) | (0xf8 ≤ l) && return i+1
×
284
            @assert l >= 0xc0 "invalid codeunit"
×
285
        end
286
        # first continuation byte
287
        (i += 1) > n && return i
15,568✔
288
        @inbounds b = codeunit(s, i)
15,568✔
289
        b & 0xc0 ≠ 0x80 && return i
15,568✔
290
        ((i += 1) > n) | (l < 0xe0) && return i
15,568✔
291
        # second continuation byte
292
        @inbounds b = codeunit(s, i)
15,518✔
293
        b & 0xc0 ≠ 0x80 && return i
15,518✔
294
        ((i += 1) > n) | (l < 0xf0) && return i
15,518✔
295
        # third continuation byte
296
        @inbounds b = codeunit(s, i)
2✔
297
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
2✔
298
    end)(s, i, n, l)
299
end
300

301
## checking UTF-8 & ASCII validity ##
302
#=
303
    The UTF-8 Validation is performed by a shift based DFA.
304
    ┌───────────────────────────────────────────────────────────────────┐
305
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
306
    │                               ├────────3────────┐           │     │
307
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
308
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
309
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
310
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
311
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
312
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
313
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
314
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
315
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
316
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
317
    │                      │        │     ├─┤               │        │  │
318
    │                      │        └─4──►│6├─────1,9───────┘        │  │
319
    │          INVALID     │              └─┘                        │  │
320
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
321
    │          ┌▼───┴┐                                                  │
322
    │          │  2  ◄─── All undefined transitions result in state 2   │
323
    │          └─────┘                                                  │
324
    └───────────────────────────────────────────────────────────────────┘
325

326
        Validation States
327
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
328
                        If the DFA ends in this state the string is ASCII only
329
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
330
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
331
                    as seen by all 1s in that column of table below
332
            3 -> One valid continuation byte needed to return to state 0
333
        4,5,6 -> Two valid continuation bytes needed to return to state 0
334
        7,8,9 -> Three valids continuation bytes needed to return to state 0
335

336
                        Current State
337
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
338
                0 | 0  1  2  2  2  2  2  2  2  2
339
                1 | 2  2  2  1  3  2  3  2  4  4
340
                2 | 3  3  2  2  2  2  2  2  2  2
341
                3 | 4  4  2  2  2  2  2  2  2  2
342
                4 | 6  6  2  2  2  2  2  2  2  2
343
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
344
    Class       6 | 8  8  2  2  2  2  2  2  2  2
345
                7 | 2  2  2  1  3  3  2  4  4  2
346
                8 | 2  2  2  2  2  2  2  2  2  2
347
                9 | 2  2  2  1  3  2  3  4  4  2
348
               10 | 5  5  2  2  2  2  2  2  2  2
349
               11 | 7  7  2  2  2  2  2  2  2  2
350

351
           Shifts | 0  4 10 14 18 24  8 20 12 26
352

353
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
354
    the rows the correct shift was a result.
355

356
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
357
    the current state then masking the result with 0x11110 give the shift for the new state
358

359

360
=#
361

362
#State type used by UTF-8 DFA
363
const _UTF8DFAState = UInt32
364
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
365
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
366
    num_classes=12
367
    num_states=10
368
    bit_per_state = 6
369

370
    # These shifts were derived using a SMT solver
371
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
372

373
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
374
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
375
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
376
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
377
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
378
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
379
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
380
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
381
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
382
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
383
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
384
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
385
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
386
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
387
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
388
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
389

390
    # These are the rows discussed in comments above
391
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
392
                     2  2  2  1  3  2  3  2  4  4;
393
                     3  3  2  2  2  2  2  2  2  2;
394
                     4  4  2  2  2  2  2  2  2  2;
395
                     6  6  2  2  2  2  2  2  2  2;
396
                     9  9  2  2  2  2  2  2  2  2;
397
                     8  8  2  2  2  2  2  2  2  2;
398
                     2  2  2  1  3  3  2  4  4  2;
399
                     2  2  2  2  2  2  2  2  2  2;
400
                     2  2  2  1  3  2  3  4  4  2;
401
                     5  5  2  2  2  2  2  2  2  2;
402
                     7  7  2  2  2  2  2  2  2  2]
403

404
    #This converts the state_arrays into the shift encoded _UTF8DFAState
405
    class_row = zeros(_UTF8DFAState, num_classes)
406

407
    for i = 1:num_classes
408
        row = _UTF8DFAState(0)
409
        for j in 1:num_states
410
            #Calculate the shift required for the next state
411
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
412
            #Shift the next state into the position of the current state
413
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
414
        end
415
        class_row[i]=row
416
    end
417

418
    map(c->class_row[c+1],character_classes)
×
419
end
420

421

422
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
423
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
424
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
425

426
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
427
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
110,816✔
428

429
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
10,496✔
430
    for i = first:last
51,884✔
431
       @inbounds state = _utf_dfa_step(state, bytes[i])
110,816✔
432
    end
169,748✔
433
    return (state)
51,884✔
434
end
435

436
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
437
    n=first
20✔
438
    while n <= last - chunk_size
80✔
439
        _isascii(cu,n,n+chunk_size-1) || return n
60✔
440
        n += chunk_size
60✔
441
    end
60✔
442
    n= last-chunk_size+1
20✔
443
    _isascii(cu,n,last) || return n
20✔
444
    return nothing
20✔
445
end
446

447
##
448

449
# Classifications of string
450
    # 0: neither valid ASCII nor UTF-8
451
    # 1: valid ASCII
452
    # 2: valid UTF-8
453
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
48✔
454

455

456
function byte_string_classify(bytes::AbstractVector{UInt8})
48✔
457
    chunk_size = 1024
41,833✔
458
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
41,833✔
459
    n = length(bytes)
41,833✔
460
    if n > chunk_threshold
41,833✔
461
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
20✔
462
        isnothing(start) && return 1
20✔
463
    else
464
        _isascii(bytes,1,n) && return 1
41,813✔
465
        start = 1
40,620✔
466
    end
467
    return _byte_string_classify_nonascii(bytes,start,n)
40,620✔
468
end
469

470
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
40,620✔
471
    chunk_size = 256
40,620✔
472

473
    start = first
40,620✔
474
    stop = min(last,first + chunk_size - 1)
40,620✔
475
    state = _UTF8_DFA_ACCEPT
40,620✔
476
    while start <= last
50,330✔
477
        # try to process ascii chunks
478
        while state == _UTF8_DFA_ACCEPT
40,620✔
479
            _isascii(bytes,start,stop) || break
40,620✔
480
            (start = start + chunk_size) <= last || break
×
481
            stop = min(last,stop + chunk_size)
×
482
        end
×
483
        # Process non ascii chunk
484
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
99,552✔
485
        state == _UTF8_DFA_INVALID && return 0
40,620✔
486

487
        start = start + chunk_size
9,710✔
488
        stop = min(last,stop + chunk_size)
9,710✔
489
    end
9,710✔
490
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
9,710✔
491
end
492

493
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
42,073✔
494
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
48✔
495

496
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
1,555✔
497

498
is_valid_continuation(c) = c & 0xc0 == 0x80
809✔
499

500
## required core functionality ##
501

502
@inline function iterate(s::Union{String, StringView}, i::Int=firstindex(s))
2,839✔
503
    (i % UInt) - 1 < ncodeunits(s) || return nothing
536,764,416✔
504
    b = @inbounds codeunit(s, i)
452,431,596✔
505
    u = UInt32(b) << 24
452,431,596✔
506
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
904,218,841✔
507
    return @noinline iterate_continued(s, i, u)
644,351✔
508
end
509

510
# duck-type s so that external UTF-8 string packages like StringViews can hook in
511
function iterate_continued(s, i::Int, u::UInt32)
267,125✔
512
    @label begin
267,125✔
513
        u < 0xc0000000 && (i += 1; break)
267,125✔
514
        n = ncodeunits(s)
252,184✔
515
        # first continuation byte
516
        (i += 1) > n && break
252,184✔
517
        @inbounds b = codeunit(s, i)
249,030✔
518
        b & 0xc0 == 0x80 || break
249,030✔
519
        u |= UInt32(b) << 16
240,193✔
520
        # second continuation byte
521
        ((i += 1) > n) | (u < 0xe0000000) && break
240,193✔
522
        @inbounds b = codeunit(s, i)
123,792✔
523
        b & 0xc0 == 0x80 || break
123,792✔
524
        u |= UInt32(b) << 8
123,792✔
525
        # third continuation byte
526
        ((i += 1) > n) | (u < 0xf0000000) && break
123,792✔
527
        @inbounds b = codeunit(s, i)
2,067✔
528
        b & 0xc0 == 0x80 || break
2,067✔
529
        u |= UInt32(b); i += 1
2,067✔
530
    end
531
    return reinterpret(Char, u), i
267,125✔
532
end
533

534
@propagate_inbounds function getindex(s::Union{String, StringView}, i::Int)
2,091✔
535
    b = codeunit(s, i)
128,382,841✔
536
    u = UInt32(b) << 24
128,382,841✔
537
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
256,576,624✔
538
    return getindex_continued(s, i, u)
188,990✔
539
end
540

541
# duck-type s so that external UTF-8 string packages like StringViews can hook in
542
function getindex_continued(s, i::Int, u::UInt32)
7,991✔
543
    @label begin
7,991✔
544
        if u < 0xc0000000
7,991✔
545
            # called from `getindex` which checks bounds
546
            @inbounds isvalid(s, i) && break
×
547
            string_index_err(s, i)
×
548
        end
549
        n = ncodeunits(s)
7,991✔
550

551
        (i += 1) > n && break
7,991✔
552
        @inbounds b = codeunit(s, i) # cont byte 1
7,991✔
553
        b & 0xc0 == 0x80 || break
7,991✔
554
        u |= UInt32(b) << 16
7,991✔
555

556
        ((i += 1) > n) | (u < 0xe0000000) && break
7,991✔
557
        @inbounds b = codeunit(s, i) # cont byte 2
7,941✔
558
        b & 0xc0 == 0x80 || break
7,941✔
559
        u |= UInt32(b) << 8
7,941✔
560

561
        ((i += 1) > n) | (u < 0xf0000000) && break
7,941✔
UNCOV
562
        @inbounds b = codeunit(s, i) # cont byte 3
×
UNCOV
563
        b & 0xc0 == 0x80 || break
×
UNCOV
564
        u |= UInt32(b)
×
565
    end
566
    return reinterpret(Char, u)
7,991✔
567
end
568

569
function getindex(s::Union{String, StringView}, r::AbstractUnitRange{<:Integer})
8✔
570
    span = (Int(first(r))::Int):(Int(last(r)))::Int
8✔
571
    return s[span]
8✔
572
end
573

574
@inline function getindex(s::String, r::UnitRange{Int})
1,256✔
575
    isempty(r) && return ""
2,521,070✔
576
    i, j = first(r), last(r)
1,403,826✔
577
    @boundscheck begin
2,477,894✔
578
        checkbounds(s, r)
2,477,894✔
579
        @inbounds isvalid(s, i) || string_index_err(s, i)
2,477,894✔
580
        @inbounds isvalid(s, j) || string_index_err(s, j)
2,477,894✔
581
    end
582
    # Safety: The boundscheck checked r is inbounds in s,
583
    # and since we also checked r is not empty, j must be inbounds in s
584
    j = @inbounds nextind(s, j) - 1
4,954,602✔
585
    n = (j - i + 1) % UInt
2,477,894✔
586
    ss = _string_n(n)
2,477,894✔
587
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
2,477,894✔
588
    return ss
2,477,894✔
589
end
590

591
# nothrow because we know the start and end indices are valid
592
@assume_effects :nothrow function length(s::String)
105,337✔
593
    return length_continued(s, 1, ncodeunits(s), ncodeunits(s))
105,337✔
594
end
595

UNCOV
596
function length(s::StringView)
×
UNCOV
597
    return length_continued(s, 1, ncodeunits(s), ncodeunits(s))
×
598
end
599

600
# effects needed because @inbounds
601
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
602
    _length(s, i, j)
169,232✔
603
end
604

UNCOV
605
@inline function length(s::StringView, i::Int, j::Int)
×
UNCOV
606
    _length(s, i, j)
×
607
end
608

609
@inline function _length(s::Union{String, StringView}, i::Int, j::Int)
610
    @boundscheck begin
113,532✔
611
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
113,532✔
612
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
113,532✔
613
    end
614
    j < i && return 0
113,532✔
615
    @inbounds i, k = thisind(s, i), i
113,860✔
616
    c = j - i + (i == k)
56,930✔
617
    @inbounds length_continued(s, i, j, c)
56,930✔
618
end
619

620
@assume_effects :terminates_globally @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
4✔
621
    _length_continued(s, i, n, c)
162,273✔
622
end
623

UNCOV
624
@propagate_inbounds function length_continued(s::StringView, i::Int, n::Int, c::Int)
×
UNCOV
625
    _length_continued(s, i, n, c)
×
626
end
627

628

629
@propagate_inbounds function _length_continued(s::Union{String, StringView}, i::Int, n::Int, c::Int)
4✔
630
    i < n || return c
163,706✔
631
    b = codeunit(s, i)
160,828✔
632
    while true
909,768✔
633
        while true
3,409,320✔
634
            (i += 1) ≤ n || return c
18,951,816✔
635
            0xc0 ≤ b ≤ 0xf7 && break
18,632,756✔
636
            b = codeunit(s, i)
17,882,518✔
637
        end
17,882,518✔
638
        l = b
10✔
639
        b = codeunit(s, i) # cont byte 1
750,238✔
640
        c -= (x = b & 0xc0 == 0x80)
750,238✔
641
        x & (l ≥ 0xe0) || continue
750,238✔
642

643
        (i += 1) ≤ n || return c
60,470✔
644
        b = codeunit(s, i) # cont byte 2
57,874✔
645
        c -= (x = b & 0xc0 == 0x80)
57,874✔
646
        x & (l ≥ 0xf0) || continue
115,748✔
647

648
        (i += 1) ≤ n || return c
×
UNCOV
649
        b = codeunit(s, i) # cont byte 3
×
UNCOV
650
        c -= (b & 0xc0 == 0x80)
×
651
    end
748,940✔
652
end
653

654
## overload methods for efficiency ##
655

656
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
117,145,199✔
657

658
isascii(s::String) = isascii(codeunits(s))
6,351,203✔
659

660
# don't assume effects for general integers since we cannot know their implementation
661
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
8,351,254✔
662

663
"""
664
    repeat(c::AbstractChar, r::Integer)::String
665

666
Repeat a character `r` times. This can equivalently be accomplished by calling
667
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
668

669
# Examples
670
```jldoctest
671
julia> repeat('A', 3)
672
"AAA"
673
```
674
"""
675
function repeat(c::AbstractChar, r::Integer)
8,584,165✔
676
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
8,584,647✔
677
    r = UInt(r)::UInt
8,584,639✔
678
    c = Char(c)::Char
8,584,639✔
679
    r == 0 && return ""
8,584,639✔
680
    u = bswap(reinterpret(UInt32, c))
8,432,254✔
681
    n = 4 - (leading_zeros(u | 0xff) >> 3)
8,432,254✔
682
    s = _string_n(n*r)
8,432,254✔
683
    p = pointer(s)
8,432,252✔
684
    GC.@preserve s if n == 1
8,432,252✔
685
        memset(p, u % UInt8, r)
8,431,658✔
686
    elseif n == 2
594✔
687
        p16 = reinterpret(Ptr{UInt16}, p)
14✔
688
        for i = 1:r
14✔
689
            unsafe_store!(p16, u % UInt16, i)
40✔
690
        end
40✔
691
    elseif n == 3
580✔
692
        b1 = (u >> 0) % UInt8
570✔
693
        b2 = (u >> 8) % UInt8
570✔
694
        b3 = (u >> 16) % UInt8
570✔
695
        for i = 0:r-1
570✔
696
            unsafe_store!(p, b1, 3i + 1)
3,116✔
697
            unsafe_store!(p, b2, 3i + 2)
3,116✔
698
            unsafe_store!(p, b3, 3i + 3)
3,116✔
699
        end
3,116✔
700
    elseif n == 4
10✔
701
        p32 = reinterpret(Ptr{UInt32}, p)
10✔
702
        for i = 1:r
10✔
703
            unsafe_store!(p32, u, i)
22✔
704
        end
8,432,274✔
705
    end
706
    return s
8,432,252✔
707
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc