• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37738

05 Apr 2024 02:12AM UTC coverage: 80.159% (-1.3%) from 81.432%
#37738

push

local

web-flow
change the variable name `linfo::MethodInstance` to `mi::MethodInstance` (#53952)

54 of 61 new or added lines in 8 files covered. (88.52%)

1142 existing lines in 53 files now uncovered.

69535 of 86746 relevant lines covered (80.16%)

14578957.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.97
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
const ByteArray = Union{CodeUnits{UInt8,String}, Vector{UInt8},Vector{Int8}, FastContiguousSubArray{UInt8,1,CodeUnits{UInt8,String}}, FastContiguousSubArray{UInt8,1,Vector{UInt8}}, FastContiguousSubArray{Int8,1,Vector{Int8}}}
31

32
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
318,098,868✔
33

34
"""
35
    String <: AbstractString
36

37
The default string type in Julia, used by e.g. string literals.
38

39
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
40
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
41
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
42
that the underlying byte sequence is valid as UTF-8.
43
"""
44
String
45

46
## constructors and conversions ##
47

48
# String constructor docstring from boot.jl, workaround for #16730
49
# and the unavailability of @doc in boot.jl context.
50
"""
51
    String(v::AbstractVector{UInt8})
52

53
Create a new `String` object using the data buffer from byte vector `v`.
54
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
55
modification of `v` cannot affect the contents of the resulting string.
56
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
57
`AbstractVector` types, `String(v)` already makes a copy.
58

59
When possible, the memory of `v` will be used without copying when the `String`
60
object is created. This is guaranteed to be the case for byte vectors returned
61
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
62
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
63
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
64
to guarantee consistent behavior.
65
"""
66
String(v::AbstractVector{UInt8}) = String(copyto!(StringMemory(length(v)), v))
5,225,710✔
67
function String(v::Memory{UInt8})
68
    len = length(v)
2,615,874✔
69
    len == 0 && return ""
2,615,874✔
70
    return ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), v, len)
2,615,823✔
71
end
72
function String(v::Vector{UInt8})
2,617✔
73
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
74
    len = length(v)
12,747,415✔
75
    len == 0 && return ""
12,747,415✔
76
    ref = v.ref
11,698,646✔
77
    if ref.ptr_or_offset == ref.mem.ptr
11,698,649✔
78
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
11,698,648✔
79
    else
80
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
1✔
81
    end
82
    # optimized empty!(v); sizehint!(v, 0) calls
83
    setfield!(v, :size, (0,))
11,698,649✔
84
    setfield!(v, :ref, MemoryRef(Memory{UInt8}()))
11,698,646✔
85
    return str
11,698,649✔
86
end
87

88
"""
89
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
90

91
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
92
(The pointer can be safely freed afterwards.) If `length` is specified
93
(the length of the data in bytes), the string does not have to be NUL-terminated.
94

95
This function is labeled "unsafe" because it will crash if `p` is not
96
a valid memory address to data of the requested length.
97
"""
98
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
40✔
99
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
954,491✔
100
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
954,490✔
101
end
102
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
176✔
103
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
6,896,053✔
104
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
6,896,052✔
105
end
106

107
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
108
# but the macro is not available at this time in bootstrap, so we write it manually.
109
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0x000e)), :(convert(Csize_t, n))))
160,372,767✔
110

111
"""
112
    String(s::AbstractString)
113

114
Create a new `String` from an existing `AbstractString`.
115
"""
116
String(s::AbstractString) = print_to_string(s)
463✔
117
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
5,186,776✔
118

119
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
20,727,955✔
120
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
53,445✔
121

122
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
37,410✔
123
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
37,377✔
124
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
125

126
String(s::CodeUnits{UInt8,String}) = s.s
3✔
127

128
## low-level functions ##
129

130
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
672,038,926✔
131
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
462,210,599✔
132

133
ncodeunits(s::String) = Core.sizeof(s)
909,805,386✔
134
codeunit(s::String) = UInt8
×
135

136
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
137
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
611✔
138
    @boundscheck checkbounds(s, i)
432,128,140✔
139
    b = GC.@preserve s unsafe_load(pointer(s, i))
432,128,132✔
140
    return b
432,128,132✔
141
end
142

143
## comparison ##
144

145
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,415,783✔
146

147
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,424,133✔
148
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
149
    GC.@preserve a b begin
10,408,549✔
150
        pa = unsafe_convert(Ptr{UInt8}, a)
9,937,255✔
151
        pb = unsafe_convert(Ptr{UInt8}, b)
10,408,549✔
152
        memcmp(pa, pb, len % Csize_t) % Int
10,408,549✔
153
    end
154
end
155

156
function cmp(a::String, b::String)
1✔
157
    al, bl = sizeof(a), sizeof(b)
5,415,783✔
158
    c = _memcmp(a, b)
5,415,783✔
159
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
9,499,658✔
160
end
161

162
==(a::String, b::String) = a===b
12,286,162✔
163

164
typemin(::Type{String}) = ""
1✔
165
typemin(::String) = typemin(String)
1✔
166

167
## thisind, nextind ##
168

169
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
260,521,988✔
170

171
# s should be String or SubString{String}
172
@inline function _thisind_str(s, i::Int)
173
    i == 0 && return 0
134,553,850✔
174
    n = ncodeunits(s)
134,730,141✔
175
    i == n + 1 && return i
134,730,141✔
176
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
134,707,177✔
177
    @inbounds b = codeunit(s, i)
134,707,153✔
178
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
247,354,106✔
179
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,779,353✔
180
        local b
×
181
        @inbounds b = codeunit(s, i-1)
21,889,680✔
182
        between(b, 0b11000000, 0b11110111) && return i-1
21,889,680✔
183
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,304,134✔
184
        @inbounds b = codeunit(s, i-2)
8,910,188✔
185
        between(b, 0b11100000, 0b11110111) && return i-2
8,910,188✔
186
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,130✔
187
        @inbounds b = codeunit(s, i-3)
1,842,054✔
188
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,054✔
189
        return i
1,056,047✔
190
    end)(s, i, n)
191
end
192

193
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
51,765,047✔
194

195
# s should be String or SubString{String}
196
@inline function _nextind_str(s, i::Int)
197
    i == 0 && return 1
34,735,364✔
198
    n = ncodeunits(s)
34,433,237✔
199
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
34,433,249✔
200
    @inbounds l = codeunit(s, i)
34,433,225✔
201
    between(l, 0x80, 0xf7) || return i+1
67,952,417✔
202
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,828,044✔
203
        if l < 0xc0
914,022✔
204
            # handle invalid codeunit index by scanning back to the start of this index
205
            # (which may be the same as this index)
206
            i′ = @inbounds thisind(s, i)
69,068✔
207
            i′ >= i && return i+1
34,534✔
208
            i = i′
×
209
            @inbounds l = codeunit(s, i)
17,495✔
210
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,495✔
211
            @assert l >= 0xc0
17,495✔
212
        end
213
        # first continuation byte
214
        (i += 1) > n && return i
896,983✔
215
        @inbounds b = codeunit(s, i)
896,225✔
216
        b & 0xc0 ≠ 0x80 && return i
896,225✔
217
        ((i += 1) > n) | (l < 0xe0) && return i
883,578✔
218
        # second continuation byte
219
        @inbounds b = codeunit(s, i)
864,367✔
220
        b & 0xc0 ≠ 0x80 && return i
864,367✔
221
        ((i += 1) > n) | (l < 0xf0) && return i
862,547✔
222
        # third continuation byte
223
        @inbounds b = codeunit(s, i)
279,644✔
224
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,644✔
225
    end)(s, i, n, l)
226
end
227

228
## checking UTF-8 & ACSII validity ##
229
#=
230
    The UTF-8 Validation is performed by a shift based DFA.
231
    ┌───────────────────────────────────────────────────────────────────┐
232
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
233
    │                               ├────────3────────┐           │     │
234
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
235
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
236
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
237
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
238
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
239
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
240
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
241
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
242
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
243
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
244
    │                      │        │     ├─┤               │        │  │
245
    │                      │        └─4──►│6├─────1,9───────┘        │  │
246
    │          INVALID     │              └─┘                        │  │
247
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
248
    │          ┌▼───┴┐                                                  │
249
    │          │  2  ◄─── All undefined transitions result in state 2   │
250
    │          └─────┘                                                  │
251
    └───────────────────────────────────────────────────────────────────┘
252

253
        Validation States
254
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
255
                        If the DFA ends in this state the string is ASCII only
256
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
257
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
258
                    as seen by all 1s in that column of table below
259
            3 -> One valid continuation byte needed to return to state 0
260
        4,5,6 -> Two valid continuation bytes needed to return to state 0
261
        7,8,9 -> Three valids continuation bytes needed to return to state 0
262

263
                        Current State
264
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
265
                0 | 0  1  2  2  2  2  2  2  2  2
266
                1 | 2  2  2  1  3  2  3  2  4  4
267
                2 | 3  3  2  2  2  2  2  2  2  2
268
                3 | 4  4  2  2  2  2  2  2  2  2
269
                4 | 6  6  2  2  2  2  2  2  2  2
270
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
271
    Class       6 | 8  8  2  2  2  2  2  2  2  2
272
                7 | 2  2  2  1  3  3  2  4  4  2
273
                8 | 2  2  2  2  2  2  2  2  2  2
274
                9 | 2  2  2  1  3  2  3  4  4  2
275
               10 | 5  5  2  2  2  2  2  2  2  2
276
               11 | 7  7  2  2  2  2  2  2  2  2
277

278
           Shifts | 0  4 10 14 18 24  8 20 12 26
279

280
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
281
    the rows the correct shift was a result.
282

283
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
284
    the current state then masking the result with 0x11110 give the shift for the new state
285

286

287
=#
288

289
#State type used by UTF-8 DFA
290
const _UTF8DFAState = UInt32
291
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
292
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
293
    num_classes=12
294
    num_states=10
295
    bit_per_state = 6
296

297
    # These shifts were derived using a SMT solver
298
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
299

300
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
309
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
310
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
311
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
312
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
313
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
314
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
315
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
316

317
    # These are the rows discussed in comments above
318
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
319
                     2  2  2  1  3  2  3  2  4  4;
320
                     3  3  2  2  2  2  2  2  2  2;
321
                     4  4  2  2  2  2  2  2  2  2;
322
                     6  6  2  2  2  2  2  2  2  2;
323
                     9  9  2  2  2  2  2  2  2  2;
324
                     8  8  2  2  2  2  2  2  2  2;
325
                     2  2  2  1  3  3  2  4  4  2;
326
                     2  2  2  2  2  2  2  2  2  2;
327
                     2  2  2  1  3  2  3  4  4  2;
328
                     5  5  2  2  2  2  2  2  2  2;
329
                     7  7  2  2  2  2  2  2  2  2]
330

331
    #This converts the state_arrays into the shift encoded _UTF8DFAState
332
    class_row = zeros(_UTF8DFAState, num_classes)
333

334
    for i = 1:num_classes
335
        row = _UTF8DFAState(0)
336
        for j in 1:num_states
337
            #Calculate the shift required for the next state
338
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
339
            #Shift the next state into the position of the current state
340
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
341
        end
342
        class_row[i]=row
343
    end
344

345
    map(c->class_row[c+1],character_classes)
×
346
end
347

348

349
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
350
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
351
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
352

353
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
354
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
355

356
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
5,248✔
357
    for i = first:last
25,821✔
358
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
359
    end
82,597✔
360
    return (state)
25,821✔
361
end
362

363
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
364
    n=first
10✔
365
    while n <= last - chunk_size
40✔
366
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
367
        n += chunk_size
30✔
368
    end
30✔
369
    n= last-chunk_size+1
10✔
370
    _isascii(cu,n,last) || return n
10✔
371
    return nothing
10✔
372
end
373

374
##
375

376
# Classifcations of string
377
    # 0: neither valid ASCII nor UTF-8
378
    # 1: valid ASCII
379
    # 2: valid UTF-8
380
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
381

382

383
function byte_string_classify(bytes::AbstractVector{UInt8})
24✔
384
    chunk_size = 1024
20,429✔
385
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,429✔
386
    n = length(bytes)
20,429✔
387
    if n > chunk_threshold
20,429✔
388
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
389
        isnothing(start) && return 1
10✔
390
    else
391
        _isascii(bytes,1,n) && return 1
20,419✔
392
        start = 1
20,189✔
393
    end
394
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
395
end
396

397
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
398
    chunk_size = 256
20,189✔
399

400
    start = first
20,189✔
401
    stop = min(last,first + chunk_size - 1)
20,189✔
402
    state = _UTF8_DFA_ACCEPT
20,189✔
403
    while start <= last
24,926✔
404
        # try to process ascii chunks
405
        while state == _UTF8_DFA_ACCEPT
20,189✔
406
            _isascii(bytes,start,stop) || break
20,189✔
407
            (start = start + chunk_size) <= last || break
×
408
            stop = min(last,stop + chunk_size)
×
409
        end
×
410
        # Process non ascii chunk
411
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
412
        state == _UTF8_DFA_INVALID && return 0
20,189✔
413

414
        start = start + chunk_size
4,737✔
415
        stop = min(last,stop + chunk_size)
4,737✔
416
    end
4,737✔
417
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
418
end
419

420
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,451✔
421
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
422

423
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
192✔
424

UNCOV
425
is_valid_continuation(c) = c & 0xc0 == 0x80
×
426

427
## required core functionality ##
428

429
@inline function iterate(s::String, i::Int=firstindex(s))
6,781✔
430
    (i % UInt) - 1 < ncodeunits(s) || return nothing
75,771,890✔
431
    b = @inbounds codeunit(s, i)
63,850,356✔
432
    u = UInt32(b) << 24
63,850,356✔
433
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
126,611,674✔
434
    return iterate_continued(s, i, u)
1,089,038✔
435
end
436

437
# duck-type s so that external UTF-8 string packages like StringViews can hook in
438
function iterate_continued(s, i::Int, u::UInt32)
1,089,038✔
439
    u < 0xc0000000 && (i += 1; @goto ret)
1,089,038✔
440
    n = ncodeunits(s)
1,081,692✔
441
    # first continuation byte
442
    (i += 1) > n && @goto ret
1,081,692✔
443
    @inbounds b = codeunit(s, i)
1,079,085✔
444
    b & 0xc0 == 0x80 || @goto ret
1,079,085✔
445
    u |= UInt32(b) << 16
1,074,163✔
446
    # second continuation byte
447
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
1,074,163✔
448
    @inbounds b = codeunit(s, i)
937,779✔
449
    b & 0xc0 == 0x80 || @goto ret
937,905✔
450
    u |= UInt32(b) << 8
937,653✔
451
    # third continuation byte
452
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
937,653✔
453
    @inbounds b = codeunit(s, i)
276,214✔
454
    b & 0xc0 == 0x80 || @goto ret
276,220✔
455
    u |= UInt32(b); i += 1
276,208✔
456
@label ret
457
    return reinterpret(Char, u), i
1,089,038✔
458
end
459

460
@propagate_inbounds function getindex(s::String, i::Int)
1,966✔
461
    b = codeunit(s, i)
18,032,588✔
462
    u = UInt32(b) << 24
18,032,588✔
463
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
35,256,417✔
464
    return getindex_continued(s, i, u)
808,726✔
465
end
466

467
# duck-type s so that external UTF-8 string packages like StringViews can hook in
468
function getindex_continued(s, i::Int, u::UInt32)
808,727✔
469
    if u < 0xc0000000
808,729✔
470
        # called from `getindex` which checks bounds
471
        @inbounds isvalid(s, i) && @goto ret
54✔
472
        string_index_err(s, i)
1✔
473
    end
474
    n = ncodeunits(s)
808,702✔
475

476
    (i += 1) > n && @goto ret
808,702✔
477
    @inbounds b = codeunit(s, i) # cont byte 1
808,701✔
478
    b & 0xc0 == 0x80 || @goto ret
808,701✔
479
    u |= UInt32(b) << 16
808,690✔
480

481
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
808,690✔
482
    @inbounds b = codeunit(s, i) # cont byte 2
802,320✔
483
    b & 0xc0 == 0x80 || @goto ret
802,320✔
484
    u |= UInt32(b) << 8
802,320✔
485

486
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
802,320✔
487
    @inbounds b = codeunit(s, i) # cont byte 3
262,116✔
488
    b & 0xc0 == 0x80 || @goto ret
262,116✔
489
    u |= UInt32(b)
262,116✔
490
@label ret
491
    return reinterpret(Char, u)
808,728✔
492
end
493

494
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
495

496
@inline function getindex(s::String, r::UnitRange{Int})
67,378✔
497
    isempty(r) && return ""
978,397✔
498
    i, j = first(r), last(r)
73,165✔
499
    @boundscheck begin
956,067✔
500
        checkbounds(s, r)
956,073✔
501
        @inbounds isvalid(s, i) || string_index_err(s, i)
953,632✔
502
        @inbounds isvalid(s, j) || string_index_err(s, j)
956,062✔
503
    end
504
    j = nextind(s, j) - 1
1,912,119✔
505
    n = j - i + 1
956,060✔
506
    ss = _string_n(n)
956,060✔
507
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
956,060✔
508
    return ss
956,060✔
509
end
510

511
# nothrow because we know the start and end indices are valid
512
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
891,431✔
513

514
# effects needed because @inbounds
515
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
5✔
516
    @boundscheck begin
1,461,425✔
517
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
1,461,425✔
518
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
1,461,429✔
519
    end
520
    j < i && return 0
1,461,421✔
521
    @inbounds i, k = thisind(s, i), i
2,858,176✔
522
    c = j - i + (i == k)
1,429,088✔
523
    @inbounds length_continued(s, i, j, c)
1,429,088✔
524
end
525

526
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
527
    i < n || return c
2,619,370✔
528
    b = codeunit(s, i)
2,021,668✔
529
    while true
4,625,138✔
530
        while true
58,391,634✔
531
            (i += 1) ≤ n || return c
60,412,533✔
532
            0xc0 ≤ b ≤ 0xf7 && break
56,370,735✔
533
            b = codeunit(s, i)
53,766,496✔
534
        end
53,766,496✔
535
        l = b
×
536
        b = codeunit(s, i) # cont byte 1
2,604,239✔
537
        c -= (x = b & 0xc0 == 0x80)
2,604,239✔
538
        x & (l ≥ 0xe0) || continue
2,604,239✔
539

540
        (i += 1) ≤ n || return c
2,209,916✔
541
        b = codeunit(s, i) # cont byte 2
2,208,392✔
542
        c -= (x = b & 0xc0 == 0x80)
2,208,392✔
543
        x & (l ≥ 0xf0) || continue
3,793,077✔
544

545
        (i += 1) ≤ n || return c
623,714✔
546
        b = codeunit(s, i) # cont byte 3
623,700✔
547
        c -= (b & 0xc0 == 0x80)
623,700✔
548
    end
2,603,470✔
549
end
550

551
## overload methods for efficiency ##
552

553
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
235,432,198✔
554

555
isascii(s::String) = isascii(codeunits(s))
648,223✔
556

557
# don't assume effects for general integers since we cannot know their implementation
558
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
187,140✔
559

560
"""
561
    repeat(c::AbstractChar, r::Integer) -> String
562

563
Repeat a character `r` times. This can equivalently be accomplished by calling
564
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
565

566
# Examples
567
```jldoctest
568
julia> repeat('A', 3)
569
"AAA"
570
```
571
"""
572
function repeat(c::AbstractChar, r::Integer)
186,972✔
573
    c = Char(c)::Char
187,141✔
574
    r == 0 && return ""
187,141✔
575
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
110,907✔
576
    u = bswap(reinterpret(UInt32, c))
110,903✔
577
    n = 4 - (leading_zeros(u | 0xff) >> 3)
110,903✔
578
    s = _string_n(n*r)
110,962✔
579
    p = pointer(s)
110,960✔
580
    GC.@preserve s if n == 1
110,960✔
581
        memset(p, u % UInt8, r)
110,842✔
582
    elseif n == 2
118✔
583
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
584
        for i = 1:r
6✔
585
            unsafe_store!(p16, u % UInt16, i)
20✔
586
        end
25✔
587
    elseif n == 3
112✔
588
        b1 = (u >> 0) % UInt8
108✔
589
        b2 = (u >> 8) % UInt8
108✔
590
        b3 = (u >> 16) % UInt8
108✔
591
        for i = 0:r-1
108✔
592
            unsafe_store!(p, b1, 3i + 1)
578✔
593
            unsafe_store!(p, b2, 3i + 2)
578✔
594
            unsafe_store!(p, b3, 3i + 3)
578✔
595
        end
578✔
596
    elseif n == 4
4✔
597
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
598
        for i = 1:r
4✔
599
            unsafe_store!(p32, u, i)
8✔
600
        end
110,968✔
601
    end
602
    return s
110,901✔
603
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc