• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37997

29 Jan 2025 02:08AM UTC coverage: 17.283% (-68.7%) from 85.981%
#37997

push

local

web-flow
bpart: Start enforcing min_world for global variable definitions (#57150)

This is the analog of #57102 for global variables. Unlike for consants,
there is no automatic global backdate mechanism. The reasoning for this
is that global variables can be declared at any time, unlike constants
which can only be decalared once their value is available. As a result
code patterns using `Core.eval` to declare globals are rarer and likely
incorrect.

1 of 22 new or added lines in 3 files covered. (4.55%)

31430 existing lines in 188 files now uncovered.

7903 of 45728 relevant lines covered (17.28%)

98663.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

29.83
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
10
    index::Integer
11
end
UNCOV
12
@noinline string_index_err(s::AbstractString, i::Integer) =
×
13
    throw(StringIndexError(s, Int(i)))
UNCOV
14
function Base.showerror(io::IO, exc::StringIndexError)
×
UNCOV
15
    s = exc.string
×
UNCOV
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
×
UNCOV
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
×
UNCOV
18
        iprev = thisind(s, exc.index)
×
UNCOV
19
        inext = nextind(s, iprev)
×
UNCOV
20
        escprev = escape_string(s[iprev:iprev])
×
UNCOV
21
        if inext <= ncodeunits(s)
×
UNCOV
22
            escnext = escape_string(s[inext:inext])
×
UNCOV
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
×
24
        else
UNCOV
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
×
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
1,474,403✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = unsafe_takestring(copyto!(StringMemory(length(v)), v))
21✔
65
function String(v::Vector{UInt8})
66
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
67
    len = length(v)
560,915✔
68
    len == 0 && return ""
560,915✔
69
    ref = v.ref
303,684✔
70
    if ref.ptr_or_offset == ref.mem.ptr
308,726✔
71
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
308,726✔
72
    else
UNCOV
73
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
×
74
    end
75
    # optimized empty!(v); sizehint!(v, 0) calls
76
    setfield!(v, :size, (0,))
308,726✔
77
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
303,684✔
78
    return str
308,726✔
79
end
80

81
"Create a string re-using the memory, if possible.
82
Mutating or reading the memory after calling this function is undefined behaviour."
83
function unsafe_takestring(m::Memory{UInt8})
84
    isempty(m) ? "" : ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), m, length(m))
121,976✔
85
end
86

87
"""
88
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
89

90
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
91
(The pointer can be safely freed afterwards.) If `length` is specified
92
(the length of the data in bytes), the string does not have to be NUL-terminated.
93

94
This function is labeled "unsafe" because it will crash if `p` is not
95
a valid memory address to data of the requested length.
96
"""
97
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
98
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
231,068✔
99
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
231,068✔
100
end
101
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
102
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
586✔
103
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
586✔
104
end
105

106
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
107
# but the macro is not available at this time in bootstrap, so we write it manually.
108
const _string_n_override = 0x04ee
109
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
98,623✔
110
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override)), :(convert(Csize_t, n))))
111

112
"""
113
    String(s::AbstractString)
114

115
Create a new `String` from an existing `AbstractString`.
116
"""
UNCOV
117
String(s::AbstractString) = print_to_string(s)
×
118
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
499✔
119

120
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
68,822✔
UNCOV
121
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
×
122

123
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
16,788✔
124
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
16,788✔
UNCOV
125
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
126

UNCOV
127
String(s::CodeUnits{UInt8,String}) = s.s
×
128

129
## low-level functions ##
130

131
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
5,539,012✔
132
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
4,955,845✔
133

134
ncodeunits(s::String) = Core.sizeof(s)
5,995,485✔
135
codeunit(s::String) = UInt8
×
136

UNCOV
137
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
138
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
139
    @boundscheck checkbounds(s, i)
4,668,402✔
140
    b = GC.@preserve s unsafe_load(pointer(s, i))
4,668,402✔
141
    return b
4,668,402✔
142
end
143

144
## comparison ##
145

146
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
180,542✔
147

148
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
481,828✔
149
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
150
    GC.@preserve a b begin
481,828✔
151
        pa = unsafe_convert(Ptr{UInt8}, a)
481,828✔
152
        pb = unsafe_convert(Ptr{UInt8}, b)
481,828✔
153
        memcmp(pa, pb, len % Csize_t) % Int
481,828✔
154
    end
155
end
156

157
function cmp(a::String, b::String)
158
    al, bl = sizeof(a), sizeof(b)
180,542✔
159
    c = _memcmp(a, b)
180,542✔
160
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
247,600✔
161
end
162

163
==(a::String, b::String) = a===b
241,265✔
164

UNCOV
165
typemin(::Type{String}) = ""
×
UNCOV
166
typemin(::String) = typemin(String)
×
167

168
## thisind, nextind ##
169

170
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
20,650✔
171

172
# s should be String or SubString{String}
173
@inline function _thisind_str(s, i::Int)
174
    i == 0 && return 0
15,850✔
175
    n = ncodeunits(s)
15,838✔
176
    i == n + 1 && return i
15,838✔
177
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
15,838✔
178
    @inbounds b = codeunit(s, i)
15,838✔
179
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
31,169✔
UNCOV
180
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
×
181
        local b
182
        @inbounds b = codeunit(s, i-1)
183
        between(b, 0b11000000, 0b11110111) && return i-1
184
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
185
        @inbounds b = codeunit(s, i-2)
186
        between(b, 0b11100000, 0b11110111) && return i-2
187
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
188
        @inbounds b = codeunit(s, i-3)
189
        between(b, 0b11110000, 0b11110111) && return i-3
190
        return i
191
    end)(s, i, n)
192
end
193

194
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
1,060✔
195

196
# s should be String or SubString{String}
197
@inline function _nextind_str(s, i::Int)
198
    i == 0 && return 1
532✔
199
    n = ncodeunits(s)
532✔
200
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
532✔
201
    @inbounds l = codeunit(s, i)
532✔
202
    between(l, 0x80, 0xf7) || return i+1
1,064✔
UNCOV
203
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
×
204
        if l < 0xc0
205
            # handle invalid codeunit index by scanning back to the start of this index
206
            # (which may be the same as this index)
207
            i′ = @inbounds thisind(s, i)
208
            i′ >= i && return i+1
209
            i = i′
210
            @inbounds l = codeunit(s, i)
211
            (l < 0x80) | (0xf8 ≤ l) && return i+1
212
            @assert l >= 0xc0 "invalid codeunit"
213
        end
214
        # first continuation byte
215
        (i += 1) > n && return i
216
        @inbounds b = codeunit(s, i)
217
        b & 0xc0 ≠ 0x80 && return i
218
        ((i += 1) > n) | (l < 0xe0) && return i
219
        # second continuation byte
220
        @inbounds b = codeunit(s, i)
221
        b & 0xc0 ≠ 0x80 && return i
222
        ((i += 1) > n) | (l < 0xf0) && return i
223
        # third continuation byte
224
        @inbounds b = codeunit(s, i)
225
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
226
    end)(s, i, n, l)
227
end
228

229
## checking UTF-8 & ACSII validity ##
230
#=
231
    The UTF-8 Validation is performed by a shift based DFA.
232
    ┌───────────────────────────────────────────────────────────────────┐
233
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
234
    │                               ├────────3────────┐           │     │
235
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
236
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
237
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
238
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
239
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
240
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
241
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
242
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
243
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
244
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
245
    │                      │        │     ├─┤               │        │  │
246
    │                      │        └─4──►│6├─────1,9───────┘        │  │
247
    │          INVALID     │              └─┘                        │  │
248
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
249
    │          ┌▼───┴┐                                                  │
250
    │          │  2  ◄─── All undefined transitions result in state 2   │
251
    │          └─────┘                                                  │
252
    └───────────────────────────────────────────────────────────────────┘
253

254
        Validation States
255
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
256
                        If the DFA ends in this state the string is ASCII only
257
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
258
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
259
                    as seen by all 1s in that column of table below
260
            3 -> One valid continuation byte needed to return to state 0
261
        4,5,6 -> Two valid continuation bytes needed to return to state 0
262
        7,8,9 -> Three valids continuation bytes needed to return to state 0
263

264
                        Current State
265
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
266
                0 | 0  1  2  2  2  2  2  2  2  2
267
                1 | 2  2  2  1  3  2  3  2  4  4
268
                2 | 3  3  2  2  2  2  2  2  2  2
269
                3 | 4  4  2  2  2  2  2  2  2  2
270
                4 | 6  6  2  2  2  2  2  2  2  2
271
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
272
    Class       6 | 8  8  2  2  2  2  2  2  2  2
273
                7 | 2  2  2  1  3  3  2  4  4  2
274
                8 | 2  2  2  2  2  2  2  2  2  2
275
                9 | 2  2  2  1  3  2  3  4  4  2
276
               10 | 5  5  2  2  2  2  2  2  2  2
277
               11 | 7  7  2  2  2  2  2  2  2  2
278

279
           Shifts | 0  4 10 14 18 24  8 20 12 26
280

281
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
282
    the rows the correct shift was a result.
283

284
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
285
    the current state then masking the result with 0x11110 give the shift for the new state
286

287

288
=#
289

290
#State type used by UTF-8 DFA
291
const _UTF8DFAState = UInt32
292
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
293
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
294
    num_classes=12
295
    num_states=10
296
    bit_per_state = 6
297

298
    # These shifts were derived using a SMT solver
299
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
300

301
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
311
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
312
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
313
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
314
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
315
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
316
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
317

318
    # These are the rows discussed in comments above
319
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
320
                     2  2  2  1  3  2  3  2  4  4;
321
                     3  3  2  2  2  2  2  2  2  2;
322
                     4  4  2  2  2  2  2  2  2  2;
323
                     6  6  2  2  2  2  2  2  2  2;
324
                     9  9  2  2  2  2  2  2  2  2;
325
                     8  8  2  2  2  2  2  2  2  2;
326
                     2  2  2  1  3  3  2  4  4  2;
327
                     2  2  2  2  2  2  2  2  2  2;
328
                     2  2  2  1  3  2  3  4  4  2;
329
                     5  5  2  2  2  2  2  2  2  2;
330
                     7  7  2  2  2  2  2  2  2  2]
331

332
    #This converts the state_arrays into the shift encoded _UTF8DFAState
333
    class_row = zeros(_UTF8DFAState, num_classes)
334

335
    for i = 1:num_classes
336
        row = _UTF8DFAState(0)
337
        for j in 1:num_states
338
            #Calculate the shift required for the next state
339
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
340
            #Shift the next state into the position of the current state
341
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
342
        end
343
        class_row[i]=row
344
    end
345

346
    map(c->class_row[c+1],character_classes)
×
347
end
348

349

350
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
351
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
352
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
353

354
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
UNCOV
355
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
×
356

357
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
UNCOV
358
    for i = first:last
×
UNCOV
359
       @inbounds state = _utf_dfa_step(state, bytes[i])
×
UNCOV
360
    end
×
UNCOV
361
    return (state)
×
362
end
363

364
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
UNCOV
365
    n=first
×
UNCOV
366
    while n <= last - chunk_size
×
UNCOV
367
        _isascii(cu,n,n+chunk_size-1) || return n
×
UNCOV
368
        n += chunk_size
×
UNCOV
369
    end
×
UNCOV
370
    n= last-chunk_size+1
×
UNCOV
371
    _isascii(cu,n,last) || return n
×
UNCOV
372
    return nothing
×
373
end
374

375
##
376

377
# Classifcations of string
378
    # 0: neither valid ASCII nor UTF-8
379
    # 1: valid ASCII
380
    # 2: valid UTF-8
UNCOV
381
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
×
382

383

384
function byte_string_classify(bytes::AbstractVector{UInt8})
UNCOV
385
    chunk_size = 1024
×
UNCOV
386
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
×
UNCOV
387
    n = length(bytes)
×
UNCOV
388
    if n > chunk_threshold
×
UNCOV
389
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
×
UNCOV
390
        isnothing(start) && return 1
×
391
    else
UNCOV
392
        _isascii(bytes,1,n) && return 1
×
UNCOV
393
        start = 1
×
394
    end
UNCOV
395
    return _byte_string_classify_nonascii(bytes,start,n)
×
396
end
397

UNCOV
398
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
×
UNCOV
399
    chunk_size = 256
×
400

UNCOV
401
    start = first
×
UNCOV
402
    stop = min(last,first + chunk_size - 1)
×
UNCOV
403
    state = _UTF8_DFA_ACCEPT
×
UNCOV
404
    while start <= last
×
405
        # try to process ascii chunks
UNCOV
406
        while state == _UTF8_DFA_ACCEPT
×
UNCOV
407
            _isascii(bytes,start,stop) || break
×
408
            (start = start + chunk_size) <= last || break
×
409
            stop = min(last,stop + chunk_size)
×
410
        end
×
411
        # Process non ascii chunk
UNCOV
412
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
×
UNCOV
413
        state == _UTF8_DFA_INVALID && return 0
×
414

UNCOV
415
        start = start + chunk_size
×
UNCOV
416
        stop = min(last,stop + chunk_size)
×
UNCOV
417
    end
×
UNCOV
418
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
×
419
end
420

UNCOV
421
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
×
UNCOV
422
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
×
423

UNCOV
424
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
×
425

UNCOV
426
is_valid_continuation(c) = c & 0xc0 == 0x80
×
427

428
## required core functionality ##
429

430
@inline function iterate(s::String, i::Int=firstindex(s))
431
    (i % UInt) - 1 < ncodeunits(s) || return nothing
1,311,678✔
432
    b = @inbounds codeunit(s, i)
1,306,820✔
433
    u = UInt32(b) << 24
1,306,820✔
434
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
2,613,639✔
435
    return @noinline iterate_continued(s, i, u)
1✔
436
end
437

438
# duck-type s so that external UTF-8 string packages like StringViews can hook in
UNCOV
439
function iterate_continued(s, i::Int, u::UInt32)
×
UNCOV
440
    u < 0xc0000000 && (i += 1; @goto ret)
×
UNCOV
441
    n = ncodeunits(s)
×
442
    # first continuation byte
UNCOV
443
    (i += 1) > n && @goto ret
×
UNCOV
444
    @inbounds b = codeunit(s, i)
×
UNCOV
445
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
446
    u |= UInt32(b) << 16
×
447
    # second continuation byte
UNCOV
448
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
×
UNCOV
449
    @inbounds b = codeunit(s, i)
×
UNCOV
450
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
451
    u |= UInt32(b) << 8
×
452
    # third continuation byte
UNCOV
453
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
×
UNCOV
454
    @inbounds b = codeunit(s, i)
×
UNCOV
455
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
456
    u |= UInt32(b); i += 1
×
UNCOV
457
@label ret
×
UNCOV
458
    return reinterpret(Char, u), i
×
459
end
460

461
@propagate_inbounds function getindex(s::String, i::Int)
462
    b = codeunit(s, i)
150,687✔
463
    u = UInt32(b) << 24
150,687✔
464
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
301,374✔
UNCOV
465
    return getindex_continued(s, i, u)
×
466
end
467

468
# duck-type s so that external UTF-8 string packages like StringViews can hook in
UNCOV
469
function getindex_continued(s, i::Int, u::UInt32)
×
UNCOV
470
    if u < 0xc0000000
×
471
        # called from `getindex` which checks bounds
UNCOV
472
        @inbounds isvalid(s, i) && @goto ret
×
UNCOV
473
        string_index_err(s, i)
×
474
    end
UNCOV
475
    n = ncodeunits(s)
×
476

UNCOV
477
    (i += 1) > n && @goto ret
×
UNCOV
478
    @inbounds b = codeunit(s, i) # cont byte 1
×
UNCOV
479
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
480
    u |= UInt32(b) << 16
×
481

UNCOV
482
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
×
UNCOV
483
    @inbounds b = codeunit(s, i) # cont byte 2
×
UNCOV
484
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
485
    u |= UInt32(b) << 8
×
486

UNCOV
487
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
×
UNCOV
488
    @inbounds b = codeunit(s, i) # cont byte 3
×
UNCOV
489
    b & 0xc0 == 0x80 || @goto ret
×
UNCOV
490
    u |= UInt32(b)
×
UNCOV
491
@label ret
×
UNCOV
492
    return reinterpret(Char, u)
×
493
end
494

UNCOV
495
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
×
496

497
@inline function getindex(s::String, r::UnitRange{Int})
498
    isempty(r) && return ""
510✔
499
    i, j = first(r), last(r)
508✔
500
    @boundscheck begin
510✔
501
        checkbounds(s, r)
510✔
502
        @inbounds isvalid(s, i) || string_index_err(s, i)
510✔
503
        @inbounds isvalid(s, j) || string_index_err(s, j)
510✔
504
    end
505
    j = nextind(s, j) - 1
1,020✔
506
    n = j - i + 1
510✔
507
    ss = _string_n(n)
510✔
508
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
510✔
509
    return ss
510✔
510
end
511

512
# nothrow because we know the start and end indices are valid
UNCOV
513
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
×
514

515
# effects needed because @inbounds
UNCOV
516
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
×
UNCOV
517
    @boundscheck begin
×
UNCOV
518
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
×
UNCOV
519
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
×
520
    end
UNCOV
521
    j < i && return 0
×
UNCOV
522
    @inbounds i, k = thisind(s, i), i
×
UNCOV
523
    c = j - i + (i == k)
×
UNCOV
524
    @inbounds length_continued(s, i, j, c)
×
525
end
526

UNCOV
527
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
×
UNCOV
528
    i < n || return c
×
UNCOV
529
    b = codeunit(s, i)
×
UNCOV
530
    while true
×
UNCOV
531
        while true
×
UNCOV
532
            (i += 1) ≤ n || return c
×
UNCOV
533
            0xc0 ≤ b ≤ 0xf7 && break
×
UNCOV
534
            b = codeunit(s, i)
×
UNCOV
535
        end
×
536
        l = b
×
UNCOV
537
        b = codeunit(s, i) # cont byte 1
×
UNCOV
538
        c -= (x = b & 0xc0 == 0x80)
×
UNCOV
539
        x & (l ≥ 0xe0) || continue
×
540

UNCOV
541
        (i += 1) ≤ n || return c
×
UNCOV
542
        b = codeunit(s, i) # cont byte 2
×
UNCOV
543
        c -= (x = b & 0xc0 == 0x80)
×
UNCOV
544
        x & (l ≥ 0xf0) || continue
×
545

UNCOV
546
        (i += 1) ≤ n || return c
×
UNCOV
547
        b = codeunit(s, i) # cont byte 3
×
UNCOV
548
        c -= (b & 0xc0 == 0x80)
×
UNCOV
549
    end
×
550
end
551

552
## overload methods for efficiency ##
553

554
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
9,090✔
555

556
isascii(s::String) = isascii(codeunits(s))
13✔
557

558
# don't assume effects for general integers since we cannot know their implementation
559
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
456✔
560

561
"""
562
    repeat(c::AbstractChar, r::Integer) -> String
563

564
Repeat a character `r` times. This can equivalently be accomplished by calling
565
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
566

567
# Examples
568
```jldoctest
569
julia> repeat('A', 3)
570
"AAA"
571
```
572
"""
UNCOV
573
function repeat(c::AbstractChar, r::Integer)
×
UNCOV
574
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
×
UNCOV
575
    r = UInt(r)::UInt
×
UNCOV
576
    c = Char(c)::Char
×
UNCOV
577
    r == 0 && return ""
×
UNCOV
578
    u = bswap(reinterpret(UInt32, c))
×
UNCOV
579
    n = 4 - (leading_zeros(u | 0xff) >> 3)
×
UNCOV
580
    s = _string_n(n*r)
×
UNCOV
581
    p = pointer(s)
×
UNCOV
582
    GC.@preserve s if n == 1
×
UNCOV
583
        memset(p, u % UInt8, r)
×
UNCOV
584
    elseif n == 2
×
UNCOV
585
        p16 = reinterpret(Ptr{UInt16}, p)
×
UNCOV
586
        for i = 1:r
×
UNCOV
587
            unsafe_store!(p16, u % UInt16, i)
×
UNCOV
588
        end
×
UNCOV
589
    elseif n == 3
×
UNCOV
590
        b1 = (u >> 0) % UInt8
×
UNCOV
591
        b2 = (u >> 8) % UInt8
×
UNCOV
592
        b3 = (u >> 16) % UInt8
×
UNCOV
593
        for i = 0:r-1
×
UNCOV
594
            unsafe_store!(p, b1, 3i + 1)
×
UNCOV
595
            unsafe_store!(p, b2, 3i + 2)
×
UNCOV
596
            unsafe_store!(p, b3, 3i + 3)
×
UNCOV
597
        end
×
UNCOV
598
    elseif n == 4
×
UNCOV
599
        p32 = reinterpret(Ptr{UInt32}, p)
×
UNCOV
600
        for i = 1:r
×
UNCOV
601
            unsafe_store!(p32, u, i)
×
UNCOV
602
        end
×
603
    end
UNCOV
604
    return s
×
605
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc