• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37919

29 Sep 2024 09:41AM UTC coverage: 86.232% (-0.3%) from 86.484%
#37919

push

local

web-flow
fix rawbigints OOB issues (#55917)

Fixes issues introduced in #50691 and found in #55906:
* use `@inbounds` and `@boundscheck` macros in rawbigints, for catching
OOB with `--check-bounds=yes`
* fix OOB in `truncate`

12 of 13 new or added lines in 1 file covered. (92.31%)

1287 existing lines in 41 files now uncovered.

77245 of 89578 relevant lines covered (86.23%)

15686161.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.34
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
58✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
54✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
349,895,356✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = String(copyto!(StringMemory(length(v)), v))
5,451,166✔
65
function String(v::Memory{UInt8})
66
    len = length(v)
6,612,566✔
67
    len == 0 && return ""
6,612,566✔
68
    return ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), v, len)
6,612,515✔
69
end
70
function String(v::Vector{UInt8})
5,278✔
71
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
72
    len = length(v)
11,471,362✔
73
    len == 0 && return ""
11,471,362✔
74
    ref = v.ref
5,125,658✔
75
    if ref.ptr_or_offset == ref.mem.ptr
10,125,625✔
76
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
10,125,624✔
77
    else
78
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
1✔
79
    end
80
    # optimized empty!(v); sizehint!(v, 0) calls
81
    setfield!(v, :size, (0,))
10,125,625✔
82
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
5,125,658✔
83
    return str
10,125,625✔
84
end
85

86
"""
87
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
88

89
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
90
(The pointer can be safely freed afterwards.) If `length` is specified
91
(the length of the data in bytes), the string does not have to be NUL-terminated.
92

93
This function is labeled "unsafe" because it will crash if `p` is not
94
a valid memory address to data of the requested length.
95
"""
96
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
89✔
97
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,119,206✔
98
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,119,205✔
99
end
100
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
289✔
101
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
6,692,472✔
102
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
6,692,471✔
103
end
104

105
# This is `@assume_effects :total !:consistent @ccall jl_alloc_string(n::Csize_t)::Ref{String}`,
106
# but the macro is not available at this time in bootstrap, so we write it manually.
107
const _string_n_override = 0x04ee
108
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String},
189,183,944✔
109
    :(Core.svec(Csize_t)), 1, QuoteNode((:ccall, _string_n_override)), :(convert(Csize_t, n))))
110

111
"""
112
    String(s::AbstractString)
113

114
Create a new `String` from an existing `AbstractString`.
115
"""
116
String(s::AbstractString) = print_to_string(s)
464✔
117
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
4,891,318✔
118

119
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
21,349,304✔
120
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
60,963✔
121

122
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
39,301✔
123
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
39,268✔
124
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
125

126
String(s::CodeUnits{UInt8,String}) = s.s
11✔
127

128
## low-level functions ##
129

130
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
970,722,871✔
131
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
575,187,132✔
132

133
ncodeunits(s::String) = Core.sizeof(s)
1,078,885,027✔
134
codeunit(s::String) = UInt8
×
135

136
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
1✔
137
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
818✔
138
    @boundscheck checkbounds(s, i)
545,088,369✔
139
    b = GC.@preserve s unsafe_load(pointer(s, i))
545,088,347✔
140
    return b
545,088,347✔
141
end
142

143
## comparison ##
144

145
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,575,460✔
146

147
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,638,183✔
148
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
149
    GC.@preserve a b begin
10,837,648✔
150
        pa = unsafe_convert(Ptr{UInt8}, a)
10,184,741✔
151
        pb = unsafe_convert(Ptr{UInt8}, b)
10,837,648✔
152
        memcmp(pa, pb, len % Csize_t) % Int
10,837,648✔
153
    end
154
end
155

156
function cmp(a::String, b::String)
1✔
157
    al, bl = sizeof(a), sizeof(b)
5,575,460✔
158
    c = _memcmp(a, b)
5,575,460✔
159
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
10,519,015✔
160
end
161

162
==(a::String, b::String) = a===b
12,845,552✔
163

164
typemin(::Type{String}) = ""
1✔
165
typemin(::String) = typemin(String)
1✔
166

167
## thisind, nextind ##
168

169
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
253,245,316✔
170

171
# s should be String or SubString{String}
172
@inline function _thisind_str(s, i::Int)
173
    i == 0 && return 0
130,558,740✔
174
    n = ncodeunits(s)
130,732,475✔
175
    i == n + 1 && return i
130,732,475✔
176
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
130,709,511✔
177
    @inbounds b = codeunit(s, i)
130,709,487✔
178
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
239,327,176✔
179
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,829,810✔
180
        local b
×
181
        @inbounds b = codeunit(s, i-1)
21,914,906✔
182
        between(b, 0b11000000, 0b11110111) && return i-1
21,914,906✔
183
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,316,646✔
184
        @inbounds b = codeunit(s, i-2)
8,922,696✔
185
        between(b, 0b11100000, 0b11110111) && return i-2
8,922,696✔
186
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,186✔
187
        @inbounds b = codeunit(s, i-3)
1,842,110✔
188
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,110✔
189
        return i
1,056,047✔
190
    end)(s, i, n)
191
end
192

193
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
49,886,404✔
194

195
# s should be String or SubString{String}
196
@inline function _nextind_str(s, i::Int)
197
    i == 0 && return 1
33,851,094✔
198
    n = ncodeunits(s)
33,431,772✔
199
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
33,431,784✔
200
    @inbounds l = codeunit(s, i)
33,431,760✔
201
    between(l, 0x80, 0xf7) || return i+1
65,953,571✔
202
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,819,842✔
203
        if l < 0xc0
909,921✔
204
            # handle invalid codeunit index by scanning back to the start of this index
205
            # (which may be the same as this index)
206
            i′ = @inbounds thisind(s, i)
69,070✔
207
            i′ >= i && return i+1
34,535✔
208
            i = i′
×
209
            @inbounds l = codeunit(s, i)
17,496✔
210
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,496✔
211
            @assert l >= 0xc0 "invalid codeunit"
17,496✔
212
        end
213
        # first continuation byte
214
        (i += 1) > n && return i
892,882✔
215
        @inbounds b = codeunit(s, i)
892,124✔
216
        b & 0xc0 ≠ 0x80 && return i
892,124✔
217
        ((i += 1) > n) | (l < 0xe0) && return i
879,477✔
218
        # second continuation byte
219
        @inbounds b = codeunit(s, i)
863,764✔
220
        b & 0xc0 ≠ 0x80 && return i
863,764✔
221
        ((i += 1) > n) | (l < 0xf0) && return i
861,944✔
222
        # third continuation byte
223
        @inbounds b = codeunit(s, i)
279,738✔
224
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,738✔
225
    end)(s, i, n, l)
226
end
227

228
## checking UTF-8 & ACSII validity ##
229
#=
230
    The UTF-8 Validation is performed by a shift based DFA.
231
    ┌───────────────────────────────────────────────────────────────────┐
232
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
233
    │                               ├────────3────────┐           │     │
234
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
235
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
236
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
237
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
238
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
239
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
240
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
241
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
242
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
243
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
244
    │                      │        │     ├─┤               │        │  │
245
    │                      │        └─4──►│6├─────1,9───────┘        │  │
246
    │          INVALID     │              └─┘                        │  │
247
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
248
    │          ┌▼───┴┐                                                  │
249
    │          │  2  ◄─── All undefined transitions result in state 2   │
250
    │          └─────┘                                                  │
251
    └───────────────────────────────────────────────────────────────────┘
252

253
        Validation States
254
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
255
                        If the DFA ends in this state the string is ASCII only
256
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
257
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
258
                    as seen by all 1s in that column of table below
259
            3 -> One valid continuation byte needed to return to state 0
260
        4,5,6 -> Two valid continuation bytes needed to return to state 0
261
        7,8,9 -> Three valids continuation bytes needed to return to state 0
262

263
                        Current State
264
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
265
                0 | 0  1  2  2  2  2  2  2  2  2
266
                1 | 2  2  2  1  3  2  3  2  4  4
267
                2 | 3  3  2  2  2  2  2  2  2  2
268
                3 | 4  4  2  2  2  2  2  2  2  2
269
                4 | 6  6  2  2  2  2  2  2  2  2
270
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
271
    Class       6 | 8  8  2  2  2  2  2  2  2  2
272
                7 | 2  2  2  1  3  3  2  4  4  2
273
                8 | 2  2  2  2  2  2  2  2  2  2
274
                9 | 2  2  2  1  3  2  3  4  4  2
275
               10 | 5  5  2  2  2  2  2  2  2  2
276
               11 | 7  7  2  2  2  2  2  2  2  2
277

278
           Shifts | 0  4 10 14 18 24  8 20 12 26
279

280
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
281
    the rows the correct shift was a result.
282

283
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
284
    the current state then masking the result with 0x11110 give the shift for the new state
285

286

287
=#
288

289
#State type used by UTF-8 DFA
290
const _UTF8DFAState = UInt32
291
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
292
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
293
    num_classes=12
294
    num_states=10
295
    bit_per_state = 6
296

297
    # These shifts were derived using a SMT solver
298
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
299

300
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
309
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
310
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
311
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
312
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
313
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
314
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
315
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
316

317
    # These are the rows discussed in comments above
318
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
319
                     2  2  2  1  3  2  3  2  4  4;
320
                     3  3  2  2  2  2  2  2  2  2;
321
                     4  4  2  2  2  2  2  2  2  2;
322
                     6  6  2  2  2  2  2  2  2  2;
323
                     9  9  2  2  2  2  2  2  2  2;
324
                     8  8  2  2  2  2  2  2  2  2;
325
                     2  2  2  1  3  3  2  4  4  2;
326
                     2  2  2  2  2  2  2  2  2  2;
327
                     2  2  2  1  3  2  3  4  4  2;
328
                     5  5  2  2  2  2  2  2  2  2;
329
                     7  7  2  2  2  2  2  2  2  2]
330

331
    #This converts the state_arrays into the shift encoded _UTF8DFAState
332
    class_row = zeros(_UTF8DFAState, num_classes)
333

334
    for i = 1:num_classes
335
        row = _UTF8DFAState(0)
336
        for j in 1:num_states
337
            #Calculate the shift required for the next state
338
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
339
            #Shift the next state into the position of the current state
340
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
341
        end
342
        class_row[i]=row
343
    end
344

345
    map(c->class_row[c+1],character_classes)
×
346
end
347

348

349
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
350
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
351
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
352

353
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
354
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
355

356
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
5,248✔
357
    for i = first:last
25,821✔
358
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
359
    end
82,597✔
360
    return (state)
25,821✔
361
end
362

363
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
364
    n=first
10✔
365
    while n <= last - chunk_size
40✔
366
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
367
        n += chunk_size
30✔
368
    end
30✔
369
    n= last-chunk_size+1
10✔
370
    _isascii(cu,n,last) || return n
10✔
371
    return nothing
10✔
372
end
373

374
##
375

376
# Classifcations of string
377
    # 0: neither valid ASCII nor UTF-8
378
    # 1: valid ASCII
379
    # 2: valid UTF-8
380
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
381

382

383
function byte_string_classify(bytes::AbstractVector{UInt8})
24✔
384
    chunk_size = 1024
20,313✔
385
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,313✔
386
    n = length(bytes)
20,430✔
387
    if n > chunk_threshold
20,430✔
388
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
389
        isnothing(start) && return 1
10✔
390
    else
391
        _isascii(bytes,1,n) && return 1
20,420✔
392
        start = 1
20,166✔
393
    end
394
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
395
end
396

397
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
398
    chunk_size = 256
20,166✔
399

400
    start = first
20,166✔
401
    stop = min(last,first + chunk_size - 1)
20,189✔
402
    state = _UTF8_DFA_ACCEPT
20,166✔
403
    while start <= last
24,926✔
404
        # try to process ascii chunks
405
        while state == _UTF8_DFA_ACCEPT
20,189✔
406
            _isascii(bytes,start,stop) || break
20,189✔
407
            (start = start + chunk_size) <= last || break
×
408
            stop = min(last,stop + chunk_size)
×
409
        end
×
410
        # Process non ascii chunk
411
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
412
        state == _UTF8_DFA_INVALID && return 0
20,189✔
413

414
        start = start + chunk_size
4,737✔
415
        stop = min(last,stop + chunk_size)
4,737✔
416
    end
4,737✔
417
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
418
end
419

420
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,452✔
421
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
422

423
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
193✔
424

UNCOV
425
is_valid_continuation(c) = c & 0xc0 == 0x80
×
426

427
## required core functionality ##
428

429
@inline function iterate(s::String, i::Int=firstindex(s))
6,862✔
430
    (i % UInt) - 1 < ncodeunits(s) || return nothing
119,992,748✔
431
    b = @inbounds codeunit(s, i)
101,196,839✔
432
    u = UInt32(b) << 24
101,196,839✔
433
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
201,320,083✔
434
    return @noinline iterate_continued(s, i, u)
1,073,595✔
435
end
436

437
# duck-type s so that external UTF-8 string packages like StringViews can hook in
438
function iterate_continued(s, i::Int, u::UInt32)
1,073,594✔
439
    u < 0xc0000000 && (i += 1; @goto ret)
1,073,594✔
440
    n = ncodeunits(s)
1,066,082✔
441
    # first continuation byte
442
    (i += 1) > n && @goto ret
1,066,082✔
443
    @inbounds b = codeunit(s, i)
1,063,618✔
444
    b & 0xc0 == 0x80 || @goto ret
1,063,618✔
445
    u |= UInt32(b) << 16
1,058,520✔
446
    # second continuation byte
447
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
1,058,520✔
448
    @inbounds b = codeunit(s, i)
921,413✔
449
    b & 0xc0 == 0x80 || @goto ret
921,524✔
450
    u |= UInt32(b) << 8
921,302✔
451
    # third continuation byte
452
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
921,302✔
453
    @inbounds b = codeunit(s, i)
276,612✔
454
    b & 0xc0 == 0x80 || @goto ret
276,617✔
455
    u |= UInt32(b); i += 1
276,607✔
456
@label ret
457
    return reinterpret(Char, u), i
1,073,594✔
458
end
459

460
@propagate_inbounds function getindex(s::String, i::Int)
1,976✔
461
    b = codeunit(s, i)
18,445,336✔
462
    u = UInt32(b) << 24
18,445,336✔
463
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
36,045,184✔
464
    return getindex_continued(s, i, u)
845,455✔
465
end
466

467
# duck-type s so that external UTF-8 string packages like StringViews can hook in
468
function getindex_continued(s, i::Int, u::UInt32)
845,453✔
469
    if u < 0xc0000000
845,455✔
470
        # called from `getindex` which checks bounds
471
        @inbounds isvalid(s, i) && @goto ret
56✔
472
        string_index_err(s, i)
1✔
473
    end
474
    n = ncodeunits(s)
845,427✔
475

476
    (i += 1) > n && @goto ret
845,427✔
477
    @inbounds b = codeunit(s, i) # cont byte 1
845,426✔
478
    b & 0xc0 == 0x80 || @goto ret
845,426✔
479
    u |= UInt32(b) << 16
845,415✔
480

481
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
845,415✔
482
    @inbounds b = codeunit(s, i) # cont byte 2
838,852✔
483
    b & 0xc0 == 0x80 || @goto ret
838,852✔
484
    u |= UInt32(b) << 8
838,852✔
485

486
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
838,852✔
487
    @inbounds b = codeunit(s, i) # cont byte 3
262,235✔
488
    b & 0xc0 == 0x80 || @goto ret
262,235✔
489
    u |= UInt32(b)
262,235✔
490
@label ret
491
    return reinterpret(Char, u)
845,454✔
492
end
493

494
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
495

496
@inline function getindex(s::String, r::UnitRange{Int})
51,164✔
497
    isempty(r) && return ""
998,113✔
498
    i, j = first(r), last(r)
56,752✔
499
    @boundscheck begin
975,629✔
500
        checkbounds(s, r)
975,635✔
501
        @inbounds isvalid(s, i) || string_index_err(s, i)
973,534✔
502
        @inbounds isvalid(s, j) || string_index_err(s, j)
975,624✔
503
    end
504
    j = nextind(s, j) - 1
1,951,240✔
505
    n = j - i + 1
975,622✔
506
    ss = _string_n(n)
975,622✔
507
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
975,622✔
508
    return ss
975,622✔
509
end
510

511
# nothrow because we know the start and end indices are valid
512
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
861,614✔
513

514
# effects needed because @inbounds
515
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
5✔
516
    @boundscheck begin
273,458✔
517
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
273,458✔
518
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
273,462✔
519
    end
520
    j < i && return 0
273,454✔
521
    @inbounds i, k = thisind(s, i), i
474,054✔
522
    c = j - i + (i == k)
237,027✔
523
    @inbounds length_continued(s, i, j, c)
237,027✔
524
end
525

526
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
527
    i < n || return c
1,447,400✔
528
    b = codeunit(s, i)
749,882✔
529
    while true
3,385,381✔
530
        while true
47,749,163✔
531
            (i += 1) ≤ n || return c
48,498,265✔
532
            0xc0 ≤ b ≤ 0xf7 && break
47,000,061✔
533
            b = codeunit(s, i)
44,363,782✔
534
        end
44,363,782✔
535
        l = b
×
536
        b = codeunit(s, i) # cont byte 1
2,636,279✔
537
        c -= (x = b & 0xc0 == 0x80)
2,636,279✔
538
        x & (l ≥ 0xe0) || continue
2,636,279✔
539

540
        (i += 1) ≤ n || return c
2,224,063✔
541
        b = codeunit(s, i) # cont byte 2
2,222,525✔
542
        c -= (x = b & 0xc0 == 0x80)
2,222,525✔
543
        x & (l ≥ 0xf0) || continue
3,821,331✔
544

545
        (i += 1) ≤ n || return c
623,730✔
546
        b = codeunit(s, i) # cont byte 3
623,708✔
547
        c -= (b & 0xc0 == 0x80)
623,708✔
548
    end
2,635,499✔
549
end
550

551
## overload methods for efficiency ##
552

553
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
231,801,073✔
554

555
isascii(s::String) = isascii(codeunits(s))
667,015✔
556

557
# don't assume effects for general integers since we cannot know their implementation
558
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
323,453✔
559

560
"""
561
    repeat(c::AbstractChar, r::Integer) -> String
562

563
Repeat a character `r` times. This can equivalently be accomplished by calling
564
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
565

566
# Examples
567
```jldoctest
568
julia> repeat('A', 3)
569
"AAA"
570
```
571
"""
572
function repeat(c::AbstractChar, r::Integer)
323,305✔
573
    c = Char(c)::Char
323,530✔
574
    r == 0 && return ""
323,530✔
575
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
247,046✔
576
    u = bswap(reinterpret(UInt32, c))
247,042✔
577
    n = 4 - (leading_zeros(u | 0xff) >> 3)
247,042✔
578
    s = _string_n(n*r)
247,044✔
579
    p = pointer(s)
247,042✔
580
    GC.@preserve s if n == 1
247,042✔
581
        memset(p, u % UInt8, r)
246,921✔
582
    elseif n == 2
121✔
583
        p16 = reinterpret(Ptr{UInt16}, p)
8✔
584
        for i = 1:r
8✔
585
            unsafe_store!(p16, u % UInt16, i)
26✔
586
        end
31✔
587
    elseif n == 3
113✔
588
        b1 = (u >> 0) % UInt8
108✔
589
        b2 = (u >> 8) % UInt8
108✔
590
        b3 = (u >> 16) % UInt8
108✔
591
        for i = 0:r-1
108✔
592
            unsafe_store!(p, b1, 3i + 1)
580✔
593
            unsafe_store!(p, b2, 3i + 2)
580✔
594
            unsafe_store!(p, b3, 3i + 3)
580✔
595
        end
580✔
596
    elseif n == 4
5✔
597
        p32 = reinterpret(Ptr{UInt32}, p)
5✔
598
        for i = 1:r
5✔
599
            unsafe_store!(p32, u, i)
11✔
600
        end
247,053✔
601
    end
602
    return s
247,040✔
603
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc