• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37639

03 Oct 2023 12:30AM UTC coverage: 86.356% (-1.1%) from 87.461%
#37639

push

local

web-flow
Bump libunwind. (#51545)

Fixes https://github.com/JuliaLang/julia/issues/51465, caused by
https://github.com/libunwind/libunwind/pull/203.
Ref https://github.com/JuliaPackaging/Yggdrasil/pull/7466

72846 of 84355 relevant lines covered (86.36%)

11563052.54 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.4
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
const ByteArray = Union{CodeUnits{UInt8,String}, Vector{UInt8},Vector{Int8}, FastContiguousSubArray{UInt8,1,CodeUnits{UInt8,String}}, FastContiguousSubArray{UInt8,1,Vector{UInt8}}, FastContiguousSubArray{Int8,1,Vector{Int8}}}
31

32
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
191,898,037✔
33

34
"""
35
    String <: AbstractString
36

37
The default string type in Julia, used by e.g. string literals.
38

39
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
40
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
41
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
42
that the underlying byte sequence is valid as UTF-8.
43
"""
44
String
45

46
## constructors and conversions ##
47

48
# String constructor docstring from boot.jl, workaround for #16730
49
# and the unavailability of @doc in boot.jl context.
50
"""
51
    String(v::AbstractVector{UInt8})
52

53
Create a new `String` object using the data buffer from byte vector `v`.
54
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
55
modification of `v` cannot affect the contents of the resulting string.
56
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
57
`AbstractVector` types, `String(v)` already makes a copy.
58

59
When possible, the memory of `v` will be used without copying when the `String`
60
object is created. This is guaranteed to be the case for byte vectors returned
61
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
62
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
63
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
64
to guarantee consistent behavior.
65
"""
66
String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v))
2,612,856✔
67
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
33,481,810✔
68

69
"""
70
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
71

72
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
73
(The pointer can be safely freed afterwards.) If `length` is specified
74
(the length of the data in bytes), the string does not have to be NUL-terminated.
75

76
This function is labeled "unsafe" because it will crash if `p` is not
77
a valid memory address to data of the requested length.
78
"""
79
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
1,298✔
80
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
634,894✔
81
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
634,893✔
82
end
83
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
9,998✔
84
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
7,445,174✔
85
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
7,445,173✔
86
end
87

88
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
89
# but the macro is not available at this time in bootstrap, so we write it manually.
90
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0xe)), :(convert(Csize_t, n))))
153,079,323✔
91

92
"""
93
    String(s::AbstractString)
94

95
Create a new `String` from an existing `AbstractString`.
96
"""
97
String(s::AbstractString) = print_to_string(s)
406✔
98
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
4,845,295✔
99

100
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
32,469,291✔
101
unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))
×
102

103
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
20,931✔
104
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
20,872✔
105
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
106

107
String(s::CodeUnits{UInt8,String}) = s.s
3✔
108

109
## low-level functions ##
110

111
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
622,203,994✔
112
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
341,134,490✔
113

114
ncodeunits(s::String) = Core.sizeof(s)
642,900,159✔
115
codeunit(s::String) = UInt8
×
116

117
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
118
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
200,749✔
119
    @boundscheck checkbounds(s, i)
318,824,018✔
120
    b = GC.@preserve s unsafe_load(pointer(s, i))
318,824,010✔
121
    return b
318,824,010✔
122
end
123

124
## comparison ##
125

126
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
4,957,619✔
127

128
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
5,535,188✔
129
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
137✔
130
    GC.@preserve a b begin
9,425,424✔
131
        pa = unsafe_convert(Ptr{UInt8}, a)
8,955,216✔
132
        pb = unsafe_convert(Ptr{UInt8}, b)
9,425,424✔
133
        memcmp(pa, pb, len % Csize_t) % Int
9,425,424✔
134
    end
135
end
136

137
function cmp(a::String, b::String)
1✔
138
    al, bl = sizeof(a), sizeof(b)
4,957,619✔
139
    c = _memcmp(a, b)
4,957,619✔
140
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
8,827,724✔
141
end
142

143
==(a::String, b::String) = a===b
11,778,742✔
144

145
typemin(::Type{String}) = ""
2✔
146
typemin(::String) = typemin(String)
1✔
147

148
## thisind, nextind ##
149

150
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
192,312,495✔
151

152
# s should be String or SubString{String}
153
@inline function _thisind_str(s, i::Int)
152✔
154
    i == 0 && return 0
96,932,768✔
155
    n = ncodeunits(s)
96,906,628✔
156
    i == n + 1 && return i
96,906,628✔
157
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
96,883,664✔
158
    @inbounds b = codeunit(s, i)
96,883,640✔
159
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
171,911,229✔
160
    @inbounds b = codeunit(s, i-1)
21,855,906✔
161
    between(b, 0b11000000, 0b11110111) && return i-1
21,855,906✔
162
    (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,287,268✔
163
    @inbounds b = codeunit(s, i-2)
8,893,325✔
164
    between(b, 0b11100000, 0b11110111) && return i-2
8,893,325✔
165
    (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,169✔
166
    @inbounds b = codeunit(s, i-3)
1,842,093✔
167
    between(b, 0b11110000, 0b11110111) && return i-3
1,842,093✔
168
    return i
1,056,047✔
169
end
170

171
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
15,133,651✔
172

173
# s should be String or SubString{String}
174
@inline function _nextind_str(s, i::Int)
24,865,950✔
175
    i == 0 && return 1
24,865,950✔
176
    n = ncodeunits(s)
24,434,572✔
177
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
24,434,584✔
178
    @inbounds l = codeunit(s, i)
24,434,560✔
179
    (l < 0x80) | (0xf8 ≤ l) && return i+1
24,434,560✔
180
    if l < 0xc0
923,320✔
181
        i′ = @inbounds thisind(s, i)
57,823✔
182
        return i′ < i ? @inbounds(nextind(s, i′)) : i+1
34,520✔
183
    end
184
    # first continuation byte
185
    (i += 1) > n && return i
888,800✔
186
    @inbounds b = codeunit(s, i)
888,042✔
187
    b & 0xc0 ≠ 0x80 && return i
888,042✔
188
    ((i += 1) > n) | (l < 0xe0) && return i
875,396✔
189
    # second continuation byte
190
    @inbounds b = codeunit(s, i)
856,527✔
191
    b & 0xc0 ≠ 0x80 && return i
856,527✔
192
    ((i += 1) > n) | (l < 0xf0) && return i
854,707✔
193
    # third continuation byte
194
    @inbounds b = codeunit(s, i)
279,661✔
195
    ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,661✔
196
end
197

198
## checking UTF-8 & ACSII validity ##
199
#=
200
    The UTF-8 Validation is performed by a shift based DFA.
201
    ┌───────────────────────────────────────────────────────────────────┐
202
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
203
    │                               ├────────3────────┐           │     │
204
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
205
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
206
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
207
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
208
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
209
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
210
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
211
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
212
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
213
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
214
    │                      │        │     ├─┤               │        │  │
215
    │                      │        └─4──►│6├─────1,9───────┘        │  │
216
    │          INVALID     │              └─┘                        │  │
217
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
218
    │          ┌▼───┴┐                                                  │
219
    │          │  2  ◄─── All undefined transitions result in state 2   │
220
    │          └─────┘                                                  │
221
    └───────────────────────────────────────────────────────────────────┘
222

223
        Validation States
224
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
225
                        If the DFA ends in this state the string is ASCII only
226
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
227
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
228
                    as seen by all 1s in that column of table below
229
            3 -> One valid continuation byte needed to return to state 0
230
        4,5,6 -> Two valid continuation bytes needed to return to state 0
231
        7,8,9 -> Three valids continuation bytes needed to return to state 0
232

233
                        Current State
234
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
235
                0 | 0  1  2  2  2  2  2  2  2  2
236
                1 | 2  2  2  1  3  2  3  2  4  4
237
                2 | 3  3  2  2  2  2  2  2  2  2
238
                3 | 4  4  2  2  2  2  2  2  2  2
239
                4 | 6  6  2  2  2  2  2  2  2  2
240
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
241
    Class       6 | 8  8  2  2  2  2  2  2  2  2
242
                7 | 2  2  2  1  3  3  2  4  4  2
243
                8 | 2  2  2  2  2  2  2  2  2  2
244
                9 | 2  2  2  1  3  2  3  4  4  2
245
               10 | 5  5  2  2  2  2  2  2  2  2
246
               11 | 7  7  2  2  2  2  2  2  2  2
247

248
           Shifts | 0  4 10 14 18 24  8 20 12 26
249

250
    The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into
251
    the rows the correct shift was a result.
252

253
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
254
    the current state then masking the result with 0x11110 give the shift for the new state
255

256

257
=#
258

259
#State type used by UTF-8 DFA
260
const _UTF8DFAState = UInt32
261
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
262
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
263
    num_classes=12
264
    num_states=10
265
    bit_per_state = 6
266

267
    # These shifts were derived using a SMT solver
268
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
269

270
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
271
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
272
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
273
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
274
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
275
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
276
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
277
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
278
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
279
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
280
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
281
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
282
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
283
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
284
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
285
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
286

287
    # These are the rows discussed in comments above
288
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
289
                     2  2  2  1  3  2  3  2  4  4;
290
                     3  3  2  2  2  2  2  2  2  2;
291
                     4  4  2  2  2  2  2  2  2  2;
292
                     6  6  2  2  2  2  2  2  2  2;
293
                     9  9  2  2  2  2  2  2  2  2;
294
                     8  8  2  2  2  2  2  2  2  2;
295
                     2  2  2  1  3  3  2  4  4  2;
296
                     2  2  2  2  2  2  2  2  2  2;
297
                     2  2  2  1  3  2  3  4  4  2;
298
                     5  5  2  2  2  2  2  2  2  2;
299
                     7  7  2  2  2  2  2  2  2  2]
300

301
    #This converts the state_arrays into the shift encoded _UTF8DFAState
302
    class_row = zeros(_UTF8DFAState, num_classes)
303

304
    for i = 1:num_classes
305
        row = _UTF8DFAState(0)
306
        for j in 1:num_states
307
            #Calculate the shift required for the next state
308
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
309
            #Shift the next state into the position of the current state
310
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
311
        end
312
        class_row[i]=row
313
    end
314

315
    map(c->class_row[c+1],character_classes)
×
316
end
317

318

319
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
320
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
321
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
322

323
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
324
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
325

326
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
25,798✔
327
    for i = first:last
51,258✔
328
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
329
    end
82,597✔
330
    return (state)
25,821✔
331
end
332

333
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
10✔
334
    n=first
10✔
335
    while n <= last - chunk_size
40✔
336
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
337
        n += chunk_size
30✔
338
    end
30✔
339
    n= last-chunk_size+1
10✔
340
    _isascii(cu,n,last) || return n
10✔
341
    return nothing
10✔
342
end
343

344
##
345

346
# Classifcations of string
347
    # 0: neither valid ASCII nor UTF-8
348
    # 1: valid ASCII
349
    # 2: valid UTF-8
350
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
351

352

353
function byte_string_classify(bytes::AbstractVector{UInt8})
20,313✔
354
    chunk_size = 1024
20,313✔
355
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,313✔
356
    n = length(bytes)
20,430✔
357
    if n > chunk_threshold
20,430✔
358
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
359
        isnothing(start) && return 1
10✔
360
    else
361
        _isascii(bytes,1,n) && return 1
20,420✔
362
        start = 1
20,166✔
363
    end
364
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
365
end
366

367
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
368
    chunk_size = 256
20,166✔
369

370
    start = first
20,166✔
371
    stop = min(last,first + chunk_size - 1)
20,189✔
372
    state = _UTF8_DFA_ACCEPT
20,166✔
373
    while start <= last
24,926✔
374
        # try to process ascii chunks
375
        while state == _UTF8_DFA_ACCEPT
20,189✔
376
            _isascii(bytes,start,stop) || break
20,189✔
377
            (start = start + chunk_size) <= last || break
×
378
            stop = min(last,stop + chunk_size)
×
379
        end
×
380
        # Process non ascii chunk
381
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
382
        state == _UTF8_DFA_INVALID && return 0
20,189✔
383

384
        start = start + chunk_size
4,737✔
385
        stop = min(last,stop + chunk_size)
4,737✔
386
    end
4,737✔
387
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
388
end
389

390
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,406✔
391
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
392

393
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
147✔
394

395
is_valid_continuation(c) = c & 0xc0 == 0x80
×
396

397
## required core functionality ##
398

399
@inline function iterate(s::String, i::Int=firstindex(s))
44,499✔
400
    (i % UInt) - 1 < ncodeunits(s) || return nothing
37,970,403✔
401
    b = @inbounds codeunit(s, i)
26,047,156✔
402
    u = UInt32(b) << 24
26,047,156✔
403
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
51,125,351✔
404
    return iterate_continued(s, i, u)
968,961✔
405
end
406

407
# duck-type s so that external UTF-8 string packages like StringViews can hook in
408
function iterate_continued(s, i::Int, u::UInt32)
968,961✔
409
    u < 0xc0000000 && (i += 1; @goto ret)
969,997✔
410
    n = ncodeunits(s)
967,925✔
411
    # first continuation byte
412
    (i += 1) > n && @goto ret
967,925✔
413
    @inbounds b = codeunit(s, i)
967,001✔
414
    b & 0xc0 == 0x80 || @goto ret
967,001✔
415
    u |= UInt32(b) << 16
965,084✔
416
    # second continuation byte
417
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
965,084✔
418
    @inbounds b = codeunit(s, i)
892,421✔
419
    b & 0xc0 == 0x80 || @goto ret
892,541✔
420
    u |= UInt32(b) << 8
892,301✔
421
    # third continuation byte
422
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
892,301✔
423
    @inbounds b = codeunit(s, i)
275,482✔
424
    b & 0xc0 == 0x80 || @goto ret
275,486✔
425
    u |= UInt32(b); i += 1
550,956✔
426
@label ret
×
427
    return reinterpret(Char, u), i
968,961✔
428
end
429

430
@propagate_inbounds function getindex(s::String, i::Int)
235,683✔
431
    b = codeunit(s, i)
11,939,905✔
432
    u = UInt32(b) << 24
11,939,905✔
433
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
23,088,879✔
434
    return getindex_continued(s, i, u)
790,899✔
435
end
436

437
# duck-type s so that external UTF-8 string packages like StringViews can hook in
438
function getindex_continued(s, i::Int, u::UInt32)
790,899✔
439
    if u < 0xc0000000
790,899✔
440
        # called from `getindex` which checks bounds
441
        @inbounds isvalid(s, i) && @goto ret
16✔
442
        string_index_err(s, i)
1✔
443
    end
444
    n = ncodeunits(s)
790,883✔
445

446
    (i += 1) > n && @goto ret
790,883✔
447
    @inbounds b = codeunit(s, i) # cont byte 1
790,882✔
448
    b & 0xc0 == 0x80 || @goto ret
790,882✔
449
    u |= UInt32(b) << 16
790,871✔
450

451
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
790,871✔
452
    @inbounds b = codeunit(s, i) # cont byte 2
786,446✔
453
    b & 0xc0 == 0x80 || @goto ret
786,446✔
454
    u |= UInt32(b) << 8
786,446✔
455

456
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
786,446✔
457
    @inbounds b = codeunit(s, i) # cont byte 3
259,349✔
458
    b & 0xc0 == 0x80 || @goto ret
259,349✔
459
    u |= UInt32(b)
259,349✔
460
@label ret
×
461
    return reinterpret(Char, u)
790,898✔
462
end
463

464
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
465

466
@inline function getindex(s::String, r::UnitRange{Int})
468,229✔
467
    isempty(r) && return ""
486,875✔
468
    i, j = first(r), last(r)
71,148✔
469
    @boundscheck begin
461,587✔
470
        checkbounds(s, r)
461,593✔
471
        @inbounds isvalid(s, i) || string_index_err(s, i)
460,728✔
472
        @inbounds isvalid(s, j) || string_index_err(s, j)
461,582✔
473
    end
474
    j = nextind(s, j) - 1
461,580✔
475
    n = j - i + 1
461,580✔
476
    ss = _string_n(n)
461,580✔
477
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
461,580✔
478
    return ss
461,580✔
479
end
480

481
# nothrow because we know the start and end indices are valid
482
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
811,685✔
483

484
# effects needed because @inbounds
485
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
68,251✔
486
    @boundscheck begin
220,887✔
487
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
220,887✔
488
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
220,891✔
489
    end
490
    j < i && return 0
220,883✔
491
    @inbounds i, k = thisind(s, i), i
336,602✔
492
    c = j - i + (i == k)
168,301✔
493
    @inbounds length_continued(s, i, j, c)
168,301✔
494
end
495

496
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
886,150✔
497
    i < n || return c
1,158,981✔
498
    b = codeunit(s, i)
800,991✔
499
    while true
4,188,986✔
500
        while true
52,447,192✔
501
            (i += 1) ≤ n || return c
53,247,415✔
502
            0xc0 ≤ b ≤ 0xf7 && break
51,646,969✔
503
            b = codeunit(s, i)
48,258,206✔
504
        end
48,258,206✔
505
        l = b
×
506
        b = codeunit(s, i) # cont byte 1
3,388,763✔
507
        c -= (x = b & 0xc0 == 0x80)
3,388,763✔
508
        x & (l ≥ 0xe0) || continue
3,388,763✔
509

510
        (i += 1) ≤ n || return c
2,983,448✔
511
        b = codeunit(s, i) # cont byte 2
2,981,936✔
512
        c -= (x = b & 0xc0 == 0x80)
2,981,936✔
513
        x & (l ≥ 0xf0) || continue
5,077,807✔
514

515
        (i += 1) ≤ n || return c
886,077✔
516
        b = codeunit(s, i) # cont byte 3
886,053✔
517
        c -= (b & 0xc0 == 0x80)
886,053✔
518
    end
3,387,995✔
519
end
520

521
## overload methods for efficiency ##
522

523
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
88,513,424✔
524

525
isascii(s::String) = isascii(codeunits(s))
673,868✔
526

527
# don't assume effects for general integers since we cannot know their implementation
528
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
70,915✔
529

530
"""
531
    repeat(c::AbstractChar, r::Integer) -> String
532

533
Repeat a character `r` times. This can equivalently be accomplished by calling
534
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
535

536
# Examples
537
```jldoctest
538
julia> repeat('A', 3)
539
"AAA"
540
```
541
"""
542
function repeat(c::AbstractChar, r::Integer)
70,877✔
543
    c = Char(c)::Char
70,877✔
544
    r == 0 && return ""
70,877✔
545
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
66,790✔
546
    u = bswap(reinterpret(UInt32, c))
66,786✔
547
    n = 4 - (leading_zeros(u | 0xff) >> 3)
66,786✔
548
    s = _string_n(n*r)
66,845✔
549
    p = pointer(s)
66,843✔
550
    GC.@preserve s if n == 1
66,843✔
551
        memset(p, u % UInt8, r)
66,735✔
552
    elseif n == 2
108✔
553
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
554
        for i = 1:r
11✔
555
            unsafe_store!(p16, u % UInt16, i)
20✔
556
        end
25✔
557
    elseif n == 3
102✔
558
        b1 = (u >> 0) % UInt8
98✔
559
        b2 = (u >> 8) % UInt8
98✔
560
        b3 = (u >> 16) % UInt8
98✔
561
        for i = 0:r-1
196✔
562
            unsafe_store!(p, b1, 3i + 1)
525✔
563
            unsafe_store!(p, b2, 3i + 2)
525✔
564
            unsafe_store!(p, b3, 3i + 3)
525✔
565
        end
525✔
566
    elseif n == 4
4✔
567
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
568
        for i = 1:r
8✔
569
            unsafe_store!(p32, u, i)
8✔
570
        end
66,851✔
571
    end
572
    return s
66,784✔
573
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc