• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37666

04 Nov 2023 02:27AM UTC coverage: 87.924% (+0.09%) from 87.831%
#37666

push

local

web-flow
Simplify, 16bit PDP-11 isn't going to be supported (#45763)

PDP_ENDIAN isn't used.

Co-authored-by: Viral B. Shah <ViralBShah@users.noreply.github.com>

74550 of 84789 relevant lines covered (87.92%)

15319904.67 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.15
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
const ByteArray = Union{CodeUnits{UInt8,String}, Vector{UInt8},Vector{Int8}, FastContiguousSubArray{UInt8,1,CodeUnits{UInt8,String}}, FastContiguousSubArray{UInt8,1,Vector{UInt8}}, FastContiguousSubArray{Int8,1,Vector{Int8}}}
31

32
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
241,174,567✔
33

34
"""
35
    String <: AbstractString
36

37
The default string type in Julia, used by e.g. string literals.
38

39
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
40
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
41
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
42
that the underlying byte sequence is valid as UTF-8.
43
"""
44
String
45

46
## constructors and conversions ##
47

48
# String constructor docstring from boot.jl, workaround for #16730
49
# and the unavailability of @doc in boot.jl context.
50
"""
51
    String(v::AbstractVector{UInt8})
52

53
Create a new `String` object using the data buffer from byte vector `v`.
54
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
55
modification of `v` cannot affect the contents of the resulting string.
56
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
57
`AbstractVector` types, `String(v)` already makes a copy.
58

59
When possible, the memory of `v` will be used without copying when the `String`
60
object is created. This is guaranteed to be the case for byte vectors returned
61
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
62
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
63
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
64
to guarantee consistent behavior.
65
"""
66
String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v))
2,612,900✔
67
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
32,463,404✔
68

69
"""
70
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
71

72
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
73
(The pointer can be safely freed afterwards.) If `length` is specified
74
(the length of the data in bytes), the string does not have to be NUL-terminated.
75

76
This function is labeled "unsafe" because it will crash if `p` is not
77
a valid memory address to data of the requested length.
78
"""
79
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
1,363✔
80
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,057,328✔
81
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,057,327✔
82
end
83
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
10,380✔
84
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
8,021,271✔
85
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
8,021,270✔
86
end
87

88
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
89
# but the macro is not available at this time in bootstrap, so we write it manually.
90
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0xe)), :(convert(Csize_t, n))))
157,652,255✔
91

92
"""
93
    String(s::AbstractString)
94

95
Create a new `String` from an existing `AbstractString`.
96
"""
97
String(s::AbstractString) = print_to_string(s)
410✔
98
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
4,900,208✔
99

100
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
30,255,558✔
101
unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))
×
102

103
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
62,583✔
104
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
62,429✔
105
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
106

107
String(s::CodeUnits{UInt8,String}) = s.s
3✔
108

109
## low-level functions ##
110

111
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
693,899,593✔
112
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
405,710,094✔
113

114
ncodeunits(s::String) = Core.sizeof(s)
743,555,364✔
115
codeunit(s::String) = UInt8
×
116

117
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
118
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
858,099✔
119
    @boundscheck checkbounds(s, i)
378,848,982✔
120
    b = GC.@preserve s unsafe_load(pointer(s, i))
378,848,974✔
121
    return b
378,848,974✔
122
end
123

124
## comparison ##
125

126
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,329,883✔
127

128
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,394,320✔
129
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
138✔
130
    GC.@preserve a b begin
10,460,550✔
131
        pa = unsafe_convert(Ptr{UInt8}, a)
9,818,638✔
132
        pb = unsafe_convert(Ptr{UInt8}, b)
10,460,550✔
133
        memcmp(pa, pb, len % Csize_t) % Int
10,460,550✔
134
    end
135
end
136

137
function cmp(a::String, b::String)
1✔
138
    al, bl = sizeof(a), sizeof(b)
5,329,883✔
139
    c = _memcmp(a, b)
5,329,883✔
140
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
9,318,351✔
141
end
142

143
==(a::String, b::String) = a===b
12,358,643✔
144

145
typemin(::Type{String}) = ""
2✔
146
typemin(::String) = typemin(String)
1✔
147

148
## thisind, nextind ##
149

150
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
204,884,443✔
151

152
# s should be String or SubString{String}
153
@inline function _thisind_str(s, i::Int)
4,774✔
154
    i == 0 && return 0
102,998,745✔
155
    n = ncodeunits(s)
103,227,899✔
156
    i == n + 1 && return i
103,227,899✔
157
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
103,204,935✔
158
    @inbounds b = codeunit(s, i)
103,204,911✔
159
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
184,323,124✔
160
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,714,342✔
161
        local b
×
162
        @inbounds b = codeunit(s, i-1)
21,857,172✔
163
        between(b, 0b11000000, 0b11110111) && return i-1
21,857,172✔
164
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,287,323✔
165
        @inbounds b = codeunit(s, i-2)
8,893,377✔
166
        between(b, 0b11100000, 0b11110111) && return i-2
8,893,377✔
167
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,175✔
168
        @inbounds b = codeunit(s, i-3)
1,842,099✔
169
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,099✔
170
        return i
1,056,047✔
171
    end)(s, i, n)
172
end
173

174
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
35,439,012✔
175

176
# s should be String or SubString{String}
177
@inline function _nextind_str(s, i::Int)
446✔
178
    i == 0 && return 1
30,519,274✔
179
    n = ncodeunits(s)
30,242,280✔
180
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
30,242,292✔
181
    @inbounds l = codeunit(s, i)
30,242,268✔
182
    between(l, 0x80, 0xf7) || return i+1
59,573,966✔
183
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,821,108✔
184
        if l < 0xc0
910,554✔
185
            # handle invalid codeunit index by scanning back to the start of this index
186
            # (which may be the same as this index)
187
            i′ = @inbounds thisind(s, i)
69,040✔
188
            i′ >= i && return i+1
34,520✔
189
            i = i′
×
190
            @inbounds l = codeunit(s, i)
17,481✔
191
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,481✔
192
            @assert l >= 0xc0
17,481✔
193
        end
194
        # first continuation byte
195
        (i += 1) > n && return i
893,515✔
196
        @inbounds b = codeunit(s, i)
892,757✔
197
        b & 0xc0 ≠ 0x80 && return i
892,757✔
198
        ((i += 1) > n) | (l < 0xe0) && return i
880,111✔
199
        # second continuation byte
200
        @inbounds b = codeunit(s, i)
860,477✔
201
        b & 0xc0 ≠ 0x80 && return i
860,477✔
202
        ((i += 1) > n) | (l < 0xf0) && return i
858,657✔
203
        # third continuation byte
204
        @inbounds b = codeunit(s, i)
281,740✔
205
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
281,740✔
206
    end)(s, i, n, l)
207
end
208

209
## checking UTF-8 & ACSII validity ##
210
#=
211
    The UTF-8 Validation is performed by a shift based DFA.
212
    ┌───────────────────────────────────────────────────────────────────┐
213
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
214
    │                               ├────────3────────┐           │     │
215
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
216
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
217
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
218
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
219
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
220
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
221
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
222
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
223
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
224
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
225
    │                      │        │     ├─┤               │        │  │
226
    │                      │        └─4──►│6├─────1,9───────┘        │  │
227
    │          INVALID     │              └─┘                        │  │
228
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
229
    │          ┌▼───┴┐                                                  │
230
    │          │  2  ◄─── All undefined transitions result in state 2   │
231
    │          └─────┘                                                  │
232
    └───────────────────────────────────────────────────────────────────┘
233

234
        Validation States
235
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
236
                        If the DFA ends in this state the string is ASCII only
237
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
238
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
239
                    as seen by all 1s in that column of table below
240
            3 -> One valid continuation byte needed to return to state 0
241
        4,5,6 -> Two valid continuation bytes needed to return to state 0
242
        7,8,9 -> Three valids continuation bytes needed to return to state 0
243

244
                        Current State
245
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
246
                0 | 0  1  2  2  2  2  2  2  2  2
247
                1 | 2  2  2  1  3  2  3  2  4  4
248
                2 | 3  3  2  2  2  2  2  2  2  2
249
                3 | 4  4  2  2  2  2  2  2  2  2
250
                4 | 6  6  2  2  2  2  2  2  2  2
251
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
252
    Class       6 | 8  8  2  2  2  2  2  2  2  2
253
                7 | 2  2  2  1  3  3  2  4  4  2
254
                8 | 2  2  2  2  2  2  2  2  2  2
255
                9 | 2  2  2  1  3  2  3  4  4  2
256
               10 | 5  5  2  2  2  2  2  2  2  2
257
               11 | 7  7  2  2  2  2  2  2  2  2
258

259
           Shifts | 0  4 10 14 18 24  8 20 12 26
260

261
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
262
    the rows the correct shift was a result.
263

264
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
265
    the current state then masking the result with 0x11110 give the shift for the new state
266

267

268
=#
269

270
#State type used by UTF-8 DFA
271
const _UTF8DFAState = UInt32
272
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
273
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
274
    num_classes=12
275
    num_states=10
276
    bit_per_state = 6
277

278
    # These shifts were derived using a SMT solver
279
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
280

281
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
282
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
283
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
284
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
285
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
286
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
287
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
288
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
289
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
290
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
291
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
292
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
293
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
294
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
295
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
296
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
297

298
    # These are the rows discussed in comments above
299
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
300
                     2  2  2  1  3  2  3  2  4  4;
301
                     3  3  2  2  2  2  2  2  2  2;
302
                     4  4  2  2  2  2  2  2  2  2;
303
                     6  6  2  2  2  2  2  2  2  2;
304
                     9  9  2  2  2  2  2  2  2  2;
305
                     8  8  2  2  2  2  2  2  2  2;
306
                     2  2  2  1  3  3  2  4  4  2;
307
                     2  2  2  2  2  2  2  2  2  2;
308
                     2  2  2  1  3  2  3  4  4  2;
309
                     5  5  2  2  2  2  2  2  2  2;
310
                     7  7  2  2  2  2  2  2  2  2]
311

312
    #This converts the state_arrays into the shift encoded _UTF8DFAState
313
    class_row = zeros(_UTF8DFAState, num_classes)
314

315
    for i = 1:num_classes
316
        row = _UTF8DFAState(0)
317
        for j in 1:num_states
318
            #Calculate the shift required for the next state
319
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
320
            #Shift the next state into the position of the current state
321
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
322
        end
323
        class_row[i]=row
324
    end
325

326
    map(c->class_row[c+1],character_classes)
×
327
end
328

329

330
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
331
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
332
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
333

334
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
335
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
336

337
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
25,821✔
338
    for i = first:last
51,258✔
339
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
340
    end
82,597✔
341
    return (state)
25,821✔
342
end
343

344
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
10✔
345
    n=first
10✔
346
    while n <= last - chunk_size
40✔
347
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
348
        n += chunk_size
30✔
349
    end
30✔
350
    n= last-chunk_size+1
10✔
351
    _isascii(cu,n,last) || return n
10✔
352
    return nothing
10✔
353
end
354

355
##
356

357
# Classifcations of string
358
    # 0: neither valid ASCII nor UTF-8
359
    # 1: valid ASCII
360
    # 2: valid UTF-8
361
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
362

363

364
function byte_string_classify(bytes::AbstractVector{UInt8})
20,430✔
365
    chunk_size = 1024
20,430✔
366
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,430✔
367
    n = length(bytes)
20,430✔
368
    if n > chunk_threshold
20,430✔
369
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
370
        isnothing(start) && return 1
10✔
371
    else
372
        _isascii(bytes,1,n) && return 1
20,420✔
373
        start = 1
20,189✔
374
    end
375
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
376
end
377

378
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
379
    chunk_size = 256
20,189✔
380

381
    start = first
20,189✔
382
    stop = min(last,first + chunk_size - 1)
20,189✔
383
    state = _UTF8_DFA_ACCEPT
20,189✔
384
    while start <= last
24,926✔
385
        # try to process ascii chunks
386
        while state == _UTF8_DFA_ACCEPT
20,189✔
387
            _isascii(bytes,start,stop) || break
20,189✔
388
            (start = start + chunk_size) <= last || break
×
389
            stop = min(last,stop + chunk_size)
×
390
        end
×
391
        # Process non ascii chunk
392
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
393
        state == _UTF8_DFA_INVALID && return 0
20,189✔
394

395
        start = start + chunk_size
4,737✔
396
        stop = min(last,stop + chunk_size)
4,737✔
397
    end
4,737✔
398
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
399
end
400

401
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,406✔
402
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
403

404
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
147✔
405

406
is_valid_continuation(c) = c & 0xc0 == 0x80
307✔
407

408
## required core functionality ##
409

410
@inline function iterate(s::String, i::Int=firstindex(s))
74,872✔
411
    (i % UInt) - 1 < ncodeunits(s) || return nothing
37,183,616✔
412
    b = @inbounds codeunit(s, i)
29,602,381✔
413
    u = UInt32(b) << 24
29,602,381✔
414
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
58,233,724✔
415
    return iterate_continued(s, i, u)
1,938,606✔
416
end
417

418
# duck-type s so that external UTF-8 string packages like StringViews can hook in
419
function iterate_continued(s, i::Int, u::UInt32)
275,695✔
420
    u < 0xc0000000 && (i += 1; @goto ret)
972,090✔
421
    n = ncodeunits(s)
969,986✔
422
    # first continuation byte
423
    (i += 1) > n && @goto ret
969,986✔
424
    @inbounds b = codeunit(s, i)
969,137✔
425
    b & 0xc0 == 0x80 || @goto ret
969,137✔
426
    u |= UInt32(b) << 16
967,249✔
427
    # second continuation byte
428
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
967,249✔
429
    @inbounds b = codeunit(s, i)
893,236✔
430
    b & 0xc0 == 0x80 || @goto ret
893,336✔
431
    u |= UInt32(b) << 8
893,136✔
432
    # third continuation byte
433
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
893,136✔
434
    @inbounds b = codeunit(s, i)
275,700✔
435
    b & 0xc0 == 0x80 || @goto ret
275,705✔
436
    u |= UInt32(b); i += 1
533,253✔
437
@label ret
×
438
    return reinterpret(Char, u), i
971,038✔
439
end
440

441
@propagate_inbounds function getindex(s::String, i::Int)
392,705✔
442
    b = codeunit(s, i)
15,287,988✔
443
    u = UInt32(b) << 24
15,287,988✔
444
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
29,780,030✔
445
    return getindex_continued(s, i, u)
795,913✔
446
end
447

448
# duck-type s so that external UTF-8 string packages like StringViews can hook in
449
function getindex_continued(s, i::Int, u::UInt32)
795,913✔
450
    if u < 0xc0000000
795,913✔
451
        # called from `getindex` which checks bounds
452
        @inbounds isvalid(s, i) && @goto ret
32✔
453
        string_index_err(s, i)
1✔
454
    end
455
    n = ncodeunits(s)
795,897✔
456

457
    (i += 1) > n && @goto ret
795,897✔
458
    @inbounds b = codeunit(s, i) # cont byte 1
795,896✔
459
    b & 0xc0 == 0x80 || @goto ret
795,896✔
460
    u |= UInt32(b) << 16
795,885✔
461

462
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
795,885✔
463
    @inbounds b = codeunit(s, i) # cont byte 2
790,421✔
464
    b & 0xc0 == 0x80 || @goto ret
790,421✔
465
    u |= UInt32(b) << 8
790,421✔
466

467
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
790,421✔
468
    @inbounds b = codeunit(s, i) # cont byte 3
261,427✔
469
    b & 0xc0 == 0x80 || @goto ret
261,427✔
470
    u |= UInt32(b)
261,427✔
471
@label ret
×
472
    return reinterpret(Char, u)
795,912✔
473
end
474

475
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
476

477
@inline function getindex(s::String, r::UnitRange{Int})
1,606,840✔
478
    isempty(r) && return ""
1,096,834✔
479
    i, j = first(r), last(r)
73,293✔
480
    @boundscheck begin
1,066,153✔
481
        checkbounds(s, r)
1,066,159✔
482
        @inbounds isvalid(s, i) || string_index_err(s, i)
1,065,243✔
483
        @inbounds isvalid(s, j) || string_index_err(s, j)
1,066,148✔
484
    end
485
    j = nextind(s, j) - 1
2,132,291✔
486
    n = j - i + 1
1,066,146✔
487
    ss = _string_n(n)
1,066,146✔
488
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
1,066,146✔
489
    return ss
1,066,146✔
490
end
491

492
# nothrow because we know the start and end indices are valid
493
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
1,111,807✔
494

495
# effects needed because @inbounds
496
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
68,251✔
497
    @boundscheck begin
252,065✔
498
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
252,065✔
499
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
252,069✔
500
    end
501
    j < i && return 0
252,061✔
502
    @inbounds i, k = thisind(s, i), i
398,696✔
503
    c = j - i + (i == k)
199,348✔
504
    @inbounds length_continued(s, i, j, c)
199,348✔
505
end
506

507
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
886,437✔
508
    i < n || return c
1,522,845✔
509
    b = codeunit(s, i)
1,099,465✔
510
    while true
4,511,159✔
511
        while true
65,470,961✔
512
            (i += 1) ≤ n || return c
66,569,656✔
513
            0xc0 ≤ b ≤ 0xf7 && break
64,372,266✔
514
            b = codeunit(s, i)
60,959,802✔
515
        end
60,959,802✔
516
        l = b
×
517
        b = codeunit(s, i) # cont byte 1
3,412,464✔
518
        c -= (x = b & 0xc0 == 0x80)
3,412,464✔
519
        x & (l ≥ 0xe0) || continue
3,412,464✔
520

521
        (i += 1) ≤ n || return c
3,005,808✔
522
        b = codeunit(s, i) # cont byte 2
3,004,284✔
523
        c -= (x = b & 0xc0 == 0x80)
3,004,284✔
524
        x & (l ≥ 0xf0) || continue
5,122,220✔
525

526
        (i += 1) ≤ n || return c
886,356✔
527
        b = codeunit(s, i) # cont byte 3
886,340✔
528
        c -= (b & 0xc0 == 0x80)
886,340✔
529
    end
3,411,694✔
530
end
531

532
## overload methods for efficiency ##
533

534
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
186,211,754✔
535

536
isascii(s::String) = isascii(codeunits(s))
863,976✔
537

538
# don't assume effects for general integers since we cannot know their implementation
539
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
72,399✔
540

541
"""
542
    repeat(c::AbstractChar, r::Integer) -> String
543

544
Repeat a character `r` times. This can equivalently be accomplished by calling
545
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
546

547
# Examples
548
```jldoctest
549
julia> repeat('A', 3)
550
"AAA"
551
```
552
"""
553
function repeat(c::AbstractChar, r::Integer)
72,404✔
554
    c = Char(c)::Char
72,289✔
555
    r == 0 && return ""
72,404✔
556
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
67,474✔
557
    u = bswap(reinterpret(UInt32, c))
67,470✔
558
    n = 4 - (leading_zeros(u | 0xff) >> 3)
67,470✔
559
    s = _string_n(n*r)
67,529✔
560
    p = pointer(s)
67,527✔
561
    GC.@preserve s if n == 1
67,527✔
562
        memset(p, u % UInt8, r)
67,415✔
563
    elseif n == 2
112✔
564
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
565
        for i = 1:r
11✔
566
            unsafe_store!(p16, u % UInt16, i)
20✔
567
        end
25✔
568
    elseif n == 3
106✔
569
        b1 = (u >> 0) % UInt8
102✔
570
        b2 = (u >> 8) % UInt8
102✔
571
        b3 = (u >> 16) % UInt8
102✔
572
        for i = 0:r-1
204✔
573
            unsafe_store!(p, b1, 3i + 1)
557✔
574
            unsafe_store!(p, b2, 3i + 2)
557✔
575
            unsafe_store!(p, b3, 3i + 3)
557✔
576
        end
557✔
577
    elseif n == 4
4✔
578
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
579
        for i = 1:r
8✔
580
            unsafe_store!(p32, u, i)
8✔
581
        end
67,535✔
582
    end
583
    return s
67,468✔
584
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc