• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37857

01 Aug 2024 06:46AM UTC coverage: 87.6% (+0.5%) from 87.135%
#37857

push

local

web-flow
document the return types of `fieldname` and `fieldnames` (#55259)

77694 of 88692 relevant lines covered (87.6%)

16046941.43 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.35
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
330,828,310✔
31

32
"""
33
    String <: AbstractString
34

35
The default string type in Julia, used by e.g. string literals.
36

37
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
38
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
39
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
40
that the underlying byte sequence is valid as UTF-8.
41
"""
42
String
43

44
## constructors and conversions ##
45

46
# String constructor docstring from boot.jl, workaround for #16730
47
# and the unavailability of @doc in boot.jl context.
48
"""
49
    String(v::AbstractVector{UInt8})
50

51
Create a new `String` object using the data buffer from byte vector `v`.
52
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
53
modification of `v` cannot affect the contents of the resulting string.
54
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
55
`AbstractVector` types, `String(v)` already makes a copy.
56

57
When possible, the memory of `v` will be used without copying when the `String`
58
object is created. This is guaranteed to be the case for byte vectors returned
59
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
60
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
61
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
62
to guarantee consistent behavior.
63
"""
64
String(v::AbstractVector{UInt8}) = String(copyto!(StringMemory(length(v)), v))
5,451,166✔
65
function String(v::Memory{UInt8})
328✔
66
    len = length(v)
6,608,275✔
67
    len == 0 && return ""
6,608,275✔
68
    return ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), v, len)
6,608,224✔
69
end
70
function String(v::Vector{UInt8})
5,873✔
71
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
72
    len = length(v)
10,522,602✔
73
    len == 0 && return ""
10,522,602✔
74
    ref = v.ref
4,054,065✔
75
    if ref.ptr_or_offset == ref.mem.ptr
9,192,281✔
76
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
9,192,280✔
77
    else
78
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
1✔
79
    end
80
    # optimized empty!(v); sizehint!(v, 0) calls
81
    setfield!(v, :size, (0,))
9,192,281✔
82
    setfield!(v, :ref, memoryref(Memory{UInt8}()))
4,054,065✔
83
    return str
9,192,281✔
84
end
85

86
"""
87
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
88

89
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
90
(The pointer can be safely freed afterwards.) If `length` is specified
91
(the length of the data in bytes), the string does not have to be NUL-terminated.
92

93
This function is labeled "unsafe" because it will crash if `p` is not
94
a valid memory address to data of the requested length.
95
"""
96
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
89✔
97
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,203,669✔
98
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,203,668✔
99
end
100
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
1,094✔
101
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
7,638,977✔
102
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
7,638,976✔
103
end
104

105
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
106
# but the macro is not available at this time in bootstrap, so we write it manually.
107
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0x000e)), :(convert(Csize_t, n))))
190,806,920✔
108

109
"""
110
    String(s::AbstractString)
111

112
Create a new `String` from an existing `AbstractString`.
113
"""
114
String(s::AbstractString) = print_to_string(s)
465✔
115
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
5,177,060✔
116

117
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
22,068,307✔
118
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = wrap(Array, unsafe_wrap(Memory{UInt8}, s))
67,586✔
119

120
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
39,239✔
121
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
39,206✔
122
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
123

124
String(s::CodeUnits{UInt8,String}) = s.s
3✔
125

126
## low-level functions ##
127

128
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
861,556,750✔
129
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
487,904,759✔
130

131
ncodeunits(s::String) = Core.sizeof(s)
969,498,972✔
132
codeunit(s::String) = UInt8
×
133

134
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
135
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
962✔
136
    @boundscheck checkbounds(s, i)
455,750,025✔
137
    b = GC.@preserve s unsafe_load(pointer(s, i))
455,750,017✔
138
    return b
455,750,017✔
139
end
140

141
## comparison ##
142

143
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,429,666✔
144

145
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,484,380✔
146
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
147
    GC.@preserve a b begin
10,838,493✔
148
        pa = unsafe_convert(Ptr{UInt8}, a)
10,181,261✔
149
        pb = unsafe_convert(Ptr{UInt8}, b)
10,838,493✔
150
        memcmp(pa, pb, len % Csize_t) % Int
10,838,493✔
151
    end
152
end
153

154
function cmp(a::String, b::String)
1✔
155
    al, bl = sizeof(a), sizeof(b)
5,429,666✔
156
    c = _memcmp(a, b)
5,429,666✔
157
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
10,192,169✔
158
end
159

160
==(a::String, b::String) = a===b
13,081,571✔
161

162
typemin(::Type{String}) = ""
1✔
163
typemin(::String) = typemin(String)
1✔
164

165
## thisind, nextind ##
166

167
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
263,165,932✔
168

169
# s should be String or SubString{String}
170
@inline function _thisind_str(s, i::Int)
171
    i == 0 && return 0
135,505,707✔
172
    n = ncodeunits(s)
135,689,573✔
173
    i == n + 1 && return i
135,689,573✔
174
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
135,666,609✔
175
    @inbounds b = codeunit(s, i)
135,666,585✔
176
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
249,252,597✔
177
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,781,360✔
178
        local b
×
179
        @inbounds b = codeunit(s, i-1)
21,890,681✔
180
        between(b, 0b11000000, 0b11110111) && return i-1
21,890,681✔
181
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,304,052✔
182
        @inbounds b = codeunit(s, i-2)
8,910,106✔
183
        between(b, 0b11100000, 0b11110111) && return i-2
8,910,106✔
184
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,148✔
185
        @inbounds b = codeunit(s, i-3)
1,842,072✔
186
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,072✔
187
        return i
1,056,047✔
188
    end)(s, i, n)
189
end
190

191
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
55,791,467✔
192

193
# s should be String or SubString{String}
194
@inline function _nextind_str(s, i::Int)
195
    i == 0 && return 1
37,967,358✔
196
    n = ncodeunits(s)
37,533,680✔
197
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
37,533,692✔
198
    @inbounds l = codeunit(s, i)
37,533,668✔
199
    between(l, 0x80, 0xf7) || return i+1
74,158,050✔
200
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,818,540✔
201
        if l < 0xc0
909,270✔
202
            # handle invalid codeunit index by scanning back to the start of this index
203
            # (which may be the same as this index)
204
            i′ = @inbounds thisind(s, i)
69,068✔
205
            i′ >= i && return i+1
34,534✔
206
            i = i′
×
207
            @inbounds l = codeunit(s, i)
17,495✔
208
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,495✔
209
            @assert l >= 0xc0
17,495✔
210
        end
211
        # first continuation byte
212
        (i += 1) > n && return i
892,231✔
213
        @inbounds b = codeunit(s, i)
891,473✔
214
        b & 0xc0 ≠ 0x80 && return i
891,473✔
215
        ((i += 1) > n) | (l < 0xe0) && return i
878,826✔
216
        # second continuation byte
217
        @inbounds b = codeunit(s, i)
863,535✔
218
        b & 0xc0 ≠ 0x80 && return i
863,535✔
219
        ((i += 1) > n) | (l < 0xf0) && return i
861,715✔
220
        # third continuation byte
221
        @inbounds b = codeunit(s, i)
279,663✔
222
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,663✔
223
    end)(s, i, n, l)
224
end
225

226
## checking UTF-8 & ACSII validity ##
227
#=
228
    The UTF-8 Validation is performed by a shift based DFA.
229
    ┌───────────────────────────────────────────────────────────────────┐
230
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
231
    │                               ├────────3────────┐           │     │
232
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
233
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
234
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
235
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
236
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
237
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
238
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
239
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
240
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
241
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
242
    │                      │        │     ├─┤               │        │  │
243
    │                      │        └─4──►│6├─────1,9───────┘        │  │
244
    │          INVALID     │              └─┘                        │  │
245
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
246
    │          ┌▼───┴┐                                                  │
247
    │          │  2  ◄─── All undefined transitions result in state 2   │
248
    │          └─────┘                                                  │
249
    └───────────────────────────────────────────────────────────────────┘
250

251
        Validation States
252
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
253
                        If the DFA ends in this state the string is ASCII only
254
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
255
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
256
                    as seen by all 1s in that column of table below
257
            3 -> One valid continuation byte needed to return to state 0
258
        4,5,6 -> Two valid continuation bytes needed to return to state 0
259
        7,8,9 -> Three valids continuation bytes needed to return to state 0
260

261
                        Current State
262
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
263
                0 | 0  1  2  2  2  2  2  2  2  2
264
                1 | 2  2  2  1  3  2  3  2  4  4
265
                2 | 3  3  2  2  2  2  2  2  2  2
266
                3 | 4  4  2  2  2  2  2  2  2  2
267
                4 | 6  6  2  2  2  2  2  2  2  2
268
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
269
    Class       6 | 8  8  2  2  2  2  2  2  2  2
270
                7 | 2  2  2  1  3  3  2  4  4  2
271
                8 | 2  2  2  2  2  2  2  2  2  2
272
                9 | 2  2  2  1  3  2  3  4  4  2
273
               10 | 5  5  2  2  2  2  2  2  2  2
274
               11 | 7  7  2  2  2  2  2  2  2  2
275

276
           Shifts | 0  4 10 14 18 24  8 20 12 26
277

278
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
279
    the rows the correct shift was a result.
280

281
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
282
    the current state then masking the result with 0x11110 give the shift for the new state
283

284

285
=#
286

287
#State type used by UTF-8 DFA
288
const _UTF8DFAState = UInt32
289
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
290
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
291
    num_classes=12
292
    num_states=10
293
    bit_per_state = 6
294

295
    # These shifts were derived using a SMT solver
296
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
297

298
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
299
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
300
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
307
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
308
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
309
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
310
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
311
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
312
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
313
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
314

315
    # These are the rows discussed in comments above
316
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
317
                     2  2  2  1  3  2  3  2  4  4;
318
                     3  3  2  2  2  2  2  2  2  2;
319
                     4  4  2  2  2  2  2  2  2  2;
320
                     6  6  2  2  2  2  2  2  2  2;
321
                     9  9  2  2  2  2  2  2  2  2;
322
                     8  8  2  2  2  2  2  2  2  2;
323
                     2  2  2  1  3  3  2  4  4  2;
324
                     2  2  2  2  2  2  2  2  2  2;
325
                     2  2  2  1  3  2  3  4  4  2;
326
                     5  5  2  2  2  2  2  2  2  2;
327
                     7  7  2  2  2  2  2  2  2  2]
328

329
    #This converts the state_arrays into the shift encoded _UTF8DFAState
330
    class_row = zeros(_UTF8DFAState, num_classes)
331

332
    for i = 1:num_classes
333
        row = _UTF8DFAState(0)
334
        for j in 1:num_states
335
            #Calculate the shift required for the next state
336
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
337
            #Shift the next state into the position of the current state
338
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
339
        end
340
        class_row[i]=row
341
    end
342

343
    map(c->class_row[c+1],character_classes)
×
344
end
345

346

347
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
348
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
349
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
350

351
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
352
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
353

354
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
5,248✔
355
    for i = first:last
25,821✔
356
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
357
    end
82,597✔
358
    return (state)
25,821✔
359
end
360

361
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
362
    n=first
10✔
363
    while n <= last - chunk_size
40✔
364
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
365
        n += chunk_size
30✔
366
    end
30✔
367
    n= last-chunk_size+1
10✔
368
    _isascii(cu,n,last) || return n
10✔
369
    return nothing
10✔
370
end
371

372
##
373

374
# Classifcations of string
375
    # 0: neither valid ASCII nor UTF-8
376
    # 1: valid ASCII
377
    # 2: valid UTF-8
378
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
379

380

381
function byte_string_classify(bytes::AbstractVector{UInt8})
24✔
382
    chunk_size = 1024
20,430✔
383
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,430✔
384
    n = length(bytes)
20,430✔
385
    if n > chunk_threshold
20,430✔
386
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
387
        isnothing(start) && return 1
10✔
388
    else
389
        _isascii(bytes,1,n) && return 1
20,420✔
390
        start = 1
20,189✔
391
    end
392
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
393
end
394

395
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
396
    chunk_size = 256
20,189✔
397

398
    start = first
20,189✔
399
    stop = min(last,first + chunk_size - 1)
20,189✔
400
    state = _UTF8_DFA_ACCEPT
20,189✔
401
    while start <= last
24,926✔
402
        # try to process ascii chunks
403
        while state == _UTF8_DFA_ACCEPT
20,189✔
404
            _isascii(bytes,start,stop) || break
20,189✔
405
            (start = start + chunk_size) <= last || break
×
406
            stop = min(last,stop + chunk_size)
×
407
        end
×
408
        # Process non ascii chunk
409
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
410
        state == _UTF8_DFA_INVALID && return 0
20,189✔
411

412
        start = start + chunk_size
4,737✔
413
        stop = min(last,stop + chunk_size)
4,737✔
414
    end
4,737✔
415
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
416
end
417

418
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,452✔
419
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
420

421
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
193✔
422

423
is_valid_continuation(c) = c & 0xc0 == 0x80
300✔
424

425
## required core functionality ##
426

427
@inline function iterate(s::String, i::Int=firstindex(s))
9,807✔
428
    (i % UInt) - 1 < ncodeunits(s) || return nothing
80,770,856✔
429
    b = @inbounds codeunit(s, i)
67,720,791✔
430
    u = UInt32(b) << 24
67,720,791✔
431
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
134,352,435✔
432
    return @noinline iterate_continued(s, i, u)
1,089,147✔
433
end
434

435
# duck-type s so that external UTF-8 string packages like StringViews can hook in
436
function iterate_continued(s, i::Int, u::UInt32)
1,089,146✔
437
    u < 0xc0000000 && (i += 1; @goto ret)
1,089,146✔
438
    n = ncodeunits(s)
1,081,741✔
439
    # first continuation byte
440
    (i += 1) > n && @goto ret
1,081,741✔
441
    @inbounds b = codeunit(s, i)
1,079,259✔
442
    b & 0xc0 == 0x80 || @goto ret
1,079,259✔
443
    u |= UInt32(b) << 16
1,074,188✔
444
    # second continuation byte
445
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
1,074,188✔
446
    @inbounds b = codeunit(s, i)
936,549✔
447
    b & 0xc0 == 0x80 || @goto ret
936,677✔
448
    u |= UInt32(b) << 8
936,421✔
449
    # third continuation byte
450
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
936,421✔
451
    @inbounds b = codeunit(s, i)
276,294✔
452
    b & 0xc0 == 0x80 || @goto ret
276,299✔
453
    u |= UInt32(b); i += 1
276,289✔
454
@label ret
455
    return reinterpret(Char, u), i
1,089,146✔
456
end
457

458
@propagate_inbounds function getindex(s::String, i::Int)
1,973✔
459
    b = codeunit(s, i)
19,730,051✔
460
    u = UInt32(b) << 24
19,730,051✔
461
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
38,649,455✔
462
    return getindex_continued(s, i, u)
810,614✔
463
end
464

465
# duck-type s so that external UTF-8 string packages like StringViews can hook in
466
function getindex_continued(s, i::Int, u::UInt32)
810,612✔
467
    if u < 0xc0000000
810,614✔
468
        # called from `getindex` which checks bounds
469
        @inbounds isvalid(s, i) && @goto ret
54✔
470
        string_index_err(s, i)
1✔
471
    end
472
    n = ncodeunits(s)
810,587✔
473

474
    (i += 1) > n && @goto ret
810,587✔
475
    @inbounds b = codeunit(s, i) # cont byte 1
810,586✔
476
    b & 0xc0 == 0x80 || @goto ret
810,586✔
477
    u |= UInt32(b) << 16
810,575✔
478

479
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
810,575✔
480
    @inbounds b = codeunit(s, i) # cont byte 2
803,557✔
481
    b & 0xc0 == 0x80 || @goto ret
803,557✔
482
    u |= UInt32(b) << 8
803,557✔
483

484
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
803,557✔
485
    @inbounds b = codeunit(s, i) # cont byte 3
262,154✔
486
    b & 0xc0 == 0x80 || @goto ret
262,154✔
487
    u |= UInt32(b)
262,154✔
488
@label ret
489
    return reinterpret(Char, u)
810,613✔
490
end
491

492
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
493

494
@inline function getindex(s::String, r::UnitRange{Int})
67,378✔
495
    isempty(r) && return ""
1,017,457✔
496
    i, j = first(r), last(r)
76,843✔
497
    @boundscheck begin
993,404✔
498
        checkbounds(s, r)
993,410✔
499
        @inbounds isvalid(s, i) || string_index_err(s, i)
991,014✔
500
        @inbounds isvalid(s, j) || string_index_err(s, j)
993,399✔
501
    end
502
    j = nextind(s, j) - 1
1,986,793✔
503
    n = j - i + 1
993,397✔
504
    ss = _string_n(n)
993,397✔
505
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
993,397✔
506
    return ss
993,397✔
507
end
508

509
# nothrow because we know the start and end indices are valid
510
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
868,683✔
511

512
# effects needed because @inbounds
513
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
5✔
514
    @boundscheck begin
267,663✔
515
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
267,663✔
516
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
267,667✔
517
    end
518
    j < i && return 0
267,659✔
519
    @inbounds i, k = thisind(s, i), i
463,436✔
520
    c = j - i + (i == k)
231,718✔
521
    @inbounds length_continued(s, i, j, c)
231,718✔
522
end
523

524
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
525
    i < n || return c
1,441,891✔
526
    b = codeunit(s, i)
758,911✔
527
    while true
3,362,232✔
528
        while true
47,840,114✔
529
            (i += 1) ≤ n || return c
48,598,242✔
530
            0xc0 ≤ b ≤ 0xf7 && break
47,081,986✔
531
            b = codeunit(s, i)
44,477,882✔
532
        end
44,477,882✔
533
        l = b
×
534
        b = codeunit(s, i) # cont byte 1
2,604,104✔
535
        c -= (x = b & 0xc0 == 0x80)
2,604,104✔
536
        x & (l ≥ 0xe0) || continue
2,604,104✔
537

538
        (i += 1) ≤ n || return c
2,209,661✔
539
        b = codeunit(s, i) # cont byte 2
2,208,105✔
540
        c -= (x = b & 0xc0 == 0x80)
2,208,105✔
541
        x & (l ≥ 0xf0) || continue
3,792,535✔
542

543
        (i += 1) ≤ n || return c
623,680✔
544
        b = codeunit(s, i) # cont byte 3
623,670✔
545
        c -= (b & 0xc0 == 0x80)
623,670✔
546
    end
2,603,321✔
547
end
548

549
## overload methods for efficiency ##
550

551
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
238,950,825✔
552

553
isascii(s::String) = isascii(codeunits(s))
733,444✔
554

555
# don't assume effects for general integers since we cannot know their implementation
556
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
312,033✔
557

558
"""
559
    repeat(c::AbstractChar, r::Integer) -> String
560

561
Repeat a character `r` times. This can equivalently be accomplished by calling
562
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
563

564
# Examples
565
```jldoctest
566
julia> repeat('A', 3)
567
"AAA"
568
```
569
"""
570
function repeat(c::AbstractChar, r::Integer)
311,866✔
571
    c = Char(c)::Char
312,093✔
572
    r == 0 && return ""
312,093✔
573
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
235,327✔
574
    u = bswap(reinterpret(UInt32, c))
235,323✔
575
    n = 4 - (leading_zeros(u | 0xff) >> 3)
235,323✔
576
    s = _string_n(n*r)
235,323✔
577
    p = pointer(s)
235,321✔
578
    GC.@preserve s if n == 1
235,321✔
579
        memset(p, u % UInt8, r)
235,203✔
580
    elseif n == 2
118✔
581
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
582
        for i = 1:r
6✔
583
            unsafe_store!(p16, u % UInt16, i)
20✔
584
        end
25✔
585
    elseif n == 3
112✔
586
        b1 = (u >> 0) % UInt8
108✔
587
        b2 = (u >> 8) % UInt8
108✔
588
        b3 = (u >> 16) % UInt8
108✔
589
        for i = 0:r-1
108✔
590
            unsafe_store!(p, b1, 3i + 1)
580✔
591
            unsafe_store!(p, b2, 3i + 2)
580✔
592
            unsafe_store!(p, b3, 3i + 3)
580✔
593
        end
580✔
594
    elseif n == 4
4✔
595
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
596
        for i = 1:r
4✔
597
            unsafe_store!(p32, u, i)
8✔
598
        end
235,329✔
599
    end
600
    return s
235,321✔
601
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc