• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37740

07 Apr 2024 07:37AM UTC coverage: 81.426% (-0.003%) from 81.429%
#37740

push

local

web-flow
Fix typos in docstrings (#53986)

70717 of 86848 relevant lines covered (81.43%)

15725683.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.38
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
57✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
53✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
const ByteArray = Union{CodeUnits{UInt8,String}, Vector{UInt8},Vector{Int8}, FastContiguousSubArray{UInt8,1,CodeUnits{UInt8,String}}, FastContiguousSubArray{UInt8,1,Vector{UInt8}}, FastContiguousSubArray{Int8,1,Vector{Int8}}}
31

32
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
325,408,298✔
33

34
"""
35
    String <: AbstractString
36

37
The default string type in Julia, used by e.g. string literals.
38

39
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
40
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
41
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
42
that the underlying byte sequence is valid as UTF-8.
43
"""
44
String
45

46
## constructors and conversions ##
47

48
# String constructor docstring from boot.jl, workaround for #16730
49
# and the unavailability of @doc in boot.jl context.
50
"""
51
    String(v::AbstractVector{UInt8})
52

53
Create a new `String` object using the data buffer from byte vector `v`.
54
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
55
modification of `v` cannot affect the contents of the resulting string.
56
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
57
`AbstractVector` types, `String(v)` already makes a copy.
58

59
When possible, the memory of `v` will be used without copying when the `String`
60
object is created. This is guaranteed to be the case for byte vectors returned
61
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
62
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
63
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
64
to guarantee consistent behavior.
65
"""
66
String(v::AbstractVector{UInt8}) = String(copyto!(StringMemory(length(v)), v))
5,225,710✔
67
function String(v::Memory{UInt8})
17✔
68
    len = length(v)
6,168,855✔
69
    len == 0 && return ""
6,168,855✔
70
    return ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), v, len)
6,168,804✔
71
end
72
function String(v::Vector{UInt8})
2,605✔
73
    #return ccall(:jl_array_to_string, Ref{String}, (Any,), v)
74
    len = length(v)
9,839,910✔
75
    len == 0 && return ""
9,839,910✔
76
    ref = v.ref
8,763,209✔
77
    if ref.ptr_or_offset == ref.mem.ptr
8,763,212✔
78
        str = ccall(:jl_genericmemory_to_string, Ref{String}, (Any, Int), ref.mem, len)
8,763,211✔
79
    else
80
        str = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), ref, len)
1✔
81
    end
82
    # optimized empty!(v); sizehint!(v, 0) calls
83
    setfield!(v, :size, (0,))
8,763,212✔
84
    setfield!(v, :ref, MemoryRef(Memory{UInt8}()))
8,763,209✔
85
    return str
8,763,212✔
86
end
87

88
"""
89
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
90

91
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
92
(The pointer can be safely freed afterwards.) If `length` is specified
93
(the length of the data in bytes), the string does not have to be NUL-terminated.
94

95
This function is labeled "unsafe" because it will crash if `p` is not
96
a valid memory address to data of the requested length.
97
"""
98
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
40✔
99
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
1,045,963✔
100
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
1,045,962✔
101
end
102
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
176✔
103
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
7,626,409✔
104
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
7,626,408✔
105
end
106

107
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
108
# but the macro is not available at this time in bootstrap, so we write it manually.
109
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0x000e)), :(convert(Csize_t, n))))
169,029,280✔
110

111
"""
112
    String(s::AbstractString)
113

114
Create a new `String` from an existing `AbstractString`.
115
"""
116
String(s::AbstractString) = print_to_string(s)
464✔
117
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
5,281,623✔
118

119
unsafe_wrap(::Type{Memory{UInt8}}, s::String) = ccall(:jl_string_to_genericmemory, Ref{Memory{UInt8}}, (Any,), s)
21,429,125✔
120
function unsafe_wrap(::Type{Vector{UInt8}}, s::String)
3✔
121
    mem = unsafe_wrap(Memory{UInt8}, s)
65,028✔
122
    view(mem, eachindex(mem))
129,860✔
123
end
124

125
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
37,410✔
126
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
37,377✔
127
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
128

129
String(s::CodeUnits{UInt8,String}) = s.s
3✔
130

131
## low-level functions ##
132

133
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
822,006,784✔
134
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
472,563,087✔
135

136
ncodeunits(s::String) = Core.sizeof(s)
926,142,437✔
137
codeunit(s::String) = UInt8
×
138

139
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
140
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
840✔
141
    @boundscheck checkbounds(s, i)
442,270,486✔
142
    b = GC.@preserve s unsafe_load(pointer(s, i))
442,270,478✔
143
    return b
442,270,478✔
144
end
145

146
## comparison ##
147

148
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
5,372,576✔
149

150
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
6,346,141✔
151
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
152
    GC.@preserve a b begin
10,418,225✔
153
        pa = unsafe_convert(Ptr{UInt8}, a)
9,927,662✔
154
        pb = unsafe_convert(Ptr{UInt8}, b)
10,418,225✔
155
        memcmp(pa, pb, len % Csize_t) % Int
10,418,225✔
156
    end
157
end
158

159
function cmp(a::String, b::String)
1✔
160
    al, bl = sizeof(a), sizeof(b)
5,372,576✔
161
    c = _memcmp(a, b)
5,372,576✔
162
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
9,514,534✔
163
end
164

165
==(a::String, b::String) = a===b
12,416,694✔
166

167
typemin(::Type{String}) = ""
1✔
168
typemin(::String) = typemin(String)
1✔
169

170
## thisind, nextind ##
171

172
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
264,035,270✔
173

174
# s should be String or SubString{String}
175
@inline function _thisind_str(s, i::Int)
176
    i == 0 && return 0
136,348,721✔
177
    n = ncodeunits(s)
136,526,691✔
178
    i == n + 1 && return i
136,526,691✔
179
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
136,503,727✔
180
    @inbounds b = codeunit(s, i)
136,503,703✔
181
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
250,936,027✔
182
    (@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
43,783,123✔
183
        local b
×
184
        @inbounds b = codeunit(s, i-1)
21,891,565✔
185
        between(b, 0b11000000, 0b11110111) && return i-1
21,891,565✔
186
        (b & 0xc0 == 0x80) & (i-2 > 0) || return i
21,304,524✔
187
        @inbounds b = codeunit(s, i-2)
8,910,578✔
188
        between(b, 0b11100000, 0b11110111) && return i-2
8,910,578✔
189
        (b & 0xc0 == 0x80) & (i-3 > 0) || return i
9,657,186✔
190
        @inbounds b = codeunit(s, i-3)
1,842,110✔
191
        between(b, 0b11110000, 0b11110111) && return i-3
1,842,110✔
192
        return i
1,056,047✔
193
    end)(s, i, n)
194
end
195

196
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
53,132,279✔
197

198
# s should be String or SubString{String}
199
@inline function _nextind_str(s, i::Int)
200
    i == 0 && return 1
36,518,969✔
201
    n = ncodeunits(s)
36,251,446✔
202
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
36,251,458✔
203
    @inbounds l = codeunit(s, i)
36,251,434✔
204
    between(l, 0x80, 0xf7) || return i+1
71,587,783✔
205
    (@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
1,830,138✔
206
        if l < 0xc0
915,069✔
207
            # handle invalid codeunit index by scanning back to the start of this index
208
            # (which may be the same as this index)
209
            i′ = @inbounds thisind(s, i)
69,068✔
210
            i′ >= i && return i+1
34,534✔
211
            i = i′
×
212
            @inbounds l = codeunit(s, i)
17,495✔
213
            (l < 0x80) | (0xf8 ≤ l) && return i+1
17,495✔
214
            @assert l >= 0xc0
17,495✔
215
        end
216
        # first continuation byte
217
        (i += 1) > n && return i
898,030✔
218
        @inbounds b = codeunit(s, i)
897,272✔
219
        b & 0xc0 ≠ 0x80 && return i
897,272✔
220
        ((i += 1) > n) | (l < 0xe0) && return i
884,625✔
221
        # second continuation byte
222
        @inbounds b = codeunit(s, i)
865,080✔
223
        b & 0xc0 ≠ 0x80 && return i
865,080✔
224
        ((i += 1) > n) | (l < 0xf0) && return i
863,260✔
225
        # third continuation byte
226
        @inbounds b = codeunit(s, i)
279,698✔
227
        return ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,698✔
228
    end)(s, i, n, l)
229
end
230

231
## checking UTF-8 & ACSII validity ##
232
#=
233
    The UTF-8 Validation is performed by a shift based DFA.
234
    ┌───────────────────────────────────────────────────────────────────┐
235
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
236
    │                               ├────────3────────┐           │     │
237
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
238
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
239
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
240
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
241
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
242
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
243
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
244
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
245
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
246
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
247
    │                      │        │     ├─┤               │        │  │
248
    │                      │        └─4──►│6├─────1,9───────┘        │  │
249
    │          INVALID     │              └─┘                        │  │
250
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
251
    │          ┌▼───┴┐                                                  │
252
    │          │  2  ◄─── All undefined transitions result in state 2   │
253
    │          └─────┘                                                  │
254
    └───────────────────────────────────────────────────────────────────┘
255

256
        Validation States
257
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
258
                        If the DFA ends in this state the string is ASCII only
259
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
260
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
261
                    as seen by all 1s in that column of table below
262
            3 -> One valid continuation byte needed to return to state 0
263
        4,5,6 -> Two valid continuation bytes needed to return to state 0
264
        7,8,9 -> Three valids continuation bytes needed to return to state 0
265

266
                        Current State
267
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
268
                0 | 0  1  2  2  2  2  2  2  2  2
269
                1 | 2  2  2  1  3  2  3  2  4  4
270
                2 | 3  3  2  2  2  2  2  2  2  2
271
                3 | 4  4  2  2  2  2  2  2  2  2
272
                4 | 6  6  2  2  2  2  2  2  2  2
273
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
274
    Class       6 | 8  8  2  2  2  2  2  2  2  2
275
                7 | 2  2  2  1  3  3  2  4  4  2
276
                8 | 2  2  2  2  2  2  2  2  2  2
277
                9 | 2  2  2  1  3  2  3  4  4  2
278
               10 | 5  5  2  2  2  2  2  2  2  2
279
               11 | 7  7  2  2  2  2  2  2  2  2
280

281
           Shifts | 0  4 10 14 18 24  8 20 12 26
282

283
    The shifts that represent each state were derived using the SMT solver Z3, to ensure when encoded into
284
    the rows the correct shift was a result.
285

286
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
287
    the current state then masking the result with 0x11110 give the shift for the new state
288

289

290
=#
291

292
#State type used by UTF-8 DFA
293
const _UTF8DFAState = UInt32
294
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
295
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
296
    num_classes=12
297
    num_states=10
298
    bit_per_state = 6
299

300
    # These shifts were derived using a SMT solver
301
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
302

303
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
310
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
311
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
312
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
313
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
314
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
315
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
316
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
317
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
318
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
319

320
    # These are the rows discussed in comments above
321
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
322
                     2  2  2  1  3  2  3  2  4  4;
323
                     3  3  2  2  2  2  2  2  2  2;
324
                     4  4  2  2  2  2  2  2  2  2;
325
                     6  6  2  2  2  2  2  2  2  2;
326
                     9  9  2  2  2  2  2  2  2  2;
327
                     8  8  2  2  2  2  2  2  2  2;
328
                     2  2  2  1  3  3  2  4  4  2;
329
                     2  2  2  2  2  2  2  2  2  2;
330
                     2  2  2  1  3  2  3  4  4  2;
331
                     5  5  2  2  2  2  2  2  2  2;
332
                     7  7  2  2  2  2  2  2  2  2]
333

334
    #This converts the state_arrays into the shift encoded _UTF8DFAState
335
    class_row = zeros(_UTF8DFAState, num_classes)
336

337
    for i = 1:num_classes
338
        row = _UTF8DFAState(0)
339
        for j in 1:num_states
340
            #Calculate the shift required for the next state
341
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
342
            #Shift the next state into the position of the current state
343
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
344
        end
345
        class_row[i]=row
346
    end
347

348
    map(c->class_row[c+1],character_classes)
×
349
end
350

351

352
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
353
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
354
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
355

356
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
357
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
358

359
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
5,248✔
360
    for i = first:last
25,821✔
361
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
362
    end
82,597✔
363
    return (state)
25,821✔
364
end
365

366
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
367
    n=first
10✔
368
    while n <= last - chunk_size
40✔
369
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
370
        n += chunk_size
30✔
371
    end
30✔
372
    n= last-chunk_size+1
10✔
373
    _isascii(cu,n,last) || return n
10✔
374
    return nothing
10✔
375
end
376

377
##
378

379
# Classifcations of string
380
    # 0: neither valid ASCII nor UTF-8
381
    # 1: valid ASCII
382
    # 2: valid UTF-8
383
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
384

385

386
function byte_string_classify(bytes::AbstractVector{UInt8})
24✔
387
    chunk_size = 1024
20,429✔
388
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,429✔
389
    n = length(bytes)
20,429✔
390
    if n > chunk_threshold
20,429✔
391
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
392
        isnothing(start) && return 1
10✔
393
    else
394
        _isascii(bytes,1,n) && return 1
20,419✔
395
        start = 1
20,189✔
396
    end
397
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
398
end
399

400
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
401
    chunk_size = 256
20,189✔
402

403
    start = first
20,189✔
404
    stop = min(last,first + chunk_size - 1)
20,189✔
405
    state = _UTF8_DFA_ACCEPT
20,189✔
406
    while start <= last
24,926✔
407
        # try to process ascii chunks
408
        while state == _UTF8_DFA_ACCEPT
20,189✔
409
            _isascii(bytes,start,stop) || break
20,189✔
410
            (start = start + chunk_size) <= last || break
×
411
            stop = min(last,stop + chunk_size)
×
412
        end
×
413
        # Process non ascii chunk
414
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
415
        state == _UTF8_DFA_INVALID && return 0
20,189✔
416

417
        start = start + chunk_size
4,737✔
418
        stop = min(last,stop + chunk_size)
4,737✔
419
    end
4,737✔
420
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
421
end
422

423
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,451✔
424
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
425

426
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
192✔
427

428
is_valid_continuation(c) = c & 0xc0 == 0x80
325✔
429

430
## required core functionality ##
431

432
@inline function iterate(s::String, i::Int=firstindex(s))
9,681✔
433
    (i % UInt) - 1 < ncodeunits(s) || return nothing
76,866,200✔
434
    b = @inbounds codeunit(s, i)
64,625,778✔
435
    u = UInt32(b) << 24
64,625,778✔
436
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
128,161,615✔
437
    return iterate_continued(s, i, u)
1,089,941✔
438
end
439

440
# duck-type s so that external UTF-8 string packages like StringViews can hook in
441
function iterate_continued(s, i::Int, u::UInt32)
1,089,941✔
442
    u < 0xc0000000 && (i += 1; @goto ret)
1,089,941✔
443
    n = ncodeunits(s)
1,082,588✔
444
    # first continuation byte
445
    (i += 1) > n && @goto ret
1,082,588✔
446
    @inbounds b = codeunit(s, i)
1,079,982✔
447
    b & 0xc0 == 0x80 || @goto ret
1,079,982✔
448
    u |= UInt32(b) << 16
1,075,002✔
449
    # second continuation byte
450
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
1,075,002✔
451
    @inbounds b = codeunit(s, i)
937,901✔
452
    b & 0xc0 == 0x80 || @goto ret
938,020✔
453
    u |= UInt32(b) << 8
937,782✔
454
    # third continuation byte
455
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
937,782✔
456
    @inbounds b = codeunit(s, i)
276,181✔
457
    b & 0xc0 == 0x80 || @goto ret
276,187✔
458
    u |= UInt32(b); i += 1
276,175✔
459
@label ret
460
    return reinterpret(Char, u), i
1,089,941✔
461
end
462

463
@propagate_inbounds function getindex(s::String, i::Int)
1,973✔
464
    b = codeunit(s, i)
19,131,087✔
465
    u = UInt32(b) << 24
19,131,087✔
466
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
37,452,179✔
467
    return getindex_continued(s, i, u)
809,962✔
468
end
469

470
# duck-type s so that external UTF-8 string packages like StringViews can hook in
471
function getindex_continued(s, i::Int, u::UInt32)
809,963✔
472
    if u < 0xc0000000
809,965✔
473
        # called from `getindex` which checks bounds
474
        @inbounds isvalid(s, i) && @goto ret
54✔
475
        string_index_err(s, i)
1✔
476
    end
477
    n = ncodeunits(s)
809,938✔
478

479
    (i += 1) > n && @goto ret
809,938✔
480
    @inbounds b = codeunit(s, i) # cont byte 1
809,937✔
481
    b & 0xc0 == 0x80 || @goto ret
809,937✔
482
    u |= UInt32(b) << 16
809,926✔
483

484
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
809,926✔
485
    @inbounds b = codeunit(s, i) # cont byte 2
802,977✔
486
    b & 0xc0 == 0x80 || @goto ret
802,977✔
487
    u |= UInt32(b) << 8
802,977✔
488

489
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
802,977✔
490
    @inbounds b = codeunit(s, i) # cont byte 3
262,113✔
491
    b & 0xc0 == 0x80 || @goto ret
262,113✔
492
    u |= UInt32(b)
262,113✔
493
@label ret
494
    return reinterpret(Char, u)
809,964✔
495
end
496

497
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
498

499
@inline function getindex(s::String, r::UnitRange{Int})
67,378✔
500
    isempty(r) && return ""
995,778✔
501
    i, j = first(r), last(r)
73,280✔
502
    @boundscheck begin
971,941✔
503
        checkbounds(s, r)
971,947✔
504
        @inbounds isvalid(s, i) || string_index_err(s, i)
969,467✔
505
        @inbounds isvalid(s, j) || string_index_err(s, j)
971,936✔
506
    end
507
    j = nextind(s, j) - 1
1,943,867✔
508
    n = j - i + 1
971,934✔
509
    ss = _string_n(n)
971,934✔
510
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
971,934✔
511
    return ss
971,934✔
512
end
513

514
# nothrow because we know the start and end indices are valid
515
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
1,004,682✔
516

517
# effects needed because @inbounds
518
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
5✔
519
    @boundscheck begin
1,506,864✔
520
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
1,506,864✔
521
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
1,506,868✔
522
    end
523
    j < i && return 0
1,506,860✔
524
    @inbounds i, k = thisind(s, i), i
2,941,964✔
525
    c = j - i + (i == k)
1,470,982✔
526
    @inbounds length_continued(s, i, j, c)
1,470,982✔
527
end
528

529
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
530
    i < n || return c
2,840,770✔
531
    b = codeunit(s, i)
2,110,558✔
532
    while true
4,716,089✔
533
        while true
61,930,955✔
534
            (i += 1) ≤ n || return c
64,040,738✔
535
            0xc0 ≤ b ≤ 0xf7 && break
59,821,172✔
536
            b = codeunit(s, i)
57,214,866✔
537
        end
57,214,866✔
538
        l = b
×
539
        b = codeunit(s, i) # cont byte 1
2,606,306✔
540
        c -= (x = b & 0xc0 == 0x80)
2,606,306✔
541
        x & (l ≥ 0xe0) || continue
2,606,306✔
542

543
        (i += 1) ≤ n || return c
2,211,159✔
544
        b = codeunit(s, i) # cont byte 2
2,209,629✔
545
        c -= (x = b & 0xc0 == 0x80)
2,209,629✔
546
        x & (l ≥ 0xf0) || continue
3,795,415✔
547

548
        (i += 1) ≤ n || return c
623,853✔
549
        b = codeunit(s, i) # cont byte 3
623,833✔
550
        c -= (b & 0xc0 == 0x80)
623,833✔
551
    end
2,605,531✔
552
end
553

554
## overload methods for efficiency ##
555

556
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
238,168,773✔
557

558
isascii(s::String) = isascii(codeunits(s))
714,897✔
559

560
# don't assume effects for general integers since we cannot know their implementation
561
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
187,600✔
562

563
"""
564
    repeat(c::AbstractChar, r::Integer) -> String
565

566
Repeat a character `r` times. This can equivalently be accomplished by calling
567
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
568

569
# Examples
570
```jldoctest
571
julia> repeat('A', 3)
572
"AAA"
573
```
574
"""
575
function repeat(c::AbstractChar, r::Integer)
187,430✔
576
    c = Char(c)::Char
187,601✔
577
    r == 0 && return ""
187,601✔
578
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
111,182✔
579
    u = bswap(reinterpret(UInt32, c))
111,178✔
580
    n = 4 - (leading_zeros(u | 0xff) >> 3)
111,178✔
581
    s = _string_n(n*r)
111,237✔
582
    p = pointer(s)
111,235✔
583
    GC.@preserve s if n == 1
111,235✔
584
        memset(p, u % UInt8, r)
111,117✔
585
    elseif n == 2
118✔
586
        p16 = reinterpret(Ptr{UInt16}, p)
6✔
587
        for i = 1:r
6✔
588
            unsafe_store!(p16, u % UInt16, i)
20✔
589
        end
25✔
590
    elseif n == 3
112✔
591
        b1 = (u >> 0) % UInt8
108✔
592
        b2 = (u >> 8) % UInt8
108✔
593
        b3 = (u >> 16) % UInt8
108✔
594
        for i = 0:r-1
108✔
595
            unsafe_store!(p, b1, 3i + 1)
578✔
596
            unsafe_store!(p, b2, 3i + 2)
578✔
597
            unsafe_store!(p, b3, 3i + 3)
578✔
598
        end
578✔
599
    elseif n == 4
4✔
600
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
601
        for i = 1:r
4✔
602
            unsafe_store!(p32, u, i)
8✔
603
        end
111,243✔
604
    end
605
    return s
111,176✔
606
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc