• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37596

pending completion
#37596

push

local

web-flow
🤖 [master] Bump the Pkg stdlib from 2c04d5a98 to b044bf6a2 (#50851)

Co-authored-by: Dilum Aluthge <dilum@aluthge.com>

71913 of 84418 relevant lines covered (85.19%)

32144286.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.4
/base/strings/string.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    StringIndexError(str, i)
5

6
An error occurred when trying to access `str` at index `i` that is not valid.
7
"""
8
struct StringIndexError <: Exception
9
    string::AbstractString
572✔
10
    index::Integer
11
end
12
@noinline string_index_err(s::AbstractString, i::Integer) =
568✔
13
    throw(StringIndexError(s, Int(i)))
14
function Base.showerror(io::IO, exc::StringIndexError)
4✔
15
    s = exc.string
4✔
16
    print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
4✔
17
    if firstindex(s) <= exc.index <= ncodeunits(s)
4✔
18
        iprev = thisind(s, exc.index)
4✔
19
        inext = nextind(s, iprev)
4✔
20
        escprev = escape_string(s[iprev:iprev])
4✔
21
        if inext <= ncodeunits(s)
4✔
22
            escnext = escape_string(s[inext:inext])
3✔
23
            print(io, ", valid nearby indices [$iprev]=>'$escprev', [$inext]=>'$escnext'")
3✔
24
        else
25
            print(io, ", valid nearby index [$iprev]=>'$escprev'")
1✔
26
        end
27
    end
28
end
29

30
const ByteArray = Union{CodeUnits{UInt8,String}, Vector{UInt8},Vector{Int8}, FastContiguousSubArray{UInt8,1,CodeUnits{UInt8,String}}, FastContiguousSubArray{UInt8,1,Vector{UInt8}}, FastContiguousSubArray{Int8,1,Vector{Int8}}}
31

32
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
173,954,023✔
33

34
"""
35
    String <: AbstractString
36

37
The default string type in Julia, used by e.g. string literals.
38

39
`String`s are immutable sequences of `Char`s. A `String` is stored internally as
40
a contiguous byte array, and while they are interpreted as being UTF-8 encoded,
41
they can be composed of any byte sequence. Use [`isvalid`](@ref) to validate
42
that the underlying byte sequence is valid as UTF-8.
43
"""
44
String
45

46
## constructors and conversions ##
47

48
# String constructor docstring from boot.jl, workaround for #16730
49
# and the unavailability of @doc in boot.jl context.
50
"""
51
    String(v::AbstractVector{UInt8})
52

53
Create a new `String` object using the data buffer from byte vector `v`.
54
If `v` is a `Vector{UInt8}` it will be truncated to zero length and future
55
modification of `v` cannot affect the contents of the resulting string.
56
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
57
`AbstractVector` types, `String(v)` already makes a copy.
58

59
When possible, the memory of `v` will be used without copying when the `String`
60
object is created. This is guaranteed to be the case for byte vectors returned
61
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
62
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
63
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
64
to guarantee consistent behavior.
65
"""
66
String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v))
2,612,856✔
67
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
32,841,271✔
68

69
"""
70
    unsafe_string(p::Ptr{UInt8}, [length::Integer])
71

72
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
73
(The pointer can be safely freed afterwards.) If `length` is specified
74
(the length of the data in bytes), the string does not have to be NUL-terminated.
75

76
This function is labeled "unsafe" because it will crash if `p` is not
77
a valid memory address to data of the requested length.
78
"""
79
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
1,097✔
80
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
666,508✔
81
    ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
666,507✔
82
end
83
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
10,208✔
84
    p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
3,195,907✔
85
    ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
3,195,906✔
86
end
87

88
# This is @assume_effects :effect_free :nothrow :terminates_globally @ccall jl_alloc_string(n::Csize_t)::Ref{String},
89
# but the macro is not available at this time in bootstrap, so we write it manually.
90
@eval _string_n(n::Integer) = $(Expr(:foreigncall, QuoteNode(:jl_alloc_string), Ref{String}, Expr(:call, Expr(:core, :svec), :Csize_t), 1, QuoteNode((:ccall,0xe)), :(convert(Csize_t, n))))
143,683,001✔
91

92
"""
93
    String(s::AbstractString)
94

95
Create a new `String` from an existing `AbstractString`.
96
"""
97
String(s::AbstractString) = print_to_string(s)
318✔
98
@assume_effects :total String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
716,535✔
99

100
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
31,684,521✔
101
unsafe_wrap(::Type{Vector{UInt8}}, s::FastContiguousSubArray{UInt8,1,Vector{UInt8}}) = unsafe_wrap(Vector{UInt8}, pointer(s), size(s))
×
102

103
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
20,925✔
104
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
20,868✔
105
Array{UInt8}(s::String)  = Vector{UInt8}(codeunits(s))
×
106

107
String(s::CodeUnits{UInt8,String}) = s.s
3✔
108

109
## low-level functions ##
110

111
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
454,295,481✔
112
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
320,355,779✔
113

114
ncodeunits(s::String) = Core.sizeof(s)
357,988,016✔
115
codeunit(s::String) = UInt8
×
116

117
codeunit(s::String, i::Integer) = codeunit(s, Int(i))
×
118
@assume_effects :foldable @inline function codeunit(s::String, i::Int)
188,750✔
119
    @boundscheck checkbounds(s, i)
302,042,939✔
120
    b = GC.@preserve s unsafe_load(pointer(s, i))
302,042,939✔
121
    return b
302,042,939✔
122
end
123

124
## comparison ##
125

126
@assume_effects :total _memcmp(a::String, b::String) = @invoke _memcmp(a::Union{Ptr{UInt8},AbstractString},b::Union{Ptr{UInt8},AbstractString})
505,708✔
127

128
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}) = _memcmp(a, b, min(sizeof(a), sizeof(b)))
1,044,472✔
129
function _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len::Int)
105✔
130
    GC.@preserve a b begin
1,765,652✔
131
        pa = unsafe_convert(Ptr{UInt8}, a)
1,357,797✔
132
        pb = unsafe_convert(Ptr{UInt8}, b)
1,765,652✔
133
        memcmp(pa, pb, len % Csize_t) % Int
1,765,652✔
134
    end
135
end
136

137
function cmp(a::String, b::String)
1✔
138
    al, bl = sizeof(a), sizeof(b)
505,708✔
139
    c = _memcmp(a, b)
505,708✔
140
    return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
769,582✔
141
end
142

143
==(a::String, b::String) = a===b
10,529,898✔
144

145
typemin(::Type{String}) = ""
2✔
146
typemin(::String) = typemin(String)
1✔
147

148
## thisind, nextind ##
149

150
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
181,043,663✔
151

152
# s should be String or SubString{String}
153
@inline function _thisind_str(s, i::Int)
×
154
    i == 0 && return 0
91,226,341✔
155
    n = ncodeunits(s)
91,207,281✔
156
    i == n + 1 && return i
91,207,281✔
157
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
91,184,839✔
158
    @inbounds b = codeunit(s, i)
91,184,815✔
159
    (b & 0xc0 == 0x80) & (i-1 > 0) || return i
162,302,330✔
160
    @inbounds b = codeunit(s, i-1)
20,067,300✔
161
    between(b, 0b11000000, 0b11110111) && return i-1
20,067,300✔
162
    (b & 0xc0 == 0x80) & (i-2 > 0) || return i
15,659,965✔
163
    @inbounds b = codeunit(s, i-2)
7,170,769✔
164
    between(b, 0b11100000, 0b11110111) && return i-2
7,170,769✔
165
    (b & 0xc0 == 0x80) & (i-3 > 0) || return i
5,906,548✔
166
    @inbounds b = codeunit(s, i-3)
1,026,998✔
167
    between(b, 0b11110000, 0b11110111) && return i-3
1,026,998✔
168
    return i
241,004✔
169
end
170

171
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
14,029,184✔
172

173
# s should be String or SubString{String}
174
@inline function _nextind_str(s, i::Int)
23,238,751✔
175
    i == 0 && return 1
23,238,751✔
176
    n = ncodeunits(s)
23,218,938✔
177
    @boundscheck between(i, 1, n) || throw(BoundsError(s, i))
23,218,950✔
178
    @inbounds l = codeunit(s, i)
23,218,926✔
179
    (l < 0x80) | (0xf8 ≤ l) && return i+1
23,218,926✔
180
    if l < 0xc0
918,431✔
181
        i′ = @inbounds thisind(s, i)
51,357✔
182
        return i′ < i ? @inbounds(nextind(s, i′)) : i+1
30,403✔
183
    end
184
    # first continuation byte
185
    (i += 1) > n && return i
888,028✔
186
    @inbounds b = codeunit(s, i)
887,390✔
187
    b & 0xc0 ≠ 0x80 && return i
887,390✔
188
    ((i += 1) > n) | (l < 0xe0) && return i
880,134✔
189
    # second continuation byte
190
    @inbounds b = codeunit(s, i)
859,526✔
191
    b & 0xc0 ≠ 0x80 && return i
859,526✔
192
    ((i += 1) > n) | (l < 0xf0) && return i
855,893✔
193
    # third continuation byte
194
    @inbounds b = codeunit(s, i)
279,608✔
195
    ifelse(b & 0xc0 ≠ 0x80, i, i+1)
279,608✔
196
end
197

198
## checking UTF-8 & ACSII validity ##
199
#=
200
    The UTF-8 Validation is performed by a shift based DFA.
201
    ┌───────────────────────────────────────────────────────────────────┐
202
    │    UTF-8 DFA State Diagram    ┌──────────────2──────────────┐     │
203
    │                               ├────────3────────┐           │     │
204
    │                 ┌──────────┐  │     ┌─┐        ┌▼┐          │     │
205
    │      ASCII      │  UTF-8   │  ├─5──►│9├───1────► │          │     │
206
    │                 │          │  │     ├─┤        │ │         ┌▼┐    │
207
    │                 │  ┌─0─┐   │  ├─6──►│8├─1,7,9──►4├──1,7,9──► │    │
208
    │      ┌─0─┐      │  │   │   │  │     ├─┤        │ │         │ │    │
209
    │      │   │      │ ┌▼───┴┐  │  ├─11─►│7├──7,9───► │ ┌───────►3├─┐  │
210
    │     ┌▼───┴┐     │ │     │  ▼  │     └─┘        └─┘ │       │ │ │  │
211
    │     │  0  ├─────┘ │  1  ├─► ──┤                    │  ┌────► │ │  │
212
    │     └─────┘       │     │     │     ┌─┐            │  │    └─┘ │  │
213
    │                   └──▲──┘     ├─10─►│5├─────7──────┘  │        │  │
214
    │                      │        │     ├─┤               │        │  │
215
    │                      │        └─4──►│6├─────1,9───────┘        │  │
216
    │          INVALID     │              └─┘                        │  │
217
    │           ┌─*─┐      └──────────────────1,7,9──────────────────┘  │
218
    │          ┌▼───┴┐                                                  │
219
    │          │  2  ◄─── All undefined transitions result in state 2   │
220
    │          └─────┘                                                  │
221
    └───────────────────────────────────────────────────────────────────┘
222

223
        Validation States
224
            0 -> _UTF8_DFA_ASCII is the start state and will only stay in this state if the string is only ASCII characters
225
                        If the DFA ends in this state the string is ASCII only
226
            1 -> _UTF8_DFA_ACCEPT is the valid complete character state of the DFA once it has encountered a UTF-8 Unicode character
227
            2 -> _UTF8_DFA_INVALID is only reached by invalid bytes and once in this state it will not change
228
                    as seen by all 1s in that column of table below
229
            3 -> One valid continuation byte needed to return to state 0
230
        4,5,6 -> Two valid continuation bytes needed to return to state 0
231
        7,8,9 -> Three valids continuation bytes needed to return to state 0
232

233
                        Current State
234
                    0̲  1̲  2̲  3̲  4̲  5̲  6̲  7̲  8̲  9̲
235
                0 | 0  1  2  2  2  2  2  2  2  2
236
                1 | 2  2  2  1  3  2  3  2  4  4
237
                2 | 3  3  2  2  2  2  2  2  2  2
238
                3 | 4  4  2  2  2  2  2  2  2  2
239
                4 | 6  6  2  2  2  2  2  2  2  2
240
    Character   5 | 9  9  2  2  2  2  2  2  2  2     <- Next State
241
    Class       6 | 8  8  2  2  2  2  2  2  2  2
242
                7 | 2  2  2  1  3  3  2  4  4  2
243
                8 | 2  2  2  2  2  2  2  2  2  2
244
                9 | 2  2  2  1  3  2  3  4  4  2
245
               10 | 5  5  2  2  2  2  2  2  2  2
246
               11 | 7  7  2  2  2  2  2  2  2  2
247

248
           Shifts | 0  4 10 14 18 24  8 20 12 26
249

250
    The shifts that represent each state were derived using teh SMT solver Z3, to ensure when encoded into
251
    the rows the correct shift was a result.
252

253
    Each character class row is encoding 10 states with shifts as defined above. By shifting the bitsof a row by
254
    the current state then masking the result with 0x11110 give the shift for the new state
255

256

257
=#
258

259
#State type used by UTF-8 DFA
260
const _UTF8DFAState = UInt32
261
# Fill the table with 256 UInt64 representing the DFA transitions for all bytes
262
const _UTF8_DFA_TABLE = let # let block rather than function doesn't pollute base
263
    num_classes=12
264
    num_states=10
265
    bit_per_state = 6
266

267
    # These shifts were derived using a SMT solver
268
    state_shifts = [0, 4, 10, 14, 18, 24, 8, 20, 12, 26]
269

270
    character_classes = [   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
271
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
272
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
273
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
274
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
275
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
276
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
277
                            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
278
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
279
                            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
280
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
281
                            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
282
                            8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
283
                            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
284
                            10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
285
                            11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 ]
286

287
    # These are the rows discussed in comments above
288
    state_arrays = [ 0  1  2  2  2  2  2  2  2  2;
289
                     2  2  2  1  3  2  3  2  4  4;
290
                     3  3  2  2  2  2  2  2  2  2;
291
                     4  4  2  2  2  2  2  2  2  2;
292
                     6  6  2  2  2  2  2  2  2  2;
293
                     9  9  2  2  2  2  2  2  2  2;
294
                     8  8  2  2  2  2  2  2  2  2;
295
                     2  2  2  1  3  3  2  4  4  2;
296
                     2  2  2  2  2  2  2  2  2  2;
297
                     2  2  2  1  3  2  3  4  4  2;
298
                     5  5  2  2  2  2  2  2  2  2;
299
                     7  7  2  2  2  2  2  2  2  2]
300

301
    #This converts the state_arrays into the shift encoded _UTF8DFAState
302
    class_row = zeros(_UTF8DFAState, num_classes)
303

304
    for i = 1:num_classes
305
        row = _UTF8DFAState(0)
306
        for j in 1:num_states
307
            #Calculate the shift required for the next state
308
            to_shift = UInt8((state_shifts[state_arrays[i,j]+1]) )
309
            #Shift the next state into the position of the current state
310
            row = row | (_UTF8DFAState(to_shift) << state_shifts[j])
311
        end
312
        class_row[i]=row
313
    end
314

315
    map(c->class_row[c+1],character_classes)
×
316
end
317

318

319
const _UTF8_DFA_ASCII = _UTF8DFAState(0) #This state represents the start and end of any valid string
320
const _UTF8_DFA_ACCEPT = _UTF8DFAState(4) #This state represents the start and end of any valid string
321
const _UTF8_DFA_INVALID = _UTF8DFAState(10) # If the state machine is ever in this state just stop
322

323
# The dfa step is broken out so that it may be used in other functions. The mask was calculated to work with state shifts above
324
@inline _utf_dfa_step(state::_UTF8DFAState, byte::UInt8) = @inbounds (_UTF8_DFA_TABLE[byte+1] >> state) & _UTF8DFAState(0x0000001E)
54,209✔
325

326
@inline function _isvalid_utf8_dfa(state::_UTF8DFAState, bytes::AbstractVector{UInt8}, first::Int = firstindex(bytes), last::Int = lastindex(bytes))
25,752✔
327
    for i = first:last
51,258✔
328
       @inbounds state = _utf_dfa_step(state, bytes[i])
54,209✔
329
    end
82,597✔
330
    return (state)
25,821✔
331
end
332

333
@inline function  _find_nonascii_chunk(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
10✔
334
    n=first
10✔
335
    while n <= last - chunk_size
40✔
336
        _isascii(cu,n,n+chunk_size-1) || return n
30✔
337
        n += chunk_size
30✔
338
    end
30✔
339
    n= last-chunk_size+1
10✔
340
    _isascii(cu,n,last) || return n
10✔
341
    return nothing
10✔
342
end
343

344
##
345

346
# Classifcations of string
347
    # 0: neither valid ASCII nor UTF-8
348
    # 1: valid ASCII
349
    # 2: valid UTF-8
350
 byte_string_classify(s::AbstractString) = byte_string_classify(codeunits(s))
24✔
351

352

353
function byte_string_classify(bytes::AbstractVector{UInt8})
20,283✔
354
    chunk_size = 1024
20,259✔
355
    chunk_threshold =  chunk_size + (chunk_size ÷ 2)
20,259✔
356
    n = length(bytes)
20,430✔
357
    if n > chunk_threshold
20,430✔
358
        start = _find_nonascii_chunk(chunk_size,bytes,1,n)
10✔
359
        isnothing(start) && return 1
10✔
360
    else
361
        _isascii(bytes,1,n) && return 1
20,420✔
362
        start = 1
20,120✔
363
    end
364
    return _byte_string_classify_nonascii(bytes,start,n)
20,189✔
365
end
366

367
function _byte_string_classify_nonascii(bytes::AbstractVector{UInt8}, first::Int, last::Int)
20,189✔
368
    chunk_size = 256
20,120✔
369

370
    start = first
20,120✔
371
    stop = min(last,first + chunk_size - 1)
20,189✔
372
    state = _UTF8_DFA_ACCEPT
20,120✔
373
    while start <= last
24,926✔
374
        # try to process ascii chunks
375
        while state == _UTF8_DFA_ACCEPT
20,189✔
376
            _isascii(bytes,start,stop) || break
20,189✔
377
            (start = start + chunk_size) <= last || break
×
378
            stop = min(last,stop + chunk_size)
×
379
        end
×
380
        # Process non ascii chunk
381
        state = _isvalid_utf8_dfa(state,bytes,start,stop)
48,577✔
382
        state == _UTF8_DFA_INVALID && return 0
20,189✔
383

384
        start = start + chunk_size
4,737✔
385
        stop = min(last,stop + chunk_size)
4,737✔
386
    end
4,737✔
387
    return ifelse(state == _UTF8_DFA_ACCEPT,2,0)
4,737✔
388
end
389

390
isvalid(::Type{String}, bytes::AbstractVector{UInt8}) = (@inline byte_string_classify(bytes)) ≠ 0
20,406✔
391
isvalid(::Type{String}, s::AbstractString) =  (@inline byte_string_classify(s)) ≠ 0
24✔
392

393
@inline isvalid(s::AbstractString) = @inline isvalid(String, codeunits(s))
147✔
394

395
is_valid_continuation(c) = c & 0xc0 == 0x80
×
396

397
## required core functionality ##
398

399
@inline function iterate(s::String, i::Int=firstindex(s))
31,535✔
400
    (i % UInt) - 1 < ncodeunits(s) || return nothing
29,197,225✔
401
    b = @inbounds codeunit(s, i)
20,168,028✔
402
    u = UInt32(b) << 24
20,168,028✔
403
    between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
39,411,665✔
404
    return iterate_continued(s, i, u)
924,391✔
405
end
406

407
function iterate_continued(s::String, i::Int, u::UInt32)
924,391✔
408
    u < 0xc0000000 && (i += 1; @goto ret)
925,446✔
409
    n = ncodeunits(s)
923,336✔
410
    # first continuation byte
411
    (i += 1) > n && @goto ret
923,336✔
412
    @inbounds b = codeunit(s, i)
922,461✔
413
    b & 0xc0 == 0x80 || @goto ret
922,461✔
414
    u |= UInt32(b) << 16
920,605✔
415
    # second continuation byte
416
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
920,605✔
417
    @inbounds b = codeunit(s, i)
855,203✔
418
    b & 0xc0 == 0x80 || @goto ret
855,325✔
419
    u |= UInt32(b) << 8
855,081✔
420
    # third continuation byte
421
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
855,081✔
422
    @inbounds b = codeunit(s, i)
273,720✔
423
    b & 0xc0 == 0x80 || @goto ret
273,722✔
424
    u |= UInt32(b); i += 1
547,436✔
425
@label ret
×
426
    return reinterpret(Char, u), i
924,391✔
427
end
428

429
@propagate_inbounds function getindex(s::String, i::Int)
7,750✔
430
    b = codeunit(s, i)
11,267,885✔
431
    u = UInt32(b) << 24
11,267,885✔
432
    between(b, 0x80, 0xf7) || return reinterpret(Char, u)
21,745,623✔
433
    return getindex_continued(s, i, u)
790,147✔
434
end
435

436
function getindex_continued(s::String, i::Int, u::UInt32)
790,147✔
437
    if u < 0xc0000000
790,147✔
438
        # called from `getindex` which checks bounds
439
        @inbounds isvalid(s, i) && @goto ret
16✔
440
        string_index_err(s, i)
1✔
441
    end
442
    n = ncodeunits(s)
790,131✔
443

444
    (i += 1) > n && @goto ret
790,131✔
445
    @inbounds b = codeunit(s, i) # cont byte 1
790,130✔
446
    b & 0xc0 == 0x80 || @goto ret
790,130✔
447
    u |= UInt32(b) << 16
790,119✔
448

449
    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
790,119✔
450
    @inbounds b = codeunit(s, i) # cont byte 2
785,824✔
451
    b & 0xc0 == 0x80 || @goto ret
785,824✔
452
    u |= UInt32(b) << 8
785,824✔
453

454
    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
785,824✔
455
    @inbounds b = codeunit(s, i) # cont byte 3
259,330✔
456
    b & 0xc0 == 0x80 || @goto ret
259,330✔
457
    u |= UInt32(b)
259,330✔
458
@label ret
×
459
    return reinterpret(Char, u)
790,146✔
460
end
461

462
getindex(s::String, r::AbstractUnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
4✔
463

464
@inline function getindex(s::String, r::UnitRange{Int})
417,017✔
465
    isempty(r) && return ""
433,963✔
466
    i, j = first(r), last(r)
69,125✔
467
    @boundscheck begin
410,643✔
468
        checkbounds(s, r)
410,625✔
469
        @inbounds isvalid(s, i) || string_index_err(s, i)
410,613✔
470
        @inbounds isvalid(s, j) || string_index_err(s, j)
410,614✔
471
    end
472
    j = nextind(s, j) - 1
410,636✔
473
    n = j - i + 1
410,636✔
474
    ss = _string_n(n)
410,636✔
475
    GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
410,636✔
476
    return ss
410,636✔
477
end
478

479
# nothrow because we know the start and end indices are valid
480
@assume_effects :nothrow length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
869,003✔
481

482
# effects needed because @inbounds
483
@assume_effects :consistent :effect_free @inline function length(s::String, i::Int, j::Int)
68,251✔
484
    @boundscheck begin
181,097✔
485
        0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i))
181,097✔
486
        0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j))
181,101✔
487
    end
488
    j < i && return 0
181,093✔
489
    @inbounds i, k = thisind(s, i), i
263,308✔
490
    c = j - i + (i == k)
131,654✔
491
    @inbounds length_continued(s, i, j, c)
131,654✔
492
end
493

494
@assume_effects :terminates_locally @inline @propagate_inbounds function length_continued(s::String, i::Int, n::Int, c::Int)
897,052✔
495
    i < n || return c
1,169,548✔
496
    b = codeunit(s, i)
831,766✔
497
    while true
4,258,767✔
498
        while true
50,617,302✔
499
            (i += 1) ≤ n || return c
51,448,194✔
500
            0xc0 ≤ b ≤ 0xf7 && break
49,786,410✔
501
            b = codeunit(s, i)
46,358,535✔
502
        end
46,358,535✔
503
        l = b
3,427,875✔
504
        b = codeunit(s, i) # cont byte 1
3,427,875✔
505
        c -= (x = b & 0xc0 == 0x80)
3,427,875✔
506
        x & (l ≥ 0xe0) || continue
3,427,875✔
507

508
        (i += 1) ≤ n || return c
3,026,547✔
509
        b = codeunit(s, i) # cont byte 2
3,024,891✔
510
        c -= (x = b & 0xc0 == 0x80)
3,024,891✔
511
        x & (l ≥ 0xf0) || continue
5,152,684✔
512

513
        (i += 1) ≤ n || return c
897,144✔
514
        b = codeunit(s, i) # cont byte 3
897,052✔
515
        c -= (b & 0xc0 == 0x80)
897,052✔
516
    end
3,427,001✔
517
end
518

519
## overload methods for efficiency ##
520

521
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
83,352,074✔
522

523
isascii(s::String) = isascii(codeunits(s))
635,113✔
524

525
# don't assume effects for general integers since we cannot know their implementation
526
@assume_effects :foldable repeat(c::Char, r::BitInteger) = @invoke repeat(c::Char, r::Integer)
25,466✔
527

528
"""
529
    repeat(c::AbstractChar, r::Integer) -> String
530

531
Repeat a character `r` times. This can equivalently be accomplished by calling
532
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
533

534
# Examples
535
```jldoctest
536
julia> repeat('A', 3)
537
"AAA"
538
```
539
"""
540
function repeat(c::AbstractChar, r::Integer)
25,370✔
541
    c = Char(c)::Char
25,370✔
542
    r == 0 && return ""
25,370✔
543
    r < 0 && throw(ArgumentError("can't repeat a character $r times"))
21,217✔
544
    u = bswap(reinterpret(UInt32, c))
21,213✔
545
    n = 4 - (leading_zeros(u | 0xff) >> 3)
21,213✔
546
    s = _string_n(n*r)
21,323✔
547
    p = pointer(s)
21,321✔
548
    GC.@preserve s if n == 1
21,321✔
549
        memset(p, u % UInt8, r)
21,199✔
550
    elseif n == 2
122✔
551
        p16 = reinterpret(Ptr{UInt16}, p)
7✔
552
        for i = 1:r
13✔
553
            unsafe_store!(p16, u % UInt16, i)
26✔
554
        end
31✔
555
    elseif n == 3
115✔
556
        b1 = (u >> 0) % UInt8
111✔
557
        b2 = (u >> 8) % UInt8
111✔
558
        b3 = (u >> 16) % UInt8
111✔
559
        for i = 0:r-1
222✔
560
            unsafe_store!(p, b1, 3i + 1)
624✔
561
            unsafe_store!(p, b2, 3i + 2)
624✔
562
            unsafe_store!(p, b3, 3i + 3)
624✔
563
        end
624✔
564
    elseif n == 4
4✔
565
        p32 = reinterpret(Ptr{UInt32}, p)
4✔
566
        for i = 1:r
8✔
567
            unsafe_store!(p32, u, i)
8✔
568
        end
21,329✔
569
    end
570
    return s
21,211✔
571
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc