• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37658

20 Oct 2023 08:24PM UTC coverage: 87.459% (-0.5%) from 87.929%
#37658

push

local

web-flow
fix unicode indexing in parse(Complex, string) (#51758)

This fixes a string-indexing bug introduced in #24713 (Julia 0.7).
Sometimes, this would cause `parse(Complex{T}, string)` to throw a
`StringIndexError` rather than an `ArgumentError`, e.g. for
`parse(ComplexF64, "3 β+ 4im")` or `parse(ComplexF64, "3 + 4αm")`. (As
far as I can tell, it can never cause parsing to fail for valid
strings.)

The source of the error is that if `i` is the index of an ASCII
character in a string `s`, then `s[i+1]` is valid (even if the next
character is non-ASCII) but `s[i-1]` is invalid if the previous
character is non-ASCII.

5 of 5 new or added lines in 1 file covered. (100.0%)

73572 of 84122 relevant lines covered (87.46%)

11577017.06 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.88
/base/strings/substring.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
"""
4
    SubString(s::AbstractString, i::Integer, j::Integer=lastindex(s))
5
    SubString(s::AbstractString, r::UnitRange{<:Integer})
6

7
Like [`getindex`](@ref), but returns a view into the parent string `s`
8
within range `i:j` or `r` respectively instead of making a copy.
9

10
The [`@views`](@ref) macro converts any string slices `s[i:j]` into
11
substrings `SubString(s, i, j)` in a block of code.
12

13
# Examples
14
```jldoctest
15
julia> SubString("abc", 1, 2)
16
"ab"
17

18
julia> SubString("abc", 1:2)
19
"ab"
20

21
julia> SubString("abc", 2)
22
"bc"
23
```
24
"""
25
struct SubString{T<:AbstractString} <: AbstractString
26
    string::T
27
    offset::Int
28
    ncodeunits::Int
29

30
    function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString
7,453,254✔
31
        i ≤ j || return new(s, 0, 0)
8,231,934✔
32
        @boundscheck begin
6,674,563✔
33
            checkbounds(s, i:j)
6,674,584✔
34
            @inbounds isvalid(s, i) || string_index_err(s, i)
6,674,544✔
35
            @inbounds isvalid(s, j) || string_index_err(s, j)
6,674,550✔
36
        end
37
        return new(s, i-1, nextind(s,j)-i)
6,674,530✔
38
    end
39
    function SubString{T}(s::T, i::Int, j::Int, ::Val{:noshift}) where T<:AbstractString
9✔
40
        @boundscheck begin
8✔
41
            si, sj = i + 1, prevind(s, j + i + 1)
8✔
42
            @inbounds isvalid(s, si) || string_index_err(s, si)
8✔
43
            @inbounds isvalid(s, sj) || string_index_err(s, sj)
8✔
44
        end
45
        new(s, i, j)
8✔
46
    end
47
end
48

49
@propagate_inbounds SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j)
7,460,175✔
50
@propagate_inbounds SubString(s::T, i::Int, j::Int, v::Val{:noshift}) where {T<:AbstractString} = SubString{T}(s, i, j, v)
1✔
51
@propagate_inbounds SubString(s::AbstractString, i::Integer, j::Integer=lastindex(s)) = SubString(s, Int(i), Int(j))
3,718,369✔
52
@propagate_inbounds SubString(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, first(r), last(r))
99,961✔
53

54
@propagate_inbounds function SubString(s::SubString, i::Int, j::Int)
2,042✔
55
    @boundscheck i ≤ j && checkbounds(s, i:j)
264,698✔
56
    SubString(s.string, s.offset+i, s.offset+j)
264,690✔
57
end
58

59
SubString(s::AbstractString) = SubString(s, 1, lastindex(s)::Int)
671✔
60
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, lastindex(s)::Int)
×
61

62
@propagate_inbounds view(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, r)
1,689✔
63
@propagate_inbounds maybeview(s::AbstractString, r::AbstractUnitRange{<:Integer}) = view(s, r)
3✔
64
@propagate_inbounds maybeview(s::AbstractString, args...) = getindex(s, args...)
6✔
65

66
convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} =
66✔
67
    SubString(convert(S, s))::SubString{S}
68
convert(::Type{T}, s::T) where {T<:SubString} = s
3✔
69

70
# Regex match allows only Union{String, SubString{String}} so define conversion to this type
71
convert(::Type{Union{String, SubString{String}}}, s::String) = s
×
72
convert(::Type{Union{String, SubString{String}}}, s::SubString{String}) = s
×
73
convert(::Type{Union{String, SubString{String}}}, s::AbstractString) = convert(String, s)::String
×
74

75
function String(s::SubString{String})
182,970✔
76
    parent = s.string
886,060✔
77
    copy = GC.@preserve parent unsafe_string(pointer(parent, s.offset+1), s.ncodeunits)
886,060✔
78
    return copy
886,060✔
79
end
80

81
ncodeunits(s::SubString) = s.ncodeunits
95,484,619✔
82
codeunit(s::SubString) = codeunit(s.string)::CodeunitType
×
83
length(s::SubString) = length(s.string, s.offset+1, s.offset+s.ncodeunits)
148,859✔
84

85
function codeunit(s::SubString, i::Integer)
3,769✔
86
    @boundscheck checkbounds(s, i)
12,364,488✔
87
    @inbounds return codeunit(s.string, s.offset + i)
12,364,488✔
88
end
89

90
function iterate(s::SubString, i::Integer=firstindex(s))
8,240✔
91
    i == ncodeunits(s)+1 && return nothing
6,319,932✔
92
    @boundscheck checkbounds(s, i)
3,271,007✔
93
    y = iterate(s.string, s.offset + i)
6,247,622✔
94
    y === nothing && return nothing
3,271,003✔
95
    c, i = y::Tuple{AbstractChar,Int}
3,270,799✔
96
    return c, i - s.offset
3,271,003✔
97
end
98

99
function getindex(s::SubString, i::Integer)
24,003✔
100
    @boundscheck checkbounds(s, i)
9,651,511✔
101
    @inbounds return getindex(s.string, s.offset + i)
18,514,636✔
102
end
103

104
isascii(ss::SubString{String}) = isascii(codeunits(ss))
×
105

106
function isvalid(s::SubString, i::Integer)
13,354,919✔
107
    ib = true
233✔
108
    @boundscheck ib = checkbounds(Bool, s, i)
26,970,016✔
109
    @inbounds return ib && isvalid(s.string, s.offset + i)::Bool
26,972,260✔
110
end
111

112
thisind(s::SubString{String}, i::Int) = _thisind_str(s, i)
767,007✔
113
nextind(s::SubString{String}, i::Int) = _nextind_str(s, i)
9,710,686✔
114

115
parent(s::SubString) = s.string
1✔
116
parentindices(s::SubString) = (s.offset + 1 : thisind(s.string, s.offset + s.ncodeunits),)
1✔
117

118
function ==(a::Union{String, SubString{String}}, b::Union{String, SubString{String}})
622,273✔
119
    sizeof(a) == sizeof(b) && _memcmp(a, b) == 0
17,441,266✔
120
end
121

122
function cmp(a::SubString{String}, b::SubString{String})
×
123
    c = _memcmp(a, b)
×
124
    return c < 0 ? -1 : c > 0 ? +1 : cmp(sizeof(a), sizeof(b))
×
125
end
126

127
# don't make unnecessary copies when passing substrings to C functions
128
cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s
×
129
cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s
2✔
130

131
function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8}
321✔
132
    convert(Ptr{R}, pointer(s.string)) + s.offset
1,060,229✔
133
end
134

135
pointer(x::SubString{String}) = pointer(x.string) + x.offset
3,552,566✔
136
pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
65✔
137

138
function hash(s::SubString{String}, h::UInt)
260✔
139
    h += memhash_seed
260✔
140
    ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h
251,063✔
141
end
142

143
"""
144
    reverse(s::AbstractString) -> AbstractString
145

146
Reverses a string. Technically, this function reverses the codepoints in a string and its
147
main utility is for reversed-order string processing, especially for reversed
148
regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
149
indices in `reverse(s)` and vice-versa, and `graphemes` from module `Unicode` to
150
operate on user-visible "characters" (graphemes) rather than codepoints.
151
See also [`Iterators.reverse`](@ref) for
152
reverse-order iteration without making a copy. Custom string types must implement the
153
`reverse` function themselves and should typically return a string with the same type
154
and encoding. If they return a string with a different encoding, they must also override
155
`reverseind` for that string type to satisfy `s[reverseind(s,i)] == reverse(s)[i]`.
156

157
# Examples
158
```jldoctest
159
julia> reverse("JuliaLang")
160
"gnaLailuJ"
161
```
162

163
!!! note
164
    The examples below may be rendered differently on different systems.
165
    The comments indicate how they're supposed to be rendered
166

167
Combining characters can lead to surprising results:
168

169
```jldoctest
170
julia> reverse("ax̂e") # hat is above x in the input, above e in the output
171
"êxa"
172

173
julia> using Unicode
174

175
julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes; hat is above x in both in- and output
176
"ex̂a"
177
```
178
"""
179
function reverse(s::Union{String,SubString{String}})::String
4,462✔
180
    # Read characters forwards from `s` and write backwards to `out`
181
    out = _string_n(sizeof(s))
4,462✔
182
    offs = sizeof(s) + 1
4,462✔
183
    for c in s
8,800✔
184
        offs -= ncodeunits(c)
135,244✔
185
        __unsafe_string!(out, c, offs)
132,770✔
186
    end
256,829✔
187
    return out
4,462✔
188
end
189

190
string(a::String)            = String(a)
41,072✔
191
string(a::SubString{String}) = String(a)
81✔
192

193
function Symbol(s::SubString{String})
80✔
194
    return ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), s, sizeof(s))
8,632✔
195
end
196

197
@inline function __unsafe_string!(out, c::Char, offs::Integer) # out is a (new) String (or StringVector)
29,368✔
198
    x = bswap(reinterpret(UInt32, c))
1,935,813✔
199
    n = ncodeunits(c)
2,052,679✔
200
    GC.@preserve out begin
1,935,839✔
201
        unsafe_store!(pointer(out, offs), x % UInt8)
1,935,839✔
202
        n == 1 && return n
1,935,813✔
203
        x >>= 8
58,992✔
204
        unsafe_store!(pointer(out, offs+1), x % UInt8)
58,992✔
205
        n == 2 && return n
58,992✔
206
        x >>= 8
55,222✔
207
        unsafe_store!(pointer(out, offs+2), x % UInt8)
55,222✔
208
        n == 3 && return n
55,222✔
209
        x >>= 8
2,652✔
210
        unsafe_store!(pointer(out, offs+3), x % UInt8)
2,652✔
211
    end
212
    return n
2,652✔
213
end
214

215
@assume_effects :nothrow @inline function __unsafe_string!(out, s::String, offs::Integer)
×
216
    n = sizeof(s)
18,188,274✔
217
    GC.@preserve s out unsafe_copyto!(pointer(out, offs), pointer(s), n)
18,188,274✔
218
    return n
18,188,274✔
219
end
220

221
@inline function __unsafe_string!(out, s::SubString{String}, offs::Integer)
×
222
    n = sizeof(s)
628,424✔
223
    GC.@preserve s out unsafe_copyto!(pointer(out, offs), pointer(s), n)
628,424✔
224
    return n
628,424✔
225
end
226

227
@assume_effects :nothrow @inline function __unsafe_string!(out, s::Symbol, offs::Integer)
×
228
    n = sizeof(s)
20,581✔
229
    GC.@preserve s out unsafe_copyto!(pointer(out, offs), unsafe_convert(Ptr{UInt8},s), n)
20,581✔
230
    return n
20,581✔
231
end
232

233
# nothrow needed here because for v in a can't prove the indexing is inbounds.
234
@assume_effects :foldable :nothrow string(a::Union{Char, String, Symbol}...) = _string(a...)
8,570,161✔
235

236
string(a::Union{Char, String, SubString{String}, Symbol}...) = _string(a...)
902,829✔
237

238
function _string(a::Union{Char, String, SubString{String}, Symbol}...)
8,945,820✔
239
    n = 0
2,704✔
240
    for v in a
8,945,820✔
241
        # 4 types is too many for automatic Union-splitting, so we split manually
242
        # and allow one specializable call site per concrete type
243
        if v isa Char
1,839,354✔
244
            n += ncodeunits(v)
317,358✔
245
        elseif v isa String
1,799,512✔
246
            n += sizeof(v)
18,188,274✔
247
        elseif v isa SubString{String}
648,985✔
248
            n += sizeof(v)
628,424✔
249
        else
250
            n += sizeof(v::Symbol)
20,581✔
251
        end
252
    end
27,649,883✔
253
    out = _string_n(n)
8,945,846✔
254
    offs = 1
2,704✔
255
    for v in a
8,945,820✔
256
        if v isa Char
1,839,354✔
257
            offs += __unsafe_string!(out, v, offs)
264,769✔
258
        elseif v isa String || v isa SubString{String}
2,448,497✔
259
            offs += __unsafe_string!(out, v, offs)
18,816,698✔
260
        else
261
            offs += __unsafe_string!(out, v::Symbol, offs)
20,581✔
262
        end
263
    end
27,649,883✔
264
    return out
8,945,820✔
265
end
266

267
# don't assume effects for general integers since we cannot know their implementation
268
# not nothrow because r<0 throws
269
@assume_effects :foldable repeat(s::String, r::BitInteger) = @invoke repeat(s::String, r::Integer)
982,705✔
270

271
function repeat(s::Union{String, SubString{String}}, r::Integer)
982,696✔
272
    r < 0 && throw(ArgumentError("can't repeat a string $r times"))
982,696✔
273
    r == 0 && return ""
982,688✔
274
    r == 1 && return String(s)
866,898✔
275
    n = sizeof(s)
735,411✔
276
    out = _string_n(n*r)
735,411✔
277
    if n == 1 # common case: repeating a single-byte string
735,411✔
278
        @inbounds b = codeunit(s, 1)
693,760✔
279
        memset(unsafe_convert(Ptr{UInt8}, out), b, r)
693,760✔
280
    else
281
        for i = 0:r-1
83,302✔
282
            GC.@preserve s out unsafe_copyto!(pointer(out, i*n+1), pointer(s), n)
2,065,755✔
283
        end
2,065,755✔
284
    end
285
    return out
735,411✔
286
end
287

288
function filter(f, s::Union{String, SubString{String}})
42✔
289
    out = StringVector(sizeof(s))
42✔
290
    offset = 1
3✔
291
    for c in s
84✔
292
        if f(c)
884✔
293
            offset += __unsafe_string!(out, c, offset)
663✔
294
        end
295
    end
1,637✔
296
    resize!(out, offset-1)
84✔
297
    sizehint!(out, offset-1)
42✔
298
    return String(out)
42✔
299
end
300

301
getindex(s::AbstractString, r::AbstractUnitRange{<:Integer}) = SubString(s, r)
17,626✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc