• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

neon-sunset / U8String / 5912240727

19 Aug 2023 03:39PM UTC coverage: 22.685% (+0.3%) from 22.422%
5912240727

push

github

neon-sunset
feat: enumerators improvements

122 of 776 branches covered (15.72%)

Branch coverage included in aggregate %.

65 of 65 new or added lines in 4 files covered. (100.0%)

439 of 1697 relevant lines covered (25.87%)

30109.06 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

52.29
/src/U8String.Enumeration.cs
1
using System.Buffers;
2
using System.Collections;
3
using System.Diagnostics;
4
using System.Runtime.InteropServices;
5
using System.Text;
6
using U8Primitives.Abstractions;
7

8
using Rune = System.Text.Rune;
9

10
namespace U8Primitives;
11

12
#pragma warning disable IDE0032, IDE0057 // Use auto property and index operator. Why: Perf, struct layout, accuracy and codegen.
13
public readonly partial struct U8String
14
{
15
    /// <summary>
16
    /// Returns a collection of <see cref="char"/>s over the provided string.
17
    /// </summary>
18
    /// <remarks>
19
    /// This is a lazily-evaluated allocation-free collection.
20
    /// </remarks>
21
    public U8Chars Chars
22
    {
23
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
24
        get => new(this);
24✔
25
    }
26

27
    /// <summary>
28
    /// Returns a collection of <see cref="Rune"/>s over the provided string.
29
    /// </summary>
30
    /// <remarks>
31
    /// This is a lazily-evaluated allocation-free collection.
32
    /// </remarks>
33
    public U8Runes Runes
34
    {
35
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
36
        get => new(this);
30✔
37
    }
38

39
    /// <summary>
40
    /// Returns a collection of lines over the provided string.
41
    /// </summary>
42
    /// <remarks>
43
    /// This is a lazily-evaluated allocation-free collection.
44
    /// </remarks>
45
    public U8Lines Lines
46
    {
47
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
48
        get => new(this);
×
49
    }
50

51
    // Bad codegen still :(
52
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
53
    public Enumerator GetEnumerator() => new(this);
12✔
54

55
    IEnumerator<byte> IEnumerable<byte>.GetEnumerator() => GetEnumerator();
×
56
    IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
6✔
57

58
    public struct Enumerator : IEnumerator<byte>
59
    {
60
        readonly byte[]? _value;
61
        readonly int _offset;
62
        readonly int _length;
63
        int _index;
64

65
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
66
        public Enumerator(U8String value)
67
        {
68
            _value = value._value;
12✔
69
            _offset = value.Offset;
12✔
70
            _length = value.Length;
12✔
71
            _index = -1;
12✔
72
        }
12✔
73

74
        // Still cheaper than MemoryMarshal clever variants
75
        public readonly byte Current => _value![_offset + _index];
3,138✔
76

77
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
78
        public bool MoveNext() => (uint)(++_index) < (uint)_length;
3,156✔
79
        // {
80
        //     var index = _index;
81
        //     if (++index < _length)
82
        //     {
83
        //         // Current = Unsafe.Add(
84
        //         //     ref MemoryMarshal.GetArrayDataReference(_value!),
85
        //         //     (nint)(uint)(_offset + index));
86
        //         Current = _value![_offset + index];
87
        //         _index = index;
88
        //         return true;
89
        //     }
90

91
        //     return false;
92
        // }
93

94
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
95
        public void Reset() => _index = -1;
×
96

97
        readonly object IEnumerator.Current => Current;
1,569✔
98
        readonly void IDisposable.Dispose() { }
6✔
99
    }
100
}
101

102
/// <summary>
103
/// A collection of chars in a provided <see cref="U8String"/>.
104
/// </summary>
105
public struct U8Chars : ICollection<char>, IEnumerable<char, U8Chars.Enumerator>
106
{
107
    readonly U8String _value;
108

109
    int _count;
110

111
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
112
    public U8Chars(U8String value)
113
    {
114
        _value = value;
24✔
115
        _count = value.IsEmpty ? 0 : -1;
24✔
116
    }
24✔
117

118
    /// <summary>
119
    /// The number of chars in the current <see cref="U8String"/>.
120
    /// </summary>
121
    public int Count
122
    {
123
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
124
        get
125
        {
126
            // Somehow the codegen here is underwhelming
127
            var count = _count;
23✔
128
            if (count >= 0)
23✔
129
            {
130
                return count;
8✔
131
            }
132
            return _count = Count(_value.UnsafeSpan);
15✔
133

134
            static int Count(ReadOnlySpan<byte> value)
135
            {
136
                Debug.Assert(!value.IsEmpty);
137

138
                // TODO: Is this enough?
139
                return Encoding.UTF8.GetCharCount(value);
15✔
140
            }
141
        }
142
    }
143

144
    // TODO: Wow, this seems to be terribly broken on surrogate chars and 
145
    // there is no easy way to fix it without sacrificing performance.
146
    // Perhaps it is worth just do the transcoding iteration here and warn the users
147
    // instead of straight up producing UB or throwing exceptions???
148
    public readonly bool Contains(char item) => _value.Contains(item);
×
149

150
    public readonly void CopyTo(char[] destination, int index)
151
    {
152
        var value = _value;
×
153
        if (!value.IsEmpty)
×
154
        {
155
            Encoding.UTF8.GetChars(value.UnsafeSpan, destination.AsSpan()[index..]);
×
156
        }
157
    }
×
158

159
    public readonly void Deconstruct(out char first, out char second)
160
    {
161
        this.Deconstruct<U8Chars, Enumerator, char>(out first, out second);
×
162
    }
×
163

164
    public readonly void Deconstruct(out char first, out char second, out char third)
165
    {
166
        this.Deconstruct<U8Chars, Enumerator, char>(out first, out second, out third);
×
167
    }
×
168

169
    public char[] ToArray()
170
    {
171
        var value = _value;
6✔
172
        if (!value.IsEmpty)
6✔
173
        {
174
            var chars = new char[Count];
5✔
175
            Encoding.UTF8.GetChars(value.UnsafeSpan, chars);
5✔
176
            return chars;
5✔
177
        }
178

179
        return Array.Empty<char>();
1✔
180
    }
181

182
    public List<char> ToList()
183
    {
184
        var value = _value;
×
185
        if (!value.IsEmpty)
×
186
        {
187
            var count = Count;
×
188
            var chars = new List<char>(count);
×
189
            CollectionsMarshal.SetCount(chars, count);
×
190
            var span = CollectionsMarshal.AsSpan(chars);
×
191

192
            Encoding.UTF8.GetChars(value.UnsafeSpan, span);
×
193
            return chars;
×
194
        }
195

196
        return new List<char>();
×
197
    }
198

199
    public readonly Enumerator GetEnumerator() => new(_value);
6✔
200

201
    readonly IEnumerator<char> IEnumerable<char>.GetEnumerator() => new Enumerator(_value);
×
202
    readonly IEnumerator IEnumerable.GetEnumerator() => new Enumerator(_value);
6✔
203

204
    public struct Enumerator : IEnumerator<char>
205
    {
206
        // TODO: refactor layout
207
        readonly byte[]? _value;
208
        readonly int _offset;
209
        readonly int _length;
210
        int _nextByteIdx;
211
        uint _currentCharPair;
212

213
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
214
        public Enumerator(U8String value)
215
        {
216
            _value = value._value;
12✔
217
            _offset = value.Offset;
12✔
218
            _length = value.Length;
12✔
219
            _nextByteIdx = 0;
12✔
220
        }
12✔
221

222
        // TODO
223
        public readonly char Current => (char)_currentCharPair;
1,402✔
224

225
        // TODO: This looks terrible, there must be a better way
226
        // to convert UTF-8 to UTF-16 with an enumerator.
227
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
228
        public bool MoveNext()
229
        {
230
            var (offset, length, nextByteIdx, currentCharPair) =
1,420✔
231
                (_offset, _length, _nextByteIdx, _currentCharPair);
1,420✔
232

233
            if (currentCharPair < char.MaxValue)
1,420✔
234
            {
235
                if ((uint)nextByteIdx < (uint)length)
1,302✔
236
                {
237
                    var span = _value!.SliceUnsafe(offset + nextByteIdx, length - nextByteIdx);
1,284✔
238
                    var firstByte = MemoryMarshal.GetReference(span);
1,284✔
239
                    if (U8Info.IsAsciiByte(firstByte))
1,284✔
240
                    {
241
                        // Fast path because Rune.DecodeFromUtf8 won't inline
242
                        // making UTF-8 push us more and more towards anglocentrism.
243
                        _nextByteIdx = nextByteIdx + 1;
314✔
244
                        _currentCharPair = firstByte;
314✔
245
                        return true;
314✔
246
                    }
247

248
                    var status = Rune.DecodeFromUtf8(span, out var rune, out var bytesConsumed);
970✔
249
                    Debug.Assert(status is OperationStatus.Done);
250

251
                    _nextByteIdx = nextByteIdx + bytesConsumed;
970✔
252

253
                    if (rune.IsBmp)
970✔
254
                    {
255
                        _currentCharPair = (uint)rune.Value;
852✔
256
                        return true;
852✔
257
                    }
258

259
                    // I wonder if this just explodes on BigEndian
260
                    var runeValue = (uint)rune.Value;
118✔
261
                    var highSurrogate = (char)((runeValue + ((0xD800u - 0x40u) << 10)) >> 10);
118✔
262
                    var lowSurrogate = (char)((runeValue & 0x3FFu) + 0xDC00u);
118✔
263
                    _currentCharPair = highSurrogate + ((uint)lowSurrogate << 16);
118✔
264
                    return true;
118✔
265
                }
266

267
                return false;
18✔
268
            }
269

270
            _currentCharPair = currentCharPair >> 16;
118✔
271
            return true;
118✔
272
        }
273

274
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
275
        public void Reset() => _nextByteIdx = 0;
×
276

277
        readonly object IEnumerator.Current => Current;
701✔
278
        readonly void IDisposable.Dispose() { }
6✔
279
    }
280

281
    readonly bool ICollection<char>.IsReadOnly => true;
×
282
    readonly void ICollection<char>.Add(char item) => throw new NotSupportedException();
×
283
    readonly void ICollection<char>.Clear() => throw new NotSupportedException();
×
284
    readonly bool ICollection<char>.Remove(char item) => throw new NotSupportedException();
×
285
}
286

287
/// <summary>
288
/// A collection of Runes (unicode scalar values) in a provided <see cref="U8String"/>.
289
/// </summary>
290
public struct U8Runes : ICollection<Rune>, IEnumerable<Rune, U8Runes.Enumerator>
291
{
292
    readonly U8String _value;
293

294
    // If we bring up non-ascii counting to ascii level, we might not need this
295
    // similar to LineCollection.
296
    int _count;
297

298
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
299
    public U8Runes(U8String value)
300
    {
301
        _value = value;
30✔
302
        _count = value.IsEmpty ? 0 : -1;
30✔
303
    }
30✔
304

305
    /// <summary>
306
    /// The number of Runes (unicode scalar values) in the current <see cref="U8String"/>.
307
    /// </summary>
308
    public int Count
309
    {
310
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
311
        get
312
        {
313
            // Somehow the codegen here is underwhelming
314
            var count = _count;
24✔
315
            if (count >= 0)
24✔
316
            {
317
                return count;
9✔
318
            }
319

320
            return _count = Count(_value.UnsafeSpan);
15✔
321

322
            static int Count(ReadOnlySpan<byte> value)
323
            {
324
                Debug.Assert(!value.IsEmpty);
325

326
                // TODO: SIMD non-continuation byte counting
327
                var runeCount = (int)(nint)Polyfills.Text.Ascii.GetIndexOfFirstNonAsciiByte(value);
15✔
328
                value = value.SliceUnsafe(runeCount);
15✔
329

330
                for (var i = 0; (uint)i < (uint)value.Length; i += U8Info.RuneLength(value.AsRef(i)))
2,940✔
331
                {
332
                    runeCount++;
1,455✔
333
                }
334

335
                return runeCount;
15✔
336
            }
337
        }
338
    }
339

340
    public readonly bool Contains(Rune item) => _value.Contains(item);
642✔
341

342
    public readonly void CopyTo(Rune[] destination, int index)
343
    {
344
        // TODO: Simple SIMD widen ASCII to UTF-32 (ideally widen+validate in place instead of double traversal)
345
        // TODO: Consistency and correctness? Implement single-pass vectorized conversion?
346
        foreach (var rune in this)
×
347
        {
348
            destination[index++] = rune;
×
349
        }
350
    }
×
351

352
    public readonly void Deconstruct(out Rune first, out Rune second)
353
    {
354
        this.Deconstruct<U8Runes, Enumerator, Rune>(out first, out second);
×
355
    }
×
356

357
    public readonly void Deconstruct(out Rune first, out Rune second, out Rune third)
358
    {
359
        this.Deconstruct<U8Runes, Enumerator, Rune>(out first, out second, out third);
×
360
    }
×
361

362
    public Rune[] ToArray() => this.ToArray<U8Runes, Enumerator, Rune>();
6✔
363

364
    public List<Rune> ToList() => this.ToList<U8Runes, Enumerator, Rune>();
×
365

366
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
367
    public readonly Enumerator GetEnumerator() => new(_value);
659✔
368

369
    readonly IEnumerator<Rune> IEnumerable<Rune>.GetEnumerator() => GetEnumerator();
642✔
370
    readonly IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
6✔
371

372
    public struct Enumerator : IEnumerator<Rune>
373
    {
374
        readonly byte[]? _value;
375
        readonly int _offset;
376
        readonly int _length;
377
        int _index;
378

379
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
380
        public Enumerator(U8String value)
381
        {
382
            _value = value._value;
659✔
383
            _offset = value.Offset;
659✔
384
            _length = value.Length;
659✔
385
        }
659✔
386

387
        public Rune Current { get; private set; }
388

389
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
390
        public bool MoveNext()
391
        {
392
            var index = _index;
55,597✔
393
            if (index < _length)
55,597✔
394
            {
395
                ref var ptr = ref MemoryMarshal
55,574✔
396
                    .GetArrayDataReference(_value!)
55,574✔
397
                    .Add(_offset + index);
55,574✔
398

399
                Current = U8Conversions.CodepointToRune(ref ptr, out var size);
55,574✔
400
                _index = index + size;
55,574✔
401
                return true;
55,574✔
402
            }
403

404
            return false;
23✔
405
        }
406

407
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
408
        public void Reset() => _index = -1;
×
409

410
        readonly object IEnumerator.Current => Current;
642✔
411
        readonly void IDisposable.Dispose() { }
653✔
412
    }
413

414
    readonly bool ICollection<Rune>.IsReadOnly => true;
×
415
    readonly void ICollection<Rune>.Add(Rune item) => throw new NotImplementedException();
×
416
    readonly void ICollection<Rune>.Clear() => throw new NotImplementedException();
×
417
    readonly bool ICollection<Rune>.Remove(Rune item) => throw new NotImplementedException();
×
418
}
419

420
/// <summary>
421
/// A collection of lines in a provided <see cref="U8String"/>.
422
/// </summary>
423
public struct U8Lines : ICollection<U8String>, IU8Enumerable<U8Lines.Enumerator>
424
{
425
    readonly U8String _value;
426

427
    // We might not need this. Although counting is O(n), the absolute performance
428
    // is very good, and on AVX2/512 - it's basically instantenous.
429
    int _count;
430

431
    /// <summary>
432
    /// Creates a new line enumeration over the provided string.
433
    /// </summary>
434
    /// <param name="value">The string to enumerate over.</param>
435
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
436
    public U8Lines(U8String value)
437
    {
438
        _value = value;
×
439
        _count = value.IsEmpty ? 0 : -1;
×
440
    }
×
441

442
    /// <summary>
443
    /// The number of lines in the current <see cref="U8String"/>.
444
    /// </summary>
445
    public int Count
446
    {
447
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
448
        get
449
        {
450
            var count = _count;
×
451
            if (count >= 0)
×
452
            {
453
                return count;
×
454
            }
455

456
            // Matches the behavior of string.Split('\n').Length for "hello\n"
457
            // TODO: Should we break consistency and not count the very last segment if it is empty?
458
            // (likely no - an empty line is still a line)
459
            return _count = _value.UnsafeSpan.Count((byte)'\n') + 1;
×
460
        }
461
    }
462

463
    public readonly bool Contains(U8String item)
464
    {
465
        return !item.Contains((byte)'\n') && _value.Contains(item);
×
466
    }
467

468
    public void CopyTo(U8String[] destination, int index)
469
    {
470
        this.CopyTo<U8Lines, Enumerator, U8String>(destination.AsSpan()[index..]);
×
471
    }
×
472

473
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
474
    public readonly void Deconstruct(out U8String first, out U8String second)
475
    {
476
        this.Deconstruct<U8Lines, Enumerator, U8String>(out first, out second);
×
477
    }
×
478

479
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
480
    public readonly void Deconstruct(out U8String first, out U8String second, out U8String third)
481
    {
482
        this.Deconstruct<U8Lines, Enumerator, U8String>(out first, out second, out third);
×
483
    }
×
484

485
    public U8String[] ToArray() => this.ToArray<U8Lines, Enumerator, U8String>();
×
486
    public List<U8String> ToList() => this.ToList<U8Lines, Enumerator, U8String>();
×
487

488
    /// <summary>
489
    /// Returns a <see cref="Enumerator"/> over the provided string.
490
    /// </summary>
491
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
492
    public readonly Enumerator GetEnumerator() => new(_value);
×
493

494
    readonly IEnumerator<U8String> IEnumerable<U8String>.GetEnumerator() => GetEnumerator();
×
495
    readonly IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
×
496

497
    readonly bool ICollection<U8String>.IsReadOnly => true;
×
498
    readonly void ICollection<U8String>.Add(U8String item) => throw new NotSupportedException();
×
499
    readonly void ICollection<U8String>.Clear() => throw new NotSupportedException();
×
500
    readonly bool ICollection<U8String>.Remove(U8String item) => throw new NotSupportedException();
×
501

502
    /// <summary>
503
    /// A struct that enumerates lines over a string.
504
    /// </summary>
505
    public struct Enumerator : IU8Enumerator
506
    {
507
        // TODO 1: Ensure this is aligned with Rust's .lines() implementation, or not?
508
        // private static readonly SearchValues<byte> NewLine = SearchValues.Create("\r\n"u8);
509
        // TODO 2: Consider using 'InnerOffsets'
510
        private readonly byte[]? _value;
511
        private U8Range _remaining;
512
        private U8Range _current;
513

514
        /// <summary>
515
        /// Creates a new line enumerator over the provided string.
516
        /// </summary>
517
        /// <param name="value">The string to enumerate over.</param>
518
        public Enumerator(U8String value)
519
        {
520
            _value = value._value;
×
521
            _remaining = value._inner;
×
522
        }
×
523

524
        /// <summary>
525
        /// Returns the current line.
526
        /// </summary>
527
        public readonly U8String Current => new(_value, _current.Offset, _current.Length);
×
528

529
        /// <summary>
530
        /// Advances the enumerator to the next line.
531
        /// </summary>
532
        [MethodImpl(MethodImplOptions.AggressiveInlining)] // Surprisingly smaller codegen than when not inlined
533
        public bool MoveNext()
534
        {
535
            var remaining = _remaining;
×
536
            if (remaining.Length > 0)
×
537
            {
538
                var span = _value!.SliceUnsafe(remaining.Offset, remaining.Length);
×
539
                var idx = span.IndexOf((byte)'\n');
×
540

541
                if ((uint)idx < (uint)span.Length)
×
542
                {
543
                    var cutoff = idx;
×
544
                    if (idx > 0 && span.AsRef().Add(idx - 1) is (byte)'\r')
×
545
                    {
546
                        cutoff--;
×
547
                    }
548

549
                    _current = new(remaining.Offset, cutoff);
×
550
                    _remaining = new(remaining.Offset + idx + 1, remaining.Length - idx - 1);
×
551
                }
552
                else
553
                {
554
                    // We've reached EOF, but we still need to return 'true' for this final
555
                    // iteration so that the caller can query the Current property once more.
556
                    _current = new(remaining.Offset, remaining.Length);
×
557
                    _remaining = default;
×
558
                }
559

560
                return true;
×
561
            }
562

563
            return false;
×
564
        }
565

566
        readonly object IEnumerator.Current => Current;
×
567
        readonly void IEnumerator.Reset() => throw new NotSupportedException();
×
568
        readonly void IDisposable.Dispose() { }
×
569
    }
570
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc