• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

STEllAR-GROUP / hpx / #848

07 Dec 2022 11:00PM CUT coverage: 86.456% (+0.6%) from 85.835%
#848

push

StellarBot
Merge #6096

6096: Forking Boost.Tokenizer r=hkaiser a=hkaiser

- flyby: remove more Boost headers that are not needed anymore

Working towards #3440 

Co-authored-by: Hartmut Kaiser <hartmut.kaiser@gmail.com>

525 of 525 new or added lines in 20 files covered. (100.0%)

173087 of 200202 relevant lines covered (86.46%)

1845223.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.68
/libs/core/string_util/include/hpx/string_util/token_functions.hpp
1
//  Copyright (c) 2022 Hartmut Kaiser
2
//
3
//  SPDX-License-Identifier: BSL-1.0
4
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
5
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6

7
// Copyright John R. Bandela 2001.
8

9
// See http://www.boost.org/libs/tokenizer/ for documentation.
10

11
// Revision History:
12
// 01 Oct 2004   Joaquin M Lopez Munoz
13
//      Workaround for a problem with string::assign in msvc-stlport
14
// 06 Apr 2004   John Bandela
15
//      Fixed a bug involving using char_delimiter with a true input iterator
16
// 28 Nov 2003   Robert Zeh and John Bandela
17
//      Converted into "fast" functions that avoid using += when
18
//      the supplied iterator isn't an input_iterator; based on
19
//      some work done at Archelon and a version that was checked into
20
//      the boost CVS for a short period of time.
21
// 20 Feb 2002   John Maddock
22
//      Removed using namespace std declarations and added
23
//      workaround for BOOST_NO_STDC_NAMESPACE (the library
24
//      can be safely mixed with regex).
25
// 06 Feb 2002   Jeremy Siek
26
//      Added char_separator.
27
// 02 Feb 2002   Jeremy Siek
28
//      Removed tabs and a little cleanup.
29

30
#pragma once
31

32
#include <hpx/config.hpp>
33
#include <hpx/assert.hpp>
34
#include <hpx/modules/errors.hpp>
35

36
#include <algorithm>    // for find_if
37
#include <cctype>
38
#include <cwctype>
39
#include <initializer_list>
40
#include <iterator>
41
#include <stdexcept>
42
#include <string>
43
#include <vector>
44

45
namespace hpx::string_util {
46

47
    //=========================================================================
48
    // The escaped_list_separator class. Which is a model of TokenizerFunction
49
    // An escaped list is a super-set of what is commonly known as a comma
50
    // separated value (csv) list.It is separated into fields by a comma or
51
    // other character. If the delimiting character is inside quotes, then it is
52
    // counted as a regular character.To allow for embedded quotes in a field,
53
    // there can be escape sequences using the \ much like C. The role of the
54
    // comma, the quotation mark, and the escape character (backslash \), can be
55
    // assigned to other characters.
56
    template <typename Char,
57
        typename Traits = typename std::basic_string<Char>::traits_type>
58
    class escaped_list_separator
13,888✔
59
    {
60
    private:
61
        using string_type = std::basic_string<Char, Traits>;
62

63
        struct char_eq
64
        {
65
            Char e_;
66

67
            explicit char_eq(Char e) noexcept
736,363✔
68
              : e_(e)
736,363✔
69
            {
70
            }
736,363✔
71

72
            bool operator()(Char c) noexcept
1,218,027✔
73
            {
74
                return Traits::eq(e_, c);
1,218,027✔
75
            }
76
        };
77

78
        string_type escape_;
79
        string_type c_;
80
        string_type quote_;
81
        bool last_ = false;
1,980✔
82

83
        bool is_escape(Char e)
247,656✔
84
        {
85
            char_eq f(e);
247,656✔
86
            return std::find_if(escape_.begin(), escape_.end(), f) !=
495,312✔
87
                escape_.end();
247,656✔
88
        }
89

90
        bool is_c(Char e)
247,642✔
91
        {
92
            char_eq f(e);
247,642✔
93
            return std::find_if(c_.begin(), c_.end(), f) != c_.end();
247,642✔
94
        }
95

96
        bool is_quote(Char e)
241,065✔
97
        {
98
            char_eq f(e);
241,065✔
99
            return std::find_if(quote_.begin(), quote_.end(), f) !=
482,130✔
100
                quote_.end();
241,065✔
101
        }
102

103
        template <typename iterator, typename Token>
104
        void do_escape(iterator& next, iterator end, Token& tok)
17✔
105
        {
106
            if (++next == end)
17✔
107
            {
108
                HPX_THROW_EXCEPTION(invalid_status,
×
109
                    "escaped_list_separator::do_escape",
110
                    "cannot end with escape");
111
            }
112

113
            if (Traits::eq(*next, 'n'))
17✔
114
            {
115
                tok += '\n';
×
116
                return;
×
117
            }
118
            else if (is_quote(*next) || is_c(*next) || is_escape(*next))
17✔
119
            {
120
                tok += *next;
17✔
121
                return;
17✔
122
            }
123
            else
124
            {
125
                HPX_THROW_EXCEPTION(invalid_status,
×
126
                    "escaped_list_separator::do_escape",
127
                    "unknown escape sequence");
128
            }
129
        }
17✔
130

131
    public:
132
        explicit escaped_list_separator(
4✔
133
            Char e = '\\', Char c = ',', Char q = '\"')
134
          : escape_(1, e)
4✔
135
          , c_(1, c)
4✔
136
          , quote_(1, q)
4✔
137
        {
138
        }
4✔
139

140
        escaped_list_separator(
1,976✔
141
            string_type e, string_type c, string_type q) noexcept
142
          : escape_(HPX_MOVE(e))
1,976✔
143
          , c_(HPX_MOVE(c))
1,976✔
144
          , quote_(HPX_MOVE(q))
1,976✔
145
        {
146
        }
1,976✔
147

148
        void reset() noexcept
3,957✔
149
        {
150
            last_ = false;
3,957✔
151
        }
3,957✔
152

153
        template <typename InputIterator, typename Token>
154
        bool operator()(InputIterator& next, InputIterator end, Token& tok)
10,537✔
155
        {
156
            bool bInQuote = false;
10,537✔
157
            tok = Token();
10,537✔
158

159
            if (next == end)
10,537✔
160
            {
161
                if (last_)
2,010✔
162
                {
163
                    last_ = false;
30✔
164
                    return true;
30✔
165
                }
166
                else
167
                {
168
                    return false;
1,980✔
169
                }
170
            }
171

172
            last_ = false;
8,527✔
173
            for (; next != end; ++next)
249,602✔
174
            {
175
                if (is_escape(*next))
247,652✔
176
                {
177
                    do_escape(next, end, tok);
17✔
178
                }
17✔
179
                else if (is_c(*next))
247,635✔
180
                {
181
                    if (!bInQuote)
6,587✔
182
                    {
183
                        // If we are not in quote, then we are done
184
                        ++next;
6,577✔
185

186
                        // The last character was a c, that means there is 1
187
                        // more blank field
188
                        last_ = true;
6,577✔
189
                        return true;
6,577✔
190
                    }
191
                    else
192
                    {
193
                        tok += *next;
10✔
194
                    }
195
                }
10✔
196
                else if (is_quote(*next))
241,048✔
197
                {
198
                    bInQuote = !bInQuote;
20✔
199
                }
20✔
200
                else
201
                {
202
                    tok += *next;
241,028✔
203
                }
204
            }
241,075✔
205
            return true;
1,950✔
206
        }
10,537✔
207
    };
208

209
    //=========================================================================
210
    // The classes here are used by offset_separator and char_separator to
211
    // implement faster assigning of tokens using assign instead of +=
212

213
    namespace detail {
214

215
        //=====================================================================
216
        // Tokenizer was broken for wide character separators, at least on
217
        // Windows, since CRT functions isspace etc only expect values in [0,
218
        // 0xFF]. Debug build asserts if higher values are passed in. The traits
219
        // extension class should take care of this. Assuming that the
220
        // conditional will always get optimized out in the function
221
        // implementations, argument types are not a problem since both forms of
222
        // character classifiers expect an int.
223
        template <typename Traits, int N>
224
        struct traits_extension_details : public Traits
225
        {
226
            using char_type = typename Traits::char_type;
227

228
            static bool isspace(char_type c) noexcept
229
            {
230
                return std::iswspace(c) != 0;
231
            }
232

233
            static bool ispunct(char_type c) noexcept
234
            {
235
                return std::iswpunct(c) != 0;
236
            }
237
        };
238

239
        template <typename Traits>
240
        struct traits_extension_details<Traits, 1> : public Traits
241
        {
242
            using char_type = typename Traits::char_type;
243

244
            static bool isspace(char_type c) noexcept
98✔
245
            {
246
                return std::isspace(c) != 0;
98✔
247
            }
248

249
            static bool ispunct(char_type c) noexcept
92✔
250
            {
251
                return std::ispunct(c) != 0;
92✔
252
            }
253
        };
254

255
        // In case there is no cwctype header, we implement the checks manually.
256
        // We make use of the fact that the tested categories should fit in
257
        // ASCII.
258
        template <typename Traits>
259
        struct traits_extension : public Traits
260
        {
261
            using char_type = typename Traits::char_type;
262

263
            static bool isspace(char_type c) noexcept
98✔
264
            {
265
                return traits_extension_details<Traits,
98✔
266
                    sizeof(char_type)>::isspace(c);
98✔
267
            }
268

269
            static bool ispunct(char_type c) noexcept
92✔
270
            {
271
                return traits_extension_details<Traits,
92✔
272
                    sizeof(char_type)>::ispunct(c);
92✔
273
            }
274
        };
275

276
        // The assign_or_plus_equal struct contains functions that implement
277
        // assign, +=, and clearing based on the iterator type. The generic case
278
        // does nothing for plus_equal and clearing, while passing through the
279
        // call for assign.
280
        //
281
        // When an input iterator is being used, the situation is reversed. The
282
        // assign method does nothing, plus_equal invokes operator +=, and the
283
        // clearing method sets the supplied token to the default token
284
        // constructor's result.
285
        template <typename IteratorTag>
286
        struct assign_or_plus_equal
287
        {
288
            template <typename Iterator, typename Token>
289
            static constexpr void assign(Iterator b, Iterator e, Token& t)
66,941✔
290
            {
291
                t.assign(b, e);
66,941✔
292
            }
66,941✔
293

294
            template <typename Token, typename Value>
295
            static constexpr void plus_equal(Token&, Value&&) noexcept
791,675✔
296
            {
297
            }
791,675✔
298

299
            // If we are doing an assign, there is no need for the the clear.
300
            template <typename Token>
301
            static constexpr void clear(Token&) noexcept
101,027✔
302
            {
303
            }
101,027✔
304
        };
305

306
        template <>
307
        struct assign_or_plus_equal<std::input_iterator_tag>
308
        {
309
            template <class Iterator, class Token>
310
            static constexpr void assign(Iterator, Iterator, Token&) noexcept
311
            {
312
            }
313

314
            template <class Token, class Value>
315
            static constexpr void plus_equal(Token& t, Value&& v)
316
            {
317
                t += HPX_FORWARD(Value, v);
318
            }
319

320
            template <class Token>
321
            static constexpr void clear(Token& t)
322
            {
323
                t = Token();
324
            }
325
        };
326

327
        template <typename Iterator>
328
        struct class_iterator_category
329
        {
330
            using type = typename Iterator::iterator_category;
331
        };
332

333
        // This portably gets the iterator_tag without partial template
334
        // specialization
335
        template <typename Iterator>
336
        struct get_iterator_category
337
        {
338
            using iterator_category =
339
                std::conditional_t<std::is_pointer_v<Iterator>,
340
                    std::random_access_iterator_tag,
341
                    typename class_iterator_category<Iterator>::type>;
342
        };
343
    }    // namespace detail
344

345
    //===========================================================================
346
    // The offset_separator class, which is a model of TokenizerFunction. Offset
347
    // breaks a string into tokens based on a range of offsets
348
    class offset_separator
63✔
349
    {
350
    private:
351
        std::vector<int> offsets_;
352
        unsigned int current_offset_ = 0;
8✔
353
        bool wrap_offsets_ = true;
3✔
354
        bool return_partial_last_ = true;
3✔
355

356
    public:
357
        template <typename Iter>
358
        offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
4✔
359
            bool return_partial_last = true)
360
          : offsets_(begin, end)
4✔
361
          , wrap_offsets_(wrap_offsets)
4✔
362
          , return_partial_last_(return_partial_last)
4✔
363
        {
364
        }
4✔
365

366
        offset_separator(std::initializer_list<int> init,
1✔
367
            bool wrap_offsets = true, bool return_partial_last = true)
368
          : offsets_(HPX_MOVE(init))
1✔
369
          , wrap_offsets_(wrap_offsets)
1✔
370
          , return_partial_last_(return_partial_last)
1✔
371
        {
372
        }
1✔
373

374
        offset_separator()
3✔
375
          : offsets_(1, 1)
3✔
376
        {
377
        }
3✔
378

379
        void reset()
10✔
380
        {
381
            current_offset_ = 0;
10✔
382
        }
10✔
383

384
        template <typename InputIterator, typename Token>
385
        bool operator()(InputIterator& next, InputIterator end, Token& tok)
20✔
386
        {
387
            using assigner = detail::assign_or_plus_equal<typename detail::
388
                    get_iterator_category<InputIterator>::iterator_category>;
389

390
            HPX_ASSERT(!offsets_.empty());
20✔
391

392
            assigner::clear(tok);
20✔
393
            InputIterator start(next);
20✔
394

395
            if (next == end)
20✔
396
            {
397
                return false;
4✔
398
            }
399

400
            if (current_offset_ == offsets_.size())
16✔
401
            {
402
                if (wrap_offsets_)
×
403
                {
404
                    current_offset_ = 0;
×
405
                }
×
406
                else
407
                {
408
                    return false;
×
409
                }
410
            }
×
411

412
            int c = offsets_[current_offset_];
16✔
413
            int i = 0;
16✔
414
            for (; i < c; ++i)
55✔
415
            {
416
                if (next == end)
39✔
417
                {
418
                    break;
×
419
                }
420
                assigner::plus_equal(tok, *next++);
39✔
421
            }
39✔
422
            assigner::assign(start, next, tok);
16✔
423

424
            if (!return_partial_last_)
16✔
425
            {
426
                if (i < (c - 1))
×
427
                {
428
                    return false;
×
429
                }
430
            }
×
431

432
            ++current_offset_;
16✔
433
            return true;
16✔
434
        }
20✔
435
    };
436

437
    //=========================================================================
438
    // The char_separator class breaks a sequence of characters into tokens
439
    // based on the character delimiters (very much like bad old strtok). A
440
    // delimiter character can either be kept or dropped. A kept delimiter shows
441
    // up as an output token, whereas a dropped delimiter does not.
442

443
    // This class replaces the char_delimiters_separator class. The constructor
444
    // for the char_delimiters_separator class was too confusing and needed to
445
    // be deprecated. However, because of the default arguments to the
446
    // constructor, adding the new constructor would cause ambiguity, so instead
447
    // I deprecated the whole class. The implementation of the class was also
448
    // simplified considerably.
449
    enum class empty_token_policy
450
    {
451
        drop,
452
        keep
453
    };
454

455
    // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
456
    template <typename Char,
457
        typename Tr = typename std::basic_string<Char>::traits_type>
458
    class char_separator
425,588✔
459
    {
460
        using Traits = detail::traits_extension<Tr>;
461
        using string_type = std::basic_string<Char, Tr>;
462

463
    public:
464
        explicit char_separator(Char const* dropped_delims,
44,518✔
465
            Char const* kept_delims = nullptr,
466
            empty_token_policy empty_tokens = empty_token_policy::drop)
467
          : m_dropped_delims(dropped_delims)
44,518✔
468
          , m_use_ispunct(false)
44,518✔
469
          , m_use_isspace(false)
44,518✔
470
          , m_empty_tokens(empty_tokens)
44,518✔
471
        {
472
            // Borland workaround
473
            if (kept_delims)
44,518✔
474
                m_kept_delims = kept_delims;
97✔
475
        }
44,518✔
476

477
        // use ispunct() for kept delimiters and isspace for dropped.
478
        char_separator() = default;
2✔
479

480
        void reset() noexcept {}
138,770✔
481

482
        template <typename InputIterator, typename Token>
483
        bool operator()(InputIterator& next, InputIterator end, Token& tok)
101,007✔
484
        {
485
            using assigner = detail::assign_or_plus_equal<typename detail::
486
                    get_iterator_category<InputIterator>::iterator_category>;
487

488
            assigner::clear(tok);
101,007✔
489

490
            // skip past all dropped_delims
491
            if (m_empty_tokens == empty_token_policy::drop)
101,007✔
492
            {
493
                for (; next != end && is_dropped(*next); ++next)
126,314✔
494
                {
495
                }
25,525✔
496
            }
100,789✔
497

498
            InputIterator start(next);
101,007✔
499

500
            if (m_empty_tokens == empty_token_policy::drop)
101,007✔
501
            {
502
                if (next == end)
100,789✔
503
                    return false;
33,985✔
504

505
                // if we are on a kept_delims move past it and stop
506
                if (is_kept(*next))
66,804✔
507
                {
508
                    assigner::plus_equal(tok, *next);
2✔
509
                    ++next;
2✔
510
                }
2✔
511
                else
512
                {
513
                    // append all the non delim characters
514
                    for (; next != end && !is_dropped(*next) && !is_kept(*next);
847,436✔
515
                         ++next)
780,634✔
516
                    {
517
                        assigner::plus_equal(tok, *next);
780,634✔
518
                    }
780,634✔
519
                }
520
            }
66,804✔
521
            else
522
            {    // m_empty_tokens == empty_token_policy::keep
523

524
                // Handle empty token at the end
525
                if (next == end)
218✔
526
                {
527
                    if (m_output_done == false)
98✔
528
                    {
529
                        m_output_done = true;
1✔
530
                        assigner::assign(start, next, tok);
1✔
531
                        return true;
1✔
532
                    }
533
                    else
534
                    {
535
                        return false;
97✔
536
                    }
537
                }
538

539
                if (is_kept(*next))
120✔
540
                {
541
                    if (m_output_done == false)
5✔
542
                    {
543
                        m_output_done = true;
1✔
544
                    }
1✔
545
                    else
546
                    {
547
                        assigner::plus_equal(tok, *next);
4✔
548
                        ++next;
4✔
549
                        m_output_done = false;
4✔
550
                    }
551
                }
5✔
552
                else if (m_output_done == false && is_dropped(*next))
115✔
553
                {
554
                    m_output_done = true;
2✔
555
                }
2✔
556
                else
557
                {
558
                    if (is_dropped(*next))
113✔
559
                    {
560
                        start = ++next;
16✔
561
                    }
16✔
562

563
                    for (; next != end && !is_dropped(*next) && !is_kept(*next);
11,109✔
564
                         ++next)
10,996✔
565
                    {
566
                        assigner::plus_equal(tok, *next);
10,996✔
567
                    }
10,996✔
568

569
                    m_output_done = true;
113✔
570
                }
571
            }
572

573
            assigner::assign(start, next, tok);
66,924✔
574
            return true;
66,924✔
575
        }
101,007✔
576

577
    private:
578
        string_type m_kept_delims;
579
        string_type m_dropped_delims;
580
        bool m_use_ispunct = true;
2✔
581
        bool m_use_isspace = true;
2✔
582
        empty_token_policy m_empty_tokens = empty_token_policy::drop;
2✔
583
        bool m_output_done = false;
44,520✔
584

585
        bool is_kept(Char E) const
858,559✔
586
        {
587
            if (m_kept_delims.length())
858,559✔
588
                return m_kept_delims.find(E) != string_type::npos;
40✔
589
            else if (m_use_ispunct)
858,519✔
590
            {
591
                return Traits::ispunct(E) != 0;
34✔
592
            }
593
            else
594
            {
595
                return false;
858,485✔
596
            }
597
        }
858,559✔
598

599
        bool is_dropped(Char E) const
907,458✔
600
        {
601
            if (m_dropped_delims.length())
907,458✔
602
                return m_dropped_delims.find(E) != string_type::npos;
907,412✔
603
            else if (m_use_isspace)
46✔
604
            {
605
                return Traits::isspace(E) != 0;
46✔
606
            }
607
            else
608
            {
609
                return false;
×
610
            }
611
        }
907,458✔
612
    };
613

614
    //===========================================================================
615
    // The char_delimiters_separator class, which is a model of
616
    // TokenizerFunction. char_delimiters_separator breaks a string into tokens
617
    // based on character delimiters. There are 2 types of delimiters.
618
    // Returnable delimiters can be returned as tokens. These are often
619
    // punctuation. Nonreturnable delimiters cannot be returned as tokens. These
620
    // are often whitespace
621

622
    // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
623
    template <typename Char,
624
        typename Tr = typename std::basic_string<Char>::traits_type>
625
    class char_delimiters_separator
56✔
626
    {
627
    private:
628
        using Traits = detail::traits_extension<Tr>;
629
        using string_type = std::basic_string<Char, Tr>;
630

631
        string_type returnable_;
632
        string_type nonreturnable_;
633
        bool return_delims_;
634
        bool no_ispunct_;
635
        bool no_isspace_;
636

637
        bool is_ret(Char E) const noexcept
86✔
638
        {
639
            if (returnable_.length())
86✔
640
            {
641
                return returnable_.find(E) != string_type::npos;
28✔
642
            }
643
            else
644
            {
645
                if (no_ispunct_)
58✔
646
                {
647
                    return false;
×
648
                }
649
                else
650
                {
651
                    int r = Traits::ispunct(E);
58✔
652
                    return r != 0;
58✔
653
                }
654
            }
655
        }
86✔
656

657
        bool is_nonret(Char E) const noexcept
75✔
658
        {
659
            if (nonreturnable_.length())
75✔
660
            {
661
                return nonreturnable_.find(E) != string_type::npos;
×
662
            }
663
            else
664
            {
665
                if (no_isspace_)
75✔
666
                {
667
                    return false;
23✔
668
                }
669
                else
670
                {
671
                    int r = Traits::isspace(E);
52✔
672
                    return r != 0;
52✔
673
                }
674
            }
675
        }
75✔
676

677
    public:
678
        explicit char_delimiters_separator(bool return_delims = false,
4✔
679
            Char const* returnable = nullptr,
680
            Char const* nonreturnable = nullptr)
681
          : returnable_(returnable ? returnable : string_type().c_str())
4✔
682
          , nonreturnable_(
8✔
683
                nonreturnable ? nonreturnable : string_type().c_str())
4✔
684
          , return_delims_(return_delims)
4✔
685
          , no_ispunct_(returnable != nullptr)
4✔
686
          , no_isspace_(nonreturnable != nullptr)
4✔
687
        {
688
        }
4✔
689

690
        void reset() noexcept {}
5✔
691

692
    public:
693
        template <typename InputIterator, typename Token>
694
        bool operator()(InputIterator& next, InputIterator end, Token& tok)
16✔
695
        {
696
            tok = Token();
16✔
697

698
            // skip past all nonreturnable delims
699
            // skip past the returnable only if we are not returning delims
700
            for (; next != end &&
57✔
701
                 (is_nonret(*next) || (is_ret(*next) && !return_delims_));
27✔
702
                 ++next)
14✔
703
            {
704
            }
14✔
705

706
            if (next == end)
16✔
707
            {
708
                return false;
3✔
709
            }
710

711
            // if we are to return delims and we are one a returnable one move
712
            // past it and stop
713
            if (is_ret(*next) && return_delims_)
13✔
714
            {
715
                tok += *next;
2✔
716
                ++next;
2✔
717
            }
2✔
718
            else
719
            {
720
                // append all the non delim characters
721
                for (; next != end && !is_nonret(*next) && !is_ret(*next);
49✔
722
                     ++next)
38✔
723
                {
724
                    tok += *next;
38✔
725
                }
38✔
726
            }
727

728
            return true;
13✔
729
        }
16✔
730
    };
731
}    // namespace hpx::string_util
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc