• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

GothenburgBitFactory / taskwarrior / 12343201393

15 Dec 2024 11:30PM UTC coverage: 84.419% (-1.1%) from 85.522%
12343201393

Pull #3724

github

web-flow
Merge 532931b9f into ddae5c4ba
Pull Request #3724: Support importing Taskwarrior v2.x data files

15 of 145 new or added lines in 4 files covered. (10.34%)

183 existing lines in 48 files now uncovered.

19289 of 22849 relevant lines covered (84.42%)

23168.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.24
/src/Lexer.cpp
1
////////////////////////////////////////////////////////////////////////////////
2
//
3
// Copyright 2013 - 2021, Tomas Babej, Paul Beckingham, Federico Hernandez.
4
//
5
// Permission is hereby granted, free of charge, to any person obtaining a copy
6
// of this software and associated documentation files (the "Software"), to deal
7
// in the Software without restriction, including without limitation the rights
8
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
// copies of the Software, and to permit persons to whom the Software is
10
// furnished to do so, subject to the following conditions:
11
//
12
// The above copyright notice and this permission notice shall be included
13
// in all copies or substantial portions of the Software.
14
//
15
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
// SOFTWARE.
22
//
23
// https://www.opensource.org/licenses/mit-license.php
24
//
25
////////////////////////////////////////////////////////////////////////////////
26

27
#include <cmake.h>
28
// cmake.h include header must come first
29

30
#include <Datetime.h>
31
#include <Duration.h>
32
#include <Lexer.h>
33
#include <ctype.h>
34
#include <unicode.h>
35
#include <utf8.h>
36

37
#include <algorithm>
38

39
static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
40
static const unsigned int uuid_min_length = 8;
41

42
std::string Lexer::dateFormat = "";
43
std::string::size_type Lexer::minimumMatchLength = 3;
44
std::map<std::string, std::string> Lexer::attributes;
45

46
////////////////////////////////////////////////////////////////////////////////
47
Lexer::Lexer(const std::string& text) : _text(text), _cursor(0), _eos(text.size()) {}
35,121✔
48

49
////////////////////////////////////////////////////////////////////////////////
50
// When a Lexer object is constructed with a string, this method walks through
51
// the stream of low-level tokens.
52
bool Lexer::token(std::string& token, Lexer::Type& type) {
25,054✔
53
  // Eat white space.
54
  while (unicodeWhitespace(_text[_cursor])) utf8_next_char(_text, _cursor);
25,898✔
55

56
  // Terminate at EOS.
57
  if (isEOS()) return false;
25,054✔
58

59
  // The sequence is specific, and must follow these rules:
60
  //   - date < duration < uuid < identifier
61
  //   - dom < uuid
62
  //   - uuid < hex < number
63
  //   - url < pair < identifier
64
  //   - hex < number
65
  //   - separator < tag < operator
66
  //   - path < substitution < pattern
67
  //   - set < number
68
  //   - word last
69
  if (isString(token, type, "'\"") || isDate(token, type) || isDuration(token, type) ||
87,347✔
70
      isURL(token, type) || isPair(token, type) || isUUID(token, type, true) ||
21,019✔
71
      isSet(token, type) || isDOM(token, type) || isHexNumber(token, type) ||
16,983✔
72
      isNumber(token, type) || isSeparator(token, type) || isTag(token, type) ||
16,401✔
73
      isPath(token, type) || isSubstitution(token, type) || isPattern(token, type) ||
12,886✔
74
      isOperator(token, type) || isIdentifier(token, type) || isWord(token, type))
65,499✔
75
    return true;
21,848✔
76

77
  return false;
×
78
}
79

80
////////////////////////////////////////////////////////////////////////////////
81
// This static method tokenizes the input, but discards the type information.
82
std::vector<std::string> Lexer::split(const std::string& text) {
2✔
83
  std::vector<std::string> all;
2✔
84
  std::string token;
2✔
85
  Lexer::Type ignored;
86
  Lexer l(text);
2✔
87
  while (l.token(token, ignored)) all.push_back(token);
15✔
88

89
  return all;
4✔
90
}
2✔
91

92
////////////////////////////////////////////////////////////////////////////////
93
// No L10N - these are for internal purposes.
94
const std::string Lexer::typeName(const Lexer::Type& type) {
57✔
95
  switch (type) {
57✔
96
    case Lexer::Type::uuid:
1✔
97
      return "uuid";
2✔
98
    case Lexer::Type::number:
16✔
99
      return "number";
32✔
100
    case Lexer::Type::hex:
3✔
101
      return "hex";
6✔
102
    case Lexer::Type::string:
5✔
103
      return "string";
10✔
104
    case Lexer::Type::url:
1✔
105
      return "url";
2✔
106
    case Lexer::Type::pair:
1✔
107
      return "pair";
2✔
108
    case Lexer::Type::set:
1✔
109
      return "set";
2✔
110
    case Lexer::Type::separator:
1✔
111
      return "separator";
2✔
112
    case Lexer::Type::tag:
1✔
113
      return "tag";
2✔
114
    case Lexer::Type::path:
1✔
115
      return "path";
2✔
116
    case Lexer::Type::substitution:
1✔
117
      return "substitution";
2✔
118
    case Lexer::Type::pattern:
1✔
119
      return "pattern";
2✔
120
    case Lexer::Type::op:
15✔
121
      return "op";
30✔
122
    case Lexer::Type::dom:
1✔
123
      return "dom";
2✔
124
    case Lexer::Type::identifier:
5✔
125
      return "identifier";
10✔
126
    case Lexer::Type::word:
1✔
127
      return "word";
2✔
128
    case Lexer::Type::date:
1✔
129
      return "date";
2✔
130
    case Lexer::Type::duration:
1✔
131
      return "duration";
2✔
132
  }
133

134
  return "unknown";
×
135
}
136

137
////////////////////////////////////////////////////////////////////////////////
138
bool Lexer::isIdentifierStart(int c) {
33,493✔
139
  return c &&  // Include null character check.
33,463✔
140
         !unicodeWhitespace(c) && !unicodeLatinDigit(c) && !isSingleCharOperator(c) &&
60,480✔
141
         !isPunctuation(c);
60,480✔
142
}
143

144
////////////////////////////////////////////////////////////////////////////////
145
bool Lexer::isIdentifierNext(int c) {
156,170✔
146
  return c &&         // Include null character check.
133,879✔
147
         c != ':' &&  // Used in isPair.
130,154✔
148
         c != '=' &&  // Used in isPair.
129,994✔
149
         !unicodeWhitespace(c) && !isSingleCharOperator(c);
290,049✔
150
}
151

152
////////////////////////////////////////////////////////////////////////////////
153
bool Lexer::isSingleCharOperator(int c) {
197,313✔
154
  return c == '+' ||  // Addition
196,034✔
155
         c == '-' ||  // Subtraction or unary minus = ambiguous
191,281✔
156
         c == '*' ||  // Multiplication
191,251✔
157
         c == '/' ||  // Diviѕion
190,803✔
158
         c == '(' ||  // Precedence open parenthesis
188,775✔
159
         c == ')' ||  // Precedence close parenthesis
186,808✔
160
         c == '<' ||  // Less than
186,759✔
161
         c == '>' ||  // Greater than
186,719✔
162
         c == '^' ||  // Exponent
186,701✔
163
         c == '!' ||  // Unary not
186,675✔
164
         c == '%' ||  // Modulus
186,669✔
165
         c == '=' ||  // Partial match
393,347✔
166
         c == '~';    // Pattern match
197,313✔
167
}
168

169
////////////////////////////////////////////////////////////////////////////////
170
bool Lexer::isDoubleCharOperator(int c0, int c1, int c2) {
29,291✔
171
  return (c0 == '=' && c1 == '=') || (c0 == '!' && c1 == '=') || (c0 == '<' && c1 == '=') ||
29,291✔
172
         (c0 == '>' && c1 == '=') || (c0 == 'o' && c1 == 'r' && isBoundary(c1, c2)) ||
29,255✔
173
         (c0 == '|' && c1 == '|') || (c0 == '&' && c1 == '&') || (c0 == '!' && c1 == '~');
58,582✔
174
}
175

176
////////////////////////////////////////////////////////////////////////////////
177
bool Lexer::isTripleCharOperator(int c0, int c1, int c2, int c3) {
27,446✔
178
  return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary(c2, c3)) ||
27,446✔
179
         (c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary(c2, c3)) ||
54,904✔
180
         (c0 == '!' && c1 == '=' && c2 == '=');
27,458✔
181
}
182

183
////////////////////////////////////////////////////////////////////////////////
184
bool Lexer::isBoundary(int left, int right) {
170✔
185
  // EOS
186
  if (right == '\0') return true;
170✔
187

188
  // XOR
189
  if (unicodeLatinAlpha(left) != unicodeLatinAlpha(right)) return true;
62✔
190
  if (unicodeLatinDigit(left) != unicodeLatinDigit(right)) return true;
12✔
191
  if (unicodeWhitespace(left) != unicodeWhitespace(right)) return true;
12✔
192

193
  // OR
194
  if (isPunctuation(left) || isPunctuation(right)) return true;
10✔
195

196
  return false;
9✔
197
}
198

199
////////////////////////////////////////////////////////////////////////////////
200
bool Lexer::isHardBoundary(int left, int right) {
19,886✔
201
  // EOS
202
  if (right == '\0') return true;
19,886✔
203

204
  // FILTER operators that don't need to be surrounded by whitespace.
205
  if (left == '(' || left == ')' || right == '(' || right == ')') return true;
19,886✔
206

207
  return false;
19,866✔
208
}
209

210
////////////////////////////////////////////////////////////////////////////////
211
bool Lexer::isPunctuation(int c) {
27,006✔
212
  return isprint(c) && c != ' ' && c != '@' && c != '#' && c != '$' && c != '_' &&
26,908✔
213
         !unicodeLatinDigit(c) && !unicodeLatinAlpha(c);
53,914✔
214
}
215

216
////////////////////////////////////////////////////////////////////////////////
217
// Assumes that quotes is a string containing a non-trivial set of quote
218
// characters.
219
void Lexer::dequote(std::string& input, const std::string& quotes) {
27,439✔
220
  int quote = input[0];
27,439✔
221
  if (quotes.find(quote) != std::string::npos) {
27,439✔
222
    size_t len = input.length();
2,847✔
223
    if (quote == input[len - 1]) input = input.substr(1, len - 2);
2,847✔
224
  }
225
}
27,439✔
226

227
////////////////////////////////////////////////////////////////////////////////
228
// Detects characters in an input string that indicate quotes were required, or
229
// escapes, to get them past the shell.
230
bool Lexer::wasQuoted(const std::string& input) {
16,799✔
231
  if (input.find_first_of(" \t()<>&~") != std::string::npos) return true;
16,799✔
232

233
  return false;
15,184✔
234
}
235

236
////////////////////////////////////////////////////////////////////////////////
237
bool Lexer::isEOS() const { return _cursor >= _eos; }
41,721✔
238

239
////////////////////////////////////////////////////////////////////////////////
240
// Converts '0'     -> 0
241
//          '9'     -> 9
242
//          'a'/'A' -> 10
243
//          'f'/'F' -> 15
244
int Lexer::hexToInt(int c) {
36✔
245
  if (c >= '0' && c <= '9')
36✔
246
    return (c - '0');
22✔
247
  else if (c >= 'a' && c <= 'f')
14✔
248
    return (c - 'a' + 10);
2✔
249
  else
250
    return (c - 'A' + 10);
12✔
251
}
252

253
////////////////////////////////////////////////////////////////////////////////
254
int Lexer::hexToInt(int c0, int c1) { return (hexToInt(c0) << 4) + hexToInt(c1); }
×
255

256
////////////////////////////////////////////////////////////////////////////////
257
int Lexer::hexToInt(int c0, int c1, int c2, int c3) {
9✔
258
  return (hexToInt(c0) << 12) + (hexToInt(c1) << 8) + (hexToInt(c2) << 4) + hexToInt(c3);
9✔
259
}
260

261
////////////////////////////////////////////////////////////////////////////////
262
// Compares two strings, and returns the number bytes in common.
263
//
264
// left:   wonderful
265
// right:  wonderbread
266
// returns:     ^ 6
267
std::string::size_type Lexer::commonLength(const std::string& left, const std::string& right) {
7✔
268
  std::string::size_type l = 0;
7✔
269
  std::string::size_type r = 0;
7✔
270
  while (left[l] == right[r] && utf8_next_char(left, l) && utf8_next_char(right, r));
19✔
271

272
  return l;
7✔
273
}
274

275
////////////////////////////////////////////////////////////////////////////////
276
// Compares two strings with offsets, and returns the number bytes in common.
277
//
278
// left:   wonderful
279
// l:      ^
280
// right:  prowonderbread
281
// r:         ^
282
// returns:        ^ 6
283
std::string::size_type Lexer::commonLength(const std::string& left, std::string::size_type l,
974,101✔
284
                                           const std::string& right, std::string::size_type r) {
285
  while (left[l] == right[r] && utf8_next_char(left, l) && utf8_next_char(right, r));
1,034,234✔
286

287
  return l;
974,101✔
288
}
289

290
////////////////////////////////////////////////////////////////////////////////
291
std::string Lexer::commify(const std::string& data) {
25✔
292
  // First scan for decimal point and end of digits.
293
  int decimalPoint = -1;
25✔
294
  int end = -1;
25✔
295

296
  int i;
297
  for (int i = 0; i < (int)data.length(); ++i) {
184✔
298
    if (unicodeLatinDigit(data[i])) end = i;
159✔
299

300
    if (data[i] == '.') decimalPoint = i;
159✔
301
  }
302

303
  std::string result;
25✔
304
  if (decimalPoint != -1) {
25✔
305
    // In reverse order, transfer all digits up to, and including the decimal
306
    // point.
307
    for (i = (int)data.length() - 1; i >= decimalPoint; --i) result += data[i];
30✔
308

309
    int consecutiveDigits = 0;
10✔
310
    for (; i >= 0; --i) {
65✔
311
      if (unicodeLatinDigit(data[i])) {
55✔
312
        result += data[i];
55✔
313

314
        if (++consecutiveDigits == 3 && i && unicodeLatinDigit(data[i - 1])) {
55✔
315
          result += ',';
12✔
316
          consecutiveDigits = 0;
12✔
317
        }
318
      } else
319
        result += data[i];
×
320
    }
321
  } else {
322
    // In reverse order, transfer all digits up to, but not including the last
323
    // digit.
324
    for (i = (int)data.length() - 1; i > end; --i) result += data[i];
26✔
325

326
    int consecutiveDigits = 0;
15✔
327
    for (; i >= 0; --i) {
88✔
328
      if (unicodeLatinDigit(data[i])) {
73✔
329
        result += data[i];
67✔
330

331
        if (++consecutiveDigits == 3 && i && unicodeLatinDigit(data[i - 1])) {
67✔
332
          result += ',';
15✔
333
          consecutiveDigits = 0;
15✔
334
        }
335
      } else
336
        result += data[i];
6✔
337
    }
338
  }
339

340
  // reverse result into data.
341
  std::string done;
25✔
342
  for (int i = (int)result.length() - 1; i >= 0; --i) done += result[i];
211✔
343

344
  return done;
50✔
345
}
25✔
346

347
////////////////////////////////////////////////////////////////////////////////
348
std::string Lexer::lowerCase(const std::string& input) {
636✔
349
  std::string output = input;
636✔
350
  std::transform(output.begin(), output.end(), output.begin(), tolower);
636✔
351
  return output;
636✔
352
}
353

354
////////////////////////////////////////////////////////////////////////////////
355
std::string Lexer::ucFirst(const std::string& input) {
791✔
356
  std::string output = input;
791✔
357

358
  if (output.length() > 0) output[0] = toupper(output[0]);
791✔
359

360
  return output;
791✔
UNCOV
361
}
×
362

363
////////////////////////////////////////////////////////////////////////////////
364
std::string Lexer::trimLeft(const std::string& in, const std::string& t /*= " "*/) {
17,592✔
365
  std::string::size_type ws = in.find_first_not_of(t);
17,592✔
366
  if (ws > 0) {
17,592✔
367
    std::string out{in};
279✔
368
    return out.erase(0, ws);
279✔
369
  }
279✔
370

371
  return in;
17,313✔
372
}
373

374
////////////////////////////////////////////////////////////////////////////////
375
std::string Lexer::trimRight(const std::string& in, const std::string& t /*= " "*/) {
17,487✔
376
  std::string out{in};
17,487✔
377
  return out.erase(in.find_last_not_of(t) + 1);
34,974✔
378
}
17,487✔
379

380
////////////////////////////////////////////////////////////////////////////////
381
std::string Lexer::trim(const std::string& in, const std::string& t /*= " "*/) {
17,480✔
382
  return trimLeft(trimRight(in, t), t);
17,480✔
383
}
384

385
////////////////////////////////////////////////////////////////////////////////
386
// Lexer::Type::string
387
//   '|"
388
//   [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
389
//   '|"
390
bool Lexer::isString(std::string& token, Lexer::Type& type, const std::string& quotes) {
21,848✔
391
  std::size_t marker = _cursor;
21,848✔
392
  if (readWord(_text, quotes, marker, token)) {
21,848✔
393
    type = Lexer::Type::string;
45✔
394
    _cursor = marker;
45✔
395
    return true;
45✔
396
  }
397

398
  return false;
21,803✔
399
}
400

401
////////////////////////////////////////////////////////////////////////////////
402
// Lexer::Type::date
403
//   <Datetime>
404
bool Lexer::isDate(std::string& token, Lexer::Type& type) {
21,803✔
405
  // Try an ISO date parse.
406
  std::size_t iso_i = 0;
21,803✔
407
  Datetime iso;
21,803✔
408
  if (iso.parse(_text.substr(_cursor), iso_i, Lexer::dateFormat)) {
21,803✔
409
    type = Lexer::Type::date;
537✔
410
    token = _text.substr(_cursor, iso_i);
537✔
411
    _cursor += iso_i;
537✔
412
    return true;
537✔
413
  }
414

415
  return false;
21,266✔
416
}
417

418
////////////////////////////////////////////////////////////////////////////////
419
// Lexer::Type::duration
420
//   <Duration>
421
bool Lexer::isDuration(std::string& token, Lexer::Type& type) {
21,266✔
422
  std::size_t marker = _cursor;
21,266✔
423

424
  std::string extractedToken;
21,266✔
425
  Lexer::Type extractedType;
426
  if (isOperator(extractedToken, extractedType)) {
21,266✔
427
    _cursor = marker;
4,299✔
428
    return false;
4,299✔
429
  }
430

431
  marker = 0;
16,967✔
432
  Duration iso;
16,967✔
433
  if (iso.parse(_text.substr(_cursor), marker)) {
16,967✔
434
    type = Lexer::Type::duration;
247✔
435
    token = _text.substr(_cursor, marker);
247✔
436
    _cursor += marker;
247✔
437
    return true;
247✔
438
  }
439

440
  return false;
16,720✔
441
}
21,266✔
442

443
////////////////////////////////////////////////////////////////////////////////
444
// Lexer::Type::uuid
445
//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
446
//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXX
447
//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXX
448
//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX
449
//   ...
450
//   XXXXXXXX-XX
451
//   XXXXXXXX-X
452
//   XXXXXXXX-
453
//   XXXXXXXX
454
//   Followed only by EOS, whitespace, or single character operator.
455
bool Lexer::isUUID(std::string& token, Lexer::Type& type, bool endBoundary) {
36,501✔
456
  std::size_t marker = _cursor;
36,501✔
457

458
  // Greedy.
459
  std::size_t i = 0;
36,501✔
460
  for (; i < 36 && marker + i < _eos; i++) {
153,611✔
461
    if (uuid_pattern[i] == 'x') {
144,970✔
462
      if (!unicodeHexDigit(_text[marker + i])) break;
134,444✔
463
    } else if (uuid_pattern[i] != _text[marker + i])
10,526✔
464
      break;
26✔
465
  }
466

467
  if (i >= uuid_min_length &&
39,179✔
468
      (!endBoundary || !_text[marker + i] || unicodeWhitespace(_text[marker + i]) ||
2,706✔
469
       isSingleCharOperator(_text[marker + i]))) {
28✔
470
    token = _text.substr(_cursor, i);
2,650✔
471
    type = Lexer::Type::uuid;
2,650✔
472
    _cursor += i;
2,650✔
473
    return true;
2,650✔
474
  }
475

476
  return false;
33,851✔
477
}
478

479
////////////////////////////////////////////////////////////////////////////////
480
// Lexer::Type::hex
481
//   0xX+
482
bool Lexer::isHexNumber(std::string& token, Lexer::Type& type) {
16,404✔
483
  std::size_t marker = _cursor;
16,404✔
484

485
  if (_eos - marker >= 3 && _text[marker + 0] == '0' && _text[marker + 1] == 'x') {
16,404✔
486
    marker += 2;
3✔
487

488
    while (unicodeHexDigit(_text[marker])) ++marker;
9✔
489

490
    if (marker - _cursor > 2) {
3✔
491
      token = _text.substr(_cursor, marker - _cursor);
3✔
492
      type = Lexer::Type::hex;
3✔
493
      _cursor = marker;
3✔
494
      return true;
3✔
495
    }
496
  }
497

498
  return false;
16,401✔
499
}
500

501
////////////////////////////////////////////////////////////////////////////////
502
// Lexer::Type::number
503
//   0
504
//   [1-9]\d*
505
//   [ . \d+ ]
506
//   [ e|E [ +|- ] \d+ [ . \d+ ] ]
507
//   not followed by non-operator.
508
bool Lexer::isNumber(std::string& token, Lexer::Type& type) {
16,401✔
509
  std::size_t marker = _cursor;
16,401✔
510

511
  bool leading_zero = (_text[marker] == '0');
16,401✔
512

513
  if (unicodeLatinDigit(_text[marker])) {
16,401✔
514
    ++marker;
1,517✔
515

516
    // Two (or more) digit number with a leading zero are not allowed
517
    if (leading_zero && unicodeLatinDigit(_text[marker])) return false;
1,517✔
518

519
    while (unicodeLatinDigit(_text[marker])) utf8_next_char(_text, marker);
2,073✔
520

521
    if (_text[marker] == '.') {
1,505✔
522
      ++marker;
36✔
523
      if (unicodeLatinDigit(_text[marker])) {
36✔
524
        ++marker;
32✔
525
        while (unicodeLatinDigit(_text[marker])) utf8_next_char(_text, marker);
43✔
526
      }
527
    }
528

529
    if (_text[marker] == 'e' || _text[marker] == 'E') {
1,505✔
530
      ++marker;
9✔
531

532
      if (_text[marker] == '+' || _text[marker] == '-') ++marker;
9✔
533

534
      if (unicodeLatinDigit(_text[marker])) {
9✔
535
        ++marker;
6✔
536
        while (unicodeLatinDigit(_text[marker])) utf8_next_char(_text, marker);
8✔
537

538
        if (_text[marker] == '.') {
6✔
539
          ++marker;
3✔
540
          if (unicodeLatinDigit(_text[marker])) {
3✔
541
            ++marker;
3✔
542
            while (unicodeLatinDigit(_text[marker])) utf8_next_char(_text, marker);
3✔
543
          }
544
        }
545
      }
546
    }
547

548
    // Lookahead: !<unicodeWhitespace> | !<isSingleCharOperator>
549
    // If there is an immediately consecutive character, that is not an operator, fail.
550
    if (_eos > marker && !unicodeWhitespace(_text[marker]) && !isSingleCharOperator(_text[marker]))
1,505✔
551
      return false;
293✔
552

553
    token = _text.substr(_cursor, marker - _cursor);
1,212✔
554
    type = Lexer::Type::number;
1,212✔
555
    _cursor = marker;
1,212✔
556
    return true;
1,212✔
557
  }
558

559
  return false;
14,884✔
560
}
561

562
////////////////////////////////////////////////////////////////////////////////
563
// Lexer::Type::number
564
//   0
565
//   [1-9]\d*
566
//   Integers do not start with a leading 0, unless they are zero.
567
bool Lexer::isInteger(std::string& token, Lexer::Type& type) {
34,002✔
568
  std::size_t marker = _cursor;
34,002✔
569

570
  bool leading_zero = (_text[marker] == '0');
34,002✔
571

572
  if (unicodeLatinDigit(_text[marker])) {
34,002✔
573
    ++marker;
3,850✔
574
    while (unicodeLatinDigit(_text[marker])) utf8_next_char(_text, marker);
5,351✔
575

576
    // Leading zero is only allowed in the case of number 0
577
    if (leading_zero and marker - _cursor > 1) return false;
3,850✔
578

579
    token = _text.substr(_cursor, marker - _cursor);
3,810✔
580
    type = Lexer::Type::number;
3,810✔
581
    _cursor = marker;
3,810✔
582
    return true;
3,810✔
583
  }
584

585
  return false;
30,152✔
586
}
587

588
////////////////////////////////////////////////////////////////////////////////
589
// Lexer::Type::separator
590
//   --
591
bool Lexer::isSeparator(std::string& token, Lexer::Type& type) {
15,189✔
592
  if (_eos - _cursor >= 2 && _text[_cursor] == '-' && _text[_cursor + 1] == '-') {
15,189✔
593
    _cursor += 2;
776✔
594
    type = Lexer::Type::separator;
776✔
595
    token = "--";
776✔
596
    return true;
776✔
597
  }
598

599
  return false;
14,413✔
600
}
601

602
////////////////////////////////////////////////////////////////////////////////
603
// Lexer::Type::url
604
//   http [s] :// ...
605
bool Lexer::isURL(std::string& token, Lexer::Type& type) {
21,019✔
606
  std::size_t marker = _cursor;
21,019✔
607

608
  if (_eos - _cursor > 9 &&  // length 'https://*'
47,040✔
609
      (_text[marker + 0] == 'h' || _text[marker + 0] == 'H') &&
5,002✔
610
      (_text[marker + 1] == 't' || _text[marker + 1] == 'T') &&
25✔
611
      (_text[marker + 2] == 't' || _text[marker + 2] == 'T') &&
26,028✔
612
      (_text[marker + 3] == 'p' || _text[marker + 3] == 'P')) {
7✔
613
    marker += 4;
7✔
614
    if (_text[marker + 0] == 's' || _text[marker + 0] == 'S') ++marker;
7✔
615

616
    if (_text[marker + 0] == ':' && _text[marker + 1] == '/' && _text[marker + 2] == '/') {
7✔
617
      marker += 3;
7✔
618

619
      while (marker < _eos && !unicodeWhitespace(_text[marker])) utf8_next_char(_text, marker);
104✔
620

621
      token = _text.substr(_cursor, marker - _cursor);
7✔
622
      type = Lexer::Type::url;
7✔
623
      _cursor = marker;
7✔
624
      return true;
7✔
625
    }
626
  }
627

628
  return false;
21,012✔
629
}
630

631
////////////////////////////////////////////////////////////////////////////////
632
// Lexer::Type::pair
633
//   <identifier> <separator> [ <string> | <word> ]
634
//   separator '::' | ':=' | ':' | '='
635
bool Lexer::isPair(std::string& token, Lexer::Type& type) {
21,012✔
636
  std::size_t marker = _cursor;
21,012✔
637

638
  std::string ignoredToken;
21,012✔
639
  Lexer::Type ignoredType;
640
  if (isIdentifier(ignoredToken, ignoredType)) {
21,012✔
641
    // Look for a valid separator.
642
    std::string separator = _text.substr(_cursor, 2);
14,856✔
643
    if (separator == "::" || separator == ":=")
14,856✔
644
      _cursor += 2;
6✔
645
    else if (separator[0] == ':' || separator[0] == '=')
14,850✔
646
      _cursor++;
3,879✔
647
    else {
648
      _cursor = marker;
10,971✔
649
      return false;
10,971✔
650
    }
651

652
    // String, word or nothing are all valid.
653
    if (readWord(_text, "'\"", _cursor, ignoredToken) || readWord(_text, _cursor, ignoredToken) ||
15,571✔
654
        isEOS() || unicodeWhitespace(_text[_cursor])) {
11,686✔
655
      token = _text.substr(marker, _cursor - marker);
3,885✔
656
      type = Lexer::Type::pair;
3,885✔
657
      return true;
3,885✔
658
    }
659
  }
14,856✔
660

661
  _cursor = marker;
6,156✔
662
  return false;
6,156✔
663
}
21,012✔
664

665
////////////////////////////////////////////////////////////////////////////////
666
// Lexer::Type::set
667
//   a single number:      1
668
//   a list of numbers:    1,3,5
669
//   a range:              5-10
670
//   or a combination:     1,3,5-10
671
//
672
//   <id> [ - <id> ] [ , <id> [ - <id> ] ] ...
673
bool Lexer::isSet(std::string& token, Lexer::Type& type) {
16,983✔
674
  std::size_t marker = _cursor;
16,983✔
675
  int count = 0;
16,983✔
676
  std::string dummyToken;
16,983✔
677
  Lexer::Type dummyType;
678

679
  do {
680
    if (isInteger(dummyToken, dummyType)) {
17,008✔
681
      ++count;
1,907✔
682
      if (isLiteral("-", false, false)) {
3,814✔
683
        if (isInteger(dummyToken, dummyType))
104✔
684
          ++count;
66✔
685
        else {
686
          _cursor = marker;
38✔
687
          return false;
38✔
688
        }
689
      }
690
    } else {
691
      _cursor = marker;
15,101✔
692
      return false;
15,101✔
693
    }
694
  } while (isLiteral(",", false, false));
3,738✔
695

696
  // Success is multiple numbers, matching the pattern.
697
  if (count > 1 && (isEOS() || unicodeWhitespace(_text[_cursor]) ||
1,851✔
698
                    isHardBoundary(_text[_cursor], _text[_cursor + 1]))) {
7✔
699
    token = _text.substr(marker, _cursor - marker);
65✔
700
    type = Lexer::Type::set;
65✔
701
    return true;
65✔
702
  }
703

704
  _cursor = marker;
1,779✔
705
  return false;
1,779✔
706
}
16,983✔
707

708
////////////////////////////////////////////////////////////////////////////////
709
// Lexer::Type::tag
710
//   ^ | '(' | ')' | <unicodeWhitespace>
711
//     [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
712
bool Lexer::isTag(std::string& token, Lexer::Type& type) {
14,413✔
713
  std::size_t marker = _cursor;
14,413✔
714

715
  // Lookbehind: Assert ^ or preceded by whitespace, (, or ).
716
  if (marker > 0 && !unicodeWhitespace(_text[marker - 1]) && _text[marker - 1] != '(' &&
14,760✔
717
      _text[marker - 1] != ')')
347✔
718
    return false;
347✔
719

720
  if (_text[marker] == '+' || _text[marker] == '-') {
14,066✔
721
    ++marker;
1,591✔
722

723
    if (isIdentifierStart(_text[marker])) {
1,591✔
724
      utf8_next_char(_text, marker);
1,527✔
725

726
      while (isIdentifierNext(_text[marker])) utf8_next_char(_text, marker);
9,106✔
727

728
      token = _text.substr(_cursor, marker - _cursor);
1,527✔
729
      type = Lexer::Type::tag;
1,527✔
730
      _cursor = marker;
1,527✔
731
      return true;
1,527✔
732
    }
733
  }
734

735
  return false;
12,539✔
736
}
737

738
////////////////////////////////////////////////////////////////////////////////
739
// Lexer::Type::path
740
//   ( / <non-slash, non-whitespace> )+
741
bool Lexer::isPath(std::string& token, Lexer::Type& type) {
12,886✔
742
  std::size_t marker = _cursor;
12,886✔
743
  int slashCount = 0;
12,886✔
744

745
  while (true) {
746
    if (_text[marker] == '/') {
13,112✔
747
      ++marker;
385✔
748
      ++slashCount;
385✔
749
    } else
750
      break;
12,727✔
751

752
    if (_text[marker] && !unicodeWhitespace(_text[marker]) && _text[marker] != '/') {
385✔
753
      utf8_next_char(_text, marker);
226✔
754
      while (_text[marker] && !unicodeWhitespace(_text[marker]) && _text[marker] != '/')
920✔
755
        utf8_next_char(_text, marker);
694✔
756
    } else
757
      break;
159✔
758
  }
759

760
  if (marker > _cursor && slashCount > 3) {
12,886✔
761
    type = Lexer::Type::path;
4✔
762
    token = _text.substr(_cursor, marker - _cursor);
4✔
763
    _cursor = marker;
4✔
764
    return true;
4✔
765
  }
766

767
  return false;
12,882✔
768
}
769

770
////////////////////////////////////////////////////////////////////////////////
771
// Lexer::Type::substitution
772
//   / <unquoted-string> / <unquoted-string> / [g]  <EOS> | <unicodeWhitespace>
773
bool Lexer::isSubstitution(std::string& token, Lexer::Type& type) {
12,882✔
774
  std::size_t marker = _cursor;
12,882✔
775

776
  std::string word;
12,882✔
777
  if (readWord(_text, "/", _cursor, word)) {
38,646✔
778
    --_cursor;  // Step backwards over the '/'.
165✔
779

780
    if (readWord(_text, "/", _cursor, word)) {
495✔
781
      if (_text[_cursor] == 'g') ++_cursor;
44✔
782

783
      // Lookahread: <EOS> | <unicodeWhitespace>
784
      if (_text[_cursor] == '\0' || unicodeWhitespace(_text[_cursor])) {
44✔
785
        token = _text.substr(marker, _cursor - marker);
41✔
786
        type = Lexer::Type::substitution;
41✔
787
        return true;
41✔
788
      }
789
    }
790
  }
791

792
  _cursor = marker;
12,841✔
793
  return false;
12,841✔
794
}
12,882✔
795

796
////////////////////////////////////////////////////////////////////////////////
797
// Lexer::Type::pattern
798
//   / <unquoted-string> /  <EOS> | <unicodeWhitespace>
799
bool Lexer::isPattern(std::string& token, Lexer::Type& type) {
12,841✔
800
  std::size_t marker = _cursor;
12,841✔
801

802
  std::string word;
12,841✔
803
  if (readWord(_text, "/", _cursor, word) && (isEOS() || unicodeWhitespace(_text[_cursor]))) {
38,523✔
804
    token = _text.substr(marker, _cursor - marker);
118✔
805
    type = Lexer::Type::pattern;
118✔
806
    return true;
118✔
807
  }
808

809
  _cursor = marker;
12,723✔
810
  return false;
12,723✔
811
}
12,841✔
812

813
////////////////////////////////////////////////////////////////////////////////
814
// Lexer::Type::op
815
//   _hastag_ | _notag | _neg_ | _pos_ |
816
//   <isTripleCharOperator> |
817
//   <isDoubleCharOperator> |
818
//   <isSingleCharOperator> |
819
bool Lexer::isOperator(std::string& token, Lexer::Type& type) {
33,989✔
820
  std::size_t marker = _cursor;
33,989✔
821

822
  if (_eos - marker >= 8 && _text.substr(marker, 8) == "_hastag_") {
33,989✔
823
    marker += 8;
4✔
824
    type = Lexer::Type::op;
4✔
825
    token = _text.substr(_cursor, marker - _cursor);
4✔
826
    _cursor = marker;
4✔
827
    return true;
4✔
828
  }
829

830
  else if (_eos - marker >= 7 && _text.substr(marker, 7) == "_notag_") {
33,985✔
831
    marker += 7;
4✔
832
    type = Lexer::Type::op;
4✔
833
    token = _text.substr(_cursor, marker - _cursor);
4✔
834
    _cursor = marker;
4✔
835
    return true;
4✔
836
  }
837

838
  else if (_eos - marker >= 5 && _text.substr(marker, 5) == "_neg_") {
33,981✔
839
    marker += 5;
4✔
840
    type = Lexer::Type::op;
4✔
841
    token = _text.substr(_cursor, marker - _cursor);
4✔
842
    _cursor = marker;
4✔
843
    return true;
4✔
844
  }
845

846
  else if (_eos - marker >= 5 && _text.substr(marker, 5) == "_pos_") {
33,977✔
847
    marker += 5;
4✔
848
    type = Lexer::Type::op;
4✔
849
    token = _text.substr(_cursor, marker - _cursor);
4✔
850
    _cursor = marker;
4✔
851
    return true;
4✔
852
  }
853

854
  else if (_eos - marker >= 3 && isTripleCharOperator(_text[marker], _text[marker + 1],
61,419✔
855
                                                      _text[marker + 2], _text[marker + 3])) {
27,446✔
856
    marker += 3;
90✔
857
    type = Lexer::Type::op;
90✔
858
    token = _text.substr(_cursor, marker - _cursor);
90✔
859
    _cursor = marker;
90✔
860
    return true;
90✔
861
  }
862

863
  else if (_eos - marker >= 2 &&
63,174✔
864
           isDoubleCharOperator(_text[marker], _text[marker + 1], _text[marker + 2])) {
29,291✔
865
    marker += 2;
138✔
866
    type = Lexer::Type::op;
138✔
867
    token = _text.substr(_cursor, marker - _cursor);
138✔
868
    _cursor = marker;
138✔
869
    return true;
138✔
870
  }
871

872
  else if (isSingleCharOperator(_text[marker])) {
33,745✔
873
    token = _text[marker];
5,888✔
874
    type = Lexer::Type::op;
5,888✔
875
    _cursor = ++marker;
5,888✔
876
    return true;
5,888✔
877
  }
878

879
  return false;
27,857✔
880
}
881

882
////////////////////////////////////////////////////////////////////////////////
883
// Lexer::Type::dom
884
//   [ <isUUID> | <isDigit>+ . ] <isIdentifier> [ . <isIdentifier> ]*
885
//
886
// Configuration:
887
//   rc.<name>
888
//
889
// System:
890
//   tw.syncneeded
891
//   tw.program
892
//   tw.args
893
//   tw.width
894
//   tw.height
895
//   context.program      // 2017-02-25 Deprecated in 2.6.0
896
//   context.args         // 2017-02-25 Deprecated in 2.6.0
897
//   context.width        // 2017-02-25 Deprecated in 2.6.0
898
//   context.height       // 2017-02-25 Deprecated in 2.6.0
899
//   system.version
900
//   system.os
901
//
902
// Relative or absolute attribute:
903
//   <attribute>
904
//   <id>.<attribute>
905
//   <uuid>.<attribute>
906
//
907
// Single tag:
908
//   tags.<word>
909
//
910
// Date type:
911
//   <date>.year
912
//   <date>.month
913
//   <date>.day
914
//   <date>.week
915
//   <date>.weekday
916
//   <date>.julian
917
//   <date>.hour
918
//   <date>.minute
919
//   <date>.second
920
//
921
// Annotations (entry is a date):
922
//   annotations.count
923
//   annotations.<N>.entry
924
//   annotations.<N>.description
925
//
926
bool Lexer::isDOM(std::string& token, Lexer::Type& type) {
16,918✔
927
  std::size_t marker = _cursor;
16,918✔
928

929
  // rc. ...
930
  std::string partialToken;
16,918✔
931
  Lexer::Type partialType;
932
  if (isLiteral("rc.", false, false) && isWord(partialToken, partialType)) {
50,754✔
933
    token = _text.substr(marker, _cursor - marker);
8✔
934
    type = Lexer::Type::dom;
8✔
935
    return true;
8✔
936
  } else
937
    _cursor = marker;
16,910✔
938

939
  // Literals
940
  if (isOneOf({"tw.syncneeded", "tw.program", "tw.args", "tw.width", "tw.height", "tw.version",
50,730✔
941
               "context.program", "context.args", "context.width", "context.height",
942
               "system.version", "system.os"},
943
              false, true)) {
944
    token = _text.substr(marker, _cursor - marker);
21✔
945
    type = Lexer::Type::dom;
21✔
946
    return true;
21✔
947
  }
948

949
  // Optional:
950
  //   <uuid>.
951
  //   <id>.
952
  std::string extractedToken;
16,889✔
953
  Lexer::Type extractedType;
954
  if (isUUID(extractedToken, extractedType, false) || isInteger(extractedToken, extractedType)) {
16,889✔
955
    if (!isLiteral(".", false, false)) {
3,672✔
956
      _cursor = marker;
1,478✔
957
      return false;
1,478✔
958
    }
959
  }
960

961
  // Any failure after this line should rollback to the checkpoint.
962
  std::size_t checkpoint = _cursor;
15,411✔
963

964
  // [prefix]tags.<word>
965
  if (isLiteral("tags", false, false) && isLiteral(".", false, false) &&
46,335✔
966
      isWord(partialToken, partialType)) {
18✔
967
    token = _text.substr(marker, _cursor - marker);
18✔
968
    type = Lexer::Type::dom;
18✔
969
    return true;
18✔
970
  } else
971
    _cursor = checkpoint;
15,393✔
972

973
  // [prefix]attribute (bounded)
974
  if (isOneOf(attributes, false, true)) {
15,393✔
975
    token = _text.substr(marker, _cursor - marker);
419✔
976
    type = Lexer::Type::dom;
419✔
977
    return true;
419✔
978
  }
979

980
  // [prefix]attribute. (unbounded)
981
  if (isOneOf(attributes, false, false)) {
14,974✔
982
    if (isLiteral(".", false, false)) {
334✔
983
      std::string attribute = _text.substr(checkpoint, _cursor - checkpoint - 1);
49✔
984

985
      // if attribute type is 'date', then it has sub-elements.
986
      if (attributes[attribute] == "date" &&
66✔
987
          isOneOf({"year", "month", "day", "week", "weekday", "julian", "hour", "minute", "second"},
100✔
988
                  false, true)) {
989
        token = _text.substr(marker, _cursor - marker);
17✔
990
        type = Lexer::Type::dom;
17✔
991
        return true;
17✔
992
      }
993

994
      _cursor = checkpoint;
32✔
995
    }
49✔
996

997
    // Lookahead: !<alpha>
998
    else if (!unicodeLatinAlpha(_text[marker])) {
118✔
999
      token = _text.substr(marker, _cursor - marker);
×
1000
      type = Lexer::Type::dom;
×
1001
      return true;
×
1002
    }
1003

1004
    _cursor = checkpoint;
150✔
1005
  }
1006

1007
  // [prefix]annotations.
1008
  if (isLiteral("annotations", true, false) && isLiteral(".", false, false)) {
45,185✔
1009
    if (isLiteral("count", false, false)) {
62✔
1010
      token = _text.substr(marker, _cursor - marker);
4✔
1011
      type = Lexer::Type::dom;
4✔
1012
      return true;
31✔
1013
    }
1014

1015
    std::string extractedToken;
27✔
1016
    Lexer::Type extractedType;
1017
    if (isInteger(extractedToken, extractedType)) {
27✔
1018
      if (isLiteral(".", false, false)) {
54✔
1019
        if (isLiteral("description", false, true)) {
54✔
1020
          token = _text.substr(marker, _cursor - marker);
17✔
1021
          type = Lexer::Type::dom;
17✔
1022
          return true;
17✔
1023
        } else if (isLiteral("entry", false, true)) {
20✔
1024
          token = _text.substr(marker, _cursor - marker);
7✔
1025
          type = Lexer::Type::dom;
7✔
1026
          return true;
7✔
1027
        } else if (isLiteral("entry", false, false) && isLiteral(".", false, false) &&
21✔
1028
                   isOneOf({"year", "month", "day", "week", "weekday", "julian", "hour", "minute",
12✔
1029
                            "second"},
1030
                           false, true)) {
1031
          token = _text.substr(marker, _cursor - marker);
3✔
1032
          type = Lexer::Type::dom;
3✔
1033
          return true;
3✔
1034
        }
1035
      }
1036
    } else
1037
      _cursor = checkpoint;
×
1038
  }
27✔
1039

1040
  _cursor = marker;
14,926✔
1041
  return false;
14,926✔
1042
}
16,918✔
1043

1044
////////////////////////////////////////////////////////////////////////////////
1045
// Lexer::Type::identifier
1046
//   <isIdentifierStart> [ <isIdentifierNext> ]*
1047
bool Lexer::isIdentifier(std::string& token, Lexer::Type& type) {
31,902✔
1048
  std::size_t marker = _cursor;
31,902✔
1049

1050
  if (isIdentifierStart(_text[marker])) {
31,902✔
1051
    utf8_next_char(_text, marker);
25,426✔
1052

1053
    while (isIdentifierNext(_text[marker])) utf8_next_char(_text, marker);
147,064✔
1054

1055
    token = _text.substr(_cursor, marker - _cursor);
25,426✔
1056
    type = Lexer::Type::identifier;
25,426✔
1057
    _cursor = marker;
25,426✔
1058
    return true;
25,426✔
1059
  }
1060

1061
  return false;
6,476✔
1062
}
1063

1064
////////////////////////////////////////////////////////////////////////////////
1065
// Lexer::Type::word
1066
//   [^\s]+
1067
bool Lexer::isWord(std::string& token, Lexer::Type& type) {
346✔
1068
  std::size_t marker = _cursor;
346✔
1069

1070
  while (_text[marker] && !unicodeWhitespace(_text[marker]) && !isSingleCharOperator(_text[marker]))
2,417✔
1071
    utf8_next_char(_text, marker);
2,071✔
1072

1073
  if (marker > _cursor) {
346✔
1074
    token = _text.substr(_cursor, marker - _cursor);
346✔
1075
    type = Lexer::Type::word;
346✔
1076
    _cursor = marker;
346✔
1077
    return true;
346✔
1078
  }
1079

1080
  return false;
×
1081
}
1082

1083
////////////////////////////////////////////////////////////////////////////////
1084
bool Lexer::isLiteral(const std::string& literal, bool allowAbbreviations, bool endBoundary) {
974,100✔
1085
  auto common = commonLength(literal, 0, _text, _cursor);
974,100✔
1086

1087
  // Without abbreviations, common must equal literal length.
1088
  if (!allowAbbreviations && common < literal.length()) return false;
974,100✔
1089

1090
  // Abbreviations must meet the minimum size.
1091
  if (allowAbbreviations && common < minimumMatchLength) return false;
16,457✔
1092

1093
  // End boundary conditions must be met.
1094
  if (endBoundary && _text[_cursor + common] && !unicodeWhitespace(_text[_cursor + common]) &&
1,861✔
1095
      !Lexer::isSingleCharOperator(_text[_cursor + common]))
204✔
1096
    return false;
172✔
1097

1098
  _cursor += common;
1,485✔
1099
  return true;
1,485✔
1100
}
1101

1102
////////////////////////////////////////////////////////////////////////////////
1103
bool Lexer::isOneOf(const std::vector<std::string>& options, bool allowAbbreviations,
16,932✔
1104
                    bool endBoundary) {
1105
  for (auto& item : options)
219,793✔
1106
    if (isLiteral(item, allowAbbreviations, endBoundary)) return true;
202,903✔
1107

1108
  return false;
16,890✔
1109
}
1110

1111
////////////////////////////////////////////////////////////////////////////////
1112
bool Lexer::isOneOf(const std::map<std::string, std::string>& options, bool allowAbbreviations,
30,367✔
1113
                    bool endBoundary) {
1114
  for (auto& item : options)
747,607✔
1115
    if (isLiteral(item.first, allowAbbreviations, endBoundary)) return true;
717,826✔
1116

1117
  return false;
29,781✔
1118
}
1119

1120
////////////////////////////////////////////////////////////////////////////////
1121
// Static
1122
std::string Lexer::typeToString(Lexer::Type type) {
512✔
1123
  if (type == Lexer::Type::string)
512✔
1124
    return std::string("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m";
52✔
1125
  else if (type == Lexer::Type::uuid)
486✔
1126
    return std::string("\033[38;5;7m\033[48;5;10m") + "uuid" + "\033[0m";
48✔
1127
  else if (type == Lexer::Type::hex)
462✔
1128
    return std::string("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m";
4✔
1129
  else if (type == Lexer::Type::number)
460✔
1130
    return std::string("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m";
26✔
1131
  else if (type == Lexer::Type::separator)
447✔
1132
    return std::string("\033[38;5;7m\033[48;5;4m") + "separator" + "\033[0m";
4✔
1133
  else if (type == Lexer::Type::url)
445✔
1134
    return std::string("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
8✔
1135
  else if (type == Lexer::Type::pair)
441✔
1136
    return std::string("\033[38;5;7m\033[48;5;1m") + "pair" + "\033[0m";
200✔
1137
  else if (type == Lexer::Type::set)
341✔
1138
    return std::string("\033[38;5;15m\033[48;5;208m") + "set" + "\033[0m";
20✔
1139
  else if (type == Lexer::Type::tag)
331✔
1140
    return std::string("\033[37;45m") + "tag" + "\033[0m";
32✔
1141
  else if (type == Lexer::Type::path)
315✔
1142
    return std::string("\033[37;102m") + "path" + "\033[0m";
4✔
1143
  else if (type == Lexer::Type::substitution)
313✔
1144
    return std::string("\033[37;102m") + "substitution" + "\033[0m";
8✔
1145
  else if (type == Lexer::Type::pattern)
309✔
1146
    return std::string("\033[37;42m") + "pattern" + "\033[0m";
16✔
1147
  else if (type == Lexer::Type::op)
301✔
1148
    return std::string("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
256✔
1149
  else if (type == Lexer::Type::dom)
173✔
1150
    return std::string("\033[38;5;15m\033[48;5;244m") + "dom" + "\033[0m";
72✔
1151
  else if (type == Lexer::Type::identifier)
137✔
1152
    return std::string("\033[38;5;15m\033[48;5;244m") + "identifier" + "\033[0m";
112✔
1153
  else if (type == Lexer::Type::word)
81✔
1154
    return std::string("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
74✔
1155
  else if (type == Lexer::Type::date)
44✔
1156
    return std::string("\033[38;5;15m\033[48;5;34m") + "date" + "\033[0m";
28✔
1157
  else if (type == Lexer::Type::duration)
30✔
1158
    return std::string("\033[38;5;15m\033[48;5;34m") + "duration" + "\033[0m";
60✔
1159
  else
1160
    return std::string("\033[37;41m") + "unknown" + "\033[0m";
×
1161
}
1162

1163
////////////////////////////////////////////////////////////////////////////////
1164
bool Lexer::isAllDigits(const std::string& text) {
9,469✔
1165
  return text.length() && text.find_first_not_of("0123456789") == std::string::npos;
9,469✔
1166
}
1167

1168
////////////////////////////////////////////////////////////////////////////////
1169
// This is intentionally looking for a single token.
1170
bool Lexer::isDOM(const std::string& text) {
728✔
1171
  Lexer lex(text);
728✔
1172
  int count = 0;
728✔
1173
  std::string token;
728✔
1174
  Lexer::Type type;
1175
  while (lex.token(token, type)) ++count;
1,484✔
1176

1177
  return count == 1 && type == Lexer::Type::dom;
1,456✔
1178
}
728✔
1179

1180
////////////////////////////////////////////////////////////////////////////////
1181
// Full implementation of a quoted word.  Includes:
1182
//   '\''
1183
//   '"'
1184
//   "'"
1185
//   "\""
1186
//   'one two'
1187
// Result includes the quotes.
1188
bool Lexer::readWord(const std::string& text, const std::string& quotes,
52,119✔
1189
                     std::string::size_type& cursor, std::string& word) {
1190
  if (quotes.find(text[cursor]) == std::string::npos) return false;
52,119✔
1191

1192
  std::string::size_type eos = text.length();
1,020✔
1193
  int quote = text[cursor++];
1,020✔
1194
  word = quote;
1,020✔
1195

1196
  int c;
1197
  while ((c = text[cursor])) {
6,388✔
1198
    // Quoted word ends on a quote.
1199
    if (quote && quote == c) {
6,255✔
1200
      word += utf8_character(utf8_next_char(text, cursor));
887✔
1201
      break;
887✔
1202
    }
1203

1204
    // Unicode U+XXXX or \uXXXX codepoint.
1205
    else if (eos - cursor >= 6 &&
13,558✔
1206
             ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
2,822✔
1207
              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
2,819✔
1208
             unicodeHexDigit(text[cursor + 2]) && unicodeHexDigit(text[cursor + 3]) &&
7✔
1209
             unicodeHexDigit(text[cursor + 4]) && unicodeHexDigit(text[cursor + 5])) {
8,190✔
1210
      word += utf8_character(
14✔
1211
          hexToInt(text[cursor + 2], text[cursor + 3], text[cursor + 4], text[cursor + 5]));
14✔
1212
      cursor += 6;
7✔
1213
    }
1214

1215
    // An escaped thing.
1216
    else if (c == '\\') {
5,361✔
1217
      c = text[++cursor];
23✔
1218

1219
      switch (c) {
23✔
1220
        case '"':
3✔
1221
          word += (char)0x22;
3✔
1222
          ++cursor;
3✔
1223
          break;
3✔
1224
        case '\'':
6✔
1225
          word += (char)0x27;
6✔
1226
          ++cursor;
6✔
1227
          break;
6✔
1228
        case '\\':
×
1229
          word += (char)0x5C;
×
1230
          ++cursor;
×
1231
          break;
×
1232
        case 'b':
×
1233
          word += (char)0x08;
×
1234
          ++cursor;
×
1235
          break;
×
1236
        case 'f':
×
1237
          word += (char)0x0C;
×
1238
          ++cursor;
×
1239
          break;
×
1240
        case 'n':
×
1241
          word += (char)0x0A;
×
1242
          ++cursor;
×
1243
          break;
×
1244
        case 'r':
×
1245
          word += (char)0x0D;
×
1246
          ++cursor;
×
1247
          break;
×
1248
        case 't':
×
1249
          word += (char)0x09;
×
1250
          ++cursor;
×
1251
          break;
×
1252
        case 'v':
×
1253
          word += (char)0x0B;
×
1254
          ++cursor;
×
1255
          break;
×
1256

1257
        // This pass-through default case means that anything can be escaped
1258
        // harmlessly. In particular 'quote' is included, if it not one of the
1259
        // above characters.
1260
        default:
14✔
1261
          word += (char)c;
14✔
1262
          ++cursor;
14✔
1263
          break;
14✔
1264
      }
1265
    }
1266

1267
    // Ordinary character.
1268
    else
1269
      word += utf8_character(utf8_next_char(text, cursor));
5,338✔
1270
  }
1271

1272
  // Verify termination.
1273
  return word[0] == quote && word[word.length() - 1] == quote && word.length() >= 2;
1,020✔
1274
}
1275

1276
////////////////////////////////////////////////////////////////////////////////
1277
// Full implementation of an unquoted word.  Includes:
1278
//   one\ two
1279
//   abcU+0020def
1280
//   abc\u0020def
1281
//   a\tb
1282
//
1283
// Ends at:
1284
//   Lexer::isEOS
1285
//   unicodeWhitespace
1286
//   Lexer::isHardBoundary
1287
bool Lexer::readWord(const std::string& text, std::string::size_type& cursor, std::string& word) {
3,881✔
1288
  std::string::size_type eos = text.length();
3,881✔
1289

1290
  word = "";
3,881✔
1291
  int c;
1292
  int prev = 0;
3,881✔
1293
  while ((c = text[cursor]))  // Handles EOS.
27,578✔
1294
  {
1295
    // Unquoted word ends on white space.
1296
    if (unicodeWhitespace(c)) break;
24,258✔
1297

1298
    // Parentheses mostly.
1299
    if (prev && Lexer::isHardBoundary(prev, c)) break;
23,717✔
1300

1301
    // Unicode U+XXXX or \uXXXX codepoint.
1302
    else if (eos - cursor >= 6 &&
58,365✔
1303
             ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
10,971✔
1304
              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
10,970✔
1305
             unicodeHexDigit(text[cursor + 2]) && unicodeHexDigit(text[cursor + 3]) &&
2✔
1306
             unicodeHexDigit(text[cursor + 4]) && unicodeHexDigit(text[cursor + 5])) {
34,668✔
1307
      word += utf8_character(
4✔
1308
          hexToInt(text[cursor + 2], text[cursor + 3], text[cursor + 4], text[cursor + 5]));
4✔
1309
      cursor += 6;
2✔
1310
    }
1311

1312
    // An escaped thing.
1313
    else if (c == '\\') {
23,695✔
1314
      c = text[++cursor];
5✔
1315

1316
      switch (c) {
5✔
1317
        case '"':
×
1318
          word += (char)0x22;
×
1319
          ++cursor;
×
1320
          break;
×
1321
        case '\'':
×
1322
          word += (char)0x27;
×
1323
          ++cursor;
×
1324
          break;
×
1325
        case '\\':
×
1326
          word += (char)0x5C;
×
1327
          ++cursor;
×
1328
          break;
×
1329
        case 'b':
×
1330
          word += (char)0x08;
×
1331
          ++cursor;
×
1332
          break;
×
1333
        case 'f':
×
1334
          word += (char)0x0C;
×
1335
          ++cursor;
×
1336
          break;
×
1337
        case 'n':
×
1338
          word += (char)0x0A;
×
1339
          ++cursor;
×
1340
          break;
×
1341
        case 'r':
×
1342
          word += (char)0x0D;
×
1343
          ++cursor;
×
1344
          break;
×
1345
        case 't':
×
1346
          word += (char)0x09;
×
1347
          ++cursor;
×
1348
          break;
×
1349
        case 'v':
×
1350
          word += (char)0x0B;
×
1351
          ++cursor;
×
1352
          break;
×
1353

1354
        // This pass-through default case means that anything can be escaped
1355
        // harmlessly. In particular 'quote' is included, if it not one of the
1356
        // above characters.
1357
        default:
5✔
1358
          word += (char)c;
5✔
1359
          ++cursor;
5✔
1360
          break;
5✔
1361
      }
1362
    }
1363

1364
    // Ordinary character.
1365
    else
1366
      word += utf8_character(utf8_next_char(text, cursor));
23,690✔
1367

1368
    prev = c;
23,697✔
1369
  }
1370

1371
  return word.length() > 0 ? true : false;
3,881✔
1372
}
1373

1374
////////////////////////////////////////////////////////////////////////////////
1375
// <name> [. <modifier>] <: | = | :: | :=> [<value>]
1376
bool Lexer::decomposePair(const std::string& text, std::string& name, std::string& modifier,
3,203✔
1377
                          std::string& separator, std::string& value) {
1378
  // Look for the required elements.
1379
  std::string::size_type dot = text.find('.');
3,203✔
1380
  std::string::size_type sep_defer = text.find("::");
3,203✔
1381
  std::string::size_type sep_eval = text.find(":=");
3,203✔
1382
  std::string::size_type sep_colon = text.find(':');
3,203✔
1383
  std::string::size_type sep_equal = text.find('=');
3,203✔
1384

1385
  // Determine which separator is dominant, which would be the first one seen,
1386
  // taking into consideration the overlapping : characters.
1387
  std::string::size_type sep = std::string::npos;
3,203✔
1388
  std::string::size_type sep_end = std::string::npos;
3,203✔
1389
  if (sep_defer != std::string::npos && (sep_eval == std::string::npos || sep_defer <= sep_eval) &&
3,203✔
1390
      (sep_colon == std::string::npos || sep_defer <= sep_colon) &&
16✔
1391
      (sep_equal == std::string::npos || sep_defer <= sep_equal)) {
6✔
1392
    sep = sep_defer;
12✔
1393
    sep_end = sep_defer + 2;
12✔
1394
  } else if (sep_eval != std::string::npos &&
3,191✔
1395
             (sep_defer == std::string::npos || sep_eval <= sep_defer) &&
16✔
1396
             (sep_colon == std::string::npos || sep_eval <= sep_colon) &&
16✔
1397
             (sep_equal == std::string::npos || sep_eval <= sep_equal)) {
14✔
1398
    sep = sep_eval;
12✔
1399
    sep_end = sep_eval + 2;
12✔
1400
  } else if (sep_colon != std::string::npos &&
3,179✔
1401
             (sep_defer == std::string::npos || sep_colon <= sep_defer) &&
3,023✔
1402
             (sep_eval == std::string::npos || sep_colon <= sep_eval) &&
3,023✔
1403
             (sep_equal == std::string::npos || sep_colon <= sep_equal)) {
10✔
1404
    sep = sep_colon;
3,017✔
1405
    sep_end = sep_colon + 1;
3,017✔
1406
  } else if (sep_equal != std::string::npos &&
162✔
1407
             (sep_defer == std::string::npos || sep_equal <= sep_defer) &&
162✔
1408
             (sep_eval == std::string::npos || sep_equal <= sep_eval) &&
162✔
1409
             (sep_colon == std::string::npos || sep_equal <= sep_colon)) {
6✔
1410
    sep = sep_equal;
162✔
1411
    sep_end = sep_equal + 1;
162✔
1412
  }
1413

1414
  // If sep is known, all is well.
1415
  if (sep != std::string::npos) {
3,203✔
1416
    // Now the only unknown is whethere there is a modifier.
1417
    if (dot != std::string::npos && dot < sep) {
3,203✔
1418
      name = text.substr(0, dot);
822✔
1419
      modifier = text.substr(dot + 1, sep - dot - 1);
822✔
1420
    } else {
1421
      name = text.substr(0, sep);
2,381✔
1422
      modifier = "";
2,381✔
1423
    }
1424

1425
    separator = text.substr(sep, sep_end - sep);
3,203✔
1426
    value = text.substr(sep_end);
3,203✔
1427

1428
    // An empty name is an error.
1429
    if (name.length()) return true;
3,203✔
1430
  }
1431

1432
  return false;
×
1433
}
1434

1435
////////////////////////////////////////////////////////////////////////////////
1436
// / <from> / <to> / [<flags>]
1437
bool Lexer::decomposeSubstitution(const std::string& text, std::string& from, std::string& to,
37✔
1438
                                  std::string& flags) {
1439
  std::string parsed_from;
37✔
1440
  std::string::size_type cursor = 0;
37✔
1441
  if (readWord(text, "/", cursor, parsed_from) && parsed_from.length()) {
111✔
1442
    --cursor;
37✔
1443
    std::string parsed_to;
37✔
1444
    if (readWord(text, "/", cursor, parsed_to)) {
74✔
1445
      std::string parsed_flags = text.substr(cursor);
37✔
1446
      if (parsed_flags.find('/') == std::string::npos) {
37✔
1447
        dequote(parsed_from, "/");
74✔
1448
        dequote(parsed_to, "/");
37✔
1449

1450
        from = parsed_from;
37✔
1451
        to = parsed_to;
37✔
1452
        flags = parsed_flags;
37✔
1453
        return true;
37✔
1454
      }
1455
    }
37✔
1456
  }
37✔
1457

1458
  return false;
×
1459
}
37✔
1460

1461
////////////////////////////////////////////////////////////////////////////////
1462
// / <pattern> / [<flags>]
1463
bool Lexer::decomposePattern(const std::string& text, std::string& pattern, std::string& flags) {
110✔
1464
  std::string ignored;
110✔
1465
  std::string::size_type cursor = 0;
110✔
1466
  if (readWord(text, "/", cursor, ignored) && ignored.length()) {
330✔
1467
    auto parsed_flags = text.substr(cursor);
110✔
1468
    if (parsed_flags.find('/') == std::string::npos) {
110✔
1469
      flags = parsed_flags;
110✔
1470
      pattern = text.substr(1, cursor - 2 - flags.length());
110✔
1471
      return true;
110✔
1472
    }
1473
  }
110✔
1474

1475
  return false;
×
1476
}
110✔
1477

1478
////////////////////////////////////////////////////////////////////////////////
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc