• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

earwig / mwparserfromhell / 10014163542

19 Jul 2024 08:49PM CUT coverage: 99.201% (-0.002%) from 99.203%
10014163542

Pull #326

github

web-flow
Merge 8c23031f1 into 4e73af2fa
Pull Request #326: Make fallthrough explicit in tok_parse.c

2979 of 3003 relevant lines covered (99.2%)

9.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.03
/src/mwparserfromhell/parser/tokenizer.py
1
# Copyright (C) 2012-2021 Ben Kurtovic <ben.kurtovic@gmail.com>
2
#
3
# Permission is hereby granted, free of charge, to any person obtaining a copy
4
# of this software and associated documentation files (the "Software"), to deal
5
# in the Software without restriction, including without limitation the rights
6
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
# copies of the Software, and to permit persons to whom the Software is
8
# furnished to do so, subject to the following conditions:
9
#
10
# The above copyright notice and this permission notice shall be included in
11
# all copies or substantial portions of the Software.
12
#
13
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
# SOFTWARE.
20

21
import html.entities as htmlentities
10✔
22
from math import log
10✔
23
import re
10✔
24

25
from . import contexts, tokens
10✔
26
from .errors import ParserError
10✔
27
from ..definitions import (
10✔
28
    get_html_tag,
29
    is_parsable,
30
    is_single,
31
    is_single_only,
32
    is_scheme,
33
)
34

35
__all__ = ["Tokenizer"]
10✔
36

37

38
class BadRoute(Exception):
10✔
39
    """Raised internally when the current tokenization route is invalid."""
40

41
    def __init__(self, context=0):
10✔
42
        super().__init__()
10✔
43
        self.context = context
10✔
44

45

46
class _TagOpenData:
10✔
47
    """Stores data about an HTML open tag, like ``<ref name="foo">``."""
48

49
    CX_NAME = 1 << 0
10✔
50
    CX_ATTR_READY = 1 << 1
10✔
51
    CX_ATTR_NAME = 1 << 2
10✔
52
    CX_ATTR_VALUE = 1 << 3
10✔
53
    CX_QUOTED = 1 << 4
10✔
54
    CX_NOTE_SPACE = 1 << 5
10✔
55
    CX_NOTE_EQUALS = 1 << 6
10✔
56
    CX_NOTE_QUOTE = 1 << 7
10✔
57

58
    def __init__(self):
10✔
59
        self.context = self.CX_NAME
10✔
60
        self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
10✔
61
        self.quoter = None
10✔
62
        self.reset = 0
10✔
63

64

65
class Tokenizer:
10✔
66
    """Creates a list of tokens from a string of wikicode."""
67

68
    USES_C = False
10✔
69
    START = object()
10✔
70
    END = object()
10✔
71
    MARKERS = [
10✔
72
        "{",
73
        "}",
74
        "[",
75
        "]",
76
        "<",
77
        ">",
78
        "|",
79
        "=",
80
        "&",
81
        "'",
82
        '"',
83
        "#",
84
        "*",
85
        ";",
86
        ":",
87
        "/",
88
        "-",
89
        "!",
90
        "\n",
91
        START,
92
        END,
93
    ]
94
    URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
10✔
95
    MAX_DEPTH = 100
10✔
96
    regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
10✔
97
    tag_splitter = re.compile(r"([\s\"\'\\]+)")
10✔
98

99
    def __init__(self):
10✔
100
        self._text = None
10✔
101
        self._head = 0
10✔
102
        self._stacks = []
10✔
103
        self._global = 0
10✔
104
        self._depth = 0
10✔
105
        self._bad_routes = set()
10✔
106
        self._skip_style_tags = False
10✔
107

108
    @property
10✔
109
    def _stack(self):
10✔
110
        """The current token stack."""
111
        return self._stacks[-1][0]
10✔
112

113
    @property
10✔
114
    def _context(self):
10✔
115
        """The current token context."""
116
        return self._stacks[-1][1]
10✔
117

118
    @_context.setter
10✔
119
    def _context(self, value):
10✔
120
        self._stacks[-1][1] = value
10✔
121

122
    @property
10✔
123
    def _textbuffer(self):
10✔
124
        """The current textbuffer."""
125
        return self._stacks[-1][2]
10✔
126

127
    @_textbuffer.setter
10✔
128
    def _textbuffer(self, value):
10✔
129
        self._stacks[-1][2] = value
10✔
130

131
    @property
10✔
132
    def _stack_ident(self):
10✔
133
        """An identifier for the current stack.
134

135
        This is based on the starting head position and context. Stacks with
136
        the same identifier are always parsed in the same way. This can be used
137
        to cache intermediate parsing info.
138
        """
139
        return self._stacks[-1][3]
10✔
140

141
    def _push(self, context=0):
10✔
142
        """Add a new token stack, context, and textbuffer to the list."""
143
        new_ident = (self._head, context)
10✔
144
        if new_ident in self._bad_routes:
10✔
145
            raise BadRoute(context)
10✔
146

147
        self._stacks.append([[], context, [], new_ident])
10✔
148
        self._depth += 1
10✔
149

150
    def _push_textbuffer(self):
10✔
151
        """Push the textbuffer onto the stack as a Text node and clear it."""
152
        if self._textbuffer:
10✔
153
            self._stack.append(tokens.Text(text="".join(self._textbuffer)))
10✔
154
            self._textbuffer = []
10✔
155

156
    def _pop(self, keep_context=False):
10✔
157
        """Pop the current stack/context/textbuffer, returning the stack.
158

159
        If *keep_context* is ``True``, then we will replace the underlying
160
        stack's context with the current stack's.
161
        """
162
        self._push_textbuffer()
10✔
163
        self._depth -= 1
10✔
164
        if keep_context:
10✔
165
            context = self._context
10✔
166
            stack = self._stacks.pop()[0]
10✔
167
            self._context = context
10✔
168
            return stack
10✔
169
        return self._stacks.pop()[0]
10✔
170

171
    def _can_recurse(self):
10✔
172
        """Return whether or not our max recursion depth has been exceeded."""
173
        return self._depth < self.MAX_DEPTH
10✔
174

175
    def _memoize_bad_route(self):
10✔
176
        """Remember that the current route (head + context at push) is invalid.
177

178
        This will be noticed when calling _push with the same head and context,
179
        and the route will be failed immediately.
180
        """
181
        self._bad_routes.add(self._stack_ident)
10✔
182

183
    def _fail_route(self):
10✔
184
        """Fail the current tokenization route.
185

186
        Discards the current stack/context/textbuffer and raises
187
        :exc:`.BadRoute`.
188
        """
189
        context = self._context
10✔
190
        self._memoize_bad_route()
10✔
191
        self._pop()
10✔
192
        raise BadRoute(context)
10✔
193

194
    def _emit(self, token):
10✔
195
        """Write a token to the end of the current token stack."""
196
        self._push_textbuffer()
10✔
197
        self._stack.append(token)
10✔
198

199
    def _emit_first(self, token):
10✔
200
        """Write a token to the beginning of the current token stack."""
201
        self._push_textbuffer()
10✔
202
        self._stack.insert(0, token)
10✔
203

204
    def _emit_text(self, text):
10✔
205
        """Write text to the current textbuffer."""
206
        self._textbuffer.append(text)
10✔
207

208
    def _emit_all(self, tokenlist):
10✔
209
        """Write a series of tokens to the current stack at once."""
210
        if tokenlist and isinstance(tokenlist[0], tokens.Text):
10✔
211
            self._emit_text(tokenlist.pop(0).text)
10✔
212
        self._push_textbuffer()
10✔
213
        self._stack.extend(tokenlist)
10✔
214

215
    def _emit_text_then_stack(self, text):
10✔
216
        """Pop the current stack, write *text*, and then write the stack."""
217
        stack = self._pop()
10✔
218
        self._emit_text(text)
10✔
219
        if stack:
10✔
220
            self._emit_all(stack)
10✔
221
        self._head -= 1
10✔
222

223
    def _read(self, delta=0, wrap=False, strict=False):
10✔
224
        """Read the value at a relative point in the wikicode.
225

226
        The value is read from :attr:`self._head <_head>` plus the value of
227
        *delta* (which can be negative). If *wrap* is ``False``, we will not
228
        allow attempts to read from the end of the string if ``self._head +
229
        delta`` is negative. If *strict* is ``True``, the route will be failed
230
        (with :meth:`_fail_route`) if we try to read from past the end of the
231
        string; otherwise, :attr:`self.END <END>` is returned. If we try to
232
        read from before the start of the string, :attr:`self.START <START>` is
233
        returned.
234
        """
235
        index = self._head + delta
10✔
236
        if index < 0 and (not wrap or abs(index) > len(self._text)):
10✔
237
            return self.START
10✔
238
        try:
10✔
239
            return self._text[index]
10✔
240
        except IndexError:
10✔
241
            if strict:
10✔
242
                self._fail_route()
10✔
243
            return self.END
10✔
244

245
    def _parse_template(self, has_content):
10✔
246
        """Parse a template at the head of the wikicode string."""
247
        reset = self._head
10✔
248
        context = contexts.TEMPLATE_NAME
10✔
249
        if has_content:
10✔
250
            context |= contexts.HAS_TEMPLATE
10✔
251
        try:
10✔
252
            template = self._parse(context)
10✔
253
        except BadRoute:
10✔
254
            self._head = reset
10✔
255
            raise
10✔
256
        self._emit_first(tokens.TemplateOpen())
10✔
257
        self._emit_all(template)
10✔
258
        self._emit(tokens.TemplateClose())
10✔
259

260
    def _parse_argument(self):
10✔
261
        """Parse an argument at the head of the wikicode string."""
262
        reset = self._head
10✔
263
        try:
10✔
264
            argument = self._parse(contexts.ARGUMENT_NAME)
10✔
265
        except BadRoute:
10✔
266
            self._head = reset
10✔
267
            raise
10✔
268
        self._emit_first(tokens.ArgumentOpen())
10✔
269
        self._emit_all(argument)
10✔
270
        self._emit(tokens.ArgumentClose())
10✔
271

272
    def _parse_template_or_argument(self):
10✔
273
        """Parse a template or argument at the head of the wikicode string."""
274
        self._head += 2
10✔
275
        braces = 2
10✔
276
        while self._read() == "{":
10✔
277
            self._head += 1
10✔
278
            braces += 1
10✔
279
        has_content = False
10✔
280
        self._push()
10✔
281

282
        while braces:
10✔
283
            if braces == 1:
10✔
284
                return self._emit_text_then_stack("{")
10✔
285
            if braces == 2:
10✔
286
                try:
10✔
287
                    self._parse_template(has_content)
10✔
288
                except BadRoute:
10✔
289
                    return self._emit_text_then_stack("{{")
10✔
290
                break
10✔
291
            try:
10✔
292
                self._parse_argument()
10✔
293
                braces -= 3
10✔
294
            except BadRoute:
10✔
295
                try:
10✔
296
                    self._parse_template(has_content)
10✔
297
                    braces -= 2
10✔
298
                except BadRoute:
10✔
299
                    return self._emit_text_then_stack("{" * braces)
10✔
300
            if braces:
10✔
301
                has_content = True
10✔
302
                self._head += 1
10✔
303

304
        self._emit_all(self._pop())
10✔
305
        if self._context & contexts.FAIL_NEXT:
10✔
306
            self._context ^= contexts.FAIL_NEXT
10✔
307

308
    def _handle_template_param(self):
10✔
309
        """Handle a template parameter at the head of the string."""
310
        if self._context & contexts.TEMPLATE_NAME:
10✔
311
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
10✔
312
                self._fail_route()
10✔
313
            self._context ^= contexts.TEMPLATE_NAME
10✔
314
        elif self._context & contexts.TEMPLATE_PARAM_VALUE:
10✔
315
            self._context ^= contexts.TEMPLATE_PARAM_VALUE
10✔
316
        else:
317
            self._emit_all(self._pop())
10✔
318
        self._context |= contexts.TEMPLATE_PARAM_KEY
10✔
319
        self._emit(tokens.TemplateParamSeparator())
10✔
320
        self._push(self._context)
10✔
321

322
    def _handle_template_param_value(self):
10✔
323
        """Handle a template parameter's value at the head of the string."""
324
        self._emit_all(self._pop())
10✔
325
        self._context ^= contexts.TEMPLATE_PARAM_KEY
10✔
326
        self._context |= contexts.TEMPLATE_PARAM_VALUE
10✔
327
        self._emit(tokens.TemplateParamEquals())
10✔
328

329
    def _handle_template_end(self):
10✔
330
        """Handle the end of a template at the head of the string."""
331
        if self._context & contexts.TEMPLATE_NAME:
10✔
332
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
10✔
333
                self._fail_route()
10✔
334
        elif self._context & contexts.TEMPLATE_PARAM_KEY:
10✔
335
            self._emit_all(self._pop())
10✔
336
        self._head += 1
10✔
337
        return self._pop()
10✔
338

339
    def _handle_argument_separator(self):
10✔
340
        """Handle the separator between an argument's name and default."""
341
        self._context ^= contexts.ARGUMENT_NAME
10✔
342
        self._context |= contexts.ARGUMENT_DEFAULT
10✔
343
        self._emit(tokens.ArgumentSeparator())
10✔
344

345
    def _handle_argument_end(self):
10✔
346
        """Handle the end of an argument at the head of the string."""
347
        self._head += 2
10✔
348
        return self._pop()
10✔
349

350
    def _parse_wikilink(self):
10✔
351
        """Parse an internal wikilink at the head of the wikicode string."""
352
        reset = self._head + 1
10✔
353
        self._head += 2
10✔
354
        try:
10✔
355
            # If the wikilink looks like an external link, parse it as such:
356
            link, _extra = self._really_parse_external_link(True)
10✔
357
        except BadRoute:
10✔
358
            self._head = reset + 1
10✔
359
            try:
10✔
360
                # Otherwise, actually parse it as a wikilink:
361
                wikilink = self._parse(contexts.WIKILINK_TITLE)
10✔
362
            except BadRoute:
10✔
363
                self._head = reset
10✔
364
                self._emit_text("[[")
10✔
365
            else:
366
                self._emit(tokens.WikilinkOpen())
10✔
367
                self._emit_all(wikilink)
10✔
368
                self._emit(tokens.WikilinkClose())
10✔
369
        else:
370
            if self._context & contexts.EXT_LINK_TITLE:
10✔
371
                # In this exceptional case, an external link that looks like a
372
                # wikilink inside of an external link is parsed as text:
373
                self._head = reset
10✔
374
                self._emit_text("[[")
10✔
375
                return
10✔
376
            self._emit_text("[")
10✔
377
            self._emit(tokens.ExternalLinkOpen(brackets=True))
10✔
378
            self._emit_all(link)
10✔
379
            self._emit(tokens.ExternalLinkClose())
10✔
380

381
    def _handle_wikilink_separator(self):
10✔
382
        """Handle the separator between a wikilink's title and its text."""
383
        self._context ^= contexts.WIKILINK_TITLE
10✔
384
        self._context |= contexts.WIKILINK_TEXT
10✔
385
        self._emit(tokens.WikilinkSeparator())
10✔
386

387
    def _handle_wikilink_end(self):
10✔
388
        """Handle the end of a wikilink at the head of the string."""
389
        self._head += 1
10✔
390
        return self._pop()
10✔
391

392
    def _parse_bracketed_uri_scheme(self):
10✔
393
        """Parse the URI scheme of a bracket-enclosed external link."""
394
        self._push(contexts.EXT_LINK_URI)
10✔
395
        if self._read() == self._read(1) == "/":
10✔
396
            self._emit_text("//")
10✔
397
            self._head += 2
10✔
398
        else:
399
            all_valid = lambda: all(char in self.URISCHEME for char in self._read())
10✔
400
            scheme = ""
10✔
401
            while self._read() is not self.END and all_valid():
10✔
402
                scheme += self._read()
10✔
403
                self._emit_text(self._read())
10✔
404
                self._head += 1
10✔
405
            if self._read() != ":":
10✔
406
                self._fail_route()
10✔
407
            self._emit_text(":")
10✔
408
            self._head += 1
10✔
409
            slashes = self._read() == self._read(1) == "/"
10✔
410
            if slashes:
10✔
411
                self._emit_text("//")
10✔
412
                self._head += 2
10✔
413
            if not is_scheme(scheme, slashes):
10✔
414
                self._fail_route()
10✔
415

416
    def _parse_free_uri_scheme(self):
10✔
417
        """Parse the URI scheme of a free (no brackets) external link."""
418
        scheme = []
10✔
419
        try:
10✔
420
            # We have to backtrack through the textbuffer looking for our
421
            # scheme since it was just parsed as text:
422
            for chunk in reversed(self._textbuffer):
10✔
423
                for char in reversed(chunk):
10✔
424
                    # Stop at the first non-word character
425
                    if re.fullmatch(r"\W", char):
10✔
426
                        raise StopIteration()
10✔
427
                    if char not in self.URISCHEME:
10✔
428
                        raise BadRoute()
10✔
429
                    scheme.append(char)
10✔
430
        except StopIteration:
10✔
431
            pass
10✔
432
        scheme = "".join(reversed(scheme))
10✔
433
        slashes = self._read() == self._read(1) == "/"
10✔
434
        if not is_scheme(scheme, slashes):
10✔
435
            raise BadRoute()
10✔
436
        self._push(self._context | contexts.EXT_LINK_URI)
10✔
437
        self._emit_text(scheme)
10✔
438
        self._emit_text(":")
10✔
439
        if slashes:
10✔
440
            self._emit_text("//")
10✔
441
            self._head += 2
10✔
442

443
    def _handle_free_link_text(self, punct, tail, this):
10✔
444
        """Handle text in a free ext link, including trailing punctuation."""
445
        if "(" in this and ")" in punct:
10✔
446
            punct = punct[:-1]  # ')' is not longer valid punctuation
10✔
447
        if this.endswith(punct):
10✔
448
            for i in range(len(this) - 1, 0, -1):
10✔
449
                if this[i - 1] not in punct:
10✔
450
                    break
10✔
451
            else:
452
                i = 0
10✔
453
            stripped = this[:i]
10✔
454
            if stripped and tail:
10✔
455
                self._emit_text(tail)
10✔
456
                tail = ""
10✔
457
            tail += this[i:]
10✔
458
            this = stripped
10✔
459
        elif tail:
10✔
460
            self._emit_text(tail)
10✔
461
            tail = ""
10✔
462
        self._emit_text(this)
10✔
463
        return punct, tail
10✔
464

465
    def _is_uri_end(self, this, nxt):
10✔
466
        """Return whether the current head is the end of a URI."""
467
        # Built from _parse()'s end sentinels:
468
        after, ctx = self._read(2), self._context
10✔
469
        return (
10✔
470
            this in (self.END, "\n", "[", "]", "<", ">", '"')
471
            or " " in this
472
            or this == nxt == "'"
473
            or (this == "|" and ctx & contexts.TEMPLATE)
474
            or (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING))
475
            or (this == nxt == "}" and ctx & contexts.TEMPLATE)
476
            or (this == nxt == after == "}" and ctx & contexts.ARGUMENT)
477
        )
478

479
    def _really_parse_external_link(self, brackets):
10✔
480
        """Really parse an external link."""
481
        if brackets:
10✔
482
            self._parse_bracketed_uri_scheme()
10✔
483
            invalid = ("\n", " ", "]")
10✔
484
            punct = ()
10✔
485
        else:
486
            self._parse_free_uri_scheme()
10✔
487
            invalid = ("\n", " ", "[", "]")
10✔
488
            punct = tuple(",;\\.:!?)")
10✔
489
        if self._read() is self.END or self._read()[0] in invalid:
10✔
490
            self._fail_route()
10✔
491
        tail = ""
10✔
492
        while True:
6✔
493
            this, nxt = self._read(), self._read(1)
10✔
494
            if this == "&":
10✔
495
                if tail:
10✔
496
                    self._emit_text(tail)
10✔
497
                    tail = ""
10✔
498
                self._parse_entity()
10✔
499
            elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-":
10✔
500
                if tail:
10✔
501
                    self._emit_text(tail)
10✔
502
                    tail = ""
10✔
503
                self._parse_comment()
10✔
504
            elif this == nxt == "{" and self._can_recurse():
10✔
505
                if tail:
10✔
506
                    self._emit_text(tail)
10✔
507
                    tail = ""
10✔
508
                self._parse_template_or_argument()
10✔
509
            elif brackets:
10✔
510
                if this is self.END or this == "\n":
10✔
511
                    self._fail_route()
10✔
512
                if this == "]":
10✔
513
                    return self._pop(), None
10✔
514
                if self._is_uri_end(this, nxt):
10✔
515
                    if " " in this:
10✔
516
                        before, after = this.split(" ", 1)
10✔
517
                        self._emit_text(before)
10✔
518
                        self._emit(tokens.ExternalLinkSeparator())
10✔
519
                        if after:
10✔
520
                            self._emit_text(after)
10✔
521
                        self._head += 1
10✔
522
                    else:
523
                        separator = tokens.ExternalLinkSeparator()
10✔
524
                        separator.suppress_space = True
10✔
525
                        self._emit(separator)
10✔
526
                    self._context ^= contexts.EXT_LINK_URI
10✔
527
                    self._context |= contexts.EXT_LINK_TITLE
10✔
528
                    return self._parse(push=False), None
10✔
529
                self._emit_text(this)
10✔
530
            else:
531
                if self._is_uri_end(this, nxt):
10✔
532
                    if this is not self.END and " " in this:
10✔
533
                        before, after = this.split(" ", 1)
10✔
534
                        punct, tail = self._handle_free_link_text(punct, tail, before)
10✔
535
                        tail += " " + after
10✔
536
                    else:
537
                        self._head -= 1
10✔
538
                    return self._pop(), tail
10✔
539
                punct, tail = self._handle_free_link_text(punct, tail, this)
10✔
540
            self._head += 1
10✔
541

542
    def _remove_uri_scheme_from_textbuffer(self, scheme):
10✔
543
        """Remove the URI scheme of a new external link from the textbuffer."""
544
        length = len(scheme)
10✔
545
        while length:
10✔
546
            if length < len(self._textbuffer[-1]):
10✔
547
                self._textbuffer[-1] = self._textbuffer[-1][:-length]
10✔
548
                break
10✔
549
            length -= len(self._textbuffer[-1])
10✔
550
            self._textbuffer.pop()
10✔
551

552
    def _parse_external_link(self, brackets):
10✔
553
        """Parse an external link at the head of the wikicode string."""
554
        if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
10✔
555
            if not brackets and self._context & contexts.DL_TERM:
10✔
556
                self._handle_dl_term()
×
557
            else:
558
                self._emit_text(self._read())
10✔
559
            return
10✔
560

561
        reset = self._head
10✔
562
        self._head += 1
10✔
563
        try:
10✔
564
            link, extra = self._really_parse_external_link(brackets)
10✔
565
        except BadRoute:
10✔
566
            self._head = reset
10✔
567
            if not brackets and self._context & contexts.DL_TERM:
10✔
568
                self._handle_dl_term()
10✔
569
            else:
570
                self._emit_text(self._read())
10✔
571
        else:
572
            if not brackets:
10✔
573
                scheme = link[0].text.split(":", 1)[0]
10✔
574
                self._remove_uri_scheme_from_textbuffer(scheme)
10✔
575
            self._emit(tokens.ExternalLinkOpen(brackets=brackets))
10✔
576
            self._emit_all(link)
10✔
577
            self._emit(tokens.ExternalLinkClose())
10✔
578
            if extra:
10✔
579
                self._emit_text(extra)
10✔
580

581
    def _parse_heading(self):
10✔
582
        """Parse a section heading at the head of the wikicode string."""
583
        self._global |= contexts.GL_HEADING
10✔
584
        reset = self._head
10✔
585
        self._head += 1
10✔
586
        best = 1
10✔
587
        while self._read() == "=":
10✔
588
            best += 1
10✔
589
            self._head += 1
10✔
590
        context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
10✔
591

592
        try:
10✔
593
            title, level = self._parse(context)
10✔
594
        except BadRoute:
10✔
595
            self._head = reset + best - 1
10✔
596
            self._emit_text("=" * best)
10✔
597
        else:
598
            self._emit(tokens.HeadingStart(level=level))
10✔
599
            if level < best:
10✔
600
                self._emit_text("=" * (best - level))
10✔
601
            self._emit_all(title)
10✔
602
            self._emit(tokens.HeadingEnd())
10✔
603
        finally:
604
            self._global ^= contexts.GL_HEADING
10✔
605

606
    def _handle_heading_end(self):
10✔
607
        """Handle the end of a section heading at the head of the string."""
608
        reset = self._head
10✔
609
        self._head += 1
10✔
610
        best = 1
10✔
611
        while self._read() == "=":
10✔
612
            best += 1
10✔
613
            self._head += 1
10✔
614
        current = int(log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
10✔
615
        level = min(current, min(best, 6))
10✔
616

617
        try:  # Try to check for a heading closure after this one
10✔
618
            after, after_level = self._parse(self._context)
10✔
619
        except BadRoute:
10✔
620
            if level < best:
10✔
621
                self._emit_text("=" * (best - level))
10✔
622
            self._head = reset + best - 1
10✔
623
            return self._pop(), level
10✔
624
        else:  # Found another closure
625
            self._emit_text("=" * best)
10✔
626
            self._emit_all(after)
10✔
627
            return self._pop(), after_level
10✔
628

629
    def _really_parse_entity(self):
10✔
630
        """Actually parse an HTML entity and ensure that it is valid."""
631
        self._emit(tokens.HTMLEntityStart())
10✔
632
        self._head += 1
10✔
633

634
        this = self._read(strict=True)
10✔
635
        if this == "#":
10✔
636
            numeric = True
10✔
637
            self._emit(tokens.HTMLEntityNumeric())
10✔
638
            self._head += 1
10✔
639
            this = self._read(strict=True)
10✔
640
            if this[0].lower() == "x":
10✔
641
                hexadecimal = True
10✔
642
                self._emit(tokens.HTMLEntityHex(char=this[0]))
10✔
643
                this = this[1:]
10✔
644
                if not this:
10✔
645
                    self._fail_route()
10✔
646
            else:
647
                hexadecimal = False
10✔
648
        else:
649
            numeric = hexadecimal = False
10✔
650

651
        valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
10✔
652
        if not numeric and not hexadecimal:
10✔
653
            valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
10✔
654
        if not all([char in valid for char in this]):
10✔
655
            self._fail_route()
10✔
656

657
        self._head += 1
10✔
658
        if self._read() != ";":
10✔
659
            self._fail_route()
10✔
660
        if numeric:
10✔
661
            test = int(this, 16) if hexadecimal else int(this)
10✔
662
            if test < 1 or test > 0x10FFFF:
10✔
663
                self._fail_route()
10✔
664
        else:
665
            if this not in htmlentities.entitydefs:
10✔
666
                self._fail_route()
10✔
667

668
        self._emit(tokens.Text(text=this))
10✔
669
        self._emit(tokens.HTMLEntityEnd())
10✔
670

671
    def _parse_entity(self):
10✔
672
        """Parse an HTML entity at the head of the wikicode string."""
673
        reset = self._head
10✔
674
        try:
10✔
675
            self._push(contexts.HTML_ENTITY)
10✔
676
            self._really_parse_entity()
10✔
677
        except BadRoute:
10✔
678
            self._head = reset
10✔
679
            self._emit_text(self._read())
10✔
680
        else:
681
            self._emit_all(self._pop())
10✔
682

683
    def _parse_comment(self):
10✔
684
        """Parse an HTML comment at the head of the wikicode string."""
685
        self._head += 4
10✔
686
        reset = self._head - 1
10✔
687
        self._push()
10✔
688
        while True:
6✔
689
            this = self._read()
10✔
690
            if this == self.END:
10✔
691
                self._pop()
10✔
692
                self._head = reset
10✔
693
                self._emit_text("<!--")
10✔
694
                return
10✔
695
            if this == self._read(1) == "-" and self._read(2) == ">":
10✔
696
                self._emit_first(tokens.CommentStart())
10✔
697
                self._emit(tokens.CommentEnd())
10✔
698
                self._emit_all(self._pop())
10✔
699
                self._head += 2
10✔
700
                if self._context & contexts.FAIL_NEXT:
10✔
701
                    # _verify_safe() sets this flag while parsing a template
702
                    # or link when it encounters what might be a comment -- we
703
                    # must unset it to let _verify_safe() know it was correct:
704
                    self._context ^= contexts.FAIL_NEXT
10✔
705
                return
10✔
706
            self._emit_text(this)
10✔
707
            self._head += 1
10✔
708

709
    def _push_tag_buffer(self, data):
10✔
710
        """Write a pending tag attribute from *data* to the stack."""
711
        if data.context & data.CX_QUOTED:
10✔
712
            self._emit_first(tokens.TagAttrQuote(char=data.quoter))
10✔
713
            self._emit_all(self._pop())
10✔
714
        buf = data.padding_buffer
10✔
715
        self._emit_first(
10✔
716
            tokens.TagAttrStart(
717
                pad_first=buf["first"],
718
                pad_before_eq=buf["before_eq"],
719
                pad_after_eq=buf["after_eq"],
720
            )
721
        )
722
        self._emit_all(self._pop())
10✔
723
        for key in data.padding_buffer:
10✔
724
            data.padding_buffer[key] = ""
10✔
725

726
    def _handle_tag_space(self, data, text):
10✔
727
        """Handle whitespace (*text*) inside of an HTML open tag."""
728
        ctx = data.context
10✔
729
        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (
10✔
730
            data.CX_QUOTED | data.CX_NOTE_QUOTE
731
        )
732
        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
10✔
733
            self._push_tag_buffer(data)
10✔
734
            data.context = data.CX_ATTR_READY
10✔
735
        elif ctx & data.CX_NOTE_SPACE:
10✔
736
            data.context = data.CX_ATTR_READY
10✔
737
        elif ctx & data.CX_ATTR_NAME:
10✔
738
            data.context |= data.CX_NOTE_EQUALS
10✔
739
            data.padding_buffer["before_eq"] += text
10✔
740
        if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
10✔
741
            self._emit_text(text)
10✔
742
        elif data.context & data.CX_ATTR_READY:
10✔
743
            data.padding_buffer["first"] += text
10✔
744
        elif data.context & data.CX_ATTR_VALUE:
10✔
745
            data.padding_buffer["after_eq"] += text
10✔
746

747
    def _handle_tag_text(self, text):
10✔
748
        """Handle regular *text* inside of an HTML open tag."""
749
        nxt = self._read(1)
10✔
750
        if not self._can_recurse() or text not in self.MARKERS:
10✔
751
            self._emit_text(text)
10✔
752
        elif text == nxt == "{":
10✔
753
            self._parse_template_or_argument()
10✔
754
        elif text == nxt == "[":
10✔
755
            self._parse_wikilink()
10✔
756
        elif text == "<":
10✔
757
            self._parse_tag()
10✔
758
        else:
759
            self._emit_text(text)
10✔
760

761
    def _handle_tag_data(self, data, text):
10✔
762
        """Handle all sorts of *text* data inside of an HTML open tag."""
763
        for chunk in self.tag_splitter.split(text):
10✔
764
            if not chunk:
10✔
765
                continue
10✔
766
            if data.context & data.CX_NAME:
10✔
767
                if chunk in self.MARKERS or chunk.isspace():
10✔
768
                    self._fail_route()  # Tags must start with text, not spaces
10✔
769
                data.context = data.CX_NOTE_SPACE
10✔
770
            elif chunk.isspace():
10✔
771
                self._handle_tag_space(data, chunk)
10✔
772
                continue
10✔
773
            elif data.context & data.CX_NOTE_SPACE:
10✔
774
                if data.context & data.CX_QUOTED:
10✔
775
                    data.context = data.CX_ATTR_VALUE
10✔
776
                    self._memoize_bad_route()
10✔
777
                    self._pop()
10✔
778
                    self._head = data.reset - 1  # Will be auto-incremented
10✔
779
                    return  # Break early
10✔
780
                self._fail_route()
10✔
781
            elif data.context & data.CX_ATTR_READY:
10✔
782
                data.context = data.CX_ATTR_NAME
10✔
783
                self._push(contexts.TAG_ATTR)
10✔
784
            elif data.context & data.CX_ATTR_NAME:
10✔
785
                if chunk == "=":
10✔
786
                    data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
10✔
787
                    self._emit(tokens.TagAttrEquals())
10✔
788
                    continue
10✔
789
                if data.context & data.CX_NOTE_EQUALS:
10✔
790
                    self._push_tag_buffer(data)
10✔
791
                    data.context = data.CX_ATTR_NAME
10✔
792
                    self._push(contexts.TAG_ATTR)
10✔
793
            else:  # data.context & data.CX_ATTR_VALUE assured
794
                escaped = self._read(-1) == "\\" and self._read(-2) != "\\"
10✔
795
                if data.context & data.CX_NOTE_QUOTE:
10✔
796
                    data.context ^= data.CX_NOTE_QUOTE
10✔
797
                    if chunk in "'\"" and not escaped:
10✔
798
                        data.context |= data.CX_QUOTED
10✔
799
                        data.quoter = chunk
10✔
800
                        data.reset = self._head
10✔
801
                        try:
10✔
802
                            self._push(self._context)
10✔
803
                        except BadRoute:
×
804
                            # Already failed to parse this as a quoted string
805
                            data.context = data.CX_ATTR_VALUE
×
806
                            self._head -= 1
×
807
                            return
×
808
                        continue
6✔
809
                elif data.context & data.CX_QUOTED:
10✔
810
                    if chunk == data.quoter and not escaped:
10✔
811
                        data.context |= data.CX_NOTE_SPACE
10✔
812
                        continue
10✔
813
            self._handle_tag_text(chunk)
10✔
814

815
    def _handle_tag_close_open(self, data, token):
10✔
816
        """Handle the closing of a open tag (``<foo>``)."""
817
        if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
10✔
818
            self._push_tag_buffer(data)
10✔
819
        self._emit(token(padding=data.padding_buffer["first"]))
10✔
820
        self._head += 1
10✔
821

822
    def _handle_tag_open_close(self):
10✔
823
        """Handle the opening of a closing tag (``</foo>``)."""
824
        self._emit(tokens.TagOpenClose())
10✔
825
        self._push(contexts.TAG_CLOSE)
10✔
826
        self._head += 1
10✔
827

828
    def _handle_tag_close_close(self):
10✔
829
        """Handle the ending of a closing tag (``</foo>``)."""
830
        strip = lambda tok: tok.text.rstrip().lower()
10✔
831
        closing = self._pop()
10✔
832
        if len(closing) != 1 or (
10✔
833
            not isinstance(closing[0], tokens.Text)
834
            or strip(closing[0]) != strip(self._stack[1])
835
        ):
836
            self._fail_route()
10✔
837
        self._emit_all(closing)
10✔
838
        self._emit(tokens.TagCloseClose())
10✔
839
        return self._pop()
10✔
840

841
    def _handle_blacklisted_tag(self):
10✔
842
        """Handle the body of an HTML tag that is parser-blacklisted."""
843
        strip = lambda text: text.rstrip().lower()
10✔
844
        while True:
6✔
845
            this, nxt = self._read(), self._read(1)
10✔
846
            if this is self.END:
10✔
847
                self._fail_route()
10✔
848
            elif this == "<" and nxt == "/":
10✔
849
                self._head += 3
10✔
850
                if self._read() != ">" or (
10✔
851
                    strip(self._read(-1)) != strip(self._stack[1].text)
852
                ):
853
                    self._head -= 1
10✔
854
                    self._emit_text("</")
10✔
855
                    continue
10✔
856
                self._emit(tokens.TagOpenClose())
10✔
857
                self._emit_text(self._read(-1))
10✔
858
                self._emit(tokens.TagCloseClose())
10✔
859
                return self._pop()
10✔
860
            elif this == "&":
10✔
861
                self._parse_entity()
10✔
862
            else:
863
                self._emit_text(this)
10✔
864
            self._head += 1
10✔
865

866
    def _handle_single_only_tag_end(self):
10✔
867
        """Handle the end of an implicitly closing single-only HTML tag."""
868
        padding = self._stack.pop().padding
10✔
869
        self._emit(tokens.TagCloseSelfclose(padding=padding, implicit=True))
10✔
870
        self._head -= 1  # Offset displacement done by _handle_tag_close_open
10✔
871
        return self._pop()
10✔
872

873
    def _handle_single_tag_end(self):
10✔
874
        """Handle the stream end when inside a single-supporting HTML tag."""
875
        stack = self._stack
10✔
876
        # We need to find the index of the TagCloseOpen token corresponding to
877
        # the TagOpenOpen token located at index 0:
878
        depth = 1
10✔
879
        for index, token in enumerate(stack[2:], 2):
10✔
880
            if isinstance(token, tokens.TagOpenOpen):
10✔
881
                depth += 1
10✔
882
            elif isinstance(token, tokens.TagCloseOpen):
10✔
883
                depth -= 1
10✔
884
                if depth == 0:
10✔
885
                    break
10✔
886
            elif isinstance(token, tokens.TagCloseSelfclose):
10✔
887
                depth -= 1
10✔
888
                if depth == 0:  # pragma: no cover (untestable/exceptional)
889
                    raise ParserError(
890
                        "_handle_single_tag_end() got an unexpected TagCloseSelfclose"
891
                    )
892
        else:  # pragma: no cover (untestable/exceptional case)
893
            raise ParserError("_handle_single_tag_end() missed a TagCloseOpen")
894
        padding = stack[index].padding
10✔
895
        stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True)
10✔
896
        return self._pop()
10✔
897

898
    def _really_parse_tag(self):
10✔
899
        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
900
        data = _TagOpenData()
10✔
901
        self._push(contexts.TAG_OPEN)
10✔
902
        self._emit(tokens.TagOpenOpen())
10✔
903
        while True:
6✔
904
            this, nxt = self._read(), self._read(1)
10✔
905
            can_exit = (
10✔
906
                not data.context & (data.CX_QUOTED | data.CX_NAME)
907
                or data.context & data.CX_NOTE_SPACE
908
            )
909
            if this is self.END:
10✔
910
                if self._context & contexts.TAG_ATTR:
10✔
911
                    if data.context & data.CX_QUOTED:
10✔
912
                        # Unclosed attribute quote: reset, don't die
913
                        data.context = data.CX_ATTR_VALUE
10✔
914
                        self._memoize_bad_route()
10✔
915
                        self._pop()
10✔
916
                        self._head = data.reset
10✔
917
                        continue
10✔
918
                    self._pop()
10✔
919
                self._fail_route()
10✔
920
            elif this == ">" and can_exit:
10✔
921
                self._handle_tag_close_open(data, tokens.TagCloseOpen)
10✔
922
                self._context = contexts.TAG_BODY
10✔
923
                if is_single_only(self._stack[1].text):
10✔
924
                    return self._handle_single_only_tag_end()
10✔
925
                if is_parsable(self._stack[1].text):
10✔
926
                    return self._parse(push=False)
10✔
927
                return self._handle_blacklisted_tag()
10✔
928
            elif this == "/" and nxt == ">" and can_exit:
10✔
929
                self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
10✔
930
                return self._pop()
10✔
931
            else:
932
                self._handle_tag_data(data, this)
10✔
933
            self._head += 1
10✔
934

935
    def _handle_invalid_tag_start(self):
10✔
936
        """Handle the (possible) start of an implicitly closing single tag."""
937
        reset = self._head + 1
10✔
938
        self._head += 2
10✔
939
        try:
10✔
940
            if not is_single_only(self.tag_splitter.split(self._read())[0]):
10✔
941
                raise BadRoute()
10✔
942
            tag = self._really_parse_tag()
10✔
943
        except BadRoute:
10✔
944
            self._head = reset
10✔
945
            self._emit_text("</")
10✔
946
        else:
947
            tag[0].invalid = True  # Set flag of TagOpenOpen
10✔
948
            self._emit_all(tag)
10✔
949

950
    def _parse_tag(self):
10✔
951
        """Parse an HTML tag at the head of the wikicode string."""
952
        reset = self._head
10✔
953
        self._head += 1
10✔
954
        try:
10✔
955
            tag = self._really_parse_tag()
10✔
956
        except BadRoute:
10✔
957
            self._head = reset
10✔
958
            self._emit_text("<")
10✔
959
        else:
960
            self._emit_all(tag)
10✔
961

962
    def _emit_style_tag(self, tag, markup, body):
10✔
963
        """Write the body of a tag and the tokens that should surround it."""
964
        self._emit(tokens.TagOpenOpen(wiki_markup=markup))
10✔
965
        self._emit_text(tag)
10✔
966
        self._emit(tokens.TagCloseOpen())
10✔
967
        self._emit_all(body)
10✔
968
        self._emit(tokens.TagOpenClose())
10✔
969
        self._emit_text(tag)
10✔
970
        self._emit(tokens.TagCloseClose())
10✔
971

972
    def _parse_italics(self):
10✔
973
        """Parse wiki-style italics."""
974
        reset = self._head
10✔
975
        try:
10✔
976
            stack = self._parse(contexts.STYLE_ITALICS)
10✔
977
        except BadRoute as route:
10✔
978
            self._head = reset
10✔
979
            if route.context & contexts.STYLE_PASS_AGAIN:
10✔
980
                new_ctx = contexts.STYLE_ITALICS | contexts.STYLE_SECOND_PASS
10✔
981
                try:
10✔
982
                    stack = self._parse(new_ctx)
10✔
983
                except BadRoute:
×
984
                    self._head = reset
×
985
                    self._emit_text("''")
×
986
                    return
×
987
            else:
988
                self._emit_text("''")
10✔
989
                return
10✔
990
        self._emit_style_tag("i", "''", stack)
10✔
991

992
    def _parse_bold(self):
10✔
993
        """Parse wiki-style bold."""
994
        reset = self._head
10✔
995
        try:
10✔
996
            stack = self._parse(contexts.STYLE_BOLD)
10✔
997
        except BadRoute:
10✔
998
            self._head = reset
10✔
999
            if self._context & contexts.STYLE_SECOND_PASS:
10✔
1000
                self._emit_text("'")
10✔
1001
                return True
10✔
1002
            if self._context & contexts.STYLE_ITALICS:
10✔
1003
                self._context |= contexts.STYLE_PASS_AGAIN
10✔
1004
                self._emit_text("'''")
10✔
1005
            else:
1006
                self._emit_text("'")
10✔
1007
                self._parse_italics()
10✔
1008
        else:
1009
            self._emit_style_tag("b", "'''", stack)
10✔
1010
        return False
10✔
1011

1012
    def _parse_italics_and_bold(self):
10✔
1013
        """Parse wiki-style italics and bold together (i.e., five ticks)."""
1014
        reset = self._head
10✔
1015
        try:
10✔
1016
            stack = self._parse(contexts.STYLE_BOLD)
10✔
1017
        except BadRoute:
10✔
1018
            self._head = reset
10✔
1019
            try:
10✔
1020
                stack = self._parse(contexts.STYLE_ITALICS)
10✔
1021
            except BadRoute:
10✔
1022
                self._head = reset
10✔
1023
                self._emit_text("'''''")
10✔
1024
            else:
1025
                reset = self._head
10✔
1026
                try:
10✔
1027
                    stack2 = self._parse(contexts.STYLE_BOLD)
10✔
1028
                except BadRoute:
10✔
1029
                    self._head = reset
10✔
1030
                    self._emit_text("'''")
10✔
1031
                    self._emit_style_tag("i", "''", stack)
10✔
1032
                else:
1033
                    self._push()
10✔
1034
                    self._emit_style_tag("i", "''", stack)
10✔
1035
                    self._emit_all(stack2)
10✔
1036
                    self._emit_style_tag("b", "'''", self._pop())
10✔
1037
        else:
1038
            reset = self._head
10✔
1039
            try:
10✔
1040
                stack2 = self._parse(contexts.STYLE_ITALICS)
10✔
1041
            except BadRoute:
10✔
1042
                self._head = reset
10✔
1043
                self._emit_text("''")
10✔
1044
                self._emit_style_tag("b", "'''", stack)
10✔
1045
            else:
1046
                self._push()
10✔
1047
                self._emit_style_tag("b", "'''", stack)
10✔
1048
                self._emit_all(stack2)
10✔
1049
                self._emit_style_tag("i", "''", self._pop())
10✔
1050

1051
    def _parse_style(self):
10✔
1052
        """Parse wiki-style formatting (``''``/``'''`` for italics/bold)."""
1053
        self._head += 2
10✔
1054
        ticks = 2
10✔
1055
        while self._read() == "'":
10✔
1056
            self._head += 1
10✔
1057
            ticks += 1
10✔
1058
        italics = self._context & contexts.STYLE_ITALICS
10✔
1059
        bold = self._context & contexts.STYLE_BOLD
10✔
1060

1061
        if ticks > 5:
10✔
1062
            self._emit_text("'" * (ticks - 5))
10✔
1063
            ticks = 5
10✔
1064
        elif ticks == 4:
10✔
1065
            self._emit_text("'")
10✔
1066
            ticks = 3
10✔
1067

1068
        if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
10✔
1069
            if ticks == 5:
10✔
1070
                self._head -= 3 if italics else 2
10✔
1071
            return self._pop()
10✔
1072
        if not self._can_recurse():
10✔
1073
            if ticks == 3:
10✔
1074
                if self._context & contexts.STYLE_SECOND_PASS:
10✔
1075
                    self._emit_text("'")
10✔
1076
                    return self._pop()
10✔
1077
                if self._context & contexts.STYLE_ITALICS:
10✔
1078
                    self._context |= contexts.STYLE_PASS_AGAIN
10✔
1079
            self._emit_text("'" * ticks)
10✔
1080
        elif ticks == 2:
10✔
1081
            self._parse_italics()
10✔
1082
        elif ticks == 3:
10✔
1083
            if self._parse_bold():
10✔
1084
                return self._pop()
10✔
1085
        else:  # ticks == 5
1086
            self._parse_italics_and_bold()
10✔
1087
        self._head -= 1
10✔
1088

1089
    def _handle_list_marker(self):
10✔
1090
        """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
1091
        markup = self._read()
10✔
1092
        if markup == ";":
10✔
1093
            self._context |= contexts.DL_TERM
10✔
1094
        self._emit(tokens.TagOpenOpen(wiki_markup=markup))
10✔
1095
        self._emit_text(get_html_tag(markup))
10✔
1096
        self._emit(tokens.TagCloseSelfclose())
10✔
1097

1098
    def _handle_list(self):
10✔
1099
        """Handle a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
1100
        self._handle_list_marker()
10✔
1101
        while self._read(1) in ("#", "*", ";", ":"):
10✔
1102
            self._head += 1
10✔
1103
            self._handle_list_marker()
10✔
1104

1105
    def _handle_hr(self):
10✔
1106
        """Handle a wiki-style horizontal rule (``----``) in the string."""
1107
        length = 4
10✔
1108
        self._head += 3
10✔
1109
        while self._read(1) == "-":
10✔
1110
            length += 1
10✔
1111
            self._head += 1
10✔
1112
        self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
10✔
1113
        self._emit_text("hr")
10✔
1114
        self._emit(tokens.TagCloseSelfclose())
10✔
1115

1116
    def _handle_dl_term(self):
10✔
1117
        """Handle the term in a description list (``foo`` in ``;foo:bar``)."""
1118
        self._context ^= contexts.DL_TERM
10✔
1119
        if self._read() == ":":
10✔
1120
            self._handle_list_marker()
10✔
1121
        else:
1122
            self._emit_text("\n")
10✔
1123

1124
    def _emit_table_tag(
10✔
1125
        self,
1126
        open_open_markup,
1127
        tag,
1128
        style,
1129
        padding,
1130
        close_open_markup,
1131
        contents,
1132
        open_close_markup,
1133
    ):
1134
        """Emit a table tag."""
1135
        self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
10✔
1136
        self._emit_text(tag)
10✔
1137
        if style:
10✔
1138
            self._emit_all(style)
10✔
1139
        if close_open_markup:
10✔
1140
            self._emit(
10✔
1141
                tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)
1142
            )
1143
        else:
1144
            self._emit(tokens.TagCloseOpen(padding=padding))
10✔
1145
        if contents:
10✔
1146
            self._emit_all(contents)
10✔
1147
        self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup))
10✔
1148
        self._emit_text(tag)
10✔
1149
        self._emit(tokens.TagCloseClose())
10✔
1150

1151
    def _handle_table_style(self, end_token):
10✔
1152
        """Handle style attributes for a table until ``end_token``."""
1153
        data = _TagOpenData()
10✔
1154
        data.context = _TagOpenData.CX_ATTR_READY
10✔
1155
        while True:
6✔
1156
            this = self._read()
10✔
1157
            can_exit = (
10✔
1158
                not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE
1159
            )
1160
            if this == end_token and can_exit:
10✔
1161
                if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
10✔
1162
                    self._push_tag_buffer(data)
10✔
1163
                if this.isspace():
10✔
1164
                    data.padding_buffer["first"] += this
10✔
1165
                return data.padding_buffer["first"]
10✔
1166
            if this is self.END or this == end_token:
10✔
1167
                if self._context & contexts.TAG_ATTR:
10✔
1168
                    if data.context & data.CX_QUOTED:
10✔
1169
                        # Unclosed attribute quote: reset, don't die
1170
                        data.context = data.CX_ATTR_VALUE
10✔
1171
                        self._memoize_bad_route()
10✔
1172
                        self._pop()
10✔
1173
                        self._head = data.reset
10✔
1174
                        continue
10✔
1175
                    self._pop()
10✔
1176
                self._fail_route()
10✔
1177
            else:
1178
                self._handle_tag_data(data, this)
10✔
1179
            self._head += 1
10✔
1180

1181
    def _parse_table(self):
10✔
1182
        """Parse a wikicode table by starting with the first line."""
1183
        reset = self._head
10✔
1184
        self._head += 2
10✔
1185
        try:
10✔
1186
            self._push(contexts.TABLE_OPEN)
10✔
1187
            padding = self._handle_table_style("\n")
10✔
1188
        except BadRoute:
10✔
1189
            self._head = reset
10✔
1190
            self._emit_text("{")
10✔
1191
            return
10✔
1192
        style = self._pop()
10✔
1193

1194
        self._head += 1
10✔
1195
        restore_point = self._stack_ident
10✔
1196
        try:
10✔
1197
            table = self._parse(contexts.TABLE_OPEN)
10✔
1198
        except BadRoute:
10✔
1199
            while self._stack_ident != restore_point:
10✔
1200
                self._memoize_bad_route()
10✔
1201
                self._pop()
10✔
1202
            self._head = reset
10✔
1203
            self._emit_text("{")
10✔
1204
            return
10✔
1205

1206
        self._emit_table_tag("{|", "table", style, padding, None, table, "|}")
10✔
1207
        # Offset displacement done by _parse():
1208
        self._head -= 1
10✔
1209

1210
    def _handle_table_row(self):
10✔
1211
        """Parse as style until end of the line, then continue."""
1212
        self._head += 2
10✔
1213
        if not self._can_recurse():
10✔
1214
            self._emit_text("|-")
10✔
1215
            self._head -= 1
10✔
1216
            return
10✔
1217

1218
        self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
10✔
1219
        padding = self._handle_table_style("\n")
10✔
1220
        style = self._pop()
10✔
1221

1222
        # Don't parse the style separator:
1223
        self._head += 1
10✔
1224
        row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
10✔
1225

1226
        self._emit_table_tag("|-", "tr", style, padding, None, row, "")
10✔
1227
        # Offset displacement done by parse():
1228
        self._head -= 1
10✔
1229

1230
    def _handle_table_cell(self, markup, tag, line_context):
10✔
1231
        """Parse as normal syntax unless we hit a style marker, then parse
1232
        style as HTML attributes and the remainder as normal syntax."""
1233
        old_context = self._context
10✔
1234
        padding, style = "", None
10✔
1235
        self._head += len(markup)
10✔
1236
        reset = self._head
10✔
1237
        if not self._can_recurse():
10✔
1238
            self._emit_text(markup)
10✔
1239
            self._head -= 1
10✔
1240
            return
10✔
1241

1242
        cell = self._parse(
10✔
1243
            contexts.TABLE_OPEN
1244
            | contexts.TABLE_CELL_OPEN
1245
            | line_context
1246
            | contexts.TABLE_CELL_STYLE
1247
        )
1248
        cell_context = self._context
10✔
1249
        self._context = old_context
10✔
1250
        reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
10✔
1251
        if reset_for_style:
10✔
1252
            self._head = reset
10✔
1253
            self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context)
10✔
1254
            padding = self._handle_table_style("|")
10✔
1255
            style = self._pop()
10✔
1256
            # Don't parse the style separator:
1257
            self._head += 1
10✔
1258
            cell = self._parse(
10✔
1259
                contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context
1260
            )
1261
            cell_context = self._context
10✔
1262
            self._context = old_context
10✔
1263

1264
        close_open_markup = "|" if reset_for_style else None
10✔
1265
        self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "")
10✔
1266
        # Keep header/cell line contexts:
1267
        self._context |= cell_context & (
10✔
1268
            contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE
1269
        )
1270
        # Offset displacement done by parse():
1271
        self._head -= 1
10✔
1272

1273
    def _handle_table_cell_end(self, reset_for_style=False):
10✔
1274
        """Returns the current context, with the TABLE_CELL_STYLE flag set if
1275
        it is necessary to reset and parse style attributes."""
1276
        if reset_for_style:
10✔
1277
            self._context |= contexts.TABLE_CELL_STYLE
10✔
1278
        else:
1279
            self._context &= ~contexts.TABLE_CELL_STYLE
10✔
1280
        return self._pop(keep_context=True)
10✔
1281

1282
    def _handle_table_row_end(self):
10✔
1283
        """Return the stack in order to handle the table row end."""
1284
        return self._pop()
10✔
1285

1286
    def _handle_table_end(self):
10✔
1287
        """Return the stack in order to handle the table end."""
1288
        self._head += 2
10✔
1289
        return self._pop()
10✔
1290

1291
    def _handle_end(self):
10✔
1292
        """Handle the end of the stream of wikitext."""
1293
        if self._context & contexts.FAIL:
10✔
1294
            if self._context & contexts.TAG_BODY:
10✔
1295
                if is_single(self._stack[1].text):
10✔
1296
                    return self._handle_single_tag_end()
10✔
1297
            if self._context & contexts.TABLE_CELL_OPEN:
10✔
1298
                self._pop()
10✔
1299
            if self._context & contexts.DOUBLE:
10✔
1300
                self._pop()
10✔
1301
            self._fail_route()
10✔
1302
        return self._pop()
10✔
1303

1304
    def _verify_safe(self, this):
10✔
1305
        """Make sure we are not trying to write an invalid character."""
1306
        context = self._context
10✔
1307
        if context & contexts.FAIL_NEXT:
10✔
1308
            return False
10✔
1309
        if context & contexts.WIKILINK_TITLE:
10✔
1310
            if this in ("]", "{"):
10✔
1311
                self._context |= contexts.FAIL_NEXT
10✔
1312
            elif this in ("\n", "[", "}", ">"):
10✔
1313
                return False
10✔
1314
            elif this == "<":
10✔
1315
                if self._read(1) == "!":
10✔
1316
                    self._context |= contexts.FAIL_NEXT
10✔
1317
                else:
1318
                    return False
10✔
1319
            return True
10✔
1320
        if context & contexts.EXT_LINK_TITLE:
10✔
1321
            return this != "\n"
10✔
1322
        if context & contexts.TEMPLATE_NAME:
10✔
1323
            if this == "{":
10✔
1324
                self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
10✔
1325
                return True
10✔
1326
            if this == "}" or (this == "<" and self._read(1) == "!"):
10✔
1327
                self._context |= contexts.FAIL_NEXT
10✔
1328
                return True
10✔
1329
            if this in ("[", "]", "<", ">"):
10✔
1330
                return False
10✔
1331
            if this == "|":
10✔
1332
                return True
10✔
1333
            if context & contexts.HAS_TEXT:
10✔
1334
                if context & contexts.FAIL_ON_TEXT:
10✔
1335
                    if this is self.END or not this.isspace():
10✔
1336
                        return False
10✔
1337
                elif this == "\n":
10✔
1338
                    self._context |= contexts.FAIL_ON_TEXT
10✔
1339
            elif this is self.END or not this.isspace():
10✔
1340
                self._context |= contexts.HAS_TEXT
10✔
1341
            return True
10✔
1342
        if context & contexts.TAG_CLOSE:
10✔
1343
            return this != "<"
10✔
1344
        if context & contexts.FAIL_ON_EQUALS:
10✔
1345
            if this == "=":
10✔
1346
                return False
10✔
1347
        elif context & contexts.FAIL_ON_LBRACE:
10✔
1348
            if this == "{" or (self._read(-1) == self._read(-2) == "{"):
10✔
1349
                if context & contexts.TEMPLATE:
10✔
1350
                    self._context |= contexts.FAIL_ON_EQUALS
10✔
1351
                else:
1352
                    self._context |= contexts.FAIL_NEXT
10✔
1353
                return True
10✔
1354
            self._context ^= contexts.FAIL_ON_LBRACE
10✔
1355
        elif context & contexts.FAIL_ON_RBRACE:
10✔
1356
            if this == "}":
10✔
1357
                self._context |= contexts.FAIL_NEXT
10✔
1358
                return True
10✔
1359
            self._context ^= contexts.FAIL_ON_RBRACE
10✔
1360
        elif this == "{":
10✔
1361
            self._context |= contexts.FAIL_ON_LBRACE
10✔
1362
        elif this == "}":
10✔
1363
            self._context |= contexts.FAIL_ON_RBRACE
10✔
1364
        return True
10✔
1365

1366
    def _parse(self, context=0, push=True):
10✔
1367
        """Parse the wikicode string, using *context* for when to stop."""
1368
        if push:
10✔
1369
            self._push(context)
10✔
1370
        while True:
6✔
1371
            this = self._read()
10✔
1372
            if self._context & contexts.UNSAFE:
10✔
1373
                if not self._verify_safe(this):
10✔
1374
                    if self._context & contexts.DOUBLE:
10✔
1375
                        self._pop()
10✔
1376
                    self._fail_route()
10✔
1377
            if this not in self.MARKERS:
10✔
1378
                self._emit_text(this)
10✔
1379
                self._head += 1
10✔
1380
                continue
10✔
1381
            if this is self.END:
10✔
1382
                return self._handle_end()
10✔
1383
            nxt = self._read(1)
10✔
1384
            if this == nxt == "{":
10✔
1385
                if self._can_recurse():
10✔
1386
                    self._parse_template_or_argument()
10✔
1387
                else:
1388
                    self._emit_text("{")
10✔
1389
            elif this == "|" and self._context & contexts.TEMPLATE:
10✔
1390
                self._handle_template_param()
10✔
1391
            elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
10✔
1392
                if (
10✔
1393
                    not self._global & contexts.GL_HEADING
1394
                    and self._read(-1) in ("\n", self.START)
1395
                    and nxt == "="
1396
                ):
1397
                    self._parse_heading()
10✔
1398
                else:
1399
                    self._handle_template_param_value()
10✔
1400
            elif this == nxt == "}" and self._context & contexts.TEMPLATE:
10✔
1401
                return self._handle_template_end()
10✔
1402
            elif this == "|" and self._context & contexts.ARGUMENT_NAME:
10✔
1403
                self._handle_argument_separator()
10✔
1404
            elif this == nxt == "}" and self._context & contexts.ARGUMENT:
10✔
1405
                if self._read(2) == "}":
10✔
1406
                    return self._handle_argument_end()
10✔
1407
                self._emit_text("}")
10✔
1408
            elif this == nxt == "[" and self._can_recurse():
10✔
1409
                # TODO: Only do this if not in a file context:
1410
                # if self._context & contexts.WIKILINK_TEXT:
1411
                #     self._fail_route()
1412
                if not self._context & contexts.NO_WIKILINKS:
10✔
1413
                    self._parse_wikilink()
10✔
1414
                else:
1415
                    self._emit_text("[")
×
1416
            elif this == "|" and self._context & contexts.WIKILINK_TITLE:
10✔
1417
                self._handle_wikilink_separator()
10✔
1418
            elif this == nxt == "]" and self._context & contexts.WIKILINK:
10✔
1419
                return self._handle_wikilink_end()
10✔
1420
            elif this == "[":
10✔
1421
                self._parse_external_link(True)
10✔
1422
            elif this == ":" and self._read(-1) not in self.MARKERS:
10✔
1423
                self._parse_external_link(False)
10✔
1424
            elif this == "]" and self._context & contexts.EXT_LINK_TITLE:
10✔
1425
                return self._pop()
10✔
1426
            elif (
10✔
1427
                this == "="
1428
                and not self._global & contexts.GL_HEADING
1429
                and not self._context & contexts.TEMPLATE
1430
            ):
1431
                if self._read(-1) in ("\n", self.START):
10✔
1432
                    self._parse_heading()
10✔
1433
                else:
1434
                    self._emit_text("=")
10✔
1435
            elif this == "=" and self._context & contexts.HEADING:
10✔
1436
                return self._handle_heading_end()
10✔
1437
            elif this == "\n" and self._context & contexts.HEADING:
10✔
1438
                self._fail_route()
10✔
1439
            elif this == "&":
10✔
1440
                self._parse_entity()
10✔
1441
            elif this == "<" and nxt == "!":
10✔
1442
                if self._read(2) == self._read(3) == "-":
10✔
1443
                    self._parse_comment()
10✔
1444
                else:
1445
                    self._emit_text(this)
10✔
1446
            elif this == "<" and nxt == "/" and self._read(2) is not self.END:
10✔
1447
                if self._context & contexts.TAG_BODY:
10✔
1448
                    self._handle_tag_open_close()
10✔
1449
                else:
1450
                    self._handle_invalid_tag_start()
10✔
1451
            elif this == "<" and not self._context & contexts.TAG_CLOSE:
10✔
1452
                if self._can_recurse():
10✔
1453
                    self._parse_tag()
10✔
1454
                else:
1455
                    self._emit_text("<")
×
1456
            elif this == ">" and self._context & contexts.TAG_CLOSE:
10✔
1457
                return self._handle_tag_close_close()
10✔
1458
            elif this == nxt == "'" and not self._skip_style_tags:
10✔
1459
                result = self._parse_style()
10✔
1460
                if result is not None:
10✔
1461
                    return result
10✔
1462
            elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
10✔
1463
                self._handle_list()
10✔
1464
            elif self._read(-1) in ("\n", self.START) and (
10✔
1465
                this == nxt == self._read(2) == self._read(3) == "-"
1466
            ):
1467
                self._handle_hr()
10✔
1468
            elif this in ("\n", ":") and self._context & contexts.DL_TERM:
10✔
1469
                self._handle_dl_term()
10✔
1470
                if this == "\n":
10✔
1471
                    # Kill potential table contexts
1472
                    self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
10✔
1473
            # Start of table parsing
1474
            elif (
10✔
1475
                this == "{"
1476
                and nxt == "|"
1477
                and (
1478
                    self._read(-1) in ("\n", self.START)
1479
                    or (
1480
                        self._read(-2) in ("\n", self.START)
1481
                        and self._read(-1).isspace()
1482
                    )
1483
                )
1484
            ):
1485
                if self._can_recurse():
10✔
1486
                    self._parse_table()
10✔
1487
                else:
1488
                    self._emit_text("{")
10✔
1489
            elif self._context & contexts.TABLE_OPEN:
10✔
1490
                if this == nxt == "|" and self._context & contexts.TABLE_TD_LINE:
10✔
1491
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1492
                        return self._handle_table_cell_end()
10✔
1493
                    self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
10✔
1494
                elif this == nxt == "|" and self._context & contexts.TABLE_TH_LINE:
10✔
1495
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1496
                        return self._handle_table_cell_end()
10✔
1497
                    self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
10✔
1498
                elif this == nxt == "!" and self._context & contexts.TABLE_TH_LINE:
10✔
1499
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1500
                        return self._handle_table_cell_end()
10✔
1501
                    self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
10✔
1502
                elif this == "|" and self._context & contexts.TABLE_CELL_STYLE:
10✔
1503
                    return self._handle_table_cell_end(reset_for_style=True)
10✔
1504
                # on newline, clear out cell line contexts
1505
                elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
10✔
1506
                    self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
10✔
1507
                    self._emit_text(this)
10✔
1508
                elif self._read(-1) in ("\n", self.START) or (
10✔
1509
                    self._read(-2) in ("\n", self.START) and self._read(-1).isspace()
1510
                ):
1511
                    if this == "|" and nxt == "}":
10✔
1512
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1513
                            return self._handle_table_cell_end()
10✔
1514
                        if self._context & contexts.TABLE_ROW_OPEN:
10✔
1515
                            return self._handle_table_row_end()
10✔
1516
                        return self._handle_table_end()
10✔
1517
                    if this == "|" and nxt == "-":
10✔
1518
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1519
                            return self._handle_table_cell_end()
10✔
1520
                        if self._context & contexts.TABLE_ROW_OPEN:
10✔
1521
                            return self._handle_table_row_end()
10✔
1522
                        self._handle_table_row()
10✔
1523
                    elif this == "|":
10✔
1524
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1525
                            return self._handle_table_cell_end()
10✔
1526
                        self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE)
10✔
1527
                    elif this == "!":
10✔
1528
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1529
                            return self._handle_table_cell_end()
10✔
1530
                        self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE)
10✔
1531
                    else:
1532
                        self._emit_text(this)
10✔
1533
                else:
1534
                    self._emit_text(this)
10✔
1535

1536
            else:
1537
                self._emit_text(this)
10✔
1538
            self._head += 1
10✔
1539

1540
    def tokenize(self, text, context=0, skip_style_tags=False):
10✔
1541
        """Build a list of tokens from a string of wikicode and return it."""
1542
        split = self.regex.split(text)
10✔
1543
        self._text = [segment for segment in split if segment]
10✔
1544
        self._head = self._global = self._depth = 0
10✔
1545
        self._bad_routes = set()
10✔
1546
        self._skip_style_tags = skip_style_tags
10✔
1547

1548
        try:
10✔
1549
            result = self._parse(context)
10✔
1550
        except BadRoute as exc:  # pragma: no cover (untestable/exceptional case)
1551
            raise ParserError("Python tokenizer exited with BadRoute") from exc
1552
        if self._stacks:  # pragma: no cover (untestable/exceptional case)
1553
            err = "Python tokenizer exited with non-empty token stack"
1554
            raise ParserError(err)
1555
        return result
10✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc