• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

earwig / mwparserfromhell / 15949141982

28 Jun 2025 11:18PM UTC coverage: 98.886% (-0.3%) from 99.204%
15949141982

push

github

earwig
Fix a failing test

3106 of 3141 relevant lines covered (98.89%)

9.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.05
/src/mwparserfromhell/parser/tokenizer.py
1
# Copyright (C) 2012-2025 Ben Kurtovic <ben.kurtovic@gmail.com>
2
#
3
# Permission is hereby granted, free of charge, to any person obtaining a copy
4
# of this software and associated documentation files (the "Software"), to deal
5
# in the Software without restriction, including without limitation the rights
6
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
# copies of the Software, and to permit persons to whom the Software is
8
# furnished to do so, subject to the following conditions:
9
#
10
# The above copyright notice and this permission notice shall be included in
11
# all copies or substantial portions of the Software.
12
#
13
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
# SOFTWARE.
20

21
from __future__ import annotations
10✔
22

23
import html.entities
10✔
24
import math
10✔
25
import re
10✔
26
from enum import Enum
10✔
27
from typing import Literal, cast, overload
10✔
28

29
from ..definitions import (
10✔
30
    get_html_tag,
31
    is_parsable,
32
    is_scheme,
33
    is_single,
34
    is_single_only,
35
)
36
from . import contexts, tokens
10✔
37
from .errors import ParserError
10✔
38

39
__all__ = ["Tokenizer"]
10✔
40

41

42
class BadRoute(Exception):
10✔
43
    """Raised internally when the current tokenization route is invalid."""
44

45
    def __init__(self, context=0):
10✔
46
        super().__init__()
10✔
47
        self.context = context
10✔
48

49

50
class _TagOpenData:
10✔
51
    """Stores data about an HTML open tag, like ``<ref name="foo">``."""
52

53
    CX_NAME = 1 << 0
10✔
54
    CX_ATTR_READY = 1 << 1
10✔
55
    CX_ATTR_NAME = 1 << 2
10✔
56
    CX_ATTR_VALUE = 1 << 3
10✔
57
    CX_QUOTED = 1 << 4
10✔
58
    CX_NOTE_SPACE = 1 << 5
10✔
59
    CX_NOTE_EQUALS = 1 << 6
10✔
60
    CX_NOTE_QUOTE = 1 << 7
10✔
61

62
    def __init__(self):
10✔
63
        self.context = self.CX_NAME
10✔
64
        self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
10✔
65
        self.quoter = None
10✔
66
        self.reset = 0
10✔
67

68

69
class Sentinel(Enum):
10✔
70
    START = 0
10✔
71
    END = 1
10✔
72

73

74
START = Sentinel.START
10✔
75
END = Sentinel.END
10✔
76

77

78
class Tokenizer:
10✔
79
    """Creates a list of tokens from a string of wikicode."""
80

81
    USES_C = False
10✔
82
    MARKERS: list[str | Sentinel] = [
10✔
83
        "{",
84
        "}",
85
        "[",
86
        "]",
87
        "<",
88
        ">",
89
        "|",
90
        "=",
91
        "&",
92
        "'",
93
        '"',
94
        "#",
95
        "*",
96
        ";",
97
        ":",
98
        "/",
99
        "-",
100
        "!",
101
        "\n",
102
        START,
103
        END,
104
    ]
105
    URISCHEME = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
10✔
106
    MAX_DEPTH = 100
10✔
107
    regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
10✔
108
    tag_splitter = re.compile(r"([\s\"\'\\]+)")
10✔
109

110
    def __init__(self):
10✔
111
        self._text: list[str] = []
10✔
112
        self._head: int = 0
10✔
113
        self._stacks = []
10✔
114
        self._global = 0
10✔
115
        self._depth = 0
10✔
116
        self._bad_routes = set()
10✔
117
        self._skip_style_tags = False
10✔
118

119
    @property
10✔
120
    def _stack(self):
10✔
121
        """The current token stack."""
122
        return self._stacks[-1][0]
10✔
123

124
    @property
10✔
125
    def _context(self):
10✔
126
        """The current token context."""
127
        return self._stacks[-1][1]
10✔
128

129
    @_context.setter
10✔
130
    def _context(self, value):
10✔
131
        self._stacks[-1][1] = value
10✔
132

133
    @property
10✔
134
    def _textbuffer(self):
10✔
135
        """The current textbuffer."""
136
        return self._stacks[-1][2]
10✔
137

138
    @_textbuffer.setter
10✔
139
    def _textbuffer(self, value):
10✔
140
        self._stacks[-1][2] = value
10✔
141

142
    @property
10✔
143
    def _stack_ident(self):
10✔
144
        """An identifier for the current stack.
145

146
        This is based on the starting head position and context. Stacks with
147
        the same identifier are always parsed in the same way. This can be used
148
        to cache intermediate parsing info.
149
        """
150
        return self._stacks[-1][3]
10✔
151

152
    def _push(self, context=0):
10✔
153
        """Add a new token stack, context, and textbuffer to the list."""
154
        new_ident = (self._head, context)
10✔
155
        if new_ident in self._bad_routes:
10✔
156
            raise BadRoute(context)
10✔
157

158
        self._stacks.append([[], context, [], new_ident])
10✔
159
        self._depth += 1
10✔
160

161
    def _push_textbuffer(self):
10✔
162
        """Push the textbuffer onto the stack as a Text node and clear it."""
163
        if self._textbuffer:
10✔
164
            self._stack.append(tokens.Text(text="".join(self._textbuffer)))
10✔
165
            self._textbuffer = []
10✔
166

167
    def _pop(self, keep_context=False):
10✔
168
        """Pop the current stack/context/textbuffer, returning the stack.
169

170
        If *keep_context* is ``True``, then we will replace the underlying
171
        stack's context with the current stack's.
172
        """
173
        self._push_textbuffer()
10✔
174
        self._depth -= 1
10✔
175
        if keep_context:
10✔
176
            context = self._context
10✔
177
            stack = self._stacks.pop()[0]
10✔
178
            self._context = context
10✔
179
            return stack
10✔
180
        return self._stacks.pop()[0]
10✔
181

182
    def _can_recurse(self):
10✔
183
        """Return whether or not our max recursion depth has been exceeded."""
184
        return self._depth < self.MAX_DEPTH
10✔
185

186
    def _memoize_bad_route(self):
10✔
187
        """Remember that the current route (head + context at push) is invalid.
188

189
        This will be noticed when calling _push with the same head and context,
190
        and the route will be failed immediately.
191
        """
192
        self._bad_routes.add(self._stack_ident)
10✔
193

194
    def _fail_route(self):
10✔
195
        """Fail the current tokenization route.
196

197
        Discards the current stack/context/textbuffer and raises
198
        :exc:`.BadRoute`.
199
        """
200
        context = self._context
10✔
201
        self._memoize_bad_route()
10✔
202
        self._pop()
10✔
203
        raise BadRoute(context)
10✔
204

205
    def _emit(self, token):
10✔
206
        """Write a token to the end of the current token stack."""
207
        self._push_textbuffer()
10✔
208
        self._stack.append(token)
10✔
209

210
    def _emit_first(self, token):
10✔
211
        """Write a token to the beginning of the current token stack."""
212
        self._push_textbuffer()
10✔
213
        self._stack.insert(0, token)
10✔
214

215
    def _emit_text(self, text):
10✔
216
        """Write text to the current textbuffer."""
217
        self._textbuffer.append(text)
10✔
218

219
    def _emit_all(self, tokenlist):
10✔
220
        """Write a series of tokens to the current stack at once."""
221
        if tokenlist and isinstance(tokenlist[0], tokens.Text):
10✔
222
            self._emit_text(tokenlist.pop(0).text)
10✔
223
        self._push_textbuffer()
10✔
224
        self._stack.extend(tokenlist)
10✔
225

226
    def _emit_text_then_stack(self, text):
10✔
227
        """Pop the current stack, write *text*, and then write the stack."""
228
        stack = self._pop()
10✔
229
        self._emit_text(text)
10✔
230
        if stack:
10✔
231
            self._emit_all(stack)
10✔
232
        self._head -= 1
10✔
233

234
    @overload
10✔
235
    def _read(
10✔
236
        self, *, strict: Literal[False] = False
237
    ) -> str | Literal[Sentinel.END]: ...
238

239
    @overload
10✔
240
    def _read(self, *, strict: Literal[True]) -> str: ...
10✔
241

242
    @overload
10✔
243
    def _read(
10✔
244
        self, delta: int = 0, *, strict: Literal[False] = False
245
    ) -> str | Literal[Sentinel.START, Sentinel.END]: ...
246

247
    @overload
10✔
248
    def _read(
10✔
249
        self, delta: int = 0, *, strict: Literal[True]
250
    ) -> str | Literal[Sentinel.START]: ...
251

252
    def _read(
10✔
253
        self, delta: int = 0, *, strict: bool = False
254
    ) -> str | Literal[Sentinel.START, Sentinel.END]:
255
        """Read the value at a relative point in the wikicode.
256

257
        The value is read from :attr:`self._head <_head>` plus the value of
258
        *delta* (which can be negative). If *strict* is ``True``, the route
259
        will be failed (with :meth:`_fail_route`) if we try to read from past
260
        the end of the string; otherwise, ``END`` is returned. If we try to
261
        read from before the start of the string, ``START`` is returned.
262
        """
263
        index = self._head + delta
10✔
264
        if index < 0:
10✔
265
            return START
10✔
266
        try:
10✔
267
            return self._text[index]
10✔
268
        except IndexError:
10✔
269
            if strict:
10✔
270
                self._fail_route()
10✔
271
            return END
10✔
272

273
    def _parse_template(self, has_content):
10✔
274
        """Parse a template at the head of the wikicode string."""
275
        reset = self._head
10✔
276
        context = contexts.TEMPLATE_NAME
10✔
277
        if has_content:
10✔
278
            context |= contexts.HAS_TEMPLATE
10✔
279
        try:
10✔
280
            template = self._parse(context)
10✔
281
        except BadRoute:
10✔
282
            self._head = reset
10✔
283
            raise
10✔
284
        self._emit_first(tokens.TemplateOpen())
10✔
285
        self._emit_all(template)
10✔
286
        self._emit(tokens.TemplateClose())
10✔
287

288
    def _parse_argument(self):
10✔
289
        """Parse an argument at the head of the wikicode string."""
290
        reset = self._head
10✔
291
        try:
10✔
292
            argument = self._parse(contexts.ARGUMENT_NAME)
10✔
293
        except BadRoute:
10✔
294
            self._head = reset
10✔
295
            raise
10✔
296
        self._emit_first(tokens.ArgumentOpen())
10✔
297
        self._emit_all(argument)
10✔
298
        self._emit(tokens.ArgumentClose())
10✔
299

300
    def _parse_template_or_argument(self):
10✔
301
        """Parse a template or argument at the head of the wikicode string."""
302
        self._head += 2
10✔
303
        braces = 2
10✔
304
        while self._read() == "{":
10✔
305
            self._head += 1
10✔
306
            braces += 1
10✔
307
        has_content = False
10✔
308
        self._push()
10✔
309

310
        while braces:
10✔
311
            if braces == 1:
10✔
312
                return self._emit_text_then_stack("{")
10✔
313
            if braces == 2:
10✔
314
                try:
10✔
315
                    self._parse_template(has_content)
10✔
316
                except BadRoute:
10✔
317
                    return self._emit_text_then_stack("{{")
10✔
318
                break
10✔
319
            try:
10✔
320
                self._parse_argument()
10✔
321
                braces -= 3
10✔
322
            except BadRoute:
10✔
323
                try:
10✔
324
                    self._parse_template(has_content)
10✔
325
                    braces -= 2
10✔
326
                except BadRoute:
10✔
327
                    return self._emit_text_then_stack("{" * braces)
10✔
328
            if braces:
10✔
329
                has_content = True
10✔
330
                self._head += 1
10✔
331

332
        self._emit_all(self._pop())
10✔
333
        if self._context & contexts.FAIL_NEXT:
10✔
334
            self._context ^= contexts.FAIL_NEXT
10✔
335

336
    def _handle_template_param(self):
10✔
337
        """Handle a template parameter at the head of the string."""
338
        if self._context & contexts.TEMPLATE_NAME:
10✔
339
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
10✔
340
                self._fail_route()
10✔
341
            self._context ^= contexts.TEMPLATE_NAME
10✔
342
        elif self._context & contexts.TEMPLATE_PARAM_VALUE:
10✔
343
            self._context ^= contexts.TEMPLATE_PARAM_VALUE
10✔
344
        else:
345
            self._emit_all(self._pop())
10✔
346
        self._context |= contexts.TEMPLATE_PARAM_KEY
10✔
347
        self._emit(tokens.TemplateParamSeparator())
10✔
348
        self._push(self._context)
10✔
349

350
    def _handle_template_param_value(self):
10✔
351
        """Handle a template parameter's value at the head of the string."""
352
        self._emit_all(self._pop())
10✔
353
        self._context ^= contexts.TEMPLATE_PARAM_KEY
10✔
354
        self._context |= contexts.TEMPLATE_PARAM_VALUE
10✔
355
        self._emit(tokens.TemplateParamEquals())
10✔
356

357
    def _handle_template_end(self):
10✔
358
        """Handle the end of a template at the head of the string."""
359
        if self._context & contexts.TEMPLATE_NAME:
10✔
360
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
10✔
361
                self._fail_route()
10✔
362
        elif self._context & contexts.TEMPLATE_PARAM_KEY:
10✔
363
            self._emit_all(self._pop())
10✔
364
        self._head += 1
10✔
365
        return self._pop()
10✔
366

367
    def _handle_argument_separator(self):
10✔
368
        """Handle the separator between an argument's name and default."""
369
        self._context ^= contexts.ARGUMENT_NAME
10✔
370
        self._context |= contexts.ARGUMENT_DEFAULT
10✔
371
        self._emit(tokens.ArgumentSeparator())
10✔
372

373
    def _handle_argument_end(self):
10✔
374
        """Handle the end of an argument at the head of the string."""
375
        self._head += 2
10✔
376
        return self._pop()
10✔
377

378
    def _parse_wikilink(self):
10✔
379
        """Parse an internal wikilink at the head of the wikicode string."""
380
        reset = self._head + 1
10✔
381
        self._head += 2
10✔
382
        try:
10✔
383
            # If the wikilink looks like an external link, parse it as such:
384
            link, _extra = self._really_parse_external_link(True)
10✔
385
        except BadRoute:
10✔
386
            self._head = reset + 1
10✔
387
            try:
10✔
388
                # Otherwise, actually parse it as a wikilink:
389
                wikilink = self._parse(contexts.WIKILINK_TITLE)
10✔
390
            except BadRoute:
10✔
391
                self._head = reset
10✔
392
                self._emit_text("[[")
10✔
393
            else:
394
                self._emit(tokens.WikilinkOpen())
10✔
395
                self._emit_all(wikilink)
10✔
396
                self._emit(tokens.WikilinkClose())
10✔
397
        else:
398
            if self._context & contexts.EXT_LINK_TITLE:
10✔
399
                # In this exceptional case, an external link that looks like a
400
                # wikilink inside of an external link is parsed as text:
401
                self._head = reset
10✔
402
                self._emit_text("[[")
10✔
403
                return
10✔
404
            self._emit_text("[")
10✔
405
            self._emit(tokens.ExternalLinkOpen(brackets=True))
10✔
406
            self._emit_all(link)
10✔
407
            self._emit(tokens.ExternalLinkClose())
10✔
408

409
    def _handle_wikilink_separator(self):
10✔
410
        """Handle the separator between a wikilink's title and its text."""
411
        self._context ^= contexts.WIKILINK_TITLE
10✔
412
        self._context |= contexts.WIKILINK_TEXT
10✔
413
        self._emit(tokens.WikilinkSeparator())
10✔
414

415
    def _handle_wikilink_end(self):
10✔
416
        """Handle the end of a wikilink at the head of the string."""
417
        self._head += 1
10✔
418
        return self._pop()
10✔
419

420
    def _parse_bracketed_uri_scheme(self):
10✔
421
        """Parse the URI scheme of a bracket-enclosed external link."""
422
        self._push(contexts.EXT_LINK_URI)
10✔
423
        if self._read() == self._read(1) == "/":
10✔
424
            self._emit_text("//")
10✔
425
            self._head += 2
10✔
426
        else:
427

428
            def all_valid(this: str):
10✔
429
                return all(char in self.URISCHEME for char in this)
10✔
430

431
            scheme = ""
10✔
432
            while (this := self._read()) is not END and all_valid(this):
10✔
433
                scheme += this
10✔
434
                self._emit_text(self._read())
10✔
435
                self._head += 1
10✔
436
            if self._read() != ":":
10✔
437
                self._fail_route()
10✔
438
            self._emit_text(":")
10✔
439
            self._head += 1
10✔
440
            slashes = self._read() == self._read(1) == "/"
10✔
441
            if slashes:
10✔
442
                self._emit_text("//")
10✔
443
                self._head += 2
10✔
444
            if not is_scheme(scheme, slashes):
10✔
445
                self._fail_route()
10✔
446

447
    def _parse_free_uri_scheme(self):
10✔
448
        """Parse the URI scheme of a free (no brackets) external link."""
449
        scheme = []
10✔
450
        try:
10✔
451
            # We have to backtrack through the textbuffer looking for our
452
            # scheme since it was just parsed as text:
453
            for chunk in reversed(self._textbuffer):
10✔
454
                for char in reversed(chunk):
10✔
455
                    # Stop at the first non-word character
456
                    if re.fullmatch(r"\W", char):
10✔
457
                        raise StopIteration()
10✔
458
                    if char not in self.URISCHEME:
10✔
459
                        raise BadRoute()
10✔
460
                    scheme.append(char)
10✔
461
        except StopIteration:
10✔
462
            pass
10✔
463
        scheme = "".join(reversed(scheme))
10✔
464
        slashes = self._read() == self._read(1) == "/"
10✔
465
        if not is_scheme(scheme, slashes):
10✔
466
            raise BadRoute()
10✔
467
        self._push(self._context | contexts.EXT_LINK_URI)
10✔
468
        self._emit_text(scheme)
10✔
469
        self._emit_text(":")
10✔
470
        if slashes:
10✔
471
            self._emit_text("//")
10✔
472
            self._head += 2
10✔
473

474
    def _handle_free_link_text(self, punct, tail, this):
10✔
475
        """Handle text in a free ext link, including trailing punctuation."""
476
        if "(" in this and ")" in punct:
10✔
477
            punct = punct[:-1]  # ')' is not longer valid punctuation
10✔
478
        if this.endswith(punct):
10✔
479
            for i in range(len(this) - 1, 0, -1):
10✔
480
                if this[i - 1] not in punct:
10✔
481
                    break
10✔
482
            else:
483
                i = 0
10✔
484
            stripped = this[:i]
10✔
485
            if stripped and tail:
10✔
486
                self._emit_text(tail)
10✔
487
                tail = ""
10✔
488
            tail += this[i:]
10✔
489
            this = stripped
10✔
490
        elif tail:
10✔
491
            self._emit_text(tail)
10✔
492
            tail = ""
10✔
493
        self._emit_text(this)
10✔
494
        return punct, tail
10✔
495

496
    def _is_uri_end(self, this, nxt):
10✔
497
        """Return whether the current head is the end of a URI."""
498
        # Built from _parse()'s end sentinels:
499
        after, ctx = self._read(2), self._context
10✔
500
        return (
10✔
501
            this in (END, "\n", "[", "]", "<", ">", '"')
502
            or " " in this
503
            or this == nxt == "'"
504
            or (this == "|" and ctx & contexts.TEMPLATE)
505
            or (this == "=" and ctx & (contexts.TEMPLATE_PARAM_KEY | contexts.HEADING))
506
            or (this == nxt == "}" and ctx & contexts.TEMPLATE)
507
            or (this == nxt == after == "}" and ctx & contexts.ARGUMENT)
508
        )
509

510
    def _really_parse_external_link(self, brackets):
10✔
511
        """Really parse an external link."""
512
        if brackets:
10✔
513
            self._parse_bracketed_uri_scheme()
10✔
514
            invalid = ("\n", " ", "]")
10✔
515
            punct = ()
10✔
516
        else:
517
            self._parse_free_uri_scheme()
10✔
518
            invalid = ("\n", " ", "[", "]")
10✔
519
            punct = tuple(",;\\.:!?)")
10✔
520
        if (this := self._read()) is END or this[0] in invalid:
10✔
521
            self._fail_route()
10✔
522
        tail = ""
10✔
523
        while True:
8✔
524
            this, nxt = self._read(), self._read(1)
10✔
525
            if this == "&":
10✔
526
                if tail:
10✔
527
                    self._emit_text(tail)
10✔
528
                    tail = ""
10✔
529
                self._parse_entity()
10✔
530
            elif this == "<" and nxt == "!" and self._read(2) == self._read(3) == "-":
10✔
531
                if tail:
10✔
532
                    self._emit_text(tail)
10✔
533
                    tail = ""
10✔
534
                self._parse_comment()
10✔
535
            elif this == nxt == "{" and self._can_recurse():
10✔
536
                if tail:
10✔
537
                    self._emit_text(tail)
10✔
538
                    tail = ""
10✔
539
                self._parse_template_or_argument()
10✔
540
            elif brackets:
10✔
541
                if this is END or this == "\n":
10✔
542
                    self._fail_route()
10✔
543
                if this == "]":
10✔
544
                    return self._pop(), None
10✔
545
                if self._is_uri_end(this, nxt):
10✔
546
                    if " " in this:
10✔
547
                        before, after = this.split(" ", 1)
10✔
548
                        self._emit_text(before)
10✔
549
                        self._emit(tokens.ExternalLinkSeparator())
10✔
550
                        if after:
10✔
551
                            self._emit_text(after)
10✔
552
                        self._head += 1
10✔
553
                    else:
554
                        separator = tokens.ExternalLinkSeparator()
10✔
555
                        separator.suppress_space = True
10✔
556
                        self._emit(separator)
10✔
557
                    self._context ^= contexts.EXT_LINK_URI
10✔
558
                    self._context |= contexts.EXT_LINK_TITLE
10✔
559
                    return self._parse(push=False), None
10✔
560
                self._emit_text(this)
10✔
561
            else:
562
                if self._is_uri_end(this, nxt):
10✔
563
                    if this is not END and " " in this:
10✔
564
                        before, after = this.split(" ", 1)
10✔
565
                        punct, tail = self._handle_free_link_text(punct, tail, before)
10✔
566
                        tail += " " + after
10✔
567
                    else:
568
                        self._head -= 1
10✔
569
                    return self._pop(), tail
10✔
570
                punct, tail = self._handle_free_link_text(punct, tail, this)
10✔
571
            self._head += 1
10✔
572

573
    def _remove_uri_scheme_from_textbuffer(self, scheme):
10✔
574
        """Remove the URI scheme of a new external link from the textbuffer."""
575
        length = len(scheme)
10✔
576
        while length:
10✔
577
            if length < len(self._textbuffer[-1]):
10✔
578
                self._textbuffer[-1] = self._textbuffer[-1][:-length]
10✔
579
                break
10✔
580
            length -= len(self._textbuffer[-1])
10✔
581
            self._textbuffer.pop()
10✔
582

583
    def _parse_external_link(self, brackets):
10✔
584
        """Parse an external link at the head of the wikicode string."""
585
        if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
10✔
586
            if not brackets and self._context & contexts.DL_TERM:
10✔
587
                self._handle_dl_term()
×
588
            else:
589
                self._emit_text(self._read())
10✔
590
            return
10✔
591

592
        reset = self._head
10✔
593
        self._head += 1
10✔
594
        try:
10✔
595
            link, extra = self._really_parse_external_link(brackets)
10✔
596
        except BadRoute:
10✔
597
            self._head = reset
10✔
598
            if not brackets and self._context & contexts.DL_TERM:
10✔
599
                self._handle_dl_term()
10✔
600
            else:
601
                self._emit_text(self._read())
10✔
602
        else:
603
            if not brackets:
10✔
604
                scheme = link[0].text.split(":", 1)[0]
10✔
605
                self._remove_uri_scheme_from_textbuffer(scheme)
10✔
606
            self._emit(tokens.ExternalLinkOpen(brackets=brackets))
10✔
607
            self._emit_all(link)
10✔
608
            self._emit(tokens.ExternalLinkClose())
10✔
609
            if extra:
10✔
610
                self._emit_text(extra)
10✔
611

612
    def _parse_heading(self):
10✔
613
        """Parse a section heading at the head of the wikicode string."""
614
        self._global |= contexts.GL_HEADING
10✔
615
        reset = self._head
10✔
616
        self._head += 1
10✔
617
        best = 1
10✔
618
        while self._read() == "=":
10✔
619
            best += 1
10✔
620
            self._head += 1
10✔
621
        context = contexts.HEADING_LEVEL_1 << min(best - 1, 5)
10✔
622

623
        try:
10✔
624
            title, level = self._parse(context)
10✔
625
        except BadRoute:
10✔
626
            self._head = reset + best - 1
10✔
627
            self._emit_text("=" * best)
10✔
628
        else:
629
            self._emit(tokens.HeadingStart(level=level))
10✔
630
            if level < best:
10✔
631
                self._emit_text("=" * (best - level))
10✔
632
            self._emit_all(title)
10✔
633
            self._emit(tokens.HeadingEnd())
10✔
634
        finally:
635
            self._global ^= contexts.GL_HEADING
10✔
636

637
    def _handle_heading_end(self):
10✔
638
        """Handle the end of a section heading at the head of the string."""
639
        reset = self._head
10✔
640
        self._head += 1
10✔
641
        best = 1
10✔
642
        while self._read() == "=":
10✔
643
            best += 1
10✔
644
            self._head += 1
10✔
645
        current = int(math.log(self._context / contexts.HEADING_LEVEL_1, 2)) + 1
10✔
646
        level = min(current, min(best, 6))
10✔
647

648
        try:  # Try to check for a heading closure after this one
10✔
649
            after, after_level = self._parse(self._context)
10✔
650
        except BadRoute:
10✔
651
            if level < best:
10✔
652
                self._emit_text("=" * (best - level))
10✔
653
            self._head = reset + best - 1
10✔
654
            return self._pop(), level
10✔
655
        else:  # Found another closure
656
            self._emit_text("=" * best)
10✔
657
            self._emit_all(after)
10✔
658
            return self._pop(), after_level
10✔
659

660
    def _really_parse_entity(self):
10✔
661
        """Actually parse an HTML entity and ensure that it is valid."""
662
        self._emit(tokens.HTMLEntityStart())
10✔
663
        self._head += 1
10✔
664

665
        this = self._read(strict=True)
10✔
666
        if this == "#":
10✔
667
            numeric = True
10✔
668
            self._emit(tokens.HTMLEntityNumeric())
10✔
669
            self._head += 1
10✔
670
            this = self._read(strict=True)
10✔
671
            if this[0].lower() == "x":
10✔
672
                hexadecimal = True
10✔
673
                self._emit(tokens.HTMLEntityHex(char=this[0]))
10✔
674
                this = this[1:]
10✔
675
                if not this:
10✔
676
                    self._fail_route()
10✔
677
            else:
678
                hexadecimal = False
10✔
679
        else:
680
            numeric = hexadecimal = False
10✔
681

682
        valid = "0123456789abcdefABCDEF" if hexadecimal else "0123456789"
10✔
683
        if not numeric and not hexadecimal:
10✔
684
            valid += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
10✔
685
        if not all([char in valid for char in this]):
10✔
686
            self._fail_route()
10✔
687

688
        self._head += 1
10✔
689
        if self._read() != ";":
10✔
690
            self._fail_route()
10✔
691
        if numeric:
10✔
692
            test = int(this, 16) if hexadecimal else int(this)
10✔
693
            if test < 1 or test > 0x10FFFF:
10✔
694
                self._fail_route()
10✔
695
        else:
696
            if this not in html.entities.entitydefs:
10✔
697
                self._fail_route()
10✔
698

699
        self._emit(tokens.Text(text=this))
10✔
700
        self._emit(tokens.HTMLEntityEnd())
10✔
701

702
    def _parse_entity(self):
10✔
703
        """Parse an HTML entity at the head of the wikicode string."""
704
        reset = self._head
10✔
705
        try:
10✔
706
            self._push(contexts.HTML_ENTITY)
10✔
707
            self._really_parse_entity()
10✔
708
        except BadRoute:
10✔
709
            self._head = reset
10✔
710
            self._emit_text(self._read())
10✔
711
        else:
712
            self._emit_all(self._pop())
10✔
713

714
    def _parse_comment(self):
10✔
715
        """Parse an HTML comment at the head of the wikicode string."""
716
        self._head += 4
10✔
717
        reset = self._head - 1
10✔
718
        self._push()
10✔
719
        while True:
8✔
720
            this = self._read()
10✔
721
            if this == END:
10✔
722
                self._pop()
10✔
723
                self._head = reset
10✔
724
                self._emit_text("<!--")
10✔
725
                return
10✔
726
            if this == self._read(1) == "-" and self._read(2) == ">":
10✔
727
                self._emit_first(tokens.CommentStart())
10✔
728
                self._emit(tokens.CommentEnd())
10✔
729
                self._emit_all(self._pop())
10✔
730
                self._head += 2
10✔
731
                if self._context & contexts.FAIL_NEXT:
10✔
732
                    # _verify_safe() sets this flag while parsing a template
733
                    # or link when it encounters what might be a comment -- we
734
                    # must unset it to let _verify_safe() know it was correct:
735
                    self._context ^= contexts.FAIL_NEXT
10✔
736
                return
10✔
737
            self._emit_text(this)
10✔
738
            self._head += 1
10✔
739

740
    def _push_tag_buffer(self, data):
10✔
741
        """Write a pending tag attribute from *data* to the stack."""
742
        if data.context & data.CX_QUOTED:
10✔
743
            self._emit_first(tokens.TagAttrQuote(char=data.quoter))
10✔
744
            self._emit_all(self._pop())
10✔
745
        buf = data.padding_buffer
10✔
746
        self._emit_first(
10✔
747
            tokens.TagAttrStart(
748
                pad_first=buf["first"],
749
                pad_before_eq=buf["before_eq"],
750
                pad_after_eq=buf["after_eq"],
751
            )
752
        )
753
        self._emit_all(self._pop())
10✔
754
        for key in data.padding_buffer:
10✔
755
            data.padding_buffer[key] = ""
10✔
756

757
    def _handle_tag_space(self, data, text):
10✔
758
        """Handle whitespace (*text*) inside of an HTML open tag."""
759
        ctx = data.context
10✔
760
        end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (
10✔
761
            data.CX_QUOTED | data.CX_NOTE_QUOTE
762
        )
763
        if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
10✔
764
            self._push_tag_buffer(data)
10✔
765
            data.context = data.CX_ATTR_READY
10✔
766
        elif ctx & data.CX_NOTE_SPACE:
10✔
767
            data.context = data.CX_ATTR_READY
10✔
768
        elif ctx & data.CX_ATTR_NAME:
10✔
769
            data.context |= data.CX_NOTE_EQUALS
10✔
770
            data.padding_buffer["before_eq"] += text
10✔
771
        if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
10✔
772
            self._emit_text(text)
10✔
773
        elif data.context & data.CX_ATTR_READY:
10✔
774
            data.padding_buffer["first"] += text
10✔
775
        elif data.context & data.CX_ATTR_VALUE:
10✔
776
            data.padding_buffer["after_eq"] += text
10✔
777

778
    def _handle_tag_text(self, text):
10✔
779
        """Handle regular *text* inside of an HTML open tag."""
780
        nxt = self._read(1)
10✔
781
        if not self._can_recurse() or text not in self.MARKERS:
10✔
782
            self._emit_text(text)
10✔
783
        elif text == nxt == "{":
10✔
784
            self._parse_template_or_argument()
10✔
785
        elif text == nxt == "[":
10✔
786
            self._parse_wikilink()
10✔
787
        elif text == "<":
10✔
788
            self._parse_tag()
10✔
789
        else:
790
            self._emit_text(text)
10✔
791

792
    def _handle_tag_data(self, data, text):
10✔
793
        """Handle all sorts of *text* data inside of an HTML open tag."""
794
        for chunk in self.tag_splitter.split(text):
10✔
795
            if not chunk:
10✔
796
                continue
10✔
797
            if data.context & data.CX_NAME:
10✔
798
                if chunk in self.MARKERS or chunk.isspace():
10✔
799
                    self._fail_route()  # Tags must start with text, not spaces
10✔
800
                data.context = data.CX_NOTE_SPACE
10✔
801
            elif chunk.isspace():
10✔
802
                self._handle_tag_space(data, chunk)
10✔
803
                continue
10✔
804
            elif data.context & data.CX_NOTE_SPACE:
10✔
805
                if data.context & data.CX_QUOTED:
10✔
806
                    data.context = data.CX_ATTR_VALUE
10✔
807
                    self._memoize_bad_route()
10✔
808
                    self._pop()
10✔
809
                    self._head = data.reset - 1  # Will be auto-incremented
10✔
810
                    return  # Break early
10✔
811
                self._fail_route()
10✔
812
            elif data.context & data.CX_ATTR_READY:
10✔
813
                data.context = data.CX_ATTR_NAME
10✔
814
                self._push(contexts.TAG_ATTR)
10✔
815
            elif data.context & data.CX_ATTR_NAME:
10✔
816
                if chunk == "=":
10✔
817
                    data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
10✔
818
                    self._emit(tokens.TagAttrEquals())
10✔
819
                    continue
10✔
820
                if data.context & data.CX_NOTE_EQUALS:
10✔
821
                    self._push_tag_buffer(data)
10✔
822
                    data.context = data.CX_ATTR_NAME
10✔
823
                    self._push(contexts.TAG_ATTR)
10✔
824
            else:  # data.context & data.CX_ATTR_VALUE assured
825
                escaped = self._read(-1) == "\\" and self._read(-2) != "\\"
10✔
826
                if data.context & data.CX_NOTE_QUOTE:
10✔
827
                    data.context ^= data.CX_NOTE_QUOTE
10✔
828
                    if chunk in "'\"" and not escaped:
10✔
829
                        data.context |= data.CX_QUOTED
10✔
830
                        data.quoter = chunk
10✔
831
                        data.reset = self._head
10✔
832
                        try:
10✔
833
                            self._push(self._context)
10✔
834
                        except BadRoute:
×
835
                            # Already failed to parse this as a quoted string
836
                            data.context = data.CX_ATTR_VALUE
×
837
                            self._head -= 1
×
838
                            return
×
839
                        continue
8✔
840
                elif data.context & data.CX_QUOTED:
10✔
841
                    if chunk == data.quoter and not escaped:
10✔
842
                        data.context |= data.CX_NOTE_SPACE
10✔
843
                        continue
10✔
844
            self._handle_tag_text(chunk)
10✔
845

846
    def _handle_tag_close_open(self, data, token):
10✔
847
        """Handle the closing of a open tag (``<foo>``)."""
848
        if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
10✔
849
            self._push_tag_buffer(data)
10✔
850
        self._emit(token(padding=data.padding_buffer["first"]))
10✔
851
        self._head += 1
10✔
852

853
    def _handle_tag_open_close(self):
10✔
854
        """Handle the opening of a closing tag (``</foo>``)."""
855
        self._emit(tokens.TagOpenClose())
10✔
856
        self._push(contexts.TAG_CLOSE)
10✔
857
        self._head += 1
10✔
858

859
    def _handle_tag_close_close(self):
10✔
860
        """Handle the ending of a closing tag (``</foo>``)."""
861

862
        def strip(tok):
10✔
863
            return tok.text.rstrip().lower()
10✔
864

865
        closing = self._pop()
10✔
866
        if len(closing) != 1 or (
10✔
867
            not isinstance(closing[0], tokens.Text)
868
            or strip(closing[0]) != strip(self._stack[1])
869
        ):
870
            self._fail_route()
10✔
871
        self._emit_all(closing)
10✔
872
        self._emit(tokens.TagCloseClose())
10✔
873
        return self._pop()
10✔
874

875
    def _handle_blacklisted_tag(self):
10✔
876
        """Handle the body of an HTML tag that is parser-blacklisted."""
877

878
        def strip(text):
10✔
879
            return text.rstrip().lower()
10✔
880

881
        while True:
8✔
882
            this, nxt = self._read(), self._read(1)
10✔
883
            if this is END:
10✔
884
                self._fail_route()
10✔
885
            elif this == "<" and nxt == "/":
10✔
886
                self._head += 3
10✔
887
                if self._read() != ">" or (
10✔
888
                    strip(self._read(-1)) != strip(self._stack[1].text)
889
                ):
890
                    self._head -= 1
10✔
891
                    self._emit_text("</")
10✔
892
                    continue
10✔
893
                self._emit(tokens.TagOpenClose())
10✔
894
                self._emit_text(self._read(-1))
10✔
895
                self._emit(tokens.TagCloseClose())
10✔
896
                return self._pop()
10✔
897
            elif this == "&":
10✔
898
                self._parse_entity()
10✔
899
            else:
900
                self._emit_text(this)
10✔
901
            self._head += 1
10✔
902

903
    def _handle_single_only_tag_end(self):
10✔
904
        """Handle the end of an implicitly closing single-only HTML tag."""
905
        padding = self._stack.pop().padding
10✔
906
        self._emit(tokens.TagCloseSelfclose(padding=padding, implicit=True))
10✔
907
        self._head -= 1  # Offset displacement done by _handle_tag_close_open
10✔
908
        return self._pop()
10✔
909

910
    def _handle_single_tag_end(self):
10✔
911
        """Handle the stream end when inside a single-supporting HTML tag."""
912
        stack = self._stack
10✔
913
        # We need to find the index of the TagCloseOpen token corresponding to
914
        # the TagOpenOpen token located at index 0:
915
        depth = 1
10✔
916
        for index, token in enumerate(stack[2:], 2):
10✔
917
            if isinstance(token, tokens.TagOpenOpen):
10✔
918
                depth += 1
10✔
919
            elif isinstance(token, tokens.TagCloseOpen):
10✔
920
                depth -= 1
10✔
921
                if depth == 0:
10✔
922
                    break
10✔
923
            elif isinstance(token, tokens.TagCloseSelfclose):
10✔
924
                depth -= 1
10✔
925
                if depth == 0:  # pragma: no cover (untestable/exceptional)
926
                    raise ParserError(
927
                        "_handle_single_tag_end() got an unexpected TagCloseSelfclose"
928
                    )
929
        else:  # pragma: no cover (untestable/exceptional case)
930
            raise ParserError("_handle_single_tag_end() missed a TagCloseOpen")
931
        padding = stack[index].padding
10✔
932
        stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True)
10✔
933
        return self._pop()
10✔
934

935
    def _really_parse_tag(self):
10✔
936
        """Actually parse an HTML tag, starting with the open (``<foo>``)."""
937
        data = _TagOpenData()
10✔
938
        self._push(contexts.TAG_OPEN)
10✔
939
        self._emit(tokens.TagOpenOpen())
10✔
940
        while True:
8✔
941
            this, nxt = self._read(), self._read(1)
10✔
942
            can_exit = (
10✔
943
                not data.context & (data.CX_QUOTED | data.CX_NAME)
944
                or data.context & data.CX_NOTE_SPACE
945
            )
946
            if this is END:
10✔
947
                if self._context & contexts.TAG_ATTR:
10✔
948
                    if data.context & data.CX_QUOTED:
10✔
949
                        # Unclosed attribute quote: reset, don't die
950
                        data.context = data.CX_ATTR_VALUE
10✔
951
                        self._memoize_bad_route()
10✔
952
                        self._pop()
10✔
953
                        self._head = data.reset
10✔
954
                        continue
10✔
955
                    self._pop()
10✔
956
                self._fail_route()
10✔
957
            elif this == ">" and can_exit:
10✔
958
                self._handle_tag_close_open(data, tokens.TagCloseOpen)
10✔
959
                self._context = contexts.TAG_BODY
10✔
960
                if is_single_only(self._stack[1].text):
10✔
961
                    return self._handle_single_only_tag_end()
10✔
962
                if is_parsable(self._stack[1].text):
10✔
963
                    return self._parse(push=False)
10✔
964
                return self._handle_blacklisted_tag()
10✔
965
            elif this == "/" and nxt == ">" and can_exit:
10✔
966
                self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
10✔
967
                return self._pop()
10✔
968
            else:
969
                self._handle_tag_data(data, this)
10✔
970
            self._head += 1
10✔
971

972
    def _handle_invalid_tag_start(self):
10✔
973
        """Handle the (possible) start of an implicitly closing single tag."""
974
        reset = self._head + 1
10✔
975
        self._head += 2
10✔
976
        try:
10✔
977
            assert (this := self._read()) is not END
10✔
978
            if not is_single_only(self.tag_splitter.split(this)[0]):
10✔
979
                raise BadRoute()
10✔
980
            tag = self._really_parse_tag()
10✔
981
        except BadRoute:
10✔
982
            self._head = reset
10✔
983
            self._emit_text("</")
10✔
984
        else:
985
            tag[0].invalid = True  # Set flag of TagOpenOpen
10✔
986
            self._emit_all(tag)
10✔
987

988
    def _parse_tag(self):
10✔
989
        """Parse an HTML tag at the head of the wikicode string."""
990
        reset = self._head
10✔
991
        self._head += 1
10✔
992
        try:
10✔
993
            tag = self._really_parse_tag()
10✔
994
        except BadRoute:
10✔
995
            self._head = reset
10✔
996
            self._emit_text("<")
10✔
997
        else:
998
            self._emit_all(tag)
10✔
999

1000
    def _emit_style_tag(self, tag, markup, body):
10✔
1001
        """Write the body of a tag and the tokens that should surround it."""
1002
        self._emit(tokens.TagOpenOpen(wiki_markup=markup))
10✔
1003
        self._emit_text(tag)
10✔
1004
        self._emit(tokens.TagCloseOpen())
10✔
1005
        self._emit_all(body)
10✔
1006
        self._emit(tokens.TagOpenClose())
10✔
1007
        self._emit_text(tag)
10✔
1008
        self._emit(tokens.TagCloseClose())
10✔
1009

1010
    def _parse_italics(self):
10✔
1011
        """Parse wiki-style italics."""
1012
        reset = self._head
10✔
1013
        try:
10✔
1014
            stack = self._parse(contexts.STYLE_ITALICS)
10✔
1015
        except BadRoute as route:
10✔
1016
            self._head = reset
10✔
1017
            if route.context & contexts.STYLE_PASS_AGAIN:
10✔
1018
                new_ctx = contexts.STYLE_ITALICS | contexts.STYLE_SECOND_PASS
10✔
1019
                try:
10✔
1020
                    stack = self._parse(new_ctx)
10✔
1021
                except BadRoute:
×
1022
                    self._head = reset
×
1023
                    self._emit_text("''")
×
1024
                    return
×
1025
            else:
1026
                self._emit_text("''")
10✔
1027
                return
10✔
1028
        self._emit_style_tag("i", "''", stack)
10✔
1029

1030
    def _parse_bold(self):
10✔
1031
        """Parse wiki-style bold."""
1032
        reset = self._head
10✔
1033
        try:
10✔
1034
            stack = self._parse(contexts.STYLE_BOLD)
10✔
1035
        except BadRoute:
10✔
1036
            self._head = reset
10✔
1037
            if self._context & contexts.STYLE_SECOND_PASS:
10✔
1038
                self._emit_text("'")
10✔
1039
                return True
10✔
1040
            if self._context & contexts.STYLE_ITALICS:
10✔
1041
                self._context |= contexts.STYLE_PASS_AGAIN
10✔
1042
                self._emit_text("'''")
10✔
1043
            else:
1044
                self._emit_text("'")
10✔
1045
                self._parse_italics()
10✔
1046
        else:
1047
            self._emit_style_tag("b", "'''", stack)
10✔
1048
        return False
10✔
1049

1050
    def _parse_italics_and_bold(self):
10✔
1051
        """Parse wiki-style italics and bold together (i.e., five ticks)."""
1052
        reset = self._head
10✔
1053
        try:
10✔
1054
            stack = self._parse(contexts.STYLE_BOLD)
10✔
1055
        except BadRoute:
10✔
1056
            self._head = reset
10✔
1057
            try:
10✔
1058
                stack = self._parse(contexts.STYLE_ITALICS)
10✔
1059
            except BadRoute:
10✔
1060
                self._head = reset
10✔
1061
                self._emit_text("'''''")
10✔
1062
            else:
1063
                reset = self._head
10✔
1064
                try:
10✔
1065
                    stack2 = self._parse(contexts.STYLE_BOLD)
10✔
1066
                except BadRoute:
10✔
1067
                    self._head = reset
10✔
1068
                    self._emit_text("'''")
10✔
1069
                    self._emit_style_tag("i", "''", stack)
10✔
1070
                else:
1071
                    self._push()
10✔
1072
                    self._emit_style_tag("i", "''", stack)
10✔
1073
                    self._emit_all(stack2)
10✔
1074
                    self._emit_style_tag("b", "'''", self._pop())
10✔
1075
        else:
1076
            reset = self._head
10✔
1077
            try:
10✔
1078
                stack2 = self._parse(contexts.STYLE_ITALICS)
10✔
1079
            except BadRoute:
10✔
1080
                self._head = reset
10✔
1081
                self._emit_text("''")
10✔
1082
                self._emit_style_tag("b", "'''", stack)
10✔
1083
            else:
1084
                self._push()
10✔
1085
                self._emit_style_tag("b", "'''", stack)
10✔
1086
                self._emit_all(stack2)
10✔
1087
                self._emit_style_tag("i", "''", self._pop())
10✔
1088

1089
    def _parse_style(self):
10✔
1090
        """Parse wiki-style formatting (``''``/``'''`` for italics/bold)."""
1091
        self._head += 2
10✔
1092
        ticks = 2
10✔
1093
        while self._read() == "'":
10✔
1094
            self._head += 1
10✔
1095
            ticks += 1
10✔
1096
        italics = self._context & contexts.STYLE_ITALICS
10✔
1097
        bold = self._context & contexts.STYLE_BOLD
10✔
1098

1099
        if ticks > 5:
10✔
1100
            self._emit_text("'" * (ticks - 5))
10✔
1101
            ticks = 5
10✔
1102
        elif ticks == 4:
10✔
1103
            self._emit_text("'")
10✔
1104
            ticks = 3
10✔
1105

1106
        if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
10✔
1107
            if ticks == 5:
10✔
1108
                self._head -= 3 if italics else 2
10✔
1109
            return self._pop()
10✔
1110
        if not self._can_recurse():
10✔
1111
            if ticks == 3:
10✔
1112
                if self._context & contexts.STYLE_SECOND_PASS:
10✔
1113
                    self._emit_text("'")
10✔
1114
                    return self._pop()
10✔
1115
                if self._context & contexts.STYLE_ITALICS:
10✔
1116
                    self._context |= contexts.STYLE_PASS_AGAIN
10✔
1117
            self._emit_text("'" * ticks)
10✔
1118
        elif ticks == 2:
10✔
1119
            self._parse_italics()
10✔
1120
        elif ticks == 3:
10✔
1121
            if self._parse_bold():
10✔
1122
                return self._pop()
10✔
1123
        else:  # ticks == 5
1124
            self._parse_italics_and_bold()
10✔
1125
        self._head -= 1
10✔
1126

1127
    def _handle_list_marker(self):
10✔
1128
        """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
1129
        markup = self._read()
10✔
1130
        assert markup is not END
10✔
1131
        if markup == ";":
10✔
1132
            self._context |= contexts.DL_TERM
10✔
1133
        self._emit(tokens.TagOpenOpen(wiki_markup=markup))
10✔
1134
        self._emit_text(get_html_tag(markup))
10✔
1135
        self._emit(tokens.TagCloseSelfclose())
10✔
1136

1137
    def _handle_list(self):
10✔
1138
        """Handle a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
1139
        self._handle_list_marker()
10✔
1140
        while self._read(1) in ("#", "*", ";", ":"):
10✔
1141
            self._head += 1
10✔
1142
            self._handle_list_marker()
10✔
1143

1144
    def _handle_hr(self):
10✔
1145
        """Handle a wiki-style horizontal rule (``----``) in the string."""
1146
        length = 4
10✔
1147
        self._head += 3
10✔
1148
        while self._read(1) == "-":
10✔
1149
            length += 1
10✔
1150
            self._head += 1
10✔
1151
        self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
10✔
1152
        self._emit_text("hr")
10✔
1153
        self._emit(tokens.TagCloseSelfclose())
10✔
1154

1155
    def _handle_dl_term(self):
10✔
1156
        """Handle the term in a description list (``foo`` in ``;foo:bar``)."""
1157
        self._context ^= contexts.DL_TERM
10✔
1158
        if self._read() == ":":
10✔
1159
            self._handle_list_marker()
10✔
1160
        else:
1161
            self._emit_text("\n")
10✔
1162

1163
    def _emit_table_tag(
10✔
1164
        self,
1165
        open_open_markup,
1166
        tag,
1167
        style,
1168
        padding,
1169
        close_open_markup,
1170
        contents,
1171
        open_close_markup,
1172
    ):
1173
        """Emit a table tag."""
1174
        self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
10✔
1175
        self._emit_text(tag)
10✔
1176
        if style:
10✔
1177
            self._emit_all(style)
10✔
1178
        if close_open_markup:
10✔
1179
            self._emit(
10✔
1180
                tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)
1181
            )
1182
        else:
1183
            self._emit(tokens.TagCloseOpen(padding=padding))
10✔
1184
        if contents:
10✔
1185
            self._emit_all(contents)
10✔
1186
        self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup))
10✔
1187
        self._emit_text(tag)
10✔
1188
        self._emit(tokens.TagCloseClose())
10✔
1189

1190
    def _handle_table_style(self, end_token: str):
10✔
1191
        """Handle style attributes for a table until ``end_token``."""
1192
        data = _TagOpenData()
10✔
1193
        data.context = _TagOpenData.CX_ATTR_READY
10✔
1194
        while True:
8✔
1195
            this = self._read()
10✔
1196
            can_exit = (
10✔
1197
                not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE
1198
            )
1199
            if this == end_token and can_exit:
10✔
1200
                assert this is not END
10✔
1201
                if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
10✔
1202
                    self._push_tag_buffer(data)
10✔
1203
                if this.isspace():
10✔
1204
                    data.padding_buffer["first"] += this
10✔
1205
                return data.padding_buffer["first"]
10✔
1206
            if this is END or this == end_token:
10✔
1207
                if self._context & contexts.TAG_ATTR:
10✔
1208
                    if data.context & data.CX_QUOTED:
10✔
1209
                        # Unclosed attribute quote: reset, don't die
1210
                        data.context = data.CX_ATTR_VALUE
10✔
1211
                        self._memoize_bad_route()
10✔
1212
                        self._pop()
10✔
1213
                        self._head = data.reset
10✔
1214
                        continue
10✔
1215
                    self._pop()
10✔
1216
                self._fail_route()
10✔
1217
            else:
1218
                self._handle_tag_data(data, this)
10✔
1219
            self._head += 1
10✔
1220

1221
    def _parse_table(self):
10✔
1222
        """Parse a wikicode table by starting with the first line."""
1223
        reset = self._head
10✔
1224
        self._head += 2
10✔
1225
        try:
10✔
1226
            self._push(contexts.TABLE_OPEN)
10✔
1227
            padding = self._handle_table_style("\n")
10✔
1228
        except BadRoute:
10✔
1229
            self._head = reset
10✔
1230
            self._emit_text("{")
10✔
1231
            return
10✔
1232
        style = self._pop()
10✔
1233

1234
        self._head += 1
10✔
1235
        restore_point = self._stack_ident
10✔
1236
        try:
10✔
1237
            table = self._parse(contexts.TABLE_OPEN)
10✔
1238
        except BadRoute:
10✔
1239
            while self._stack_ident != restore_point:
10✔
1240
                self._memoize_bad_route()
10✔
1241
                self._pop()
10✔
1242
            self._head = reset
10✔
1243
            self._emit_text("{")
10✔
1244
            return
10✔
1245

1246
        self._emit_table_tag("{|", "table", style, padding, None, table, "|}")
10✔
1247
        # Offset displacement done by _parse():
1248
        self._head -= 1
10✔
1249

1250
    def _handle_table_row(self):
10✔
1251
        """Parse as style until end of the line, then continue."""
1252
        self._head += 2
10✔
1253
        if not self._can_recurse():
10✔
1254
            self._emit_text("|-")
10✔
1255
            self._head -= 1
10✔
1256
            return
10✔
1257

1258
        self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
10✔
1259
        padding = self._handle_table_style("\n")
10✔
1260
        style = self._pop()
10✔
1261

1262
        # Don't parse the style separator:
1263
        self._head += 1
10✔
1264
        row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
10✔
1265

1266
        self._emit_table_tag("|-", "tr", style, padding, None, row, "")
10✔
1267
        # Offset displacement done by parse():
1268
        self._head -= 1
10✔
1269

1270
    def _handle_table_cell(self, markup, tag, line_context):
10✔
1271
        """Parse as normal syntax unless we hit a style marker, then parse
1272
        style as HTML attributes and the remainder as normal syntax."""
1273
        old_context = self._context
10✔
1274
        padding, style = "", None
10✔
1275
        self._head += len(markup)
10✔
1276
        reset = self._head
10✔
1277
        if not self._can_recurse():
10✔
1278
            self._emit_text(markup)
10✔
1279
            self._head -= 1
10✔
1280
            return
10✔
1281

1282
        cell = self._parse(
10✔
1283
            contexts.TABLE_OPEN
1284
            | contexts.TABLE_CELL_OPEN
1285
            | line_context
1286
            | contexts.TABLE_CELL_STYLE
1287
        )
1288
        cell_context = self._context
10✔
1289
        self._context = old_context
10✔
1290
        reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
10✔
1291
        if reset_for_style:
10✔
1292
            self._head = reset
10✔
1293
            self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context)
10✔
1294
            padding = self._handle_table_style("|")
10✔
1295
            style = self._pop()
10✔
1296
            # Don't parse the style separator:
1297
            self._head += 1
10✔
1298
            cell = self._parse(
10✔
1299
                contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context
1300
            )
1301
            cell_context = self._context
10✔
1302
            self._context = old_context
10✔
1303

1304
        close_open_markup = "|" if reset_for_style else None
10✔
1305
        self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "")
10✔
1306
        # Keep header/cell line contexts:
1307
        self._context |= cell_context & (
10✔
1308
            contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE
1309
        )
1310
        # Offset displacement done by parse():
1311
        self._head -= 1
10✔
1312

1313
    def _handle_table_cell_end(self, reset_for_style=False):
10✔
1314
        """Returns the current context, with the TABLE_CELL_STYLE flag set if
1315
        it is necessary to reset and parse style attributes."""
1316
        if reset_for_style:
10✔
1317
            self._context |= contexts.TABLE_CELL_STYLE
10✔
1318
        else:
1319
            self._context &= ~contexts.TABLE_CELL_STYLE
10✔
1320
        return self._pop(keep_context=True)
10✔
1321

1322
    def _handle_table_row_end(self):
10✔
1323
        """Return the stack in order to handle the table row end."""
1324
        return self._pop()
10✔
1325

1326
    def _handle_table_end(self):
10✔
1327
        """Return the stack in order to handle the table end."""
1328
        self._head += 2
10✔
1329
        return self._pop()
10✔
1330

1331
    def _handle_end(self):
10✔
1332
        """Handle the end of the stream of wikitext."""
1333
        if self._context & contexts.FAIL:
10✔
1334
            if self._context & contexts.TAG_BODY:
10✔
1335
                if is_single(self._stack[1].text):
10✔
1336
                    return self._handle_single_tag_end()
10✔
1337
            if self._context & contexts.TABLE_CELL_OPEN:
10✔
1338
                self._pop()
10✔
1339
            if self._context & contexts.DOUBLE:
10✔
1340
                self._pop()
10✔
1341
            self._fail_route()
10✔
1342
        return self._pop()
10✔
1343

1344
    def _verify_safe(self, this):
10✔
1345
        """Make sure we are not trying to write an invalid character."""
1346
        context = self._context
10✔
1347
        if context & contexts.FAIL_NEXT:
10✔
1348
            return False
10✔
1349
        if context & contexts.WIKILINK_TITLE:
10✔
1350
            if this in ("]", "{"):
10✔
1351
                self._context |= contexts.FAIL_NEXT
10✔
1352
            elif this in ("\n", "[", "}", ">"):
10✔
1353
                return False
10✔
1354
            elif this == "<":
10✔
1355
                if self._read(1) == "!":
10✔
1356
                    self._context |= contexts.FAIL_NEXT
10✔
1357
                else:
1358
                    return False
10✔
1359
            return True
10✔
1360
        if context & contexts.EXT_LINK_TITLE:
10✔
1361
            return this != "\n"
10✔
1362
        if context & contexts.TEMPLATE_NAME:
10✔
1363
            if this == "{":
10✔
1364
                self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
10✔
1365
                return True
10✔
1366
            if this == "}" or (this == "<" and self._read(1) == "!"):
10✔
1367
                self._context |= contexts.FAIL_NEXT
10✔
1368
                return True
10✔
1369
            if this in ("[", "]", "<", ">"):
10✔
1370
                return False
10✔
1371
            if this == "|":
10✔
1372
                return True
10✔
1373
            if context & contexts.HAS_TEXT:
10✔
1374
                if context & contexts.FAIL_ON_TEXT:
10✔
1375
                    if this is END or not this.isspace():
10✔
1376
                        return False
10✔
1377
                elif this == "\n":
10✔
1378
                    self._context |= contexts.FAIL_ON_TEXT
10✔
1379
            elif this is END or not this.isspace():
10✔
1380
                self._context |= contexts.HAS_TEXT
10✔
1381
            return True
10✔
1382
        if context & contexts.TAG_CLOSE:
10✔
1383
            return this != "<"
10✔
1384
        if context & contexts.FAIL_ON_EQUALS:
10✔
1385
            if this == "=":
10✔
1386
                return False
10✔
1387
        elif context & contexts.FAIL_ON_LBRACE:
10✔
1388
            if this == "{" or (self._read(-1) == self._read(-2) == "{"):
10✔
1389
                if context & contexts.TEMPLATE:
10✔
1390
                    self._context |= contexts.FAIL_ON_EQUALS
10✔
1391
                else:
1392
                    self._context |= contexts.FAIL_NEXT
10✔
1393
                return True
10✔
1394
            self._context ^= contexts.FAIL_ON_LBRACE
10✔
1395
        elif context & contexts.FAIL_ON_RBRACE:
10✔
1396
            if this == "}":
10✔
1397
                self._context |= contexts.FAIL_NEXT
10✔
1398
                return True
10✔
1399
            self._context ^= contexts.FAIL_ON_RBRACE
10✔
1400
        elif this == "{":
10✔
1401
            self._context |= contexts.FAIL_ON_LBRACE
10✔
1402
        elif this == "}":
10✔
1403
            self._context |= contexts.FAIL_ON_RBRACE
10✔
1404
        return True
10✔
1405

1406
    def _parse(self, context=0, push=True):
10✔
1407
        """Parse the wikicode string, using *context* for when to stop."""
1408
        if push:
10✔
1409
            self._push(context)
10✔
1410
        while True:
8✔
1411
            this = self._read()
10✔
1412
            if self._context & contexts.UNSAFE:
10✔
1413
                if not self._verify_safe(this):
10✔
1414
                    if self._context & contexts.DOUBLE:
10✔
1415
                        self._pop()
10✔
1416
                    self._fail_route()
10✔
1417
            if this not in self.MARKERS:
10✔
1418
                self._emit_text(this)
10✔
1419
                self._head += 1
10✔
1420
                continue
10✔
1421
            if this is END:
10✔
1422
                return self._handle_end()
10✔
1423
            nxt = self._read(1)
10✔
1424
            if this == nxt == "{":
10✔
1425
                if self._can_recurse():
10✔
1426
                    self._parse_template_or_argument()
10✔
1427
                else:
1428
                    self._emit_text("{")
10✔
1429
            elif this == "|" and self._context & contexts.TEMPLATE:
10✔
1430
                self._handle_template_param()
10✔
1431
            elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
10✔
1432
                if (
10✔
1433
                    not self._global & contexts.GL_HEADING
1434
                    and self._read(-1) in ("\n", START)
1435
                    and nxt == "="
1436
                ):
1437
                    self._parse_heading()
10✔
1438
                else:
1439
                    self._handle_template_param_value()
10✔
1440
            elif this == nxt == "}" and self._context & contexts.TEMPLATE:
10✔
1441
                return self._handle_template_end()
10✔
1442
            elif this == "|" and self._context & contexts.ARGUMENT_NAME:
10✔
1443
                self._handle_argument_separator()
10✔
1444
            elif this == nxt == "}" and self._context & contexts.ARGUMENT:
10✔
1445
                if self._read(2) == "}":
10✔
1446
                    return self._handle_argument_end()
10✔
1447
                self._emit_text("}")
10✔
1448
            elif this == nxt == "[" and self._can_recurse():
10✔
1449
                # TODO: Only do this if not in a file context:
1450
                # if self._context & contexts.WIKILINK_TEXT:
1451
                #     self._fail_route()
1452
                if not self._context & contexts.NO_WIKILINKS:
10✔
1453
                    self._parse_wikilink()
10✔
1454
                else:
1455
                    self._emit_text("[")
×
1456
            elif this == "|" and self._context & contexts.WIKILINK_TITLE:
10✔
1457
                self._handle_wikilink_separator()
10✔
1458
            elif this == nxt == "]" and self._context & contexts.WIKILINK:
10✔
1459
                return self._handle_wikilink_end()
10✔
1460
            elif this == "[":
10✔
1461
                self._parse_external_link(True)
10✔
1462
            elif this == ":" and self._read(-1) not in self.MARKERS:
10✔
1463
                self._parse_external_link(False)
10✔
1464
            elif this == "]" and self._context & contexts.EXT_LINK_TITLE:
10✔
1465
                return self._pop()
10✔
1466
            elif (
10✔
1467
                this == "="
1468
                and not self._global & contexts.GL_HEADING
1469
                and not self._context & contexts.TEMPLATE
1470
            ):
1471
                if self._read(-1) in ("\n", START):
10✔
1472
                    self._parse_heading()
10✔
1473
                else:
1474
                    self._emit_text("=")
10✔
1475
            elif this == "=" and self._context & contexts.HEADING:
10✔
1476
                return self._handle_heading_end()
10✔
1477
            elif this == "\n" and self._context & contexts.HEADING:
10✔
1478
                self._fail_route()
10✔
1479
            elif this == "&":
10✔
1480
                self._parse_entity()
10✔
1481
            elif this == "<" and nxt == "!":
10✔
1482
                if self._read(2) == self._read(3) == "-":
10✔
1483
                    self._parse_comment()
10✔
1484
                else:
1485
                    self._emit_text(this)
10✔
1486
            elif this == "<" and nxt == "/" and self._read(2) is not END:
10✔
1487
                if self._context & contexts.TAG_BODY:
10✔
1488
                    self._handle_tag_open_close()
10✔
1489
                else:
1490
                    self._handle_invalid_tag_start()
10✔
1491
            elif this == "<" and not self._context & contexts.TAG_CLOSE:
10✔
1492
                if self._can_recurse():
10✔
1493
                    self._parse_tag()
10✔
1494
                else:
1495
                    self._emit_text("<")
×
1496
            elif this == ">" and self._context & contexts.TAG_CLOSE:
10✔
1497
                return self._handle_tag_close_close()
10✔
1498
            elif this == nxt == "'" and not self._skip_style_tags:
10✔
1499
                result = self._parse_style()
10✔
1500
                if result is not None:
10✔
1501
                    return result
10✔
1502
            elif self._read(-1) in ("\n", START) and this in ("#", "*", ";", ":"):
10✔
1503
                self._handle_list()
10✔
1504
            elif self._read(-1) in ("\n", START) and (
10✔
1505
                this == nxt == self._read(2) == self._read(3) == "-"
1506
            ):
1507
                self._handle_hr()
10✔
1508
            elif this in ("\n", ":") and self._context & contexts.DL_TERM:
10✔
1509
                self._handle_dl_term()
10✔
1510
                if this == "\n":
10✔
1511
                    # Kill potential table contexts
1512
                    self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
10✔
1513
            # Start of table parsing
1514
            elif (
10✔
1515
                this == "{"
1516
                and nxt == "|"
1517
                and (
1518
                    self._read(-1) in ("\n", START)
1519
                    or (
1520
                        self._read(-2) in ("\n", START)
1521
                        and cast(str, self._read(-1)).isspace()
1522
                    )
1523
                )
1524
            ):
1525
                if self._can_recurse():
10✔
1526
                    self._parse_table()
10✔
1527
                else:
1528
                    self._emit_text("{")
10✔
1529
            elif self._context & contexts.TABLE_OPEN:
10✔
1530
                if this == nxt == "|" and self._context & contexts.TABLE_TD_LINE:
10✔
1531
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1532
                        return self._handle_table_cell_end()
10✔
1533
                    self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
10✔
1534
                elif this == nxt == "|" and self._context & contexts.TABLE_TH_LINE:
10✔
1535
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1536
                        return self._handle_table_cell_end()
10✔
1537
                    self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
10✔
1538
                elif this == nxt == "!" and self._context & contexts.TABLE_TH_LINE:
10✔
1539
                    if self._context & contexts.TABLE_CELL_OPEN:
10✔
1540
                        return self._handle_table_cell_end()
10✔
1541
                    self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
10✔
1542
                elif this == "|" and self._context & contexts.TABLE_CELL_STYLE:
10✔
1543
                    return self._handle_table_cell_end(reset_for_style=True)
10✔
1544
                # on newline, clear out cell line contexts
1545
                elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
10✔
1546
                    self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
10✔
1547
                    self._emit_text(this)
10✔
1548
                elif self._read(-1) in ("\n", START) or (
10✔
1549
                    self._read(-2) in ("\n", START)
1550
                    and cast(str, self._read(-1)).isspace()
1551
                ):
1552
                    if this == "|" and nxt == "}":
10✔
1553
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1554
                            return self._handle_table_cell_end()
10✔
1555
                        if self._context & contexts.TABLE_ROW_OPEN:
10✔
1556
                            return self._handle_table_row_end()
10✔
1557
                        return self._handle_table_end()
10✔
1558
                    if this == "|" and nxt == "-":
10✔
1559
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1560
                            return self._handle_table_cell_end()
10✔
1561
                        if self._context & contexts.TABLE_ROW_OPEN:
10✔
1562
                            return self._handle_table_row_end()
10✔
1563
                        self._handle_table_row()
10✔
1564
                    elif this == "|":
10✔
1565
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1566
                            return self._handle_table_cell_end()
10✔
1567
                        self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE)
10✔
1568
                    elif this == "!":
10✔
1569
                        if self._context & contexts.TABLE_CELL_OPEN:
10✔
1570
                            return self._handle_table_cell_end()
10✔
1571
                        self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE)
10✔
1572
                    else:
1573
                        self._emit_text(this)
10✔
1574
                else:
1575
                    self._emit_text(this)
10✔
1576

1577
            else:
1578
                self._emit_text(this)
10✔
1579
            self._head += 1
10✔
1580

1581
    def tokenize(self, text: str, context=0, skip_style_tags=False):
10✔
1582
        """Build a list of tokens from a string of wikicode and return it."""
1583
        split = self.regex.split(text)
10✔
1584
        self._text = [segment for segment in split if segment]
10✔
1585
        self._head = self._global = self._depth = 0
10✔
1586
        self._bad_routes = set()
10✔
1587
        self._skip_style_tags = skip_style_tags
10✔
1588

1589
        try:
10✔
1590
            result = self._parse(context)
10✔
1591
        except BadRoute as exc:  # pragma: no cover (untestable/exceptional case)
1592
            raise ParserError("Python tokenizer exited with BadRoute") from exc
1593
        if self._stacks:  # pragma: no cover (untestable/exceptional case)
1594
            err = "Python tokenizer exited with non-empty token stack"
1595
            raise ParserError(err)
1596
        return result
10✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc