• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MyIntervals / emogrifier / 25810750037

13 May 2026 03:59PM UTC coverage: 98.246% (+0.6%) from 97.669%
25810750037

Pull #1620

github

web-flow
Merge ac6ce8566 into 635eb2818
Pull Request #1620: [CLEANUP] Rely on PHP to detect access to uninitialized properties

8 of 8 new or added lines in 4 files covered. (100.0%)

5 existing lines in 1 file now uncovered.

840 of 855 relevant lines covered (98.25%)

259.61 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.28
/src/HtmlProcessor/AbstractHtmlProcessor.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Pelago\Emogrifier\HtmlProcessor;
6

7
use function Safe\preg_match;
8
use function Safe\preg_replace;
9

10
/**
11
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
12
 *
13
 * The "vanilla" subclass is the HtmlNormalizer.
14
 */
15
abstract class AbstractHtmlProcessor
16
{
17
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
18
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
19

20
    /**
21
     * Regular expression part to match tag names that PHP's DOMDocument implementation is not
22
     * aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
23
     * `<keygen>` (deprecated) are also included.
24
     *
25
     * @see https://bugs.php.net/bug.php?id=73175
26
     */
27
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
28

29
    /**
30
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
31
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
32
     */
33
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
34
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';
35

36
    /**
37
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
38
     */
39
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';
40

41
    /**
42
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
43
     */
44
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
45
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';
46

47
    /**
48
     * @var \DOMDocument
49
     */
50
    private $domDocument;
51

52
    /**
53
     * @var \DOMXPath
54
     */
55
    protected $xPath;
56

57
    /**
58
     * The constructor.
59
     *
60
     * Please use `::fromHtml` or `::fromDomDocument` instead.
61
     */
62
    final private function __construct() {}
63

64
    /**
65
     * Builds a new instance from the given HTML.
66
     *
67
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
68
     *
69
     * @return static
70
     *
71
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
72
     */
73
    public static function fromHtml(string $unprocessedHtml): self
612✔
74
    {
75
        if ($unprocessedHtml === '') {
612✔
76
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
1✔
77
        }
78

79
        $instance = new static();
611✔
80
        $instance->setHtml($unprocessedHtml);
611✔
81
        \assert($instance->xPath instanceof \DOMXPath);
611✔
82

83
        return $instance;
611✔
84
    }
85

86
    /**
87
     * Builds a new instance from the given DOM document.
88
     *
89
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
90
     *
91
     * @return static
92
     */
93
    public static function fromDomDocument(\DOMDocument $document): self
4✔
94
    {
95
        $instance = new static();
4✔
96
        $instance->setDomDocument($document);
4✔
97
        \assert($instance->xPath instanceof \DOMXPath);
4✔
98

99
        return $instance;
4✔
100
    }
101

102
    /**
103
     * Sets the HTML to process.
104
     *
105
     * @param string $html the HTML to process, must be UTF-8-encoded
106
     */
107
    private function setHtml(string $html): void
611✔
108
    {
109
        $this->createUnifiedDomDocument($html);
611✔
110
    }
111

112
    /**
113
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
114
     *
115
     * @throws \UnexpectedValueException
116
     */
117
    public function getDomDocument(): \DOMDocument
613✔
118
    {
119
        return $this->domDocument;
613✔
120
    }
121

122
    private function setDomDocument(\DOMDocument $domDocument): void
615✔
123
    {
124
        $this->domDocument = $domDocument;
615✔
125
        $this->xPath = new \DOMXPath($domDocument);
615✔
126
    }
127

128
    /**
129
     * Renders the normalized and processed HTML.
130
     *
131
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
132
     */
133
    public function render(): string
212✔
134
    {
135
        return $this->getHtml();
212✔
136
    }
137

138
    /**
139
     * Renders the content of the BODY element of the normalized and processed HTML.
140
     *
141
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
142
     */
143
    public function renderBodyContent(): string
12✔
144
    {
145
        $bodyNodeHtml = $this->getHtml($this->getBodyElement());
12✔
146

147
        return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
12✔
148
    }
149

150
    /**
151
     * @param ?\DOMNode $node optional parameter to output a subset of the document
152
     *
153
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
154
     */
155
    private function getHtml(?\DOMNode $node = null): string
224✔
156
    {
157
        $html = $this->getDomDocument()->saveHTML($node);
224✔
158

159
        if (!\is_string($html)) {
224✔
UNCOV
160
            throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);
×
161
        }
162
        return $this->removeSelfClosingTagsClosingTags($html);
224✔
163
    }
164

165
    /**
166
     * Eliminates any invalid closing tags for void elements from the given HTML.
167
     */
168
    private function removeSelfClosingTagsClosingTags(string $html): string
224✔
169
    {
170
        return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
224✔
171
    }
172

173
    /**
174
     * Returns the HTML element.
175
     *
176
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
177
     *
178
     * @throws \UnexpectedValueException
179
     */
180
    protected function getHtmlElement(): \DOMElement
414✔
181
    {
182
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
414✔
183
        if (!$htmlElement instanceof \DOMElement) {
414✔
UNCOV
184
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
×
185
        }
186

187
        return $htmlElement;
414✔
188
    }
189

190
    /**
191
     * Returns the BODY element.
192
     *
193
     * This method assumes that there always is a BODY element.
194
     *
195
     * @throws \RuntimeException
196
     */
197
    private function getBodyElement(): \DOMElement
12✔
198
    {
199
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
12✔
200
        if (!$node instanceof \DOMElement) {
12✔
UNCOV
201
            throw new \RuntimeException('There is no body element.', 1617922607);
×
202
        }
203

204
        return $node;
12✔
205
    }
206

207
    /**
208
     * Creates a DOM document from the given HTML and stores it in `$this->domDocument`.
209
     *
210
     * The DOM document will always have a BODY element and a document type.
211
     */
212
    private function createUnifiedDomDocument(string $html): void
611✔
213
    {
214
        $this->createRawDomDocument($html);
611✔
215
        $this->ensureExistenceOfBodyElement();
611✔
216
    }
217

218
    /**
219
     * Creates a DOMDocument instance from the given HTML and stores it in `$this->domDocument`.
220
     */
221
    private function createRawDomDocument(string $html): void
611✔
222
    {
223
        $domDocument = new \DOMDocument();
611✔
224
        $domDocument->strictErrorChecking = false;
611✔
225
        $domDocument->formatOutput = false;
611✔
226
        $libXmlState = \libxml_use_internal_errors(true);
611✔
227
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);
611✔
228
        \libxml_clear_errors();
611✔
229
        \libxml_use_internal_errors($libXmlState);
611✔
230

231
        $this->setDomDocument($domDocument);
611✔
232
    }
233

234
    /**
235
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
236
     * ensuring that the HTML will be good for creating a DOM document from it.
237
     */
238
    private function prepareHtmlForDomConversion(string $html): string
611✔
239
    {
240
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
611✔
241
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
611✔
242

243
        return $this->addContentTypeMetaTag($htmlWithDocumentType);
611✔
244
    }
245

246
    /**
247
     * Makes sure that the passed HTML has a document type, with lowercase "html".
248
     *
249
     * @return non-empty-string HTML with document type
250
     */
251
    private function ensureDocumentType(string $html): string
611✔
252
    {
253
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
611✔
254
        if ($hasDocumentType) {
611✔
255
            return $this->normalizeDocumentType($html);
39✔
256
        }
257

258
        return self::DEFAULT_DOCUMENT_TYPE . $html;
572✔
259
    }
260

261
    /**
262
     * Makes sure the document type in the passed HTML has lowercase `html`.
263
     *
264
     * @param non-empty-string $html
265
     *
266
     * @return non-empty-string HTML with normalized document type
267
     */
268
    private function normalizeDocumentType(string $html): string
39✔
269
    {
270
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
271
        $result = preg_replace(
39✔
272
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
39✔
273
            '<!DOCTYPE html',
39✔
274
            $html,
39✔
275
            1,
39✔
276
        );
39✔
277
        \assert($result !== '');
39✔
278

279
        return $result;
39✔
280
    }
281

282
    /**
283
     * Adds a Content-Type meta tag for the charset.
284
     *
285
     * This method also ensures that there is a HEAD element.
286
     *
287
     * @param non-empty-string $html
288
     *
289
     * @return non-empty-string
290
     */
291
    private function addContentTypeMetaTag(string $html): string
611✔
292
    {
293
        if ($this->hasContentTypeMetaTagInHead($html)) {
611✔
294
            return $html;
374✔
295
        }
296

297
        // We are trying to insert the meta tag to the right spot in the DOM.
298
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
299
        $hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;
237✔
300
        $hasHtmlTag = \stripos($html, '<html') !== false;
237✔
301

302
        if ($hasHeadTag) {
237✔
303
            $reworkedHtml = preg_replace(
42✔
304
                '/<head(?=[\\s>])([^>]*+)>/i',
42✔
305
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
42✔
306
                $html,
42✔
307
            );
42✔
308
        } elseif ($hasHtmlTag) {
195✔
309
            $reworkedHtml = preg_replace(
83✔
310
                '/<html(.*?)>/is',
83✔
311
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
83✔
312
                $html,
83✔
313
            );
83✔
314
        } else {
315
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
112✔
316
        }
317
        \assert($reworkedHtml !== '');
237✔
318

319
        return $reworkedHtml;
237✔
320
    }
321

322
    /**
323
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
324
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
325
     * encountering a start tag for any element which is permitted only within the `<body>`.
326
     */
327
    private function hasContentTypeMetaTagInHead(string $html): bool
611✔
328
    {
329
        preg_match(
611✔
330
            '%
611✔
331
                (?(DEFINE)
332
                    # the target `http-equiv` attribute match
333
                    (?<target_attribute>
334
                        http-equiv=(["\']?+)Content-Type\\g{-1}
335
                        # must be followed by one of these characters
336
                        [\\s/>]
337
                    )
338
                    # the target `meta` element match without the opening `<`
339
                    (?<target>
340
                        meta(?=\\s)
341
                        # one or other of these
342
                        (?:
343
                            # one or more characters other than `>` or space
344
                            [^>\\s]++
345
                            |
346
                            # space not followed by the target `http-equiv` attribute
347
                            \\s(?!(?&target_attribute))
348
                        )
349
                        # any number of times (including zero)
350
                        *+
351
                        \\s(?&target_attribute)
352
                    )
353
                )
354
                # start of `subject`
355
                ^
356
                # one or other of these
357
                (?:
358
                    # one or more characters other than `<`
359
                    [^<]++
360
                    |
361
                    # `<` not followed by `target`
362
                    <(?!(?&target))
363
                )
364
                # any number of times (including zero)
365
                *+
366
                # followed by the target, not captured
367
                (?=<(?&target))
368
            %isx',
611✔
369
            $html,
611✔
370
            $matches,
611✔
371
        );
611✔
372
        if (isset($matches[0])) {
611✔
373
            $htmlBefore = $matches[0];
396✔
374
            try {
375
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
396✔
376
            } catch (\RuntimeException $exception) {
×
377
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV
378
                \trigger_error($exception->getMessage());
×
UNCOV
379
                $hasContentTypeMetaTagInHead = true;
×
380
            }
381
        } else {
382
            $hasContentTypeMetaTagInHead = false;
215✔
383
        }
384

385
        return $hasContentTypeMetaTagInHead;
611✔
386
    }
387

388
    /**
389
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
390
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
391
     * which is permitted only within the `<body>`.
392
     *
393
     * @throws \RuntimeException
394
     */
395
    private function hasEndOfHeadElement(string $html): bool
396✔
396
    {
397
        if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html) !== 0) {
396✔
398
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
399
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
400
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
70✔
401
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
70✔
402
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
70✔
403
        } else {
404
            $hasEndOfHeadElement = false;
374✔
405
        }
406

407
        return $hasEndOfHeadElement;
396✔
408
    }
409

410
    /**
411
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
412
     * is removed.
413
     */
414
    private function removeHtmlComments(string $html): string
70✔
415
    {
416
        return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
70✔
417
    }
418

419
    /**
420
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
421
     * the string is removed.
422
     */
423
    private function removeHtmlTemplateElements(string $html): string
70✔
424
    {
425
        return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
70✔
426
    }
427

428
    /**
429
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
430
     * self-closing slash.
431
     */
432
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
611✔
433
    {
434
        return preg_replace(
611✔
435
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
611✔
436
            '$0/',
611✔
437
            $html,
611✔
438
        );
611✔
439
    }
440

441
    /**
442
     * Checks that `$this->domDocument` has a BODY element and adds it if it is missing.
443
     */
444
    private function ensureExistenceOfBodyElement(): void
611✔
445
    {
446
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
611✔
447
            return;
198✔
448
        }
449

450
        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
413✔
451
    }
452
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc