• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MyIntervals / emogrifier / 22834755616

09 Mar 2026 01:34AM UTC coverage: 96.211% (-0.09%) from 96.305%
22834755616

Pull #1588

github

web-flow
Merge 4163c161f into c9e5ad70d
Pull Request #1588: [BUGFIX] Throw exception if `DOMDocument::saveHTML` fails

6 of 7 new or added lines in 1 file covered. (85.71%)

4 existing lines in 1 file now uncovered.

838 of 871 relevant lines covered (96.21%)

259.45 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.37
/src/HtmlProcessor/AbstractHtmlProcessor.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Pelago\Emogrifier\HtmlProcessor;
6

7
use function Safe\preg_match;
8
use function Safe\preg_replace;
9

10
/**
11
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
12
 *
13
 * The "vanilla" subclass is the HtmlNormalizer.
14
 */
15
abstract class AbstractHtmlProcessor
16
{
17
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
18
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
19

20
    /**
21
     * Regular expression part to match tag names that PHP's DOMDocument implementation is not
22
     * aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
23
     * `<keygen>` (deprecated) are also included.
24
     *
25
     * @see https://bugs.php.net/bug.php?id=73175
26
     */
27
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
28

29
    /**
30
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
31
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
32
     */
33
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
34
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';
35

36
    /**
37
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
38
     */
39
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';
40

41
    /**
42
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
43
     */
44
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
45
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';
46

47
    /**
48
     * @var \DOMDocument|null
49
     */
50
    protected $domDocument = null;
51

52
    /**
53
     * @var \DOMXPath|null
54
     */
55
    private $xPath = null;
56

57
    /**
58
     * The constructor.
59
     *
60
     * Please use `::fromHtml` or `::fromDomDocument` instead.
61
     */
62
    private function __construct() {}
63

64
    /**
65
     * Builds a new instance from the given HTML.
66
     *
67
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
68
     *
69
     * @return static
70
     *
71
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
72
     */
73
    public static function fromHtml(string $unprocessedHtml): self
612✔
74
    {
75
        // @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
76
        if ($unprocessedHtml === '') {
612✔
77
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
1✔
78
        }
79

80
        $instance = new static();
611✔
81
        $instance->setHtml($unprocessedHtml);
611✔
82

83
        return $instance;
611✔
84
    }
85

86
    /**
87
     * Builds a new instance from the given DOM document.
88
     *
89
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
90
     *
91
     * @return static
92
     */
93
    public static function fromDomDocument(\DOMDocument $document): self
4✔
94
    {
95
        $instance = new static();
4✔
96
        $instance->setDomDocument($document);
4✔
97

98
        return $instance;
4✔
99
    }
100

101
    /**
102
     * Sets the HTML to process.
103
     *
104
     * @param string $html the HTML to process, must be UTF-8-encoded
105
     */
106
    private function setHtml(string $html): void
611✔
107
    {
108
        $this->createUnifiedDomDocument($html);
611✔
109
    }
110

111
    /**
112
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
113
     *
114
     * @throws \UnexpectedValueException
115
     */
116
    public function getDomDocument(): \DOMDocument
613✔
117
    {
118
        if (!$this->domDocument instanceof \DOMDocument) {
613✔
119
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
×
120
            throw new \UnexpectedValueException($message, 1570472239);
×
121
        }
122

123
        return $this->domDocument;
613✔
124
    }
125

126
    private function setDomDocument(\DOMDocument $domDocument): void
615✔
127
    {
128
        $this->domDocument = $domDocument;
615✔
129
        $this->xPath = new \DOMXPath($this->domDocument);
615✔
130
    }
131

132
    /**
133
     * @throws \UnexpectedValueException
134
     */
135
    protected function getXPath(): \DOMXPath
×
136
    {
137
        if (!$this->xPath instanceof \DOMXPath) {
×
138
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
×
139
            throw new \UnexpectedValueException($message, 1617819086);
×
140
        }
141

142
        return $this->xPath;
×
143
    }
144

145
    /**
146
     * Renders the normalized and processed HTML.
147
     *
148
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
149
     */
150
    public function render(): string
212✔
151
    {
152
        $htmlWithPossibleErroneousClosingTags = $this->getHtml();
212✔
153

154
        return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
212✔
155
    }
156

157
    /**
158
     * Renders the content of the BODY element of the normalized and processed HTML.
159
     *
160
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
161
     */
162
    public function renderBodyContent(): string
12✔
163
    {
164
        $htmlWithPossibleErroneousClosingTags = $this->getHtml($this->getBodyElement());
12✔
165
        $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
12✔
166

167
        return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
12✔
168
    }
169

170
    /**
171
     * @param ?\DOMNode $node optional parameter to output a subset of the document
172
     *
173
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
174
     */
175
    private function getHtml(?\DOMNode $node = null): string
224✔
176
    {
177
        $html = $this->getDomDocument()->saveHTML($node);
224✔
178

179
        if (!\is_string($html)) {
224✔
NEW
180
            throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);
×
181
        }
182
        return $html;
224✔
183
    }
184

185
    /**
186
     * Eliminates any invalid closing tags for void elements from the given HTML.
187
     */
188
    private function removeSelfClosingTagsClosingTags(string $html): string
224✔
189
    {
190
        return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
224✔
191
    }
192

193
    /**
194
     * Returns the HTML element.
195
     *
196
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
197
     *
198
     * @throws \UnexpectedValueException
199
     */
200
    protected function getHtmlElement(): \DOMElement
414✔
201
    {
202
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
414✔
203
        if (!$htmlElement instanceof \DOMElement) {
414✔
UNCOV
204
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
×
205
        }
206

207
        return $htmlElement;
414✔
208
    }
209

210
    /**
211
     * Returns the BODY element.
212
     *
213
     * This method assumes that there always is a BODY element.
214
     *
215
     * @throws \RuntimeException
216
     */
217
    private function getBodyElement(): \DOMElement
12✔
218
    {
219
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
12✔
220
        if (!$node instanceof \DOMElement) {
12✔
UNCOV
221
            throw new \RuntimeException('There is no body element.', 1617922607);
×
222
        }
223

224
        return $node;
12✔
225
    }
226

227
    /**
228
     * Creates a DOM document from the given HTML and stores it in $this->domDocument.
229
     *
230
     * The DOM document will always have a BODY element and a document type.
231
     */
232
    private function createUnifiedDomDocument(string $html): void
611✔
233
    {
234
        $this->createRawDomDocument($html);
611✔
235
        $this->ensureExistenceOfBodyElement();
611✔
236
    }
237

238
    /**
239
     * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
240
     */
241
    private function createRawDomDocument(string $html): void
611✔
242
    {
243
        $domDocument = new \DOMDocument();
611✔
244
        $domDocument->strictErrorChecking = false;
611✔
245
        $domDocument->formatOutput = false;
611✔
246
        $libXmlState = \libxml_use_internal_errors(true);
611✔
247
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);
611✔
248
        \libxml_clear_errors();
611✔
249
        \libxml_use_internal_errors($libXmlState);
611✔
250

251
        $this->setDomDocument($domDocument);
611✔
252
    }
253

254
    /**
255
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
256
     * ensuring that the HTML will be good for creating a DOM document from it.
257
     */
258
    private function prepareHtmlForDomConversion(string $html): string
611✔
259
    {
260
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
611✔
261
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
611✔
262

263
        return $this->addContentTypeMetaTag($htmlWithDocumentType);
611✔
264
    }
265

266
    /**
267
     * Makes sure that the passed HTML has a document type, with lowercase "html".
268
     *
269
     * @return non-empty-string HTML with document type
270
     */
271
    private function ensureDocumentType(string $html): string
611✔
272
    {
273
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
611✔
274
        if ($hasDocumentType) {
611✔
275
            return $this->normalizeDocumentType($html);
39✔
276
        }
277

278
        return self::DEFAULT_DOCUMENT_TYPE . $html;
572✔
279
    }
280

281
    /**
282
     * Makes sure the document type in the passed HTML has lowercase `html`.
283
     *
284
     * @param non-empty-string $html
285
     *
286
     * @return non-empty-string HTML with normalized document type
287
     */
288
    private function normalizeDocumentType(string $html): string
39✔
289
    {
290
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
291
        $result = preg_replace(
39✔
292
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
39✔
293
            '<!DOCTYPE html',
39✔
294
            $html,
39✔
295
            1
39✔
296
        );
39✔
297
        \assert($result !== '');
39✔
298

299
        return $result;
39✔
300
    }
301

302
    /**
303
     * Adds a Content-Type meta tag for the charset.
304
     *
305
     * This method also ensures that there is a HEAD element.
306
     *
307
     * @param non-empty-string $html
308
     *
309
     * @return non-empty-string
310
     */
311
    private function addContentTypeMetaTag(string $html): string
611✔
312
    {
313
        if ($this->hasContentTypeMetaTagInHead($html)) {
611✔
314
            return $html;
374✔
315
        }
316

317
        // We are trying to insert the meta tag to the right spot in the DOM.
318
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
319
        $hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;
237✔
320
        $hasHtmlTag = \stripos($html, '<html') !== false;
237✔
321

322
        if ($hasHeadTag) {
237✔
323
            $reworkedHtml = preg_replace(
42✔
324
                '/<head(?=[\\s>])([^>]*+)>/i',
42✔
325
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
42✔
326
                $html
42✔
327
            );
42✔
328
        } elseif ($hasHtmlTag) {
195✔
329
            $reworkedHtml = preg_replace(
83✔
330
                '/<html(.*?)>/is',
83✔
331
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
83✔
332
                $html
83✔
333
            );
83✔
334
        } else {
335
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
112✔
336
        }
337
        \assert($reworkedHtml !== '');
237✔
338

339
        return $reworkedHtml;
237✔
340
    }
341

342
    /**
343
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
344
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
345
     * encountering a start tag for any element which is permitted only within the `<body>`.
346
     */
347
    private function hasContentTypeMetaTagInHead(string $html): bool
611✔
348
    {
349
        preg_match(
611✔
350
            '%
611✔
351
                (?(DEFINE)
352
                    # the target `http-equiv` attribute match
353
                    (?<target_attribute>
354
                        http-equiv=(["\']?+)Content-Type\\g{-1}
355
                        # must be followed by one of these characters
356
                        [\\s/>]
357
                    )
358
                    # the target `meta` element match without the opening `<`
359
                    (?<target>
360
                        meta(?=\\s)
361
                        # one or other of these
362
                        (?:
363
                            # one or more characters other than `>` or space
364
                            [^>\\s]++
365
                            |
366
                            # space not followed by the target `http-equiv` attribute
367
                            \\s(?!(?&target_attribute))
368
                        )
369
                        # any number of times (including zero)
370
                        *+
371
                        \\s(?&target_attribute)
372
                    )
373
                )
374
                # start of `subject`
375
                ^
376
                # one or other of these
377
                (?:
378
                    # one or more characters other than `<`
379
                    [^<]++
380
                    |
381
                    # `<` not followed by `target`
382
                    <(?!(?&target))
383
                )
384
                # any number of times (including zero)
385
                *+
386
                # followed by the target, not captured
387
                (?=<(?&target))
388
            %isx',
611✔
389
            $html,
611✔
390
            $matches
611✔
391
        );
611✔
392
        if (isset($matches[0])) {
611✔
393
            $htmlBefore = $matches[0];
396✔
394
            try {
395
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
396✔
396
            } catch (\RuntimeException $exception) {
×
397
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV
398
                \trigger_error($exception->getMessage());
×
UNCOV
399
                $hasContentTypeMetaTagInHead = true;
×
400
            }
401
        } else {
402
            $hasContentTypeMetaTagInHead = false;
215✔
403
        }
404

405
        return $hasContentTypeMetaTagInHead;
611✔
406
    }
407

408
    /**
409
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
410
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
411
     * which is permitted only within the `<body>`.
412
     *
413
     * @throws \RuntimeException
414
     */
415
    private function hasEndOfHeadElement(string $html): bool
396✔
416
    {
417
        if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html) !== 0) {
396✔
418
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
419
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
420
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
70✔
421
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
70✔
422
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
70✔
423
        } else {
424
            $hasEndOfHeadElement = false;
374✔
425
        }
426

427
        return $hasEndOfHeadElement;
396✔
428
    }
429

430
    /**
431
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
432
     * is removed.
433
     */
434
    private function removeHtmlComments(string $html): string
70✔
435
    {
436
        return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
70✔
437
    }
438

439
    /**
440
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
441
     * the string is removed.
442
     */
443
    private function removeHtmlTemplateElements(string $html): string
70✔
444
    {
445
        return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
70✔
446
    }
447

448
    /**
449
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
450
     * self-closing slash.
451
     */
452
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
611✔
453
    {
454
        return preg_replace(
611✔
455
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
611✔
456
            '$0/',
611✔
457
            $html
611✔
458
        );
611✔
459
    }
460

461
    /**
462
     * Checks that $this->domDocument has a BODY element and adds it if it is missing.
463
     *
464
     * @throws \UnexpectedValueException
465
     */
466
    private function ensureExistenceOfBodyElement(): void
611✔
467
    {
468
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
611✔
469
            return;
198✔
470
        }
471

472
        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
413✔
473
    }
474
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc