• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MyIntervals / emogrifier / 16949732221

13 Aug 2025 09:27PM UTC coverage: 96.966% (+0.007%) from 96.959%
16949732221

Pull #1446

github

web-flow
Merge 93ecf9ef2 into dc89c2084
Pull Request #1446: [CLEANUP] Tighten types and comments in `AbstractHtmlProcessor`

5 of 5 new or added lines in 1 file covered. (100.0%)

3 existing lines in 1 file now uncovered.

863 of 890 relevant lines covered (96.97%)

252.48 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.63
/src/HtmlProcessor/AbstractHtmlProcessor.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Pelago\Emogrifier\HtmlProcessor;
6

7
use Pelago\Emogrifier\Utilities\Preg;
8

9
/**
10
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
11
 *
12
 * The "vanilla" subclass is the HtmlNormalizer.
13
 */
14
abstract class AbstractHtmlProcessor
15
{
16
    /**
17
     * @var non-empty-string
18
     */
19
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
20

21
    /**
22
     * @var non-empty-string
23
     */
24
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
25

26
    /**
27
     * @var non-empty-string Regular expression part to match tag names that PHP's DOMDocument implementation is not
28
     *      aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
29
     *      `<keygen>` (deprecated) are also included.
30
     *
31
     * @see https://bugs.php.net/bug.php?id=73175
32
     */
33
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';
34

35
    /**
36
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
37
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
38
     *
39
     * @var non-empty-string
40
     */
41
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
42
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';
43

44
    /**
45
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
46
     *
47
     * @var non-empty-string
48
     */
49
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';
50

51
    /**
52
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
53
     *
54
     * @var non-empty-string
55
     */
56
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
57
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';
58

59
    /**
60
     * @var \DOMDocument|null
61
     */
62
    protected $domDocument = null;
63

64
    /**
65
     * @var \DOMXPath|null
66
     */
67
    private $xPath = null;
68

69
    /**
70
     * The constructor.
71
     *
72
     * Please use `::fromHtml` or `::fromDomDocument` instead.
73
     */
74
    private function __construct() {}
75

76
    /**
77
     * Builds a new instance from the given HTML.
78
     *
79
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
80
     *
81
     * @return static
82
     *
83
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
84
     */
85
    public static function fromHtml(string $unprocessedHtml): self
595✔
86
    {
87
        // @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
88
        if ($unprocessedHtml === '') {
595✔
89
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
1✔
90
        }
91

92
        $instance = new static();
594✔
93
        $instance->setHtml($unprocessedHtml);
594✔
94

95
        return $instance;
594✔
96
    }
97

98
    /**
99
     * Builds a new instance from the given DOM document.
100
     *
101
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
102
     *
103
     * @return static
104
     */
105
    public static function fromDomDocument(\DOMDocument $document): self
4✔
106
    {
107
        $instance = new static();
4✔
108
        $instance->setDomDocument($document);
4✔
109

110
        return $instance;
4✔
111
    }
112

113
    /**
114
     * Sets the HTML to process.
115
     *
116
     * @param string $html the HTML to process, must be UTF-8-encoded
117
     */
118
    private function setHtml(string $html): void
594✔
119
    {
120
        $this->createUnifiedDomDocument($html);
594✔
121
    }
122

123
    /**
124
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
125
     *
126
     * @throws \UnexpectedValueException
127
     */
128
    public function getDomDocument(): \DOMDocument
596✔
129
    {
130
        if (!$this->domDocument instanceof \DOMDocument) {
596✔
131
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
×
132
            throw new \UnexpectedValueException($message, 1570472239);
×
133
        }
134

135
        return $this->domDocument;
596✔
136
    }
137

138
    private function setDomDocument(\DOMDocument $domDocument): void
598✔
139
    {
140
        $this->domDocument = $domDocument;
598✔
141
        $this->xPath = new \DOMXPath($this->domDocument);
598✔
142
    }
143

144
    /**
145
     * @throws \UnexpectedValueException
146
     */
147
    protected function getXPath(): \DOMXPath
×
148
    {
149
        if (!$this->xPath instanceof \DOMXPath) {
×
150
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
×
151
            throw new \UnexpectedValueException($message, 1617819086);
×
152
        }
153

154
        return $this->xPath;
×
155
    }
156

157
    /**
158
     * Renders the normalized and processed HTML.
159
     */
160
    public function render(): string
205✔
161
    {
162
        $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();
205✔
163

164
        return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
205✔
165
    }
166

167
    /**
168
     * Renders the content of the BODY element of the normalized and processed HTML.
169
     */
170
    public function renderBodyContent(): string
12✔
171
    {
172
        $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());
12✔
173
        $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
12✔
174

175
        return (new Preg())->replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
12✔
176
    }
177

178
    /**
179
     * Eliminates any invalid closing tags for void elements from the given HTML.
180
     */
181
    private function removeSelfClosingTagsClosingTags(string $html): string
217✔
182
    {
183
        return (new Preg())->replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
217✔
184
    }
185

186
    /**
187
     * Returns the HTML element.
188
     *
189
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
190
     *
191
     * @throws \UnexpectedValueException
192
     */
193
    protected function getHtmlElement(): \DOMElement
414✔
194
    {
195
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
414✔
196
        if (!$htmlElement instanceof \DOMElement) {
414✔
197
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
×
198
        }
199

200
        return $htmlElement;
414✔
201
    }
202

203
    /**
204
     * Returns the BODY element.
205
     *
206
     * This method assumes that there always is a BODY element.
207
     *
208
     * @throws \RuntimeException
209
     */
210
    private function getBodyElement(): \DOMElement
12✔
211
    {
212
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
12✔
213
        if (!$node instanceof \DOMElement) {
12✔
214
            throw new \RuntimeException('There is no body element.', 1617922607);
×
215
        }
216

217
        return $node;
12✔
218
    }
219

220
    /**
221
     * Creates a DOM document from the given HTML and stores it in $this->domDocument.
222
     *
223
     * The DOM document will always have a BODY element and a document type.
224
     */
225
    private function createUnifiedDomDocument(string $html): void
594✔
226
    {
227
        $this->createRawDomDocument($html);
594✔
228
        $this->ensureExistenceOfBodyElement();
594✔
229
    }
230

231
    /**
232
     * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
233
     */
234
    private function createRawDomDocument(string $html): void
594✔
235
    {
236
        $domDocument = new \DOMDocument();
594✔
237
        $domDocument->strictErrorChecking = false;
594✔
238
        $domDocument->formatOutput = false;
594✔
239
        $libXmlState = \libxml_use_internal_errors(true);
594✔
240
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
594✔
241
        \libxml_clear_errors();
594✔
242
        \libxml_use_internal_errors($libXmlState);
594✔
243

244
        $this->setDomDocument($domDocument);
594✔
245
    }
246

247
    /**
248
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
249
     * ensuring that the HTML will be good for creating a DOM document from it.
250
     */
251
    private function prepareHtmlForDomConversion(string $html): string
594✔
252
    {
253
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
594✔
254
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
594✔
255

256
        return $this->addContentTypeMetaTag($htmlWithDocumentType);
594✔
257
    }
258

259
    /**
260
     * Makes sure that the passed HTML has a document type, with lowercase "html".
261
     *
262
     * @return non-empty-string HTML with document type
263
     */
264
    private function ensureDocumentType(string $html): string
594✔
265
    {
266
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
594✔
267
        if ($hasDocumentType) {
594✔
268
            return $this->normalizeDocumentType($html);
37✔
269
        }
270

271
        return self::DEFAULT_DOCUMENT_TYPE . $html;
557✔
272
    }
273

274
    /**
275
     * Makes sure the document type in the passed HTML has lowercase `html`.
276
     *
277
     * @return ($html is non-empty-string ? non-empty-string : string) HTML with normalized document type
278
     */
279
    private function normalizeDocumentType(string $html): string
37✔
280
    {
281
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
282
        return (new Preg())->replace(
37✔
283
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
37✔
284
            '<!DOCTYPE html',
37✔
285
            $html,
37✔
286
            1
37✔
287
        );
37✔
288
    }
289

290
    /**
291
     * Adds a Content-Type meta tag for the charset.
292
     *
293
     * This method also ensures that there is a HEAD element.
294
     *
295
     * @return non-empty-string
296
     */
297
    private function addContentTypeMetaTag(string $html): string
594✔
298
    {
299
        if ($this->hasContentTypeMetaTagInHead($html)) {
594✔
300
            \assert($html !== '');
373✔
301
            return $html;
373✔
302
        }
303

304
        // We are trying to insert the meta tag to the right spot in the DOM.
305
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
306
        $hasHeadTag = (new Preg())->match('/<head[\\s>]/i', $html) !== 0;
221✔
307
        $hasHtmlTag = \stripos($html, '<html') !== false;
221✔
308

309
        if ($hasHeadTag) {
221✔
310
            $reworkedHtml = (new Preg())->replace(
36✔
311
                '/<head(?=[\\s>])([^>]*+)>/i',
36✔
312
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
36✔
313
                $html
36✔
314
            );
36✔
315
        } elseif ($hasHtmlTag) {
185✔
316
            $reworkedHtml = (new Preg())->replace(
82✔
317
                '/<html(.*?)>/is',
82✔
318
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
82✔
319
                $html
82✔
320
            );
82✔
321
        } else {
322
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
103✔
323
        }
324
        \assert($reworkedHtml !== '');
221✔
325

326
        return $reworkedHtml;
221✔
327
    }
328

329
    /**
330
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
331
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
332
     * encountering a start tag for any element which is permitted only within the `<body>`.
333
     */
334
    private function hasContentTypeMetaTagInHead(string $html): bool
594✔
335
    {
336
        (new Preg())->match(
594✔
337
            '%^.*?(?=<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is',
594✔
338
            $html,
594✔
339
            $matches
594✔
340
        );
594✔
341
        if (isset($matches[0])) {
594✔
342
            $htmlBefore = $matches[0];
395✔
343
            try {
344
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
395✔
UNCOV
345
            } catch (\RuntimeException $exception) {
×
346
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV
347
                \trigger_error($exception->getMessage());
×
UNCOV
348
                $hasContentTypeMetaTagInHead = true;
×
349
            }
350
        } else {
351
            $hasContentTypeMetaTagInHead = false;
199✔
352
        }
353

354
        return $hasContentTypeMetaTagInHead;
594✔
355
    }
356

357
    /**
358
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
359
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
360
     * which is permitted only within the `<body>`.
361
     *
362
     * @throws \RuntimeException
363
     */
364
    private function hasEndOfHeadElement(string $html): bool
395✔
365
    {
366
        if (
367
            (new Preg())->match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html)
395✔
368
            !== 0
369
        ) {
370
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
371
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
372
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
70✔
373
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
70✔
374
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
70✔
375
        } else {
376
            $hasEndOfHeadElement = false;
373✔
377
        }
378

379
        return $hasEndOfHeadElement;
395✔
380
    }
381

382
    /**
383
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
384
     * is removed.
385
     *
386
     * @throws \RuntimeException
387
     */
388
    private function removeHtmlComments(string $html): string
70✔
389
    {
390
        return (new Preg())->throwExceptions(true)->replace(self::HTML_COMMENT_PATTERN, '', $html);
70✔
391
    }
392

393
    /**
394
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
395
     * the string is removed.
396
     *
397
     * @throws \RuntimeException
398
     */
399
    private function removeHtmlTemplateElements(string $html): string
70✔
400
    {
401
        return (new Preg())->throwExceptions(true)->replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
70✔
402
    }
403

404
    /**
405
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
406
     * self-closing slash.
407
     */
408
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
594✔
409
    {
410
        return (new Preg())->replace(
594✔
411
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
594✔
412
            '$0/',
594✔
413
            $html
594✔
414
        );
594✔
415
    }
416

417
    /**
418
     * Checks that $this->domDocument has a BODY element and adds it if it is missing.
419
     *
420
     * @throws \UnexpectedValueException
421
     */
422
    private function ensureExistenceOfBodyElement(): void
594✔
423
    {
424
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
594✔
425
            return;
181✔
426
        }
427

428
        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
413✔
429
    }
430
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc