22834755616

Committed 09 Mar 2026 01:34AM UTC coverage: 96.211% (-0.09%) from 96.305%

Build # 22834755616

Build Type

Pull #1588

github

Committed by

web-flow

Commit Message

Merge 4163c161f into c9e5ad70d

Pull Request Pull Request #1588: [BUGFIX] Throw exception if `DOMDocument::saveHTML` fails

Coverage Stats

6 of 7 new or added lines in 1 file covered. (85.71%)

4 existing lines in 1 file now uncovered.

838 of 871 relevant lines covered (96.21%)

259.45 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.37

/src/HtmlProcessor/AbstractHtmlProcessor.php

<?php

declare(strict_types=1);

namespace Pelago\Emogrifier\HtmlProcessor;

use function Safe\preg_match;
use function Safe\preg_replace;

/**
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
 *
 * The "vanilla" subclass is the HtmlNormalizer.
 */
abstract class AbstractHtmlProcessor
{
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';

    /**
     * Regular expression part to match tag names that PHP's DOMDocument implementation is not
     * aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
     * `<keygen>` (deprecated) are also included.
     *
     * @see https://bugs.php.net/bug.php?id=73175
     */
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

    /**
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
     */
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';

    /**
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
     */
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';

    /**
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
     */
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';

    /**
     * @var \DOMDocument|null
     */
    protected $domDocument = null;

    /**
     * @var \DOMXPath|null
     */
    private $xPath = null;

    /**
     * The constructor.
     *
     * Please use `::fromHtml` or `::fromDomDocument` instead.
     */
    private function __construct() {}

    /**
     * Builds a new instance from the given HTML.
     *
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
     *
     * @return static
     *
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
     */
    public static function fromHtml(string $unprocessedHtml): self
    {
        // @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
        if ($unprocessedHtml === '') {
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
        }

        $instance = new static();
        $instance->setHtml($unprocessedHtml);

        return $instance;
    }

    /**
     * Builds a new instance from the given DOM document.
     *
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
     *
     * @return static
     */
    public static function fromDomDocument(\DOMDocument $document): self
    {
        $instance = new static();
        $instance->setDomDocument($document);

        return $instance;
    }

    /**
     * Sets the HTML to process.
     *
     * @param string $html the HTML to process, must be UTF-8-encoded
     */
    private function setHtml(string $html): void
    {
        $this->createUnifiedDomDocument($html);
    }

    /**
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
     *
     * @throws \UnexpectedValueException
     */
    public function getDomDocument(): \DOMDocument
    {
        if (!$this->domDocument instanceof \DOMDocument) {
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
            throw new \UnexpectedValueException($message, 1570472239);
        }

        return $this->domDocument;
    }

    private function setDomDocument(\DOMDocument $domDocument): void
    {
        $this->domDocument = $domDocument;
        $this->xPath = new \DOMXPath($this->domDocument);
    }

    /**
     * @throws \UnexpectedValueException
     */
    protected function getXPath(): \DOMXPath
    {
        if (!$this->xPath instanceof \DOMXPath) {
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
            throw new \UnexpectedValueException($message, 1617819086);
        }

        return $this->xPath;
    }

    /**
     * Renders the normalized and processed HTML.
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    public function render(): string
    {
        $htmlWithPossibleErroneousClosingTags = $this->getHtml();

        return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
    }

    /**
     * Renders the content of the BODY element of the normalized and processed HTML.
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    public function renderBodyContent(): string
    {
        $htmlWithPossibleErroneousClosingTags = $this->getHtml($this->getBodyElement());
        $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);

        return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
    }

    /**
     * @param ?\DOMNode $node optional parameter to output a subset of the document
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    private function getHtml(?\DOMNode $node = null): string
    {
        $html = $this->getDomDocument()->saveHTML($node);

        if (!\is_string($html)) {
            throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);
        }
        return $html;
    }

    /**
     * Eliminates any invalid closing tags for void elements from the given HTML.
     */
    private function removeSelfClosingTagsClosingTags(string $html): string
    {
        return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
    }

    /**
     * Returns the HTML element.
     *
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
     *
     * @throws \UnexpectedValueException
     */
    protected function getHtmlElement(): \DOMElement
    {
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
        if (!$htmlElement instanceof \DOMElement) {
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
        }

        return $htmlElement;
    }

    /**
     * Returns the BODY element.
     *
     * This method assumes that there always is a BODY element.
     *
     * @throws \RuntimeException
     */
    private function getBodyElement(): \DOMElement
    {
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
        if (!$node instanceof \DOMElement) {
            throw new \RuntimeException('There is no body element.', 1617922607);
        }

        return $node;
    }

    /**
     * Creates a DOM document from the given HTML and stores it in $this->domDocument.
     *
     * The DOM document will always have a BODY element and a document type.
     */
    private function createUnifiedDomDocument(string $html): void
    {
        $this->createRawDomDocument($html);
        $this->ensureExistenceOfBodyElement();
    }

    /**
     * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
     */
    private function createRawDomDocument(string $html): void
    {
        $domDocument = new \DOMDocument();
        $domDocument->strictErrorChecking = false;
        $domDocument->formatOutput = false;
        $libXmlState = \libxml_use_internal_errors(true);
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);
        \libxml_clear_errors();
        \libxml_use_internal_errors($libXmlState);

        $this->setDomDocument($domDocument);
    }

    /**
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
     * ensuring that the HTML will be good for creating a DOM document from it.
     */
    private function prepareHtmlForDomConversion(string $html): string
    {
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);

        return $this->addContentTypeMetaTag($htmlWithDocumentType);
    }

    /**
     * Makes sure that the passed HTML has a document type, with lowercase "html".
     *
     * @return non-empty-string HTML with document type
     */
    private function ensureDocumentType(string $html): string
    {
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
        if ($hasDocumentType) {
            return $this->normalizeDocumentType($html);
        }

        return self::DEFAULT_DOCUMENT_TYPE . $html;
    }

    /**
     * Makes sure the document type in the passed HTML has lowercase `html`.
     *
     * @param non-empty-string $html
     *
     * @return non-empty-string HTML with normalized document type
     */
    private function normalizeDocumentType(string $html): string
    {
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
        $result = preg_replace(
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
            '<!DOCTYPE html',
            $html,
            1
        );
        \assert($result !== '');

        return $result;
    }

    /**
     * Adds a Content-Type meta tag for the charset.
     *
     * This method also ensures that there is a HEAD element.
     *
     * @param non-empty-string $html
     *
     * @return non-empty-string
     */
    private function addContentTypeMetaTag(string $html): string
    {
        if ($this->hasContentTypeMetaTagInHead($html)) {
            return $html;
        }

        // We are trying to insert the meta tag to the right spot in the DOM.
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
        $hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;
        $hasHtmlTag = \stripos($html, '<html') !== false;

        if ($hasHeadTag) {
            $reworkedHtml = preg_replace(
                '/<head(?=[\\s>])([^>]*+)>/i',
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
                $html
            );
        } elseif ($hasHtmlTag) {
            $reworkedHtml = preg_replace(
                '/<html(.*?)>/is',
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
                $html
            );
        } else {
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
        }
        \assert($reworkedHtml !== '');

        return $reworkedHtml;
    }

    /**
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
     * encountering a start tag for any element which is permitted only within the `<body>`.
     */
    private function hasContentTypeMetaTagInHead(string $html): bool
    {
        preg_match(
            '%
                (?(DEFINE)
                    # the target `http-equiv` attribute match
                    (?<target_attribute>
                        http-equiv=(["\']?+)Content-Type\\g{-1}
                        # must be followed by one of these characters
                        [\\s/>]
                    )
                    # the target `meta` element match without the opening `<`
                    (?<target>
                        meta(?=\\s)
                        # one or other of these
                        (?:
                            # one or more characters other than `>` or space
                            [^>\\s]++
                            |
                            # space not followed by the target `http-equiv` attribute
                            \\s(?!(?&target_attribute))
                        )
                        # any number of times (including zero)
                        *+
                        \\s(?&target_attribute)
                    )
                )
                # start of `subject`
                ^
                # one or other of these
                (?:
                    # one or more characters other than `<`
                    [^<]++
                    |
                    # `<` not followed by `target`
                    <(?!(?&target))
                )
                # any number of times (including zero)
                *+
                # followed by the target, not captured
                (?=<(?&target))
            %isx',
            $html,
            $matches
        );
        if (isset($matches[0])) {
            $htmlBefore = $matches[0];
            try {
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
            } catch (\RuntimeException $exception) {
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
                \trigger_error($exception->getMessage());
                $hasContentTypeMetaTagInHead = true;
            }
        } else {
            $hasContentTypeMetaTagInHead = false;
        }

        return $hasContentTypeMetaTagInHead;
    }

    /**
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
     * which is permitted only within the `<body>`.
     *
     * @throws \RuntimeException
     */
    private function hasEndOfHeadElement(string $html): bool
    {
        if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html) !== 0) {
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
        } else {
            $hasEndOfHeadElement = false;
        }

        return $hasEndOfHeadElement;
    }

    /**
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
     * is removed.
     */
    private function removeHtmlComments(string $html): string
    {
        return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
    }

    /**
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
     * the string is removed.
     */
    private function removeHtmlTemplateElements(string $html): string
    {
        return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
    }

    /**
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
     * self-closing slash.
     */
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
    {
        return preg_replace(
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
            '$0/',
            $html
        );
    }

    /**
     * Checks that $this->domDocument has a BODY element and adds it if it is missing.
     *
     * @throws \UnexpectedValueException
     */
    private function ensureExistenceOfBodyElement(): void
    {
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
            return;
        }

        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
    }
}

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Pelago\Emogrifier\HtmlProcessor;
6
7	use function Safe\preg_match;
8	use function Safe\preg_replace;
9
10	/**
11	* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
12	*
13	* The "vanilla" subclass is the HtmlNormalizer.
14	*/
15	abstract class AbstractHtmlProcessor
16	{
17	protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
18	protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
19
20	/**
21	* Regular expression part to match tag names that PHP's DOMDocument implementation is not
22	* aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
23	* `<keygen>` (deprecated) are also included.
24	*
25	* @see https://bugs.php.net/bug.php?id=73175
26	*/
27	protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command\|embed\|keygen\|source\|track\|wbr)';
28
29	/**
30	* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
31	* for any other element would implicitly start the `<body>` element due to tag omission rules.
32	*/
33	protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
34	= '(?:html\|head\|base\|command\|link\|meta\|noscript\|script\|style\|template\|title)';
35
36	/**
37	* regular expression pattern to match an HTML comment, including delimiters and modifiers
38	*/
39	protected const HTML_COMMENT_PATTERN = '/<!--[^-]+(?:-(?!->)[^-]+)*+(?:-->\|$)/';
40
41	/**
42	* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
43	*/
44	protected const HTML_TEMPLATE_ELEMENT_PATTERN
45	= '%<template[\\s>][^<]+(?:<(?!/template>)[^<]+)*+(?:</template>\|$)%i';
46
47	/**
48	* @var \DOMDocument\|null
49	*/
50	protected $domDocument = null;
51
52	/**
53	* @var \DOMXPath\|null
54	*/
55	private $xPath = null;
56
57	/**
58	* The constructor.
59	*
60	* Please use `::fromHtml` or `::fromDomDocument` instead.
61	*/
62	private function __construct() {}
63
64	/**
65	* Builds a new instance from the given HTML.
66	*
67	* @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
68	*
69	* @return static
70	*
71	* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
72	*/
73	public static function fromHtml(string $unprocessedHtml): self	612✔
74	{
75	// @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
76	if ($unprocessedHtml === '') {	612✔
77	throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);	1✔
78	}
79
80	$instance = new static();	611✔
81	$instance->setHtml($unprocessedHtml);	611✔
82
83	return $instance;	611✔
84	}
85
86	/**
87	* Builds a new instance from the given DOM document.
88	*
89	* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
90	*
91	* @return static
92	*/
93	public static function fromDomDocument(\DOMDocument $document): self	4✔
94	{
95	$instance = new static();	4✔
96	$instance->setDomDocument($document);	4✔
97
98	return $instance;	4✔
99	}
100
101	/**
102	* Sets the HTML to process.
103	*
104	* @param string $html the HTML to process, must be UTF-8-encoded
105	*/
106	private function setHtml(string $html): void	611✔
107	{
108	$this->createUnifiedDomDocument($html);	611✔
109	}
110
111	/**
112	* Provides access to the internal DOMDocument representation of the HTML in its current state.
113	*
114	* @throws \UnexpectedValueException
115	*/
116	public function getDomDocument(): \DOMDocument	613✔
117	{
118	if (!$this->domDocument instanceof \DOMDocument) {	613✔
119	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;	×
120	throw new \UnexpectedValueException($message, 1570472239);	×
121	}
122
123	return $this->domDocument;	613✔
124	}
125
126	private function setDomDocument(\DOMDocument $domDocument): void	615✔
127	{
128	$this->domDocument = $domDocument;	615✔
129	$this->xPath = new \DOMXPath($this->domDocument);	615✔
130	}
131
132	/**
133	* @throws \UnexpectedValueException
134	*/
135	protected function getXPath(): \DOMXPath	×
136	{
137	if (!$this->xPath instanceof \DOMXPath) {	×
138	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;	×
139	throw new \UnexpectedValueException($message, 1617819086);	×
140	}
141
142	return $this->xPath;	×
143	}
144
145	/**
146	* Renders the normalized and processed HTML.
147	*
148	* @throws \RuntimeException if there is an internal error with `DOMDocument`
149	*/
150	public function render(): string	212✔
151	{
152	$htmlWithPossibleErroneousClosingTags = $this->getHtml();	212✔
153
154	return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);	212✔
155	}
156
157	/**
158	* Renders the content of the BODY element of the normalized and processed HTML.
159	*
160	* @throws \RuntimeException if there is an internal error with `DOMDocument`
161	*/
162	public function renderBodyContent(): string	12✔
163	{
164	$htmlWithPossibleErroneousClosingTags = $this->getHtml($this->getBodyElement());	12✔
165	$bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);	12✔
166
167	return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);	12✔
168	}
169
170	/**
171	* @param ?\DOMNode $node optional parameter to output a subset of the document
172	*
173	* @throws \RuntimeException if there is an internal error with `DOMDocument`
174	*/
175	private function getHtml(?\DOMNode $node = null): string	224✔
176	{
177	$html = $this->getDomDocument()->saveHTML($node);	224✔
178
179	if (!\is_string($html)) {	224✔
NEW 180	throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);	×
181	}
182	return $html;	224✔
183	}
184
185	/**
186	* Eliminates any invalid closing tags for void elements from the given HTML.
187	*/
188	private function removeSelfClosingTagsClosingTags(string $html): string	224✔
189	{
190	return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);	224✔
191	}
192
193	/**
194	* Returns the HTML element.
195	*
196	* This method assumes that there always is an HTML element, throwing an exception otherwise.
197	*
198	* @throws \UnexpectedValueException
199	*/
200	protected function getHtmlElement(): \DOMElement	414✔
201	{
202	$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);	414✔
203	if (!$htmlElement instanceof \DOMElement) {	414✔
UNCOV 204	throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);	×
205	}
206
207	return $htmlElement;	414✔
208	}
209
210	/**
211	* Returns the BODY element.
212	*
213	* This method assumes that there always is a BODY element.
214	*
215	* @throws \RuntimeException
216	*/
217	private function getBodyElement(): \DOMElement	12✔
218	{
219	$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);	12✔
220	if (!$node instanceof \DOMElement) {	12✔
UNCOV 221	throw new \RuntimeException('There is no body element.', 1617922607);	×
222	}
223
224	return $node;	12✔
225	}
226
227	/**
228	* Creates a DOM document from the given HTML and stores it in $this->domDocument.
229	*
230	* The DOM document will always have a BODY element and a document type.
231	*/
232	private function createUnifiedDomDocument(string $html): void	611✔
233	{
234	$this->createRawDomDocument($html);	611✔
235	$this->ensureExistenceOfBodyElement();	611✔
236	}
237
238	/**
239	* Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
240	*/
241	private function createRawDomDocument(string $html): void	611✔
242	{
243	$domDocument = new \DOMDocument();	611✔
244	$domDocument->strictErrorChecking = false;	611✔
245	$domDocument->formatOutput = false;	611✔
246	$libXmlState = \libxml_use_internal_errors(true);	611✔
247	$domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);	611✔
248	\libxml_clear_errors();	611✔
249	\libxml_use_internal_errors($libXmlState);	611✔
250
251	$this->setDomDocument($domDocument);	611✔
252	}
253
254	/**
255	* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
256	* ensuring that the HTML will be good for creating a DOM document from it.
257	*/
258	private function prepareHtmlForDomConversion(string $html): string	611✔
259	{
260	$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);	611✔
261	$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);	611✔
262
263	return $this->addContentTypeMetaTag($htmlWithDocumentType);	611✔
264	}
265
266	/**
267	* Makes sure that the passed HTML has a document type, with lowercase "html".
268	*
269	* @return non-empty-string HTML with document type
270	*/
271	private function ensureDocumentType(string $html): string	611✔
272	{
273	$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;	611✔
274	if ($hasDocumentType) {	611✔
275	return $this->normalizeDocumentType($html);	39✔
276	}
277
278	return self::DEFAULT_DOCUMENT_TYPE . $html;	572✔
279	}
280
281	/**
282	* Makes sure the document type in the passed HTML has lowercase `html`.
283	*
284	* @param non-empty-string $html
285	*
286	* @return non-empty-string HTML with normalized document type
287	*/
288	private function normalizeDocumentType(string $html): string	39✔
289	{
290	// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
291	$result = preg_replace(	39✔
292	'/<!DOCTYPE\\s++html(?=[\\s>])/i',	39✔
293	'<!DOCTYPE html',	39✔
294	$html,	39✔
295	1	39✔
296	);	39✔
297	\assert($result !== '');	39✔
298
299	return $result;	39✔
300	}
301
302	/**
303	* Adds a Content-Type meta tag for the charset.
304	*
305	* This method also ensures that there is a HEAD element.
306	*
307	* @param non-empty-string $html
308	*
309	* @return non-empty-string
310	*/
311	private function addContentTypeMetaTag(string $html): string	611✔
312	{
313	if ($this->hasContentTypeMetaTagInHead($html)) {	611✔
314	return $html;	374✔
315	}
316
317	// We are trying to insert the meta tag to the right spot in the DOM.
318	// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
319	$hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;	237✔
320	$hasHtmlTag = \stripos($html, '<html') !== false;	237✔
321
322	if ($hasHeadTag) {	237✔
323	$reworkedHtml = preg_replace(	42✔
324	'/<head(?=[\\s>])([^>]*+)>/i',	42✔
325	'<head$1>' . self::CONTENT_TYPE_META_TAG,	42✔
326	$html	42✔
327	);	42✔
328	} elseif ($hasHtmlTag) {	195✔
329	$reworkedHtml = preg_replace(	83✔
330	'/<html(.*?)>/is',	83✔
331	'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',	83✔
332	$html	83✔
333	);	83✔
334	} else {
335	$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;	112✔
336	}
337	\assert($reworkedHtml !== '');	237✔
338
339	return $reworkedHtml;	237✔
340	}
341
342	/**
343	* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
344	* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
345	* encountering a start tag for any element which is permitted only within the `<body>`.
346	*/
347	private function hasContentTypeMetaTagInHead(string $html): bool	611✔
348	{
349	preg_match(	611✔
350	'%	611✔
351	(?(DEFINE)
352	# the target `http-equiv` attribute match
353	(?<target_attribute>
354	http-equiv=(["\']?+)Content-Type\\g{-1}
355	# must be followed by one of these characters
356	[\\s/>]
357	)
358	# the target `meta` element match without the opening `<`
359	(?<target>
360	meta(?=\\s)
361	# one or other of these
362	(?:
363	# one or more characters other than `>` or space
364	[^>\\s]++
365	\|
366	# space not followed by the target `http-equiv` attribute
367	\\s(?!(?&target_attribute))
368	)
369	# any number of times (including zero)
370	*+
371	\\s(?&target_attribute)
372	)
373	)
374	# start of `subject`
375	^
376	# one or other of these
377	(?:
378	# one or more characters other than `<`
379	[^<]++
380	\|
381	# `<` not followed by `target`
382	<(?!(?&target))
383	)
384	# any number of times (including zero)
385	*+
386	# followed by the target, not captured
387	(?=<(?&target))
388	%isx',	611✔
389	$html,	611✔
390	$matches	611✔
391	);	611✔
392	if (isset($matches[0])) {	611✔
393	$htmlBefore = $matches[0];	396✔
394	try {
395	$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);	396✔
396	} catch (\RuntimeException $exception) {	×
397	// If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV 398	\trigger_error($exception->getMessage());	×
UNCOV 399	$hasContentTypeMetaTagInHead = true;	×
400	}
401	} else {
402	$hasContentTypeMetaTagInHead = false;	215✔
403	}
404
405	return $hasContentTypeMetaTagInHead;	611✔
406	}
407
408	/**
409	* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
410	* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
411	* which is permitted only within the `<body>`.
412	*
413	* @throws \RuntimeException
414	*/
415	private function hasEndOfHeadElement(string $html): bool	396✔
416	{
417	if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w\|</head>%i', $html) !== 0) {	396✔
418	// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
419	// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
420	$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));	70✔
421	$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html	70✔
422	\|\| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);	70✔
423	} else {
424	$hasEndOfHeadElement = false;	374✔
425	}
426
427	return $hasEndOfHeadElement;	396✔
428	}
429
430	/**
431	* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
432	* is removed.
433	*/
434	private function removeHtmlComments(string $html): string	70✔
435	{
436	return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);	70✔
437	}
438
439	/**
440	* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
441	* the string is removed.
442	*/
443	private function removeHtmlTemplateElements(string $html): string	70✔
444	{
445	return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);	70✔
446	}
447
448	/**
449	* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
450	* self-closing slash.
451	*/
452	private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string	611✔
453	{
454	return preg_replace(	611✔
455	'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',	611✔
456	'$0/',	611✔
457	$html	611✔
458	);	611✔
459	}
460
461	/**
462	* Checks that $this->domDocument has a BODY element and adds it if it is missing.
463	*
464	* @throws \UnexpectedValueException
465	*/
466	private function ensureExistenceOfBodyElement(): void	611✔
467	{
468	if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {	611✔
469	return;	198✔
470	}
471
472	$this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));	413✔
473	}
474	}

MyIntervals / emogrifier / 22834755616

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous