25810750037

Committed 13 May 2026 03:59PM UTC coverage: 98.246% (+0.6%) from 97.669%

Build # 25810750037

Build Type

Pull #1620

github

Committed by

web-flow

Commit Message

Merge ac6ce8566 into 635eb2818

Pull Request Pull Request #1620: [CLEANUP] Rely on PHP to detect access to uninitialized properties

Coverage Stats

8 of 8 new or added lines in 4 files covered. (100.0%)

5 existing lines in 1 file now uncovered.

840 of 855 relevant lines covered (98.25%)

259.61 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.28

/src/HtmlProcessor/AbstractHtmlProcessor.php

<?php

declare(strict_types=1);

namespace Pelago\Emogrifier\HtmlProcessor;

use function Safe\preg_match;
use function Safe\preg_replace;

/**
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
 *
 * The "vanilla" subclass is the HtmlNormalizer.
 */
abstract class AbstractHtmlProcessor
{
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';

    /**
     * Regular expression part to match tag names that PHP's DOMDocument implementation is not
     * aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
     * `<keygen>` (deprecated) are also included.
     *
     * @see https://bugs.php.net/bug.php?id=73175
     */
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

    /**
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
     */
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';

    /**
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
     */
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';

    /**
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
     */
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';

    /**
     * @var \DOMDocument
     */
    private $domDocument;

    /**
     * @var \DOMXPath
     */
    protected $xPath;

    /**
     * The constructor.
     *
     * Please use `::fromHtml` or `::fromDomDocument` instead.
     */
    final private function __construct() {}

    /**
     * Builds a new instance from the given HTML.
     *
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
     *
     * @return static
     *
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
     */
    public static function fromHtml(string $unprocessedHtml): self
    {
        if ($unprocessedHtml === '') {
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
        }

        $instance = new static();
        $instance->setHtml($unprocessedHtml);
        \assert($instance->xPath instanceof \DOMXPath);

        return $instance;
    }

    /**
     * Builds a new instance from the given DOM document.
     *
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
     *
     * @return static
     */
    public static function fromDomDocument(\DOMDocument $document): self
    {
        $instance = new static();
        $instance->setDomDocument($document);
        \assert($instance->xPath instanceof \DOMXPath);

        return $instance;
    }

    /**
     * Sets the HTML to process.
     *
     * @param string $html the HTML to process, must be UTF-8-encoded
     */
    private function setHtml(string $html): void
    {
        $this->createUnifiedDomDocument($html);
    }

    /**
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
     *
     * @throws \UnexpectedValueException
     */
    public function getDomDocument(): \DOMDocument
    {
        return $this->domDocument;
    }

    private function setDomDocument(\DOMDocument $domDocument): void
    {
        $this->domDocument = $domDocument;
        $this->xPath = new \DOMXPath($domDocument);
    }

    /**
     * Renders the normalized and processed HTML.
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    public function render(): string
    {
        return $this->getHtml();
    }

    /**
     * Renders the content of the BODY element of the normalized and processed HTML.
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    public function renderBodyContent(): string
    {
        $bodyNodeHtml = $this->getHtml($this->getBodyElement());

        return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
    }

    /**
     * @param ?\DOMNode $node optional parameter to output a subset of the document
     *
     * @throws \RuntimeException if there is an internal error with `DOMDocument`
     */
    private function getHtml(?\DOMNode $node = null): string
    {
        $html = $this->getDomDocument()->saveHTML($node);

        if (!\is_string($html)) {
            throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);
        }
        return $this->removeSelfClosingTagsClosingTags($html);
    }

    /**
     * Eliminates any invalid closing tags for void elements from the given HTML.
     */
    private function removeSelfClosingTagsClosingTags(string $html): string
    {
        return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
    }

    /**
     * Returns the HTML element.
     *
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
     *
     * @throws \UnexpectedValueException
     */
    protected function getHtmlElement(): \DOMElement
    {
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
        if (!$htmlElement instanceof \DOMElement) {
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
        }

        return $htmlElement;
    }

    /**
     * Returns the BODY element.
     *
     * This method assumes that there always is a BODY element.
     *
     * @throws \RuntimeException
     */
    private function getBodyElement(): \DOMElement
    {
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
        if (!$node instanceof \DOMElement) {
            throw new \RuntimeException('There is no body element.', 1617922607);
        }

        return $node;
    }

    /**
     * Creates a DOM document from the given HTML and stores it in `$this->domDocument`.
     *
     * The DOM document will always have a BODY element and a document type.
     */
    private function createUnifiedDomDocument(string $html): void
    {
        $this->createRawDomDocument($html);
        $this->ensureExistenceOfBodyElement();
    }

    /**
     * Creates a DOMDocument instance from the given HTML and stores it in `$this->domDocument`.
     */
    private function createRawDomDocument(string $html): void
    {
        $domDocument = new \DOMDocument();
        $domDocument->strictErrorChecking = false;
        $domDocument->formatOutput = false;
        $libXmlState = \libxml_use_internal_errors(true);
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);
        \libxml_clear_errors();
        \libxml_use_internal_errors($libXmlState);

        $this->setDomDocument($domDocument);
    }

    /**
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
     * ensuring that the HTML will be good for creating a DOM document from it.
     */
    private function prepareHtmlForDomConversion(string $html): string
    {
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);

        return $this->addContentTypeMetaTag($htmlWithDocumentType);
    }

    /**
     * Makes sure that the passed HTML has a document type, with lowercase "html".
     *
     * @return non-empty-string HTML with document type
     */
    private function ensureDocumentType(string $html): string
    {
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
        if ($hasDocumentType) {
            return $this->normalizeDocumentType($html);
        }

        return self::DEFAULT_DOCUMENT_TYPE . $html;
    }

    /**
     * Makes sure the document type in the passed HTML has lowercase `html`.
     *
     * @param non-empty-string $html
     *
     * @return non-empty-string HTML with normalized document type
     */
    private function normalizeDocumentType(string $html): string
    {
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
        $result = preg_replace(
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
            '<!DOCTYPE html',
            $html,
            1,
        );
        \assert($result !== '');

        return $result;
    }

    /**
     * Adds a Content-Type meta tag for the charset.
     *
     * This method also ensures that there is a HEAD element.
     *
     * @param non-empty-string $html
     *
     * @return non-empty-string
     */
    private function addContentTypeMetaTag(string $html): string
    {
        if ($this->hasContentTypeMetaTagInHead($html)) {
            return $html;
        }

        // We are trying to insert the meta tag to the right spot in the DOM.
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
        $hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;
        $hasHtmlTag = \stripos($html, '<html') !== false;

        if ($hasHeadTag) {
            $reworkedHtml = preg_replace(
                '/<head(?=[\\s>])([^>]*+)>/i',
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
                $html,
            );
        } elseif ($hasHtmlTag) {
            $reworkedHtml = preg_replace(
                '/<html(.*?)>/is',
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
                $html,
            );
        } else {
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
        }
        \assert($reworkedHtml !== '');

        return $reworkedHtml;
    }

    /**
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
     * encountering a start tag for any element which is permitted only within the `<body>`.
     */
    private function hasContentTypeMetaTagInHead(string $html): bool
    {
        preg_match(
            '%
                (?(DEFINE)
                    # the target `http-equiv` attribute match
                    (?<target_attribute>
                        http-equiv=(["\']?+)Content-Type\\g{-1}
                        # must be followed by one of these characters
                        [\\s/>]
                    )
                    # the target `meta` element match without the opening `<`
                    (?<target>
                        meta(?=\\s)
                        # one or other of these
                        (?:
                            # one or more characters other than `>` or space
                            [^>\\s]++
                            |
                            # space not followed by the target `http-equiv` attribute
                            \\s(?!(?&target_attribute))
                        )
                        # any number of times (including zero)
                        *+
                        \\s(?&target_attribute)
                    )
                )
                # start of `subject`
                ^
                # one or other of these
                (?:
                    # one or more characters other than `<`
                    [^<]++
                    |
                    # `<` not followed by `target`
                    <(?!(?&target))
                )
                # any number of times (including zero)
                *+
                # followed by the target, not captured
                (?=<(?&target))
            %isx',
            $html,
            $matches,
        );
        if (isset($matches[0])) {
            $htmlBefore = $matches[0];
            try {
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
            } catch (\RuntimeException $exception) {
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
                \trigger_error($exception->getMessage());
                $hasContentTypeMetaTagInHead = true;
            }
        } else {
            $hasContentTypeMetaTagInHead = false;
        }

        return $hasContentTypeMetaTagInHead;
    }

    /**
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
     * which is permitted only within the `<body>`.
     *
     * @throws \RuntimeException
     */
    private function hasEndOfHeadElement(string $html): bool
    {
        if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html) !== 0) {
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
        } else {
            $hasEndOfHeadElement = false;
        }

        return $hasEndOfHeadElement;
    }

    /**
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
     * is removed.
     */
    private function removeHtmlComments(string $html): string
    {
        return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
    }

    /**
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
     * the string is removed.
     */
    private function removeHtmlTemplateElements(string $html): string
    {
        return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
    }

    /**
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
     * self-closing slash.
     */
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
    {
        return preg_replace(
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
            '$0/',
            $html,
        );
    }

    /**
     * Checks that `$this->domDocument` has a BODY element and adds it if it is missing.
     */
    private function ensureExistenceOfBodyElement(): void
    {
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
            return;
        }

        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
    }
}

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Pelago\Emogrifier\HtmlProcessor;
6
7	use function Safe\preg_match;
8	use function Safe\preg_replace;
9
10	/**
11	* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
12	*
13	* The "vanilla" subclass is the HtmlNormalizer.
14	*/
15	abstract class AbstractHtmlProcessor
16	{
17	protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
18	protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
19
20	/**
21	* Regular expression part to match tag names that PHP's DOMDocument implementation is not
22	* aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
23	* `<keygen>` (deprecated) are also included.
24	*
25	* @see https://bugs.php.net/bug.php?id=73175
26	*/
27	protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command\|embed\|keygen\|source\|track\|wbr)';
28
29	/**
30	* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
31	* for any other element would implicitly start the `<body>` element due to tag omission rules.
32	*/
33	protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
34	= '(?:html\|head\|base\|command\|link\|meta\|noscript\|script\|style\|template\|title)';
35
36	/**
37	* regular expression pattern to match an HTML comment, including delimiters and modifiers
38	*/
39	protected const HTML_COMMENT_PATTERN = '/<!--[^-]+(?:-(?!->)[^-]+)*+(?:-->\|$)/';
40
41	/**
42	* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
43	*/
44	protected const HTML_TEMPLATE_ELEMENT_PATTERN
45	= '%<template[\\s>][^<]+(?:<(?!/template>)[^<]+)*+(?:</template>\|$)%i';
46
47	/**
48	* @var \DOMDocument
49	*/
50	private $domDocument;
51
52	/**
53	* @var \DOMXPath
54	*/
55	protected $xPath;
56
57	/**
58	* The constructor.
59	*
60	* Please use `::fromHtml` or `::fromDomDocument` instead.
61	*/
62	final private function __construct() {}
63
64	/**
65	* Builds a new instance from the given HTML.
66	*
67	* @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
68	*
69	* @return static
70	*
71	* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
72	*/
73	public static function fromHtml(string $unprocessedHtml): self	612✔
74	{
75	if ($unprocessedHtml === '') {	612✔
76	throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);	1✔
77	}
78
79	$instance = new static();	611✔
80	$instance->setHtml($unprocessedHtml);	611✔
81	\assert($instance->xPath instanceof \DOMXPath);	611✔
82
83	return $instance;	611✔
84	}
85
86	/**
87	* Builds a new instance from the given DOM document.
88	*
89	* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
90	*
91	* @return static
92	*/
93	public static function fromDomDocument(\DOMDocument $document): self	4✔
94	{
95	$instance = new static();	4✔
96	$instance->setDomDocument($document);	4✔
97	\assert($instance->xPath instanceof \DOMXPath);	4✔
98
99	return $instance;	4✔
100	}
101
102	/**
103	* Sets the HTML to process.
104	*
105	* @param string $html the HTML to process, must be UTF-8-encoded
106	*/
107	private function setHtml(string $html): void	611✔
108	{
109	$this->createUnifiedDomDocument($html);	611✔
110	}
111
112	/**
113	* Provides access to the internal DOMDocument representation of the HTML in its current state.
114	*
115	* @throws \UnexpectedValueException
116	*/
117	public function getDomDocument(): \DOMDocument	613✔
118	{
119	return $this->domDocument;	613✔
120	}
121
122	private function setDomDocument(\DOMDocument $domDocument): void	615✔
123	{
124	$this->domDocument = $domDocument;	615✔
125	$this->xPath = new \DOMXPath($domDocument);	615✔
126	}
127
128	/**
129	* Renders the normalized and processed HTML.
130	*
131	* @throws \RuntimeException if there is an internal error with `DOMDocument`
132	*/
133	public function render(): string	212✔
134	{
135	return $this->getHtml();	212✔
136	}
137
138	/**
139	* Renders the content of the BODY element of the normalized and processed HTML.
140	*
141	* @throws \RuntimeException if there is an internal error with `DOMDocument`
142	*/
143	public function renderBodyContent(): string	12✔
144	{
145	$bodyNodeHtml = $this->getHtml($this->getBodyElement());	12✔
146
147	return preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);	12✔
148	}
149
150	/**
151	* @param ?\DOMNode $node optional parameter to output a subset of the document
152	*
153	* @throws \RuntimeException if there is an internal error with `DOMDocument`
154	*/
155	private function getHtml(?\DOMNode $node = null): string	224✔
156	{
157	$html = $this->getDomDocument()->saveHTML($node);	224✔
158
159	if (!\is_string($html)) {	224✔
UNCOV 160	throw new \RuntimeException('`DOMDocument::saveHTML()` failed.', 1773018082);	×
161	}
162	return $this->removeSelfClosingTagsClosingTags($html);	224✔
163	}
164
165	/**
166	* Eliminates any invalid closing tags for void elements from the given HTML.
167	*/
168	private function removeSelfClosingTagsClosingTags(string $html): string	224✔
169	{
170	return preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);	224✔
171	}
172
173	/**
174	* Returns the HTML element.
175	*
176	* This method assumes that there always is an HTML element, throwing an exception otherwise.
177	*
178	* @throws \UnexpectedValueException
179	*/
180	protected function getHtmlElement(): \DOMElement	414✔
181	{
182	$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);	414✔
183	if (!$htmlElement instanceof \DOMElement) {	414✔
UNCOV 184	throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);	×
185	}
186
187	return $htmlElement;	414✔
188	}
189
190	/**
191	* Returns the BODY element.
192	*
193	* This method assumes that there always is a BODY element.
194	*
195	* @throws \RuntimeException
196	*/
197	private function getBodyElement(): \DOMElement	12✔
198	{
199	$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);	12✔
200	if (!$node instanceof \DOMElement) {	12✔
UNCOV 201	throw new \RuntimeException('There is no body element.', 1617922607);	×
202	}
203
204	return $node;	12✔
205	}
206
207	/**
208	* Creates a DOM document from the given HTML and stores it in `$this->domDocument`.
209	*
210	* The DOM document will always have a BODY element and a document type.
211	*/
212	private function createUnifiedDomDocument(string $html): void	611✔
213	{
214	$this->createRawDomDocument($html);	611✔
215	$this->ensureExistenceOfBodyElement();	611✔
216	}
217
218	/**
219	* Creates a DOMDocument instance from the given HTML and stores it in `$this->domDocument`.
220	*/
221	private function createRawDomDocument(string $html): void	611✔
222	{
223	$domDocument = new \DOMDocument();	611✔
224	$domDocument->strictErrorChecking = false;	611✔
225	$domDocument->formatOutput = false;	611✔
226	$libXmlState = \libxml_use_internal_errors(true);	611✔
227	$domDocument->loadHTML($this->prepareHtmlForDomConversion($html), LIBXML_PARSEHUGE);	611✔
228	\libxml_clear_errors();	611✔
229	\libxml_use_internal_errors($libXmlState);	611✔
230
231	$this->setDomDocument($domDocument);	611✔
232	}
233
234	/**
235	* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
236	* ensuring that the HTML will be good for creating a DOM document from it.
237	*/
238	private function prepareHtmlForDomConversion(string $html): string	611✔
239	{
240	$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);	611✔
241	$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);	611✔
242
243	return $this->addContentTypeMetaTag($htmlWithDocumentType);	611✔
244	}
245
246	/**
247	* Makes sure that the passed HTML has a document type, with lowercase "html".
248	*
249	* @return non-empty-string HTML with document type
250	*/
251	private function ensureDocumentType(string $html): string	611✔
252	{
253	$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;	611✔
254	if ($hasDocumentType) {	611✔
255	return $this->normalizeDocumentType($html);	39✔
256	}
257
258	return self::DEFAULT_DOCUMENT_TYPE . $html;	572✔
259	}
260
261	/**
262	* Makes sure the document type in the passed HTML has lowercase `html`.
263	*
264	* @param non-empty-string $html
265	*
266	* @return non-empty-string HTML with normalized document type
267	*/
268	private function normalizeDocumentType(string $html): string	39✔
269	{
270	// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
271	$result = preg_replace(	39✔
272	'/<!DOCTYPE\\s++html(?=[\\s>])/i',	39✔
273	'<!DOCTYPE html',	39✔
274	$html,	39✔
275	1,	39✔
276	);	39✔
277	\assert($result !== '');	39✔
278
279	return $result;	39✔
280	}
281
282	/**
283	* Adds a Content-Type meta tag for the charset.
284	*
285	* This method also ensures that there is a HEAD element.
286	*
287	* @param non-empty-string $html
288	*
289	* @return non-empty-string
290	*/
291	private function addContentTypeMetaTag(string $html): string	611✔
292	{
293	if ($this->hasContentTypeMetaTagInHead($html)) {	611✔
294	return $html;	374✔
295	}
296
297	// We are trying to insert the meta tag to the right spot in the DOM.
298	// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
299	$hasHeadTag = preg_match('/<head[\\s>]/i', $html) !== 0;	237✔
300	$hasHtmlTag = \stripos($html, '<html') !== false;	237✔
301
302	if ($hasHeadTag) {	237✔
303	$reworkedHtml = preg_replace(	42✔
304	'/<head(?=[\\s>])([^>]*+)>/i',	42✔
305	'<head$1>' . self::CONTENT_TYPE_META_TAG,	42✔
306	$html,	42✔
307	);	42✔
308	} elseif ($hasHtmlTag) {	195✔
309	$reworkedHtml = preg_replace(	83✔
310	'/<html(.*?)>/is',	83✔
311	'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',	83✔
312	$html,	83✔
313	);	83✔
314	} else {
315	$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;	112✔
316	}
317	\assert($reworkedHtml !== '');	237✔
318
319	return $reworkedHtml;	237✔
320	}
321
322	/**
323	* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
324	* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
325	* encountering a start tag for any element which is permitted only within the `<body>`.
326	*/
327	private function hasContentTypeMetaTagInHead(string $html): bool	611✔
328	{
329	preg_match(	611✔
330	'%	611✔
331	(?(DEFINE)
332	# the target `http-equiv` attribute match
333	(?<target_attribute>
334	http-equiv=(["\']?+)Content-Type\\g{-1}
335	# must be followed by one of these characters
336	[\\s/>]
337	)
338	# the target `meta` element match without the opening `<`
339	(?<target>
340	meta(?=\\s)
341	# one or other of these
342	(?:
343	# one or more characters other than `>` or space
344	[^>\\s]++
345	\|
346	# space not followed by the target `http-equiv` attribute
347	\\s(?!(?&target_attribute))
348	)
349	# any number of times (including zero)
350	*+
351	\\s(?&target_attribute)
352	)
353	)
354	# start of `subject`
355	^
356	# one or other of these
357	(?:
358	# one or more characters other than `<`
359	[^<]++
360	\|
361	# `<` not followed by `target`
362	<(?!(?&target))
363	)
364	# any number of times (including zero)
365	*+
366	# followed by the target, not captured
367	(?=<(?&target))
368	%isx',	611✔
369	$html,	611✔
370	$matches,	611✔
371	);	611✔
372	if (isset($matches[0])) {	611✔
373	$htmlBefore = $matches[0];	396✔
374	try {
375	$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);	396✔
376	} catch (\RuntimeException $exception) {	×
377	// If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV 378	\trigger_error($exception->getMessage());	×
UNCOV 379	$hasContentTypeMetaTagInHead = true;	×
380	}
381	} else {
382	$hasContentTypeMetaTagInHead = false;	215✔
383	}
384
385	return $hasContentTypeMetaTagInHead;	611✔
386	}
387
388	/**
389	* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
390	* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
391	* which is permitted only within the `<body>`.
392	*
393	* @throws \RuntimeException
394	*/
395	private function hasEndOfHeadElement(string $html): bool	396✔
396	{
397	if (preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w\|</head>%i', $html) !== 0) {	396✔
398	// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
399	// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
400	$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));	70✔
401	$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html	70✔
402	\|\| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);	70✔
403	} else {
404	$hasEndOfHeadElement = false;	374✔
405	}
406
407	return $hasEndOfHeadElement;	396✔
408	}
409
410	/**
411	* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
412	* is removed.
413	*/
414	private function removeHtmlComments(string $html): string	70✔
415	{
416	return preg_replace(self::HTML_COMMENT_PATTERN, '', $html);	70✔
417	}
418
419	/**
420	* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
421	* the string is removed.
422	*/
423	private function removeHtmlTemplateElements(string $html): string	70✔
424	{
425	return preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);	70✔
426	}
427
428	/**
429	* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
430	* self-closing slash.
431	*/
432	private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string	611✔
433	{
434	return preg_replace(	611✔
435	'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',	611✔
436	'$0/',	611✔
437	$html,	611✔
438	);	611✔
439	}
440
441	/**
442	* Checks that `$this->domDocument` has a BODY element and adds it if it is missing.
443	*/
444	private function ensureExistenceOfBodyElement(): void	611✔
445	{
446	if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {	611✔
447	return;	198✔
448	}
449
450	$this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));	413✔
451	}
452	}

MyIntervals / emogrifier / 25810750037

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous