16949732221

Committed 13 Aug 2025 09:27PM UTC coverage: 96.966% (+0.007%) from 96.959%

Build # 16949732221

Build Type

Pull #1446

github

Committed by

web-flow

Commit Message

Merge 93ecf9ef2 into dc89c2084

Pull Request Pull Request #1446: [CLEANUP] Tighten types and comments in `AbstractHtmlProcessor`

Run Details

5 of 5 new or added lines in 1 file covered. (100.0%)

3 existing lines in 1 file now uncovered.

863 of 890 relevant lines covered (96.97%)

252.48 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.63

/src/HtmlProcessor/AbstractHtmlProcessor.php

<?php

declare(strict_types=1);

namespace Pelago\Emogrifier\HtmlProcessor;

use Pelago\Emogrifier\Utilities\Preg;

/**
 * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
 *
 * The "vanilla" subclass is the HtmlNormalizer.
 */
abstract class AbstractHtmlProcessor
{
    /**
     * @var non-empty-string
     */
    protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';

    /**
     * @var non-empty-string
     */
    protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';

    /**
     * @var non-empty-string Regular expression part to match tag names that PHP's DOMDocument implementation is not
     *      aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
     *      `<keygen>` (deprecated) are also included.
     *
     * @see https://bugs.php.net/bug.php?id=73175
     */
    protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

    /**
     * Regular expression part to match tag names that may appear before the start of the `<body>` element.  A start tag
     * for any other element would implicitly start the `<body>` element due to tag omission rules.
     *
     * @var non-empty-string
     */
    protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
        = '(?:html|head|base|command|link|meta|noscript|script|style|template|title)';

    /**
     * regular expression pattern to match an HTML comment, including delimiters and modifiers
     *
     * @var non-empty-string
     */
    protected const HTML_COMMENT_PATTERN = '/<!--[^-]*+(?:-(?!->)[^-]*+)*+(?:-->|$)/';

    /**
     * regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
     *
     * @var non-empty-string
     */
    protected const HTML_TEMPLATE_ELEMENT_PATTERN
        = '%<template[\\s>][^<]*+(?:<(?!/template>)[^<]*+)*+(?:</template>|$)%i';

    /**
     * @var \DOMDocument|null
     */
    protected $domDocument = null;

    /**
     * @var \DOMXPath|null
     */
    private $xPath = null;

    /**
     * The constructor.
     *
     * Please use `::fromHtml` or `::fromDomDocument` instead.
     */
    private function __construct() {}

    /**
     * Builds a new instance from the given HTML.
     *
     * @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
     *
     * @return static
     *
     * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
     */
    public static function fromHtml(string $unprocessedHtml): self
    {
        // @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
        if ($unprocessedHtml === '') {
            throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
        }

        $instance = new static();
        $instance->setHtml($unprocessedHtml);

        return $instance;
    }

    /**
     * Builds a new instance from the given DOM document.
     *
     * @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
     *
     * @return static
     */
    public static function fromDomDocument(\DOMDocument $document): self
    {
        $instance = new static();
        $instance->setDomDocument($document);

        return $instance;
    }

    /**
     * Sets the HTML to process.
     *
     * @param string $html the HTML to process, must be UTF-8-encoded
     */
    private function setHtml(string $html): void
    {
        $this->createUnifiedDomDocument($html);
    }

    /**
     * Provides access to the internal DOMDocument representation of the HTML in its current state.
     *
     * @throws \UnexpectedValueException
     */
    public function getDomDocument(): \DOMDocument
    {
        if (!$this->domDocument instanceof \DOMDocument) {
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
            throw new \UnexpectedValueException($message, 1570472239);
        }

        return $this->domDocument;
    }

    private function setDomDocument(\DOMDocument $domDocument): void
    {
        $this->domDocument = $domDocument;
        $this->xPath = new \DOMXPath($this->domDocument);
    }

    /**
     * @throws \UnexpectedValueException
     */
    protected function getXPath(): \DOMXPath
    {
        if (!$this->xPath instanceof \DOMXPath) {
            $message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
            throw new \UnexpectedValueException($message, 1617819086);
        }

        return $this->xPath;
    }

    /**
     * Renders the normalized and processed HTML.
     */
    public function render(): string
    {
        $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();

        return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
    }

    /**
     * Renders the content of the BODY element of the normalized and processed HTML.
     */
    public function renderBodyContent(): string
    {
        $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());
        $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);

        return (new Preg())->replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
    }

    /**
     * Eliminates any invalid closing tags for void elements from the given HTML.
     */
    private function removeSelfClosingTagsClosingTags(string $html): string
    {
        return (new Preg())->replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
    }

    /**
     * Returns the HTML element.
     *
     * This method assumes that there always is an HTML element, throwing an exception otherwise.
     *
     * @throws \UnexpectedValueException
     */
    protected function getHtmlElement(): \DOMElement
    {
        $htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
        if (!$htmlElement instanceof \DOMElement) {
            throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
        }

        return $htmlElement;
    }

    /**
     * Returns the BODY element.
     *
     * This method assumes that there always is a BODY element.
     *
     * @throws \RuntimeException
     */
    private function getBodyElement(): \DOMElement
    {
        $node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
        if (!$node instanceof \DOMElement) {
            throw new \RuntimeException('There is no body element.', 1617922607);
        }

        return $node;
    }

    /**
     * Creates a DOM document from the given HTML and stores it in $this->domDocument.
     *
     * The DOM document will always have a BODY element and a document type.
     */
    private function createUnifiedDomDocument(string $html): void
    {
        $this->createRawDomDocument($html);
        $this->ensureExistenceOfBodyElement();
    }

    /**
     * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
     */
    private function createRawDomDocument(string $html): void
    {
        $domDocument = new \DOMDocument();
        $domDocument->strictErrorChecking = false;
        $domDocument->formatOutput = false;
        $libXmlState = \libxml_use_internal_errors(true);
        $domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
        \libxml_clear_errors();
        \libxml_use_internal_errors($libXmlState);

        $this->setDomDocument($domDocument);
    }

    /**
     * Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
     * ensuring that the HTML will be good for creating a DOM document from it.
     */
    private function prepareHtmlForDomConversion(string $html): string
    {
        $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
        $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);

        return $this->addContentTypeMetaTag($htmlWithDocumentType);
    }

    /**
     * Makes sure that the passed HTML has a document type, with lowercase "html".
     *
     * @return non-empty-string HTML with document type
     */
    private function ensureDocumentType(string $html): string
    {
        $hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
        if ($hasDocumentType) {
            return $this->normalizeDocumentType($html);
        }

        return self::DEFAULT_DOCUMENT_TYPE . $html;
    }

    /**
     * Makes sure the document type in the passed HTML has lowercase `html`.
     *
     * @return ($html is non-empty-string ? non-empty-string : string) HTML with normalized document type
     */
    private function normalizeDocumentType(string $html): string
    {
        // Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
        return (new Preg())->replace(
            '/<!DOCTYPE\\s++html(?=[\\s>])/i',
            '<!DOCTYPE html',
            $html,
            1
        );
    }

    /**
     * Adds a Content-Type meta tag for the charset.
     *
     * This method also ensures that there is a HEAD element.
     *
     * @return non-empty-string
     */
    private function addContentTypeMetaTag(string $html): string
    {
        if ($this->hasContentTypeMetaTagInHead($html)) {
            \assert($html !== '');
            return $html;
        }

        // We are trying to insert the meta tag to the right spot in the DOM.
        // If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
        $hasHeadTag = (new Preg())->match('/<head[\\s>]/i', $html) !== 0;
        $hasHtmlTag = \stripos($html, '<html') !== false;

        if ($hasHeadTag) {
            $reworkedHtml = (new Preg())->replace(
                '/<head(?=[\\s>])([^>]*+)>/i',
                '<head$1>' . self::CONTENT_TYPE_META_TAG,
                $html
            );
        } elseif ($hasHtmlTag) {
            $reworkedHtml = (new Preg())->replace(
                '/<html(.*?)>/is',
                '<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
                $html
            );
        } else {
            $reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
        }
        \assert($reworkedHtml !== '');

        return $reworkedHtml;
    }

    /**
     * Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element.  Due to tag
     * omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
     * encountering a start tag for any element which is permitted only within the `<body>`.
     */
    private function hasContentTypeMetaTagInHead(string $html): bool
    {
        (new Preg())->match(
            '%^.*?(?=<meta(?=\\s)[^>]*\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is',
            $html,
            $matches
        );
        if (isset($matches[0])) {
            $htmlBefore = $matches[0];
            try {
                $hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
            } catch (\RuntimeException $exception) {
                // If something unexpected occurs, assume the `Content-Type` that was found is valid.
                \trigger_error($exception->getMessage());
                $hasContentTypeMetaTagInHead = true;
            }
        } else {
            $hasContentTypeMetaTagInHead = false;
        }

        return $hasContentTypeMetaTagInHead;
    }

    /**
     * Tests whether the `<head>` element ends within the given HTML.  Due to tag omission rules, HTML parsers are
     * expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
     * which is permitted only within the `<body>`.
     *
     * @throws \RuntimeException
     */
    private function hasEndOfHeadElement(string $html): bool
    {
        if (
            (new Preg())->match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w|</head>%i', $html)
            !== 0
        ) {
            // An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
            // comments.  As an optimization, this is only checked for if a potential `<head>` end tag is found.
            $htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
            $hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
                || $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
        } else {
            $hasEndOfHeadElement = false;
        }

        return $hasEndOfHeadElement;
    }

    /**
     * Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
     * is removed.
     *
     * @throws \RuntimeException
     */
    private function removeHtmlComments(string $html): string
    {
        return (new Preg())->throwExceptions(true)->replace(self::HTML_COMMENT_PATTERN, '', $html);
    }

    /**
     * Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
     * the string is removed.
     *
     * @throws \RuntimeException
     */
    private function removeHtmlTemplateElements(string $html): string
    {
        return (new Preg())->throwExceptions(true)->replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
    }

    /**
     * Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
     * self-closing slash.
     */
    private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
    {
        return (new Preg())->replace(
            '%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
            '$0/',
            $html
        );
    }

    /**
     * Checks that $this->domDocument has a BODY element and adds it if it is missing.
     *
     * @throws \UnexpectedValueException
     */
    private function ensureExistenceOfBodyElement(): void
    {
        if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
            return;
        }

        $this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));
    }
}

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Pelago\Emogrifier\HtmlProcessor;
6
7	use Pelago\Emogrifier\Utilities\Preg;
8
9	/**
10	* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
11	*
12	* The "vanilla" subclass is the HtmlNormalizer.
13	*/
14	abstract class AbstractHtmlProcessor
15	{
16	/**
17	* @var non-empty-string
18	*/
19	protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
20
21	/**
22	* @var non-empty-string
23	*/
24	protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
25
26	/**
27	* @var non-empty-string Regular expression part to match tag names that PHP's DOMDocument implementation is not
28	* aware are self-closing. These are mostly HTML5 elements, but for completeness `<command>` (obsolete) and
29	* `<keygen>` (deprecated) are also included.
30	*
31	* @see https://bugs.php.net/bug.php?id=73175
32	*/
33	protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command\|embed\|keygen\|source\|track\|wbr)';
34
35	/**
36	* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
37	* for any other element would implicitly start the `<body>` element due to tag omission rules.
38	*
39	* @var non-empty-string
40	*/
41	protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
42	= '(?:html\|head\|base\|command\|link\|meta\|noscript\|script\|style\|template\|title)';
43
44	/**
45	* regular expression pattern to match an HTML comment, including delimiters and modifiers
46	*
47	* @var non-empty-string
48	*/
49	protected const HTML_COMMENT_PATTERN = '/<!--[^-]+(?:-(?!->)[^-]+)*+(?:-->\|$)/';
50
51	/**
52	* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
53	*
54	* @var non-empty-string
55	*/
56	protected const HTML_TEMPLATE_ELEMENT_PATTERN
57	= '%<template[\\s>][^<]+(?:<(?!/template>)[^<]+)*+(?:</template>\|$)%i';
58
59	/**
60	* @var \DOMDocument\|null
61	*/
62	protected $domDocument = null;
63
64	/**
65	* @var \DOMXPath\|null
66	*/
67	private $xPath = null;
68
69	/**
70	* The constructor.
71	*
72	* Please use `::fromHtml` or `::fromDomDocument` instead.
73	*/
74	private function __construct() {}
75
76	/**
77	* Builds a new instance from the given HTML.
78	*
79	* @param non-empty-string $unprocessedHtml raw HTML, must be UTF-encoded
80	*
81	* @return static
82	*
83	* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
84	*/
85	public static function fromHtml(string $unprocessedHtml): self	595✔
86	{
87	// @phpstan-ignore-next-line argument.type We're checking for a contract violation here.
88	if ($unprocessedHtml === '') {	595✔
89	throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);	1✔
90	}
91
92	$instance = new static();	594✔
93	$instance->setHtml($unprocessedHtml);	594✔
94
95	return $instance;	594✔
96	}
97
98	/**
99	* Builds a new instance from the given DOM document.
100	*
101	* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
102	*
103	* @return static
104	*/
105	public static function fromDomDocument(\DOMDocument $document): self	4✔
106	{
107	$instance = new static();	4✔
108	$instance->setDomDocument($document);	4✔
109
110	return $instance;	4✔
111	}
112
113	/**
114	* Sets the HTML to process.
115	*
116	* @param string $html the HTML to process, must be UTF-8-encoded
117	*/
118	private function setHtml(string $html): void	594✔
119	{
120	$this->createUnifiedDomDocument($html);	594✔
121	}
122
123	/**
124	* Provides access to the internal DOMDocument representation of the HTML in its current state.
125	*
126	* @throws \UnexpectedValueException
127	*/
128	public function getDomDocument(): \DOMDocument	596✔
129	{
130	if (!$this->domDocument instanceof \DOMDocument) {	596✔
131	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;	×
132	throw new \UnexpectedValueException($message, 1570472239);	×
133	}
134
135	return $this->domDocument;	596✔
136	}
137
138	private function setDomDocument(\DOMDocument $domDocument): void	598✔
139	{
140	$this->domDocument = $domDocument;	598✔
141	$this->xPath = new \DOMXPath($this->domDocument);	598✔
142	}
143
144	/**
145	* @throws \UnexpectedValueException
146	*/
147	protected function getXPath(): \DOMXPath	×
148	{
149	if (!$this->xPath instanceof \DOMXPath) {	×
150	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;	×
151	throw new \UnexpectedValueException($message, 1617819086);	×
152	}
153
154	return $this->xPath;	×
155	}
156
157	/**
158	* Renders the normalized and processed HTML.
159	*/
160	public function render(): string	205✔
161	{
162	$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();	205✔
163
164	return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);	205✔
165	}
166
167	/**
168	* Renders the content of the BODY element of the normalized and processed HTML.
169	*/
170	public function renderBodyContent(): string	12✔
171	{
172	$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());	12✔
173	$bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);	12✔
174
175	return (new Preg())->replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);	12✔
176	}
177
178	/**
179	* Eliminates any invalid closing tags for void elements from the given HTML.
180	*/
181	private function removeSelfClosingTagsClosingTags(string $html): string	217✔
182	{
183	return (new Preg())->replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);	217✔
184	}
185
186	/**
187	* Returns the HTML element.
188	*
189	* This method assumes that there always is an HTML element, throwing an exception otherwise.
190	*
191	* @throws \UnexpectedValueException
192	*/
193	protected function getHtmlElement(): \DOMElement	414✔
194	{
195	$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);	414✔
196	if (!$htmlElement instanceof \DOMElement) {	414✔
197	throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);	×
198	}
199
200	return $htmlElement;	414✔
201	}
202
203	/**
204	* Returns the BODY element.
205	*
206	* This method assumes that there always is a BODY element.
207	*
208	* @throws \RuntimeException
209	*/
210	private function getBodyElement(): \DOMElement	12✔
211	{
212	$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);	12✔
213	if (!$node instanceof \DOMElement) {	12✔
214	throw new \RuntimeException('There is no body element.', 1617922607);	×
215	}
216
217	return $node;	12✔
218	}
219
220	/**
221	* Creates a DOM document from the given HTML and stores it in $this->domDocument.
222	*
223	* The DOM document will always have a BODY element and a document type.
224	*/
225	private function createUnifiedDomDocument(string $html): void	594✔
226	{
227	$this->createRawDomDocument($html);	594✔
228	$this->ensureExistenceOfBodyElement();	594✔
229	}
230
231	/**
232	* Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
233	*/
234	private function createRawDomDocument(string $html): void	594✔
235	{
236	$domDocument = new \DOMDocument();	594✔
237	$domDocument->strictErrorChecking = false;	594✔
238	$domDocument->formatOutput = false;	594✔
239	$libXmlState = \libxml_use_internal_errors(true);	594✔
240	$domDocument->loadHTML($this->prepareHtmlForDomConversion($html));	594✔
241	\libxml_clear_errors();	594✔
242	\libxml_use_internal_errors($libXmlState);	594✔
243
244	$this->setDomDocument($domDocument);	594✔
245	}
246
247	/**
248	* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
249	* ensuring that the HTML will be good for creating a DOM document from it.
250	*/
251	private function prepareHtmlForDomConversion(string $html): string	594✔
252	{
253	$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);	594✔
254	$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);	594✔
255
256	return $this->addContentTypeMetaTag($htmlWithDocumentType);	594✔
257	}
258
259	/**
260	* Makes sure that the passed HTML has a document type, with lowercase "html".
261	*
262	* @return non-empty-string HTML with document type
263	*/
264	private function ensureDocumentType(string $html): string	594✔
265	{
266	$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;	594✔
267	if ($hasDocumentType) {	594✔
268	return $this->normalizeDocumentType($html);	37✔
269	}
270
271	return self::DEFAULT_DOCUMENT_TYPE . $html;	557✔
272	}
273
274	/**
275	* Makes sure the document type in the passed HTML has lowercase `html`.
276	*
277	* @return ($html is non-empty-string ? non-empty-string : string) HTML with normalized document type
278	*/
279	private function normalizeDocumentType(string $html): string	37✔
280	{
281	// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
282	return (new Preg())->replace(	37✔
283	'/<!DOCTYPE\\s++html(?=[\\s>])/i',	37✔
284	'<!DOCTYPE html',	37✔
285	$html,	37✔
286	1	37✔
287	);	37✔
288	}
289
290	/**
291	* Adds a Content-Type meta tag for the charset.
292	*
293	* This method also ensures that there is a HEAD element.
294	*
295	* @return non-empty-string
296	*/
297	private function addContentTypeMetaTag(string $html): string	594✔
298	{
299	if ($this->hasContentTypeMetaTagInHead($html)) {	594✔
300	\assert($html !== '');	373✔
301	return $html;	373✔
302	}
303
304	// We are trying to insert the meta tag to the right spot in the DOM.
305	// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
306	$hasHeadTag = (new Preg())->match('/<head[\\s>]/i', $html) !== 0;	221✔
307	$hasHtmlTag = \stripos($html, '<html') !== false;	221✔
308
309	if ($hasHeadTag) {	221✔
310	$reworkedHtml = (new Preg())->replace(	36✔
311	'/<head(?=[\\s>])([^>]*+)>/i',	36✔
312	'<head$1>' . self::CONTENT_TYPE_META_TAG,	36✔
313	$html	36✔
314	);	36✔
315	} elseif ($hasHtmlTag) {	185✔
316	$reworkedHtml = (new Preg())->replace(	82✔
317	'/<html(.*?)>/is',	82✔
318	'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',	82✔
319	$html	82✔
320	);	82✔
321	} else {
322	$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;	103✔
323	}
324	\assert($reworkedHtml !== '');	221✔
325
326	return $reworkedHtml;	221✔
327	}
328
329	/**
330	* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
331	* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
332	* encountering a start tag for any element which is permitted only within the `<body>`.
333	*/
334	private function hasContentTypeMetaTagInHead(string $html): bool	594✔
335	{
336	(new Preg())->match(	594✔
337	'%^.?(?=<meta(?=\\s)[^>]\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is',	594✔
338	$html,	594✔
339	$matches	594✔
340	);	594✔
341	if (isset($matches[0])) {	594✔
342	$htmlBefore = $matches[0];	395✔
343	try {
344	$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);	395✔
UNCOV 345	} catch (\RuntimeException $exception) {	×
346	// If something unexpected occurs, assume the `Content-Type` that was found is valid.
UNCOV 347	\trigger_error($exception->getMessage());	×
UNCOV 348	$hasContentTypeMetaTagInHead = true;	×
349	}
350	} else {
351	$hasContentTypeMetaTagInHead = false;	199✔
352	}
353
354	return $hasContentTypeMetaTagInHead;	594✔
355	}
356
357	/**
358	* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
359	* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
360	* which is permitted only within the `<body>`.
361	*
362	* @throws \RuntimeException
363	*/
364	private function hasEndOfHeadElement(string $html): bool	395✔
365	{
366	if (
367	(new Preg())->match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w\|</head>%i', $html)	395✔
368	!== 0
369	) {
370	// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
371	// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
372	$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));	70✔
373	$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html	70✔
374	\|\| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);	70✔
375	} else {
376	$hasEndOfHeadElement = false;	373✔
377	}
378
379	return $hasEndOfHeadElement;	395✔
380	}
381
382	/**
383	* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
384	* is removed.
385	*
386	* @throws \RuntimeException
387	*/
388	private function removeHtmlComments(string $html): string	70✔
389	{
390	return (new Preg())->throwExceptions(true)->replace(self::HTML_COMMENT_PATTERN, '', $html);	70✔
391	}
392
393	/**
394	* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
395	* the string is removed.
396	*
397	* @throws \RuntimeException
398	*/
399	private function removeHtmlTemplateElements(string $html): string	70✔
400	{
401	return (new Preg())->throwExceptions(true)->replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);	70✔
402	}
403
404	/**
405	* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
406	* self-closing slash.
407	*/
408	private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string	594✔
409	{
410	return (new Preg())->replace(	594✔
411	'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',	594✔
412	'$0/',	594✔
413	$html	594✔
414	);	594✔
415	}
416
417	/**
418	* Checks that $this->domDocument has a BODY element and adds it if it is missing.
419	*
420	* @throws \UnexpectedValueException
421	*/
422	private function ensureExistenceOfBodyElement(): void	594✔
423	{
424	if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {	594✔
425	return;	181✔
426	}
427
428	$this->getHtmlElement()->appendChild($this->getDomDocument()->createElement('body'));	413✔
429	}
430	}

MyIntervals / emogrifier / 16949732221

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous