• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 25623212060

10 May 2026 07:44AM UTC coverage: 96.766% (-0.03%) from 96.791%
25623212060

Pull #145

github

web-flow
Merge 477330a45 into 7d12f7c70
Pull Request #145: Add Infection + PHPStan CI check on PHP 8.3 and fix PHP 7.x template serialization

8 of 39 new or added lines in 1 file covered. (20.51%)

3 existing lines in 2 files now uncovered.

2184 of 2257 relevant lines covered (96.77%)

301.85 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.46
/src/voku/helper/HtmlDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42

43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49

50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61

62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71

72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/template',
79
     *     'text/x-custom-template',
80
     *     'text/x-handlebars-template'
81
     * ]
82
     * ```
83
     *
84
     * @var string[]
85
     */
86
    protected $specialScriptTags = [
87
        'text/html',
88
        'text/template',
89
        'text/x-custom-template',
90
        'text/x-handlebars-template',
91
    ];
92

93
    /**
94
     * @var string[]
95
     */
96
    protected $selfClosingTags = [
97
        'area',
98
        'base',
99
        'br',
100
        'col',
101
        'command',
102
        'embed',
103
        'hr',
104
        'img',
105
        'input',
106
        'keygen',
107
        'link',
108
        'meta',
109
        'param',
110
        'source',
111
        'track',
112
        'wbr',
113
    ];
114

115
    /**
116
     * @var bool
117
     */
118
    protected $isDOMDocumentCreatedWithoutHtml = false;
119

120
    /**
121
     * @var bool
122
     */
123
    protected $isDOMDocumentCreatedWithoutWrapper = false;
124

125
    /**
126
     * @var bool
127
     */
128
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
129

130
    /**
131
     * @var bool
132
     */
133
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
134

135
    /**
136
     * @var bool
137
     */
138
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
139

140
    /**
141
     * @var bool
142
     */
143
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
144

145
    /**
146
     * @var bool
147
     */
148
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
149

150
    /**
151
     * @var bool
152
     */
153
    protected $isDOMDocumentCreatedWithMultiRoot = false;
154

155
    /**
156
     * @var bool
157
     */
158
    protected $isDOMDocumentCreatedWithEdgeWhitespace = false;
159

160
    /**
161
     * @var bool
162
     */
163
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
164

165
    /**
166
     * @var bool
167
     */
168
    protected $createdFromNode = false;
169

170
    /**
171
     * @var bool
172
     */
173
    protected $keepBrokenHtml = false;
174

175
    /**
176
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
177
     */
178
    public function __construct($element = null)
179
    {
180
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
2,382✔
181

182
        // DOMDocument settings
183
        $this->document->preserveWhiteSpace = true;
2,382✔
184
        $this->document->formatOutput = false;
2,382✔
185

186
        if ($element instanceof SimpleHtmlDomInterface) {
2,382✔
187
            $element = $element->getNode();
833✔
188
        }
189

190
        if ($element instanceof \DOMDocument) {
2,382✔
191
            $html = $element->saveHTML();
7✔
192
            if ($html !== false) {
7✔
193
                $this->loadHtml($html);
7✔
194
            }
195

196
            return;
7✔
197
        }
198

199
        if ($element instanceof \DOMNode) {
2,382✔
200
            $this->createdFromNode = true;
896✔
201

202
            $domNode = $this->document->importNode($element, true);
896✔
203

204
            // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
205
            if ($domNode instanceof \DOMNode) {
889✔
206
                $this->document->appendChild($domNode);
889✔
207
            }
208

209
            return;
889✔
210
        }
211

212
        if ($element !== null) {
2,375✔
213
            $this->loadHtml($element);
756✔
214
        }
215
    }
216

217
    /**
218
     * @param string       $name
219
     * @param array<mixed> $arguments
220
     *
221
     * @return bool|mixed
222
     */
223
    public function __call($name, $arguments)
224
    {
225
        $name = \strtolower($name);
546✔
226

227
        if (isset(self::$functionAliases[$name])) {
546✔
228
            $method = self::$functionAliases[$name];
539✔
229

230
            return $this->{$method}(...$arguments);
539✔
231
        }
232

233
        throw new \BadMethodCallException('Method does not exist: ' . $name);
7✔
234
    }
235

236
    /**
237
     * @param string       $name
238
     * @param array<mixed> $arguments
239
     *
240
     * @throws \BadMethodCallException
241
     * @throws \RuntimeException
242
     *
243
     * @return static
244
     */
245
    public static function __callStatic($name, $arguments)
246
    {
247
        $arguments0 = $arguments[0] ?? '';
786✔
248

249
        $arguments1 = $arguments[1] ?? null;
786✔
250

251
        if ($name === 'str_get_html') {
786✔
252
            $parser = self::createStaticParser();
744✔
253

254
            return $parser->loadHtml($arguments0, $arguments1);
744✔
255
        }
256

257
        if ($name === 'file_get_html') {
49✔
258
            $parser = self::createStaticParser();
42✔
259

260
            return $parser->loadHtmlFile($arguments0, $arguments1);
42✔
261
        }
262

263
        throw new \BadMethodCallException('Method does not exist');
7✔
264
    }
265

266
    /**
267
     * @return static
268
     */
269
    private static function createStaticParser()
270
    {
271
        // @phpstan-ignore new.static (factory methods intentionally preserve late static binding)
272
        return new static();
779✔
273
    }
274

275
    /** @noinspection MagicMethodsValidityInspection */
276

277
    /**
278
     * @param string $name
279
     *
280
     * @return string|null
281
     */
282
    public function __get($name)
283
    {
284
        $name = \strtolower($name);
210✔
285

286
        switch ($name) {
287
            case 'outerhtml':
210✔
288
            case 'outertext':
168✔
289
                return $this->html();
133✔
290
            case 'innerhtml':
119✔
291
            case 'innertext':
77✔
292
                return $this->innerHtml();
49✔
293
            case 'innerhtmlkeep':
70✔
294
                return $this->innerHtml(false, false);
×
295
            case 'text':
70✔
296
            case 'plaintext':
70✔
297
                return $this->text();
63✔
298
        }
299

300
        return null;
7✔
301
    }
302

303
    /**
304
     * @return string
305
     */
306
    public function __toString()
307
    {
308
        return $this->html();
147✔
309
    }
310

311
    /**
312
     * does nothing (only for api-compatibility-reasons)
313
     *
314
     * @return bool
315
     *
316
     * @deprecated
317
     */
318
    public function clear(): bool
319
    {
320
        return true;
14✔
321
    }
322

323
    /**
324
     * Create DOMDocument from HTML.
325
     *
326
     * @param string   $html
327
     * @param int|null $libXMLExtraOptions
328
     * @param bool     $useDefaultLibXMLOptions
329
     *
330
     * @return \DOMDocument
331
     */
332
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
333
    {
334
        $this->resetDynamicDomHelpers();
2,249✔
335

336
        if ($this->callbackBeforeCreateDom) {
2,249✔
337
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
7✔
338
        }
339

340
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
341
        $isDOMDocumentCreatedWithDoctype = false;
2,249✔
342
        if (\stripos($html, '<!DOCTYPE') !== false) {
2,249✔
343
            $isDOMDocumentCreatedWithDoctype = true;
427✔
344
            if (
345
                \preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
427✔
346
                &&
347
                \trim($matches_before_doctype[1])
427✔
348
            ) {
349
                $html = \str_replace($matches_before_doctype[1], '', $html);
14✔
350
            }
351
        }
352

353
        if ($this->keepBrokenHtml) {
2,249✔
354
            $html = $this->keepBrokenHtml(\trim($html));
35✔
355
        }
356

357
        if (\strpos($html, '<') === false) {
2,249✔
358
            $this->isDOMDocumentCreatedWithoutHtml = true;
98✔
359
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
2,235✔
360
            $this->isDOMDocumentCreatedWithoutWrapper = true;
70✔
361
        }
362

363
        if (\strpos(\ltrim($html), '<!--') === 0) {
2,249✔
364
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
84✔
365
        }
366

367
        /** @noinspection HtmlRequiredLangAttribute */
368
        if (
369
            \strpos($html, '<html ') === false
2,249✔
370
            &&
371
            \strpos($html, '<html>') === false
2,249✔
372
        ) {
373
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
1,465✔
374
        }
375

376
        if (
377
            \strpos($html, '<body ') === false
2,249✔
378
            &&
379
            \strpos($html, '<body>') === false
2,249✔
380
        ) {
381
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
1,479✔
382
        }
383

384
        if (
385
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,249✔
386
            &&
387
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,249✔
388
            &&
389
            \trim($html) !== $html
2,249✔
390
            &&
391
            \substr_count($html, '</') >= 2
2,249✔
392
            &&
393
            \preg_match('#^\s*<([a-zA-Z][^\\s>/]*)>.*?</\\1>#su', $html) === 1
2,249✔
394
        ) {
395
            $this->isDOMDocumentCreatedWithEdgeWhitespace = true;
28✔
396
        }
397

398
        /** @noinspection HtmlRequiredTitleElement */
399
        if (
400
            \strpos($html, '<head ') === false
2,249✔
401
            &&
402
            \strpos($html, '<head>') === false
2,249✔
403
        ) {
404
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
1,850✔
405
        }
406

407
        if (
408
            \stripos($html, '<p ') === false
2,249✔
409
            &&
410
            \stripos($html, '<p>') === false
2,249✔
411
        ) {
412
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
1,351✔
413
        }
414

415
        if (
416
            \strpos($html, '</script>') === false
2,249✔
417
            &&
418
            \strpos($html, '<\/script>') !== false
2,249✔
419
        ) {
420
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
7✔
421
        }
422

423
        if (\stripos($html, '</html>') !== false) {
2,249✔
424
            /** @noinspection NestedPositiveIfStatementsInspection */
425
            if (
426
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
875✔
427
                &&
428
                \trim($matches_after_html[1])
875✔
429
            ) {
430
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
28✔
431
            }
432
        }
433

434
        if (\strpos($html, '<script') !== false) {
2,249✔
435
            // keepSpecialScriptTags must run before html5FallbackForScriptTags so
436
            // that special-type scripts (type="text/html", etc.) are converted to
437
            // the simplevokuspecialscript placeholder element before the script-tag
438
            // regex runs.  On PHP < 8.0 the regex uses hash placeholders; if it
439
            // ran first the special-script content would be hashed and
440
            // keepSpecialScriptTags would only see the hash, losing the ability to
441
            // pass the real HTML content to the DOM for error-recovery parsing.
442
            foreach ($this->specialScriptTags as $tag) {
168✔
443
                if (\strpos($html, $tag) !== false) {
168✔
444
                    $this->keepSpecialScriptTags($html);
49✔
445
                    break;
49✔
446
                }
447
            }
448

449
            $this->html5FallbackForScriptTags($html);
168✔
450
        }
451

452
        if (\strpos($html, '<svg') !== false) {
2,249✔
453
            $this->keepSpecialSvgTags($html);
329✔
454
        }
455

456
        $html = \str_replace(
2,249✔
457
            \array_map(static function ($e) {
2,249✔
458
                return '<' . $e . '>';
2,249✔
459
            }, $this->selfClosingTags),
2,249✔
460
            \array_map(static function ($e) {
2,249✔
461
                return '<' . $e . '/>';
2,249✔
462
            }, $this->selfClosingTags),
2,249✔
463
            $html
2,249✔
464
        );
2,249✔
465

466
        // set error level
467
        $internalErrors = \libxml_use_internal_errors(true);
2,249✔
468
        if (\PHP_VERSION_ID < 80000) {
2,249✔
469
            $disableEntityLoader = \libxml_disable_entity_loader(true);
644✔
470
        }
471
        \libxml_clear_errors();
2,249✔
472

473
        $optionsXml = 0;
2,249✔
474
        if ($useDefaultLibXMLOptions) {
2,249✔
475
            $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
2,249✔
476

477
            if (\defined('LIBXML_BIGLINES')) {
2,249✔
478
                $optionsXml |= \LIBXML_BIGLINES;
2,249✔
479
            }
480

481
            if (\defined('LIBXML_COMPACT')) {
2,249✔
482
                $optionsXml |= \LIBXML_COMPACT;
2,249✔
483
            }
484

485
            if (\defined('LIBXML_HTML_NODEFDTD')) {
2,249✔
486
                $optionsXml |= \LIBXML_HTML_NODEFDTD;
2,249✔
487
            }
488
        }
489

490
        if ($libXMLExtraOptions !== null) {
2,249✔
491
            $optionsXml |= $libXMLExtraOptions;
63✔
492
        }
493

494
        if (
495
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,249✔
496
            &&
497
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,249✔
498
        ) {
499
            $this->isDOMDocumentCreatedWithMultiRoot = $this->hasMultipleTopLevelNodes($html, $optionsXml);
1,437✔
500
        }
501

502
        if (
503
            $this->isDOMDocumentCreatedWithMultiRoot
2,249✔
504
            ||
505
            $this->isDOMDocumentCreatedWithEdgeWhitespace
2,091✔
506
            ||
507
            $this->isDOMDocumentCreatedWithoutWrapper
2,079✔
508
            ||
509
            $this->isDOMDocumentCreatedWithCommentWrapper
2,030✔
510
            ||
511
            (
512
                !$isDOMDocumentCreatedWithDoctype
2,249✔
513
                &&
2,249✔
514
                $this->keepBrokenHtml
2,249✔
515
            )
516
        ) {
517
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
380✔
518
        }
519

520
        $html = self::replaceToPreserveHtmlEntities($html);
2,249✔
521

522
        $documentFound = false;
2,249✔
523
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
2,249✔
524
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
2,249✔
525
            $domElementTmp = \dom_import_simplexml($sxe);
1,412✔
526
            if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
1,412✔
527
                $documentFound = true;
1,412✔
528
                $this->document = $domElementTmp->ownerDocument;
1,412✔
529
            }
530
        }
531

532
        if ($documentFound === false) {
2,249✔
533
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
534
            $xmlHackUsed = false;
979✔
535
            if (\stripos('<?xml', $html) !== 0) {
979✔
536
                $xmlHackUsed = true;
974✔
537
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
974✔
538
            }
539

540
            if ($html !== '') {
979✔
541
                $this->document->loadHTML($html, $optionsXml);
974✔
542
            }
543

544
            // remove the "xml-encoding" hack
545
            if ($xmlHackUsed) {
979✔
546
                foreach ($this->document->childNodes as $child) {
974✔
547
                    if ($child->nodeType === \XML_PI_NODE) {
974✔
548
                        $this->document->removeChild($child);
974✔
549

550
                        break;
974✔
551
                    }
552
                }
553
            }
554
        }
555

556
        $this->markSyntheticParagraphWrapper();
2,249✔
557

558
        // set encoding
559
        $this->document->encoding = $this->getEncoding();
2,249✔
560

561
        // restore lib-xml settings
562
        \libxml_clear_errors();
2,249✔
563
        \libxml_use_internal_errors($internalErrors);
2,249✔
564
        // @phpstan-ignore isset.variable (only defined on PHP < 8 paths where it is used)
565
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
2,249✔
566
            \libxml_disable_entity_loader($disableEntityLoader);
644✔
567
        }
568

569
        return $this->document;
2,249✔
570
    }
571

572
    /**
573
     * Find list of nodes with a CSS selector.
574
     *
575
     * @param string   $selector
576
     * @param int|null $idx
577
     *
578
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
579
     */
580
    public function find(string $selector, $idx = null)
581
    {
582
        return $this->findInNodeContext($selector, null, $idx);
1,612✔
583
    }
584

585
    /**
586
     * Find list of nodes with a CSS selector within an optional DOM context.
587
     *
588
     * @param string        $selector
589
     * @param \DOMNode|null $contextNode
590
     * @param int|null      $idx
591
     *
592
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
593
     *
594
     * @internal Used by wrapped SimpleHtmlDom instances to preserve parser
595
     *           callback state when scoping queries to an existing DOM node.
596
     */
597
    public function findInNodeContext(string $selector, ?\DOMNode $contextNode = null, $idx = null)
598
    {
599
        return self::findInDocumentContext(
1,612✔
600
            $selector,
1,612✔
601
            $this->document,
1,612✔
602
            $contextNode,
1,612✔
603
            $idx,
1,612✔
604
            $this->callbackXPathBeforeQuery,
1,612✔
605
            $this
1,612✔
606
        );
1,612✔
607
    }
608

609
    /**
610
     * Find list of nodes with a CSS selector within an optional DOMDocument
611
     * context, optionally applying the parser callback before the XPath query.
612
     *
613
     * @param string        $selector
614
     * @param \DOMDocument  $document
615
     * @param \DOMNode|null $contextNode
616
     * @param int|null      $idx
617
     * @param callable|null $callbackXPathBeforeQuery
618
     * @param self|null     $queryHtmlDomParser
619
     *
620
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
621
     *
622
     * @phpstan-param null|callable(string, string, \DOMXPath, self): string $callbackXPathBeforeQuery
623
     *
624
     * @internal Used by wrapped SimpleHtmlDom instances to keep queries scoped
625
     *           to an existing DOMDocument while preserving parser callback
626
     *           behavior.
627
     */
628
    public static function findInDocumentContext(
629
        string $selector,
630
        \DOMDocument $document,
631
        ?\DOMNode $contextNode = null,
632
        $idx = null,
633
        ?callable $callbackXPathBeforeQuery = null,
634
        ?self $queryHtmlDomParser = null
635
    ) {
636
        $xPathQuery = SelectorConverter::toXPath($selector);
1,710✔
637

638
        $xPath = new \DOMXPath($document);
1,710✔
639

640
        if ($callbackXPathBeforeQuery !== null && $queryHtmlDomParser !== null) {
1,710✔
641
            $xPathQuery = \call_user_func($callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $queryHtmlDomParser);
21✔
642
        }
643

644
        if ($contextNode !== null) {
1,710✔
645
            $xPathQuery = self::scopeXPathQueryToContextNode($xPathQuery);
378✔
646
        }
647

648
        $nodesList = $xPath->query($xPathQuery, $contextNode);
1,710✔
649

650
        return self::createFindResultFromNodeList($nodesList, $idx, $queryHtmlDomParser);
1,710✔
651
    }
652

653
    /**
654
     * Prefix absolute XPath segments so they stay scoped to the provided
655
     * context node, including every branch of union expressions.
656
     *
657
     * @param string $xPathQuery
658
     *
659
     * @return string
660
     */
661
    private static function scopeXPathQueryToContextNode(string $xPathQuery): string
662
    {
663
        $scopedXPathQuery = '';
385✔
664
        $quoteCharacter = null;
385✔
665
        $bracketDepth = 0;
385✔
666
        $parenthesisDepth = 0;
385✔
667
        $isAtBranchStart = true;
385✔
668
        $length = \strlen($xPathQuery);
385✔
669

670
        for ($i = 0; $i < $length; ++$i) {
385✔
671
            $character = $xPathQuery[$i];
385✔
672

673
            if ($quoteCharacter !== null) {
385✔
674
                $scopedXPathQuery .= $character;
133✔
675

676
                if ($character === $quoteCharacter) {
133✔
677
                    $quoteCharacter = null;
133✔
678
                }
679

680
                continue;
133✔
681
            }
682

683
            if ($character === '"' || $character === "'") {
385✔
684
                $scopedXPathQuery .= $character;
133✔
685
                $quoteCharacter = $character;
133✔
686

687
                continue;
133✔
688
            }
689

690
            if ($isAtBranchStart) {
385✔
691
                if (\trim($character) === '') {
385✔
692
                    $scopedXPathQuery .= $character;
35✔
693

694
                    continue;
35✔
695
                }
696

697
                if ($character === '/') {
385✔
698
                    $scopedXPathQuery .= '.';
63✔
699
                }
700

701
                $isAtBranchStart = false;
385✔
702
            }
703

704
            if ($character === '[') {
385✔
705
                ++$bracketDepth;
161✔
706
            } elseif ($character === ']' && $bracketDepth > 0) {
385✔
707
                --$bracketDepth;
161✔
708
            } elseif ($character === '(') {
385✔
709
                ++$parenthesisDepth;
119✔
710
            } elseif ($character === ')' && $parenthesisDepth > 0) {
385✔
711
                --$parenthesisDepth;
119✔
712
            }
713

714
            $scopedXPathQuery .= $character;
385✔
715

716
            if ($character === '|' && $bracketDepth === 0 && $parenthesisDepth === 0) {
385✔
717
                $isAtBranchStart = true;
35✔
718
            }
719
        }
720

721
        return $scopedXPathQuery;
385✔
722
    }
723

724
    /**
725
     * @param \DOMNodeList<\DOMNameSpaceNode|\DOMNode>|false $nodesList
726
     * @param int|null                                       $idx
727
     *
728
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
729
     */
730
    private static function createFindResultFromNodeList($nodesList, $idx, ?self $queryHtmlDomParser = null)
731
    {
732
        $elements = new SimpleHtmlDomNode();
1,710✔
733

734
        if ($nodesList) {
1,710✔
735
            foreach ($nodesList as $node) {
1,710✔
736
                if (!$node instanceof \DOMNode) {
1,633✔
737
                    continue;
×
738
                }
739

740
                $elements[] = new SimpleHtmlDom($node, $queryHtmlDomParser);
1,633✔
741
            }
742
        }
743

744
        // return all elements
745
        if ($idx === null) {
1,710✔
746
            if (\count($elements) === 0) {
742✔
747
                return new SimpleHtmlDomNodeBlank();
140✔
748
            }
749

750
            return $elements;
714✔
751
        }
752

753
        // handle negative values
754
        if ($idx < 0) {
1,234✔
755
            $idx = \count($elements) + $idx;
77✔
756
        }
757

758
        // return one element
759
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
1,234✔
760
    }
761

762
    /**
763
     * Find nodes with a CSS selector.
764
     *
765
     * @param string $selector
766
     *
767
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
768
     */
769
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
770
    {
771
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
772
        $return = $this->find($selector, null);
126✔
773

774
        return $return;
126✔
775
    }
776

777
    /**
778
     * Find nodes with a CSS selector or false, if no element is found.
779
     *
780
     * @param string $selector
781
     *
782
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
783
     */
784
    public function findMultiOrFalse(string $selector)
785
    {
786
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
787
        $return = $this->find($selector, null);
35✔
788

789
        if ($return instanceof SimpleHtmlDomNodeBlank) {
35✔
790
            return false;
21✔
791
        }
792

793
        return $return;
21✔
794
    }
795

796
    /**
797
     * Find nodes with a CSS selector or null, if no element is found.
798
     *
799
     * @param string $selector
800
     *
801
     * @return null|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
802
     */
803
    public function findMultiOrNull(string $selector)
804
    {
805
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
806
        $return = $this->find($selector, null);
7✔
807

808
        if ($return instanceof SimpleHtmlDomNodeBlank) {
7✔
809
            return null;
7✔
810
        }
811

812
        return $return;
7✔
813
    }
814

815
    /**
816
     * Find one node with a CSS selector.
817
     *
818
     * @param string $selector
819
     *
820
     * @return SimpleHtmlDomInterface
821
     */
822
    public function findOne(string $selector): SimpleHtmlDomInterface
823
    {
824
        /** @var SimpleHtmlDomInterface $return */
825
        $return = $this->find($selector, 0);
583✔
826

827
        return $return;
583✔
828
    }
829

830
    /**
831
     * Find one node with a CSS selector or false, if no element is found.
832
     *
833
     * @param string $selector
834
     *
835
     * @return false|SimpleHtmlDomInterface
836
     */
837
    public function findOneOrFalse(string $selector)
838
    {
839
        /** @var SimpleHtmlDomInterface $return */
840
        $return = $this->find($selector, 0);
70✔
841

842
        if ($return instanceof SimpleHtmlDomBlank) {
70✔
843
            return false;
28✔
844
        }
845

846
        return $return;
56✔
847
    }
848

849
    /**
850
     * Find one node with a CSS selector or null, if no element is found.
851
     *
852
     * @param string $selector
853
     *
854
     * @return null|SimpleHtmlDomInterface
855
     */
856
    public function findOneOrNull(string $selector)
857
    {
858
        /** @var SimpleHtmlDomInterface $return */
859
        $return = $this->find($selector, 0);
7✔
860

861
        if ($return instanceof SimpleHtmlDomBlank) {
7✔
862
            return null;
7✔
863
        }
864

865
        return $return;
7✔
866
    }
867

868
    /**
869
     * @param string $content
870
     * @param bool   $multiDecodeNewHtmlEntity
871
     * @param bool   $putBrokenReplacedBack
872
     *
873
     * @return string
874
     */
875
    public function fixHtmlOutput(
876
        string $content,
877
        bool $multiDecodeNewHtmlEntity = false,
878
        bool $putBrokenReplacedBack = true
879
    ): string {
880
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
881
        //          so we try to remove it here again ...
882

883
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
1,526✔
884
            /** @noinspection HtmlRequiredLangAttribute */
885
            $content = \str_replace(
651✔
886
                [
651✔
887
                    '<html>',
651✔
888
                    '</html>',
651✔
889
                ],
651✔
890
                '',
651✔
891
                $content
651✔
892
            );
651✔
893
        }
894

895
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
1,526✔
896
            /** @noinspection HtmlRequiredTitleElement */
897
            $content = \str_replace(
854✔
898
                [
854✔
899
                    '<head>',
854✔
900
                    '</head>',
854✔
901
                ],
854✔
902
                '',
854✔
903
                $content
854✔
904
            );
854✔
905
        }
906

907
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
1,526✔
908
            $content = \str_replace(
651✔
909
                [
651✔
910
                    '<body>',
651✔
911
                    '</body>',
651✔
912
                ],
651✔
913
                '',
651✔
914
                $content
651✔
915
            );
651✔
916
        }
917

918
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
1,526✔
919
            $content = \str_replace(
7✔
920
                '</script>',
7✔
921
                '',
7✔
922
                $content
7✔
923
            );
7✔
924
        }
925

926
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
1,526✔
927
            $content = (string) \preg_replace('/^<p>/', '', $content);
56✔
928
            $content = (string) \preg_replace('/<\/p>/', '', $content);
56✔
929
        }
930

931
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
1,526✔
932
            $content = \str_replace(
84✔
933
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
84✔
934
                '',
84✔
935
                $content
84✔
936
            );
84✔
937
        }
938

939
        // https://bugs.php.net/bug.php?id=73175
940
        $content = \str_replace(
1,526✔
941
            \array_map(static function ($e) {
1,526✔
942
                return '</' . $e . '>';
1,526✔
943
            }, $this->selfClosingTags),
1,526✔
944
            '',
1,526✔
945
            $content
1,526✔
946
        );
1,526✔
947

948
        /** @noinspection HtmlRequiredTitleElement */
949
        $content = \trim(
1,526✔
950
            \str_replace(
1,526✔
951
                [
1,526✔
952
                    '<simpleHtmlDomHtml>',
1,526✔
953
                    '</simpleHtmlDomHtml>',
1,526✔
954
                    '<simpleHtmlDomP>',
1,526✔
955
                    '</simpleHtmlDomP>',
1,526✔
956
                    '<head><head>',
1,526✔
957
                    '</head></head>',
1,526✔
958
                ],
1,526✔
959
                [
1,526✔
960
                    '',
1,526✔
961
                    '',
1,526✔
962
                    '',
1,526✔
963
                    '',
1,526✔
964
                    '<head>',
1,526✔
965
                    '</head>',
1,526✔
966
                ],
1,526✔
967
                $content
1,526✔
968
            )
1,526✔
969
        );
1,526✔
970

971
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
1,526✔
972

973
        return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
1,526✔
974
    }
975

976
    /**
977
     * Return elements by ".class".
978
     *
979
     * @param string $class
980
     *
981
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
982
     */
983
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
984
    {
985
        return $this->findMulti('.' . $class);
7✔
986
    }
987

988
    /**
989
     * Return element by #id.
990
     *
991
     * @param string $id
992
     *
993
     * @return SimpleHtmlDomInterface
994
     */
995
    public function getElementById(string $id): SimpleHtmlDomInterface
996
    {
997
        return $this->findOne('#' . $id);
84✔
998
    }
999

1000
    /**
1001
     * Return element by tag name.
1002
     *
1003
     * @param string $name
1004
     *
1005
     * @return SimpleHtmlDomInterface
1006
     */
1007
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
1008
    {
1009
        $node = $this->document->getElementsByTagName($name)->item(0);
30✔
1010

1011
        if ($node === null) {
30✔
1012
            return new SimpleHtmlDomBlank();
7✔
1013
        }
1014

1015
        return new SimpleHtmlDom($node, $this);
23✔
1016
    }
1017

1018
    /**
1019
     * Returns elements by "#id".
1020
     *
1021
     * @param string   $id
1022
     * @param int|null $idx
1023
     *
1024
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1025
     */
1026
    public function getElementsById(string $id, $idx = null)
1027
    {
1028
        return $this->find('#' . $id, $idx);
7✔
1029
    }
1030

1031
    /**
1032
     * Returns elements by tag name.
1033
     *
1034
     * @param string   $name
1035
     * @param int|null $idx
1036
     *
1037
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1038
     */
1039
    public function getElementsByTagName(string $name, $idx = null)
1040
    {
1041
        $nodesList = $this->document->getElementsByTagName($name);
49✔
1042

1043
        $elements = new SimpleHtmlDomNode();
49✔
1044

1045
        foreach ($nodesList as $node) {
49✔
1046
            $elements[] = new SimpleHtmlDom($node, $this);
28✔
1047
        }
1048

1049
        // return all elements
1050
        if ($idx === null) {
49✔
1051
            if (\count($elements) === 0) {
35✔
1052
                return new SimpleHtmlDomNodeBlank();
14✔
1053
            }
1054

1055
            return $elements;
21✔
1056
        }
1057

1058
        // handle negative values
1059
        if ($idx < 0) {
14✔
1060
            $idx = \count($elements) + $idx;
×
1061
        }
1062

1063
        // return one element
1064
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
14✔
1065
    }
1066

1067
    /**
1068
     * Get dom node's outer html.
1069
     *
1070
     * @param bool $multiDecodeNewHtmlEntity
1071
     * @param bool $putBrokenReplacedBack
1072
     *
1073
     * @return string
1074
     */
1075
    public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1076
    {
1077
        if (static::$callback !== null) {
1,190✔
1078
            \call_user_func(static::$callback, [$this]);
847✔
1079
        }
1080

1081
        if ($this->shouldUseWholeDocumentSerializationForHtmlOnPhpLt8()) {
1,190✔
1082
            $content = $this->document->saveHTML();
166✔
1083
        } elseif ($this->usesInternalWrapperDocument()) {
1,062✔
1084
            $content = $this->serializeInternalWrapperContent();
155✔
1085
        } elseif ($this->createdFromNode) {
952✔
1086
            if (\PHP_VERSION_ID < 80000) {
434✔
1087
                $content = $this->serializeCreatedFromNodeForPhpLt8();
124✔
1088
            } else {
1089
                $content = $this->serializeChildNodes($this->document);
434✔
1090
            }
1091
        } elseif ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
657✔
1092
            $content = $this->document->saveHTML($this->document->documentElement);
342✔
1093
        } else {
1094
            $content = $this->document->saveHTML();
378✔
1095
        }
1096

1097
        if ($content === false) {
1,190✔
1098
            return '';
×
1099
        }
1100

1101
        $output = $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
1,190✔
1102

1103
        return $output;
1,190✔
1104
    }
1105

1106
    /**
1107
     * Mark a parser-generated <p>-wrapper so fixHtmlOutput() can remove only
1108
     * the synthetic wrapper instead of stripping all paragraph tags. The
1109
     * wrapper is renamed to the placeholder tag that fixHtmlOutput() already
1110
     * strips from serialized output.
1111
     *
1112
     * @return void
1113
     */
1114
    private function markSyntheticParagraphWrapper(): void
1115
    {
1116
        if (!$this->isDOMDocumentCreatedWithoutPTagWrapper) {
2,249✔
1117
            return;
1,017✔
1118
        }
1119

1120
        $html = $this->document->documentElement;
1,351✔
1121
        if (
1122
            !$html instanceof \DOMElement
1,351✔
1123
            ||
1124
            \strtolower($html->tagName) !== 'html'
1,351✔
1125
        ) {
1126
            return;
800✔
1127
        }
1128

1129
        $body = $this->document->getElementsByTagName('body')->item(0);
658✔
1130
        if (!$body instanceof \DOMElement) {
658✔
1131
            return;
61✔
1132
        }
1133

1134
        $wrapper = null;
637✔
1135
        foreach ($body->childNodes as $child) {
637✔
1136
            if ($child instanceof \DOMText && \trim($child->nodeValue ?? '') === '') {
525✔
1137
                continue;
84✔
1138
            }
1139

1140
            if ($wrapper !== null) {
525✔
1141
                return;
×
1142
            }
1143

1144
            if (!$child instanceof \DOMElement) {
525✔
1145
                return;
28✔
1146
            }
1147

1148
            if (\strtolower($child->tagName) !== 'p') {
497✔
1149
                return;
441✔
1150
            }
1151

1152
            $wrapper = $child;
98✔
1153
        }
1154

1155
        if (!$wrapper instanceof \DOMElement || $wrapper->parentNode === null) {
210✔
1156
            return;
112✔
1157
        }
1158

1159
        $replacement = $this->document->createElement('simpleHtmlDomP');
98✔
1160

1161
        while ($wrapper->firstChild !== null) {
98✔
1162
            $replacement->appendChild($wrapper->firstChild);
98✔
1163
        }
1164

1165
        $wrapper->parentNode->replaceChild($replacement, $wrapper);
98✔
1166
    }
1167

1168
    /**
1169
     * Serialize a single DOM node to HTML.
1170
     *
1171
     * A detached DOMDocument is used so that the serialization context is
1172
     * independent of the internal wrapper tag name (older libxml HTML
1173
     * serializers treat unknown hyphenated tags as block-level and inject
1174
     * formatting newlines into the wrapper's children when saving the full
1175
     * document).
1176
     *
1177
     * On PHP < 8.0, DOMElement instances are serialized through
1178
     * serializeElementNodeForPhpLt8() so older libxml cannot inject formatting
1179
     * newlines when saveHTML($node) is used on detached block-level elements.
1180
     * Text and other non-element nodes still use the fresh-document approach
1181
     * directly because they do not need the extra wrapper stripping.
1182
     *
1183
     * @param \DOMNode $node
1184
     */
1185
    private function serializeNode(\DOMNode $node): string
1186
    {
1187
        if (\PHP_VERSION_ID < 80000 && $node instanceof \DOMElement) {
761✔
NEW
1188
            return $this->serializeElementNodeForPhpLt8($node);
40✔
1189
        }
1190

1191
        $document = new \DOMDocument('1.0', $this->getEncoding());
735✔
1192
        $document->preserveWhiteSpace = true;
735✔
1193
        $document->formatOutput = false;
735✔
1194

1195
        $importedNode = $document->importNode($node, true);
735✔
1196
        // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
1197
        if (!$importedNode instanceof \DOMNode) {
735✔
NEW
1198
            return '';
×
1199
        }
1200

1201
        $document->appendChild($importedNode);
735✔
1202

1203
        $content = $document->saveHTML($importedNode);
735✔
1204

1205
        if ($content === false) {
735✔
1206
            return '';
×
1207
        }
1208

1209
        return $content;
735✔
1210
    }
1211

1212
    /**
1213
     * On PHP < 8.0, saveHTML($node) injects formatting newlines for detached
1214
     * block-level elements, so serialize a temporary whole document instead.
1215
     *
1216
     * @param \DOMElement $node
1217
     *
1218
     * @return string
1219
     */
1220
    private function serializeElementNodeForPhpLt8(\DOMElement $node): string
1221
    {
NEW
1222
        $document = new \DOMDocument('1.0', $this->getEncoding());
42✔
NEW
1223
        $document->preserveWhiteSpace = true;
42✔
NEW
1224
        $document->formatOutput = false;
42✔
1225

NEW
1226
        $importedNode = $document->importNode($node, true);
42✔
1227
        // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
NEW
1228
        if (!$importedNode instanceof \DOMElement) {
42✔
NEW
1229
            return '';
×
1230
        }
1231

NEW
1232
        $document->appendChild($importedNode);
42✔
1233

NEW
1234
        $content = $document->saveHTML();
42✔
NEW
1235
        if ($content === false) {
42✔
NEW
1236
            return '';
×
1237
        }
1238

NEW
1239
        $content = $this->stripLibxmlDocumentWrappers($content, \strtolower($importedNode->tagName));
42✔
1240

NEW
1241
        if (\substr($content, -1) === "\n") {
42✔
NEW
1242
            $content = \substr($content, 0, -1);
42✔
1243
        }
1244

NEW
1245
        return $content;
42✔
1246
    }
1247

1248
    /**
1249
     * Serialize the single element that was imported via the node-backed
1250
     * constructor, for PHP < 8.0.
1251
     *
1252
     * On PHP < 8, saveHTML($node) with a node argument always injects
1253
     * formatting newlines between block-level child elements and a trailing
1254
     * "\n" after raw-text elements (script, style), even with formatOutput
1255
     * set to false.  saveHTML() called without a node argument respects
1256
     * formatOutput=false and does not inject those newlines.
1257
     *
1258
     * We call saveHTML() on the constructor document (which already has the
1259
     * imported element as its only child / documentElement) and strip the
1260
     * DOCTYPE and structural wrappers (html, body) that libxml may add around
1261
     * elements that are not recognised HTML root elements.
1262
     *
1263
     * @return string
1264
     */
1265
    private function serializeCreatedFromNodeForPhpLt8(): string
1266
    {
1267
        $full = $this->document->saveHTML();
124✔
1268
        if ($full === false) {
124✔
1269
            return '';
×
1270
        }
1271

UNCOV
1272
        $documentElement = $this->document->documentElement;
124✔
1273
        $tagName = $documentElement instanceof \DOMElement
124✔
1274
            ? \strtolower($documentElement->tagName)
118✔
1275
            : '';
124✔
1276

NEW
1277
        $full = $this->stripLibxmlDocumentWrappers($full, $tagName, true);
124✔
1278

NEW
1279
        return $full;
124✔
1280
    }
1281

1282
    /**
1283
     * Strip the synthetic wrappers libxml adds when serializing a whole
1284
     * document around a non-root HTML element on PHP < 8.
1285
     */
1286
    private function stripLibxmlDocumentWrappers(string $content, string $tagName, bool $trim = false): string
1287
    {
NEW
1288
        $content = (string) \preg_replace('/^<!DOCTYPE[^>]+>\s*/i', '', $content);
162✔
NEW
1289
        if ($trim) {
162✔
NEW
1290
            $content = \trim($content);
124✔
1291
        }
1292

1293
        if ($tagName !== 'html') {
162✔
NEW
1294
            $content = (string) \preg_replace('/^<html[^>]*>/i', '', $content);
160✔
NEW
1295
            $content = (string) \preg_replace('/<\/html>\s*$/i', '', $content);
160✔
NEW
1296
            if ($trim) {
160✔
NEW
1297
                $content = \trim($content);
122✔
1298
            }
1299

UNCOV
1300
            if ($tagName !== 'body') {
160✔
NEW
1301
                $content = (string) \preg_replace('/^<body[^>]*>/i', '', $content);
158✔
NEW
1302
                $content = (string) \preg_replace('/<\/body>\s*$/i', '', $content);
158✔
NEW
1303
                $content = \str_replace('<body></body>', '', $content);
158✔
NEW
1304
                if ($trim) {
158✔
NEW
1305
                    $content = \trim($content);
120✔
1306
                }
1307
            }
1308
        }
1309

NEW
1310
        return $content;
162✔
1311
    }
1312

1313
    /**
1314
     * @param \DOMNode $parentNode
1315
     *
1316
     * @return string
1317
     */
1318
    private function serializeChildNodes(\DOMNode $parentNode): string
1319
    {
1320
        $content = '';
754✔
1321

1322
        foreach ($parentNode->childNodes as $childNode) {
754✔
1323
            $content .= $this->serializeNode($childNode);
754✔
1324
        }
1325

1326
        return $content;
754✔
1327
    }
1328

1329
    /**
1330
     * @return bool
1331
     */
1332
    private function usesInternalWrapperDocument(): bool
1333
    {
1334
        return $this->document->documentElement instanceof \DOMElement
1,372✔
1335
            && $this->document->documentElement->tagName === self::$domHtmlWrapperHelper;
1,372✔
1336
    }
1337

1338
    /**
1339
     * Older libxml preserves body-only fragments more faithfully when the whole
1340
     * temporary document is serialized and fixHtmlOutput() removes the wrappers
1341
     * afterwards. Head-only fragments still need root-element serialization, or
1342
     * <meta charset=...> can trigger output re-encoding (e.g. utf-7).
1343
     */
1344
    private function isBodyOnlyHtmlFragmentDocument(): bool
1345
    {
1346
        $documentElement = $this->document->documentElement;
152✔
1347
        if (!$documentElement instanceof \DOMElement || \strtolower($documentElement->tagName) !== 'html') {
152✔
1348
            return false;
94✔
1349
        }
1350

1351
        $head = $documentElement->getElementsByTagName('head')->item(0);
62✔
1352
        $body = $documentElement->getElementsByTagName('body')->item(0);
62✔
1353

1354
        $hasHeadContent = $head instanceof \DOMElement && $head->childNodes->length > 0;
62✔
1355
        $hasBodyContent = $body instanceof \DOMElement && $body->childNodes->length > 0;
62✔
1356

1357
        return !$hasHeadContent && $hasBodyContent;
62✔
1358
    }
1359

1360
    private function shouldUseWholeDocumentSerializationForHtmlOnPhpLt8(): bool
1361
    {
1362
        if (\PHP_VERSION_ID >= 80000) {
1,190✔
1363
            return false;
850✔
1364
        }
1365

1366
        if ($this->usesInternalWrapperDocument()) {
340✔
1367
            return true;
62✔
1368
        }
1369

1370
        if (!$this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
296✔
1371
            return false;
208✔
1372
        }
1373

1374
        $documentElement = $this->document->documentElement;
128✔
1375
        if (!$documentElement instanceof \DOMElement) {
128✔
1376
            return false;
14✔
1377
        }
1378

1379
        return \strtolower($documentElement->tagName) !== 'html'
122✔
1380
            || $this->isBodyOnlyHtmlFragmentDocument();
122✔
1381
    }
1382

1383
    private function shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8(): bool
1384
    {
1385
        return \PHP_VERSION_ID < 80000
371✔
1386
            && (
371✔
1387
                $this->usesInternalWrapperDocument()
371✔
1388
                || $this->isBodyOnlyHtmlFragmentDocument()
371✔
1389
            );
371✔
1390
    }
1391

1392
    /**
1393
     * Keep helper wrapper markers around detached child serialization so
1394
     * fixHtmlOutput() does not trim leading/trailing fragment whitespace.
1395
     *
1396
     * @return string
1397
     */
1398
    private function serializeInternalWrapperContent(): string
1399
    {
1400
        if ($this->document->documentElement === null) {
155✔
1401
            return '';
×
1402
        }
1403

1404
        $wrapperTag = self::$domHtmlWrapperHelper;
155✔
1405

1406
        return '<' . $wrapperTag . '>'
155✔
1407
            . $this->serializeChildNodes($this->document->documentElement)
155✔
1408
            . '</' . $wrapperTag . '>';
155✔
1409
    }
1410

1411
    /**
1412
     * Parse the fragment inside the internal wrapper and count significant
1413
     * direct children. This is more reliable than regex for fragments whose
1414
     * top-level elements have attributes or nested markup.
1415
     *
1416
     * @param string $html
1417
     * @param int    $optionsXml
1418
     *
1419
     * @return bool
1420
     */
1421
    private function hasMultipleTopLevelNodes(string $html, int $optionsXml): bool
1422
    {
1423
        $internalErrors = \libxml_use_internal_errors(true);
1,444✔
1424
        try {
1425
            \libxml_clear_errors();
1,444✔
1426

1427
            $xmlProbe = '<' . self::$domHtmlWrapperHelper . '>'
1,444✔
1428
                . self::replaceToPreserveHtmlEntities($html)
1,444✔
1429
                . '</' . self::$domHtmlWrapperHelper . '>';
1,444✔
1430

1431
            $simpleXml = \simplexml_load_string($xmlProbe, \SimpleXMLElement::class, $optionsXml);
1,444✔
1432
            if ($simpleXml === false || \count(\libxml_get_errors()) > 0) {
1,444✔
1433
                return false;
377✔
1434
            }
1435

1436
            $wrapper = \dom_import_simplexml($simpleXml);
1,137✔
1437
            if (!$wrapper instanceof \DOMElement) {
1,137✔
1438
                return false;
×
1439
            }
1440

1441
            return $this->countSignificantChildNodes($wrapper) > 1;
1,137✔
1442
        } finally {
1443
            \libxml_clear_errors();
1,444✔
1444
            \libxml_use_internal_errors($internalErrors);
1,444✔
1445
        }
1446
    }
1447

1448
    /**
1449
     * @param \DOMNode $node
1450
     *
1451
     * @return int
1452
     */
1453
    private function countSignificantChildNodes(\DOMNode $node): int
1454
    {
1455
        $count = 0;
1,137✔
1456

1457
        foreach ($node->childNodes as $childNode) {
1,137✔
1458
            if (
1459
                $childNode->nodeType === \XML_TEXT_NODE
1,123✔
1460
                &&
1461
                \trim($childNode->textContent) === ''
1,123✔
1462
            ) {
1463
                continue;
72✔
1464
            }
1465

1466
            ++$count;
1,123✔
1467
            if ($count > 1) {
1,123✔
1468
                return $count;
228✔
1469
            }
1470
        }
1471

1472
        return $count;
965✔
1473
    }
1474

1475
    /**
1476
     * {@inheritdoc}
1477
     */
1478
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1479
    {
1480
        $text = '';
378✔
1481

1482
        if ($this->document->documentElement) {
378✔
1483
            if ($this->shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8()) {
371✔
1484
                $text = $this->document->saveHTML();
12✔
1485
            } elseif ($this->usesInternalWrapperDocument()) {
359✔
1486
                $text = $this->serializeInternalWrapperContent();
5✔
1487
            } else {
1488
                $text = $this->serializeChildNodes($this->document->documentElement);
354✔
1489
            }
1490
        }
1491

1492
        if ($text === false) {
378✔
1493
            $text = '';
×
1494
        }
1495

1496
        $output = $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
378✔
1497

1498
        return $output;
378✔
1499
    }
1500

1501
    /**
1502
     * Get dom node's plain text.
1503
     *
1504
     * HTML document plaintext should exclude raw-text container contents like
1505
     * <script> and <style> while still preserving other text nodes in document
1506
     * order (e.g. <title> content).
1507
     *
1508
     * @param bool $multiDecodeNewHtmlEntity
1509
     *
1510
     * @return string
1511
     */
1512
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1513
    {
1514
        $parts = [];
70✔
1515

1516
        $xPath = new \DOMXPath($this->document);
70✔
1517
        $textNodes = $xPath->query(
70✔
1518
            \sprintf(
70✔
1519
                '//text()[not(ancestor::script or ancestor::style or ancestor::%s)]',
70✔
1520
                self::$domHtmlSpecialScriptHelper
70✔
1521
            )
70✔
1522
        );
70✔
1523

1524
        if ($textNodes !== false) {
70✔
1525
            foreach ($textNodes as $textNode) {
70✔
1526
                $parts[] = $textNode->nodeValue;
70✔
1527
            }
1528
        }
1529

1530
        return $this->fixHtmlOutput(\implode('', $parts), $multiDecodeNewHtmlEntity);
70✔
1531
    }
1532

1533
    /**
1534
     * Load HTML from string.
1535
     *
1536
     * @param string   $html
1537
     * @param int|null $libXMLExtraOptions
1538
     * @param bool     $useDefaultLibXMLOptions
1539
     *
1540
     * @return $this
1541
     */
1542
    public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1543
    {
1544
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
2,249✔
1545

1546
        return $this;
2,249✔
1547
    }
1548

1549
    /**
1550
     * Load HTML from file.
1551
     *
1552
     * @param string   $filePath
1553
     * @param int|null $libXMLExtraOptions
1554
     * @param bool     $useDefaultLibXMLOptions
1555
     *
1556
     * @throws \RuntimeException
1557
     *
1558
     * @return $this
1559
     */
1560
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1561
    {
1562
        if (!\preg_match("/^https?:\/\//i", $filePath)) {
112✔
1563
            if (!\file_exists($filePath)) {
105✔
1564
                throw new \RuntimeException('File ' . $filePath . ' not found');
7✔
1565
            }
1566

1567
            if (!\is_file($filePath)) {
98✔
1568
                throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1569
            }
1570
        }
1571

1572
        try {
1573
            if (\class_exists('\voku\helper\UTF8')) {
98✔
1574
                $html = \voku\helper\UTF8::file_get_contents($filePath);
×
1575
            } else {
1576
                $html = \file_get_contents($filePath);
98✔
1577
            }
1578
        } catch (\Exception $e) {
7✔
1579
            throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1580
        }
1581

1582
        if ($html === false) {
91✔
1583
            throw new \RuntimeException('Could not load file ' . $filePath);
×
1584
        }
1585

1586
        return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
91✔
1587
    }
1588

1589
    /**
1590
     * Get the HTML as XML or plain XML if needed.
1591
     *
1592
     * @param bool $multiDecodeNewHtmlEntity
1593
     * @param bool $htmlToXml
1594
     * @param bool $removeXmlHeader
1595
     * @param int  $options
1596
     *
1597
     * @return string
1598
     */
1599
    public function xml(
1600
        bool $multiDecodeNewHtmlEntity = false,
1601
        bool $htmlToXml = true,
1602
        bool $removeXmlHeader = true,
1603
        int $options = \LIBXML_NOEMPTYTAG
1604
    ): string {
1605
        $xml = $this->document->saveXML(null, $options);
28✔
1606
        if ($xml === false) {
28✔
1607
            return '';
×
1608
        }
1609

1610
        if ($removeXmlHeader) {
28✔
1611
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
14✔
1612
        }
1613

1614
        if ($htmlToXml) {
28✔
1615
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
14✔
1616
        } else {
1617
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
14✔
1618

1619
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
14✔
1620
        }
1621

1622
        return $return;
28✔
1623
    }
1624

1625
    /**
1626
     * @param string $selector
1627
     * @param int    $idx
1628
     *
1629
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1630
     */
1631
    public function __invoke($selector, $idx = null)
1632
    {
1633
        return $this->find($selector, $idx);
21✔
1634
    }
1635

1636
    /**
1637
     * @return bool
1638
     */
1639
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
1640
    {
1641
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
1,526✔
1642
    }
1643

1644
    /**
1645
     * @return bool
1646
     */
1647
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
1648
    {
1649
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
182✔
1650
    }
1651

1652
    /**
1653
     * @return bool
1654
     */
1655
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
1656
    {
1657
        return $this->isDOMDocumentCreatedWithoutHtml;
1,526✔
1658
    }
1659

1660
    /**
1661
     * @return bool
1662
     */
1663
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
1664
    {
1665
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
1,526✔
1666
    }
1667

1668
    /**
1669
     * @return bool
1670
     */
1671
    public function getIsDOMDocumentCreatedWithMultiRoot(): bool
1672
    {
1673
        return $this->isDOMDocumentCreatedWithMultiRoot;
7✔
1674
    }
1675

1676
    /**
1677
     * @return bool
1678
     */
1679
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
1680
    {
1681
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
1,526✔
1682
    }
1683

1684
    /**
1685
     * @return bool
1686
     */
1687
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
1688
    {
1689
        return $this->isDOMDocumentCreatedWithoutWrapper;
1,526✔
1690
    }
1691

1692
    /**
1693
     * @return bool
1694
     */
1695
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
1696
    {
1697
        return $this->isDOMDocumentCreatedWithFakeEndScript;
1,526✔
1698
    }
1699

1700
    /**
1701
     * @param string $html
1702
     *
1703
     * @return string
1704
     */
1705
    protected function keepBrokenHtml(string $html): string
1706
    {
1707
        do {
1708
            $original = $html;
35✔
1709

1710
            $html = (string) \preg_replace_callback(
35✔
1711
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
35✔
1712
                static function ($matches) {
35✔
1713
                    return $matches['start'] .
35✔
1714
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
35✔
1715
                        $matches['value'] .
35✔
1716
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
35✔
1717
                        $matches['end'];
35✔
1718
                },
35✔
1719
                $html
35✔
1720
            );
35✔
1721
        } while ($original !== $html);
35✔
1722

1723
        do {
1724
            $original = $html;
35✔
1725

1726
            $html = (string) \preg_replace_callback(
35✔
1727
                '/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
35✔
1728
                function ($matches) {
35✔
1729
                    $matches['broken'] = \str_replace(
21✔
1730
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
21✔
1731
                        ['</', '<', '>'],
21✔
1732
                        $matches['broken']
21✔
1733
                    );
21✔
1734

1735
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
21✔
1736
                    $this->registerDynamicDomBrokenReplaceHelper($matches['broken'], $matchesHash);
21✔
1737

1738
                    return $matches['start'] . $matchesHash . $matches['end'];
21✔
1739
                },
35✔
1740
                $html
35✔
1741
            );
35✔
1742
        } while ($original !== $html);
35✔
1743

1744
        return \str_replace(
35✔
1745
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
35✔
1746
            ['</', '<', '>'],
35✔
1747
            $html
35✔
1748
        );
35✔
1749
    }
1750

1751
    /**
1752
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
1753
     *
1754
     * @param string $html
1755
     *
1756
     * @return void
1757
     */
1758
    protected function keepSpecialSvgTags(string &$html)
1759
    {
1760
        // regEx for e.g.: [mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">...</svg>')]
1761
        /** @noinspection HtmlDeprecatedTag */
1762
        $regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
329✔
1763
        $htmlTmp = \preg_replace_callback(
329✔
1764
            $regExSpecialSvg,
329✔
1765
            function ($svgs) {
329✔
1766
                $content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
14✔
1767
                $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
14✔
1768
                $this->registerDynamicDomBrokenReplaceHelper($content, $matchesHash);
14✔
1769

1770
                return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
14✔
1771
            },
329✔
1772
            $html
329✔
1773
        );
329✔
1774

1775
        if ($htmlTmp !== null) {
329✔
1776
            $html = $htmlTmp;
329✔
1777
        }
1778
    }
1779

1780
    /**
1781
     * @param string $html
1782
     *
1783
     * @return void
1784
     */
1785
    protected function keepSpecialScriptTags(string &$html)
1786
    {
1787
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1788
        $tags = \implode('|', \array_map(
49✔
1789
            static function ($value) {
49✔
1790
                return \preg_quote($value, '/');
49✔
1791
            },
49✔
1792
            $this->specialScriptTags
49✔
1793
        ));
49✔
1794
        $html = (string) \preg_replace_callback(
49✔
1795
            '/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
49✔
1796
            function ($matches) {
49✔
1797
                // Check for logic in special script tags containing EJS/ERB-style template syntax
1798
                // (e.g. <% ... %> blocks), because often this looks like non-valid html in the template itself.
1799
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
35✔
1800
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
35✔
1801
                        // remove the html5 fallback
1802
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
28✔
1803

1804
                        $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
28✔
1805
                        $this->registerDynamicDomBrokenReplaceHelper($matches['innerContent'], $matchesHash);
28✔
1806

1807
                        return $matches['start'] . $matchesHash . $matches['end'];
28✔
1808
                    }
1809
                }
1810

1811
                // remove the html5 fallback
1812
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
21✔
1813

1814
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
21✔
1815

1816
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
21✔
1817
            },
49✔
1818
            $html
49✔
1819
        );
49✔
1820
    }
1821

1822
    /**
1823
     * @param bool $keepBrokenHtml
1824
     *
1825
     * @return $this
1826
     */
1827
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1828
    {
1829
        $this->keepBrokenHtml = $keepBrokenHtml;
35✔
1830

1831
        return $this;
35✔
1832
    }
1833

1834
    /**
1835
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1836
     *
1837
     * @return $this
1838
     */
1839
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1840
    {
1841
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
14✔
1842
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1843
            if (!\is_string($tmp)) {
14✔
1844
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
7✔
1845
            }
1846
        }
1847

1848
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
7✔
1849

1850
        return $this;
7✔
1851
    }
1852

1853
    /**
1854
     * @param string[] $specialScriptTags
1855
     *
1856
     * @return $this
1857
     */
1858
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1859
    {
1860
        foreach ($specialScriptTags as $tag) {
7✔
1861
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1862
            if (!\is_string($tag)) {
7✔
1863
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
7✔
1864
            }
1865
        }
1866

1867
        $this->specialScriptTags = $specialScriptTags;
7✔
1868

1869
        return $this;
7✔
1870
    }
1871

1872
    /**
1873
     * @param callable $callbackXPathBeforeQuery
1874
     *
1875
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1876
     *
1877
     * @return $this
1878
     */
1879
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
1880
    {
1881
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
21✔
1882

1883
        return $this;
21✔
1884
    }
1885

1886
    /**
1887
     * @param callable $callbackBeforeCreateDom
1888
     *
1889
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1890
     *
1891
     * @return $this
1892
     */
1893
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
1894
    {
1895
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
7✔
1896

1897
        return $this;
7✔
1898
    }
1899
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc