• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In
No new info detected.

voku / simple_html_dom / 24713207173

21 Apr 2026 08:51AM UTC coverage: 96.791% (+0.8%) from 96.034%
24713207173

push

github

voku
[+]: add more tests

2172 of 2244 relevant lines covered (96.79%)

287.34 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.57
/src/voku/helper/HtmlDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42

43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49

50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61

62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71

72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/template',
79
     *     'text/x-custom-template',
80
     *     'text/x-handlebars-template'
81
     * ]
82
     * ```
83
     *
84
     * @var string[]
85
     */
86
    protected $specialScriptTags = [
87
        'text/html',
88
        'text/template',
89
        'text/x-custom-template',
90
        'text/x-handlebars-template',
91
    ];
92

93
    /**
94
     * @var string[]
95
     */
96
    protected $selfClosingTags = [
97
        'area',
98
        'base',
99
        'br',
100
        'col',
101
        'command',
102
        'embed',
103
        'hr',
104
        'img',
105
        'input',
106
        'keygen',
107
        'link',
108
        'meta',
109
        'param',
110
        'source',
111
        'track',
112
        'wbr',
113
    ];
114

115
    /**
116
     * @var bool
117
     */
118
    protected $isDOMDocumentCreatedWithoutHtml = false;
119

120
    /**
121
     * @var bool
122
     */
123
    protected $isDOMDocumentCreatedWithoutWrapper = false;
124

125
    /**
126
     * @var bool
127
     */
128
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
129

130
    /**
131
     * @var bool
132
     */
133
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
134

135
    /**
136
     * @var bool
137
     */
138
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
139

140
    /**
141
     * @var bool
142
     */
143
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
144

145
    /**
146
     * @var bool
147
     */
148
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
149

150
    /**
151
     * @var bool
152
     */
153
    protected $isDOMDocumentCreatedWithMultiRoot = false;
154

155
    /**
156
     * @var bool
157
     */
158
    protected $isDOMDocumentCreatedWithEdgeWhitespace = false;
159

160
    /**
161
     * @var bool
162
     */
163
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
164

165
    /**
166
     * @var bool
167
     */
168
    protected $createdFromNode = false;
169

170
    /**
171
     * @var bool
172
     */
173
    protected $keepBrokenHtml = false;
174

175
    /**
176
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
177
     */
178
    public function __construct($element = null)
179
    {
180
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
2,254✔
181

182
        // DOMDocument settings
183
        $this->document->preserveWhiteSpace = true;
2,254✔
184
        $this->document->formatOutput = false;
2,254✔
185

186
        if ($element instanceof SimpleHtmlDomInterface) {
2,254✔
187
            $element = $element->getNode();
791✔
188
        }
189

190
        if ($element instanceof \DOMDocument) {
2,254✔
191
            $html = $element->saveHTML();
7✔
192
            if ($html !== false) {
7✔
193
                $this->loadHtml($html);
7✔
194
            }
195

196
            return;
7✔
197
        }
198

199
        if ($element instanceof \DOMNode) {
2,254✔
200
            $this->createdFromNode = true;
805✔
201

202
            $domNode = $this->document->importNode($element, true);
805✔
203

204
            // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
205
            if ($domNode instanceof \DOMNode) {
798✔
206
                $this->document->appendChild($domNode);
798✔
207
            }
208

209
            return;
798✔
210
        }
211

212
        if ($element !== null) {
2,247✔
213
            $this->loadHtml($element);
756✔
214
        }
215
    }
216

217
    /**
218
     * @param string       $name
219
     * @param array<mixed> $arguments
220
     *
221
     * @return bool|mixed
222
     */
223
    public function __call($name, $arguments)
224
    {
225
        $name = \strtolower($name);
546✔
226

227
        if (isset(self::$functionAliases[$name])) {
546✔
228
            $method = self::$functionAliases[$name];
539✔
229

230
            return $this->{$method}(...$arguments);
539✔
231
        }
232

233
        throw new \BadMethodCallException('Method does not exist: ' . $name);
7✔
234
    }
235

236
    /**
237
     * @param string       $name
238
     * @param array<mixed> $arguments
239
     *
240
     * @throws \BadMethodCallException
241
     * @throws \RuntimeException
242
     *
243
     * @return static
244
     */
245
    public static function __callStatic($name, $arguments)
246
    {
247
        $arguments0 = $arguments[0] ?? '';
707✔
248

249
        $arguments1 = $arguments[1] ?? null;
707✔
250

251
        if ($name === 'str_get_html') {
707✔
252
            $parser = self::createStaticParser();
665✔
253

254
            return $parser->loadHtml($arguments0, $arguments1);
665✔
255
        }
256

257
        if ($name === 'file_get_html') {
49✔
258
            $parser = self::createStaticParser();
42✔
259

260
            return $parser->loadHtmlFile($arguments0, $arguments1);
42✔
261
        }
262

263
        throw new \BadMethodCallException('Method does not exist');
7✔
264
    }
265

266
    /**
267
     * @return static
268
     */
269
    private static function createStaticParser()
270
    {
271
        // @phpstan-ignore new.static (factory methods intentionally preserve late static binding)
272
        return new static();
700✔
273
    }
274

275
    /** @noinspection MagicMethodsValidityInspection */
276

277
    /**
278
     * @param string $name
279
     *
280
     * @return string|null
281
     */
282
    public function __get($name)
283
    {
284
        $name = \strtolower($name);
210✔
285

286
        switch ($name) {
287
            case 'outerhtml':
210✔
288
            case 'outertext':
168✔
289
                return $this->html();
133✔
290
            case 'innerhtml':
119✔
291
            case 'innertext':
77✔
292
                return $this->innerHtml();
49✔
293
            case 'innerhtmlkeep':
70✔
294
                return $this->innerHtml(false, false);
×
295
            case 'text':
70✔
296
            case 'plaintext':
70✔
297
                return $this->text();
63✔
298
        }
299

300
        return null;
7✔
301
    }
302

303
    /**
304
     * @return string
305
     */
306
    public function __toString()
307
    {
308
        return $this->html();
147✔
309
    }
310

311
    /**
312
     * does nothing (only for api-compatibility-reasons)
313
     *
314
     * @return bool
315
     *
316
     * @deprecated
317
     */
318
    public function clear(): bool
319
    {
320
        return true;
14✔
321
    }
322

323
    /**
324
     * Create DOMDocument from HTML.
325
     *
326
     * @param string   $html
327
     * @param int|null $libXMLExtraOptions
328
     * @param bool     $useDefaultLibXMLOptions
329
     *
330
     * @return \DOMDocument
331
     */
332
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
333
    {
334
        $this->resetDynamicDomHelpers();
2,121✔
335

336
        if ($this->callbackBeforeCreateDom) {
2,121✔
337
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
7✔
338
        }
339

340
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
341
        $isDOMDocumentCreatedWithDoctype = false;
2,121✔
342
        if (\stripos($html, '<!DOCTYPE') !== false) {
2,121✔
343
            $isDOMDocumentCreatedWithDoctype = true;
427✔
344
            if (
345
                \preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
427✔
346
                &&
347
                \trim($matches_before_doctype[1])
427✔
348
            ) {
349
                $html = \str_replace($matches_before_doctype[1], '', $html);
14✔
350
            }
351
        }
352

353
        if ($this->keepBrokenHtml) {
2,121✔
354
            $html = $this->keepBrokenHtml(\trim($html));
35✔
355
        }
356

357
        if (\strpos($html, '<') === false) {
2,121✔
358
            $this->isDOMDocumentCreatedWithoutHtml = true;
98✔
359
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
2,107✔
360
            $this->isDOMDocumentCreatedWithoutWrapper = true;
70✔
361
        }
362

363
        if (\strpos(\ltrim($html), '<!--') === 0) {
2,121✔
364
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
84✔
365
        }
366

367
        /** @noinspection HtmlRequiredLangAttribute */
368
        if (
369
            \strpos($html, '<html ') === false
2,121✔
370
            &&
371
            \strpos($html, '<html>') === false
2,121✔
372
        ) {
373
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
1,386✔
374
        }
375

376
        if (
377
            \strpos($html, '<body ') === false
2,121✔
378
            &&
379
            \strpos($html, '<body>') === false
2,121✔
380
        ) {
381
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
1,400✔
382
        }
383

384
        if (
385
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,121✔
386
            &&
387
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,121✔
388
            &&
389
            \trim($html) !== $html
2,121✔
390
            &&
391
            \substr_count($html, '</') >= 2
2,121✔
392
            &&
393
            \preg_match('#^\s*<([a-zA-Z][^\\s>/]*)>.*?</\\1>#su', $html) === 1
2,121✔
394
        ) {
395
            $this->isDOMDocumentCreatedWithEdgeWhitespace = true;
28✔
396
        }
397

398
        /** @noinspection HtmlRequiredTitleElement */
399
        if (
400
            \strpos($html, '<head ') === false
2,121✔
401
            &&
402
            \strpos($html, '<head>') === false
2,121✔
403
        ) {
404
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
1,722✔
405
        }
406

407
        if (
408
            \stripos($html, '<p ') === false
2,121✔
409
            &&
410
            \stripos($html, '<p>') === false
2,121✔
411
        ) {
412
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
1,288✔
413
        }
414

415
        if (
416
            \strpos($html, '</script>') === false
2,121✔
417
            &&
418
            \strpos($html, '<\/script>') !== false
2,121✔
419
        ) {
420
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
7✔
421
        }
422

423
        if (\stripos($html, '</html>') !== false) {
2,121✔
424
            /** @noinspection NestedPositiveIfStatementsInspection */
425
            if (
426
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
826✔
427
                &&
428
                \trim($matches_after_html[1])
826✔
429
            ) {
430
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
28✔
431
            }
432
        }
433

434
        if (\strpos($html, '<script') !== false) {
2,121✔
435
            // keepSpecialScriptTags must run before html5FallbackForScriptTags so
436
            // that special-type scripts (type="text/html", etc.) are converted to
437
            // the simplevokuspecialscript placeholder element before the script-tag
438
            // regex runs.  On PHP < 8.0 the regex uses hash placeholders; if it
439
            // ran first the special-script content would be hashed and
440
            // keepSpecialScriptTags would only see the hash, losing the ability to
441
            // pass the real HTML content to the DOM for error-recovery parsing.
442
            foreach ($this->specialScriptTags as $tag) {
168✔
443
                if (\strpos($html, $tag) !== false) {
168✔
444
                    $this->keepSpecialScriptTags($html);
49✔
445
                    break;
49✔
446
                }
447
            }
448

449
            $this->html5FallbackForScriptTags($html);
168✔
450
        }
451

452
        if (\strpos($html, '<svg') !== false) {
2,121✔
453
            $this->keepSpecialSvgTags($html);
329✔
454
        }
455

456
        $html = \str_replace(
2,121✔
457
            \array_map(static function ($e) {
2,121✔
458
                return '<' . $e . '>';
2,121✔
459
            }, $this->selfClosingTags),
2,121✔
460
            \array_map(static function ($e) {
2,121✔
461
                return '<' . $e . '/>';
2,121✔
462
            }, $this->selfClosingTags),
2,121✔
463
            $html
2,121✔
464
        );
2,121✔
465

466
        // set error level
467
        $internalErrors = \libxml_use_internal_errors(true);
2,121✔
468
        if (\PHP_VERSION_ID < 80000) {
2,121✔
469
            $disableEntityLoader = \libxml_disable_entity_loader(true);
606✔
470
        }
471
        \libxml_clear_errors();
2,121✔
472

473
        $optionsXml = 0;
2,121✔
474
        if ($useDefaultLibXMLOptions) {
2,121✔
475
            $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
2,121✔
476

477
            if (\defined('LIBXML_BIGLINES')) {
2,121✔
478
                $optionsXml |= \LIBXML_BIGLINES;
2,121✔
479
            }
480

481
            if (\defined('LIBXML_COMPACT')) {
2,121✔
482
                $optionsXml |= \LIBXML_COMPACT;
2,121✔
483
            }
484

485
            if (\defined('LIBXML_HTML_NODEFDTD')) {
2,121✔
486
                $optionsXml |= \LIBXML_HTML_NODEFDTD;
2,121✔
487
            }
488
        }
489

490
        if ($libXMLExtraOptions !== null) {
2,121✔
491
            $optionsXml |= $libXMLExtraOptions;
63✔
492
        }
493

494
        if (
495
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,121✔
496
            &&
497
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,121✔
498
        ) {
499
            $this->isDOMDocumentCreatedWithMultiRoot = $this->hasMultipleTopLevelNodes($html, $optionsXml);
1,358✔
500
        }
501

502
        if (
503
            $this->isDOMDocumentCreatedWithMultiRoot
2,121✔
504
            ||
505
            $this->isDOMDocumentCreatedWithEdgeWhitespace
1,977✔
506
            ||
507
            $this->isDOMDocumentCreatedWithoutWrapper
1,965✔
508
            ||
509
            $this->isDOMDocumentCreatedWithCommentWrapper
1,916✔
510
            ||
511
            (
512
                !$isDOMDocumentCreatedWithDoctype
2,121✔
513
                &&
2,121✔
514
                $this->keepBrokenHtml
2,121✔
515
            )
516
        ) {
517
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
366✔
518
        }
519

520
        $html = self::replaceToPreserveHtmlEntities($html);
2,121✔
521

522
        $documentFound = false;
2,121✔
523
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
2,121✔
524
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
2,121✔
525
            $domElementTmp = \dom_import_simplexml($sxe);
1,354✔
526
            if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
1,354✔
527
                $documentFound = true;
1,354✔
528
                $this->document = $domElementTmp->ownerDocument;
1,354✔
529
            }
530
        }
531

532
        if ($documentFound === false) {
2,121✔
533
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
534
            $xmlHackUsed = false;
909✔
535
            if (\stripos('<?xml', $html) !== 0) {
909✔
536
                $xmlHackUsed = true;
904✔
537
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
904✔
538
            }
539

540
            if ($html !== '') {
909✔
541
                $this->document->loadHTML($html, $optionsXml);
904✔
542
            }
543

544
            // remove the "xml-encoding" hack
545
            if ($xmlHackUsed) {
909✔
546
                foreach ($this->document->childNodes as $child) {
904✔
547
                    if ($child->nodeType === \XML_PI_NODE) {
904✔
548
                        $this->document->removeChild($child);
904✔
549

550
                        break;
904✔
551
                    }
552
                }
553
            }
554
        }
555

556
        $this->markSyntheticParagraphWrapper();
2,121✔
557

558
        // set encoding
559
        $this->document->encoding = $this->getEncoding();
2,121✔
560

561
        // restore lib-xml settings
562
        \libxml_clear_errors();
2,121✔
563
        \libxml_use_internal_errors($internalErrors);
2,121✔
564
        // @phpstan-ignore isset.variable (only defined on PHP < 8 paths where it is used)
565
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
2,121✔
566
            \libxml_disable_entity_loader($disableEntityLoader);
606✔
567
        }
568

569
        return $this->document;
2,121✔
570
    }
571

572
    /**
573
     * Find list of nodes with a CSS selector.
574
     *
575
     * @param string   $selector
576
     * @param int|null $idx
577
     *
578
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
579
     */
580
    public function find(string $selector, $idx = null)
581
    {
582
        return $this->findInNodeContext($selector, null, $idx);
1,512✔
583
    }
584

585
    /**
586
     * Find list of nodes with a CSS selector within an optional DOM context.
587
     *
588
     * @param string        $selector
589
     * @param \DOMNode|null $contextNode
590
     * @param int|null      $idx
591
     *
592
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
593
     *
594
     * @internal Used by wrapped SimpleHtmlDom instances to preserve parser
595
     *           callback state when scoping queries to an existing DOM node.
596
     */
597
    public function findInNodeContext(string $selector, ?\DOMNode $contextNode = null, $idx = null)
598
    {
599
        return self::findInDocumentContext(
1,512✔
600
            $selector,
1,512✔
601
            $this->document,
1,512✔
602
            $contextNode,
1,512✔
603
            $idx,
1,512✔
604
            $this->callbackXPathBeforeQuery,
1,512✔
605
            $this
1,512✔
606
        );
1,512✔
607
    }
608

609
    /**
610
     * Find list of nodes with a CSS selector within an optional DOMDocument
611
     * context, optionally applying the parser callback before the XPath query.
612
     *
613
     * @param string        $selector
614
     * @param \DOMDocument  $document
615
     * @param \DOMNode|null $contextNode
616
     * @param int|null      $idx
617
     * @param callable|null $callbackXPathBeforeQuery
618
     * @param self|null     $queryHtmlDomParser
619
     *
620
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
621
     *
622
     * @phpstan-param null|callable(string, string, \DOMXPath, self): string $callbackXPathBeforeQuery
623
     *
624
     * @internal Used by wrapped SimpleHtmlDom instances to keep queries scoped
625
     *           to an existing DOMDocument while preserving parser callback
626
     *           behavior.
627
     */
628
    public static function findInDocumentContext(
629
        string $selector,
630
        \DOMDocument $document,
631
        ?\DOMNode $contextNode = null,
632
        $idx = null,
633
        ?callable $callbackXPathBeforeQuery = null,
634
        ?self $queryHtmlDomParser = null
635
    ) {
636
        $xPathQuery = SelectorConverter::toXPath($selector);
1,610✔
637

638
        $xPath = new \DOMXPath($document);
1,610✔
639

640
        if ($callbackXPathBeforeQuery !== null && $queryHtmlDomParser !== null) {
1,610✔
641
            $xPathQuery = \call_user_func($callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $queryHtmlDomParser);
21✔
642
        }
643

644
        if ($contextNode !== null) {
1,610✔
645
            $xPathQuery = self::scopeXPathQueryToContextNode($xPathQuery);
378✔
646
        }
647

648
        $nodesList = $xPath->query($xPathQuery, $contextNode);
1,610✔
649

650
        return self::createFindResultFromNodeList($nodesList, $idx, $queryHtmlDomParser);
1,610✔
651
    }
652

653
    /**
654
     * Prefix absolute XPath segments so they stay scoped to the provided
655
     * context node, including every branch of union expressions.
656
     *
657
     * @param string $xPathQuery
658
     *
659
     * @return string
660
     */
661
    private static function scopeXPathQueryToContextNode(string $xPathQuery): string
662
    {
663
        $scopedXPathQuery = '';
385✔
664
        $quoteCharacter = null;
385✔
665
        $bracketDepth = 0;
385✔
666
        $parenthesisDepth = 0;
385✔
667
        $isAtBranchStart = true;
385✔
668
        $length = \strlen($xPathQuery);
385✔
669

670
        for ($i = 0; $i < $length; ++$i) {
385✔
671
            $character = $xPathQuery[$i];
385✔
672

673
            if ($quoteCharacter !== null) {
385✔
674
                $scopedXPathQuery .= $character;
133✔
675

676
                if ($character === $quoteCharacter) {
133✔
677
                    $quoteCharacter = null;
133✔
678
                }
679

680
                continue;
133✔
681
            }
682

683
            if ($character === '"' || $character === "'") {
385✔
684
                $scopedXPathQuery .= $character;
133✔
685
                $quoteCharacter = $character;
133✔
686

687
                continue;
133✔
688
            }
689

690
            if ($isAtBranchStart) {
385✔
691
                if (\trim($character) === '') {
385✔
692
                    $scopedXPathQuery .= $character;
35✔
693

694
                    continue;
35✔
695
                }
696

697
                if ($character === '/') {
385✔
698
                    $scopedXPathQuery .= '.';
63✔
699
                }
700

701
                $isAtBranchStart = false;
385✔
702
            }
703

704
            if ($character === '[') {
385✔
705
                ++$bracketDepth;
161✔
706
            } elseif ($character === ']' && $bracketDepth > 0) {
385✔
707
                --$bracketDepth;
161✔
708
            } elseif ($character === '(') {
385✔
709
                ++$parenthesisDepth;
119✔
710
            } elseif ($character === ')' && $parenthesisDepth > 0) {
385✔
711
                --$parenthesisDepth;
119✔
712
            }
713

714
            $scopedXPathQuery .= $character;
385✔
715

716
            if ($character === '|' && $bracketDepth === 0 && $parenthesisDepth === 0) {
385✔
717
                $isAtBranchStart = true;
35✔
718
            }
719
        }
720

721
        return $scopedXPathQuery;
385✔
722
    }
723

724
    /**
725
     * @param \DOMNodeList<\DOMNameSpaceNode|\DOMNode>|false $nodesList
726
     * @param int|null                                       $idx
727
     *
728
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
729
     */
730
    private static function createFindResultFromNodeList($nodesList, $idx, ?self $queryHtmlDomParser = null)
731
    {
732
        $elements = new SimpleHtmlDomNode();
1,610✔
733

734
        if ($nodesList) {
1,610✔
735
            foreach ($nodesList as $node) {
1,610✔
736
                if (!$node instanceof \DOMNode) {
1,533✔
737
                    continue;
×
738
                }
739

740
                $elements[] = new SimpleHtmlDom($node, $queryHtmlDomParser);
1,533✔
741
            }
742
        }
743

744
        // return all elements
745
        if ($idx === null) {
1,610✔
746
            if (\count($elements) === 0) {
728✔
747
                return new SimpleHtmlDomNodeBlank();
140✔
748
            }
749

750
            return $elements;
700✔
751
        }
752

753
        // handle negative values
754
        if ($idx < 0) {
1,134✔
755
            $idx = \count($elements) + $idx;
77✔
756
        }
757

758
        // return one element
759
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
1,134✔
760
    }
761

762
    /**
763
     * Find nodes with a CSS selector.
764
     *
765
     * @param string $selector
766
     *
767
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
768
     */
769
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
770
    {
771
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
772
        $return = $this->find($selector, null);
112✔
773

774
        return $return;
112✔
775
    }
776

777
    /**
778
     * Find nodes with a CSS selector or false, if no element is found.
779
     *
780
     * @param string $selector
781
     *
782
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
783
     */
784
    public function findMultiOrFalse(string $selector)
785
    {
786
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
787
        $return = $this->find($selector, null);
35✔
788

789
        if ($return instanceof SimpleHtmlDomNodeBlank) {
35✔
790
            return false;
21✔
791
        }
792

793
        return $return;
21✔
794
    }
795

796
    /**
797
     * Find nodes with a CSS selector or null, if no element is found.
798
     *
799
     * @param string $selector
800
     *
801
     * @return null|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
802
     */
803
    public function findMultiOrNull(string $selector)
804
    {
805
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
806
        $return = $this->find($selector, null);
7✔
807

808
        if ($return instanceof SimpleHtmlDomNodeBlank) {
7✔
809
            return null;
7✔
810
        }
811

812
        return $return;
7✔
813
    }
814

815
    /**
816
     * Find one node with a CSS selector.
817
     *
818
     * @param string $selector
819
     *
820
     * @return SimpleHtmlDomInterface
821
     */
822
    public function findOne(string $selector): SimpleHtmlDomInterface
823
    {
824
        /** @var SimpleHtmlDomInterface $return */
825
        $return = $this->find($selector, 0);
532✔
826

827
        return $return;
532✔
828
    }
829

830
    /**
831
     * Find one node with a CSS selector or false, if no element is found.
832
     *
833
     * @param string $selector
834
     *
835
     * @return false|SimpleHtmlDomInterface
836
     */
837
    public function findOneOrFalse(string $selector)
838
    {
839
        /** @var SimpleHtmlDomInterface $return */
840
        $return = $this->find($selector, 0);
70✔
841

842
        if ($return instanceof SimpleHtmlDomBlank) {
70✔
843
            return false;
28✔
844
        }
845

846
        return $return;
56✔
847
    }
848

849
    /**
850
     * Find one node with a CSS selector or null, if no element is found.
851
     *
852
     * @param string $selector
853
     *
854
     * @return null|SimpleHtmlDomInterface
855
     */
856
    public function findOneOrNull(string $selector)
857
    {
858
        /** @var SimpleHtmlDomInterface $return */
859
        $return = $this->find($selector, 0);
7✔
860

861
        if ($return instanceof SimpleHtmlDomBlank) {
7✔
862
            return null;
7✔
863
        }
864

865
        return $return;
7✔
866
    }
867

868
    /**
869
     * @param string $content
870
     * @param bool   $multiDecodeNewHtmlEntity
871
     * @param bool   $putBrokenReplacedBack
872
     *
873
     * @return string
874
     */
875
    public function fixHtmlOutput(
876
        string $content,
877
        bool $multiDecodeNewHtmlEntity = false,
878
        bool $putBrokenReplacedBack = true
879
    ): string {
880
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
881
        //          so we try to remove it here again ...
882

883
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
1,400✔
884
            /** @noinspection HtmlRequiredLangAttribute */
885
            $content = \str_replace(
623✔
886
                [
623✔
887
                    '<html>',
623✔
888
                    '</html>',
623✔
889
                ],
623✔
890
                '',
623✔
891
                $content
623✔
892
            );
623✔
893
        }
894

895
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
1,400✔
896
            /** @noinspection HtmlRequiredTitleElement */
897
            $content = \str_replace(
777✔
898
                [
777✔
899
                    '<head>',
777✔
900
                    '</head>',
777✔
901
                ],
777✔
902
                '',
777✔
903
                $content
777✔
904
            );
777✔
905
        }
906

907
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
1,400✔
908
            $content = \str_replace(
623✔
909
                [
623✔
910
                    '<body>',
623✔
911
                    '</body>',
623✔
912
                ],
623✔
913
                '',
623✔
914
                $content
623✔
915
            );
623✔
916
        }
917

918
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
1,400✔
919
            $content = \str_replace(
7✔
920
                '</script>',
7✔
921
                '',
7✔
922
                $content
7✔
923
            );
7✔
924
        }
925

926
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
1,400✔
927
            $content = (string) \preg_replace('/^<p>/', '', $content);
56✔
928
            $content = (string) \preg_replace('/<\/p>/', '', $content);
56✔
929
        }
930

931
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
1,400✔
932
            $content = \str_replace(
84✔
933
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
84✔
934
                '',
84✔
935
                $content
84✔
936
            );
84✔
937
        }
938

939
        // https://bugs.php.net/bug.php?id=73175
940
        $content = \str_replace(
1,400✔
941
            \array_map(static function ($e) {
1,400✔
942
                return '</' . $e . '>';
1,400✔
943
            }, $this->selfClosingTags),
1,400✔
944
            '',
1,400✔
945
            $content
1,400✔
946
        );
1,400✔
947

948
        /** @noinspection HtmlRequiredTitleElement */
949
        $content = \trim(
1,400✔
950
            \str_replace(
1,400✔
951
                [
1,400✔
952
                    '<simpleHtmlDomHtml>',
1,400✔
953
                    '</simpleHtmlDomHtml>',
1,400✔
954
                    '<simpleHtmlDomP>',
1,400✔
955
                    '</simpleHtmlDomP>',
1,400✔
956
                    '<head><head>',
1,400✔
957
                    '</head></head>',
1,400✔
958
                ],
1,400✔
959
                [
1,400✔
960
                    '',
1,400✔
961
                    '',
1,400✔
962
                    '',
1,400✔
963
                    '',
1,400✔
964
                    '<head>',
1,400✔
965
                    '</head>',
1,400✔
966
                ],
1,400✔
967
                $content
1,400✔
968
            )
1,400✔
969
        );
1,400✔
970

971
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
1,400✔
972

973
        return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
1,400✔
974
    }
975

976
    /**
977
     * Return elements by ".class".
978
     *
979
     * @param string $class
980
     *
981
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
982
     */
983
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
984
    {
985
        return $this->findMulti('.' . $class);
7✔
986
    }
987

988
    /**
989
     * Return element by #id.
990
     *
991
     * @param string $id
992
     *
993
     * @return SimpleHtmlDomInterface
994
     */
995
    public function getElementById(string $id): SimpleHtmlDomInterface
996
    {
997
        return $this->findOne('#' . $id);
84✔
998
    }
999

1000
    /**
1001
     * Return element by tag name.
1002
     *
1003
     * @param string $name
1004
     *
1005
     * @return SimpleHtmlDomInterface
1006
     */
1007
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
1008
    {
1009
        $node = $this->document->getElementsByTagName($name)->item(0);
28✔
1010

1011
        if ($node === null) {
28✔
1012
            return new SimpleHtmlDomBlank();
7✔
1013
        }
1014

1015
        return new SimpleHtmlDom($node, $this);
21✔
1016
    }
1017

1018
    /**
1019
     * Returns elements by "#id".
1020
     *
1021
     * @param string   $id
1022
     * @param int|null $idx
1023
     *
1024
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1025
     */
1026
    public function getElementsById(string $id, $idx = null)
1027
    {
1028
        return $this->find('#' . $id, $idx);
7✔
1029
    }
1030

1031
    /**
1032
     * Returns elements by tag name.
1033
     *
1034
     * @param string   $name
1035
     * @param int|null $idx
1036
     *
1037
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1038
     */
1039
    public function getElementsByTagName(string $name, $idx = null)
1040
    {
1041
        $nodesList = $this->document->getElementsByTagName($name);
49✔
1042

1043
        $elements = new SimpleHtmlDomNode();
49✔
1044

1045
        foreach ($nodesList as $node) {
49✔
1046
            $elements[] = new SimpleHtmlDom($node, $this);
28✔
1047
        }
1048

1049
        // return all elements
1050
        if ($idx === null) {
49✔
1051
            if (\count($elements) === 0) {
35✔
1052
                return new SimpleHtmlDomNodeBlank();
14✔
1053
            }
1054

1055
            return $elements;
21✔
1056
        }
1057

1058
        // handle negative values
1059
        if ($idx < 0) {
14✔
1060
            $idx = \count($elements) + $idx;
×
1061
        }
1062

1063
        // return one element
1064
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
14✔
1065
    }
1066

1067
    /**
1068
     * Get dom node's outer html.
1069
     *
1070
     * @param bool $multiDecodeNewHtmlEntity
1071
     * @param bool $putBrokenReplacedBack
1072
     *
1073
     * @return string
1074
     */
1075
    public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1076
    {
1077
        if (static::$callback !== null) {
1,064✔
1078
            \call_user_func(static::$callback, [$this]);
770✔
1079
        }
1080

1081
        if ($this->shouldUseWholeDocumentSerializationForHtmlOnPhpLt8()) {
1,064✔
1082
            $content = $this->document->saveHTML();
158✔
1083
        } elseif ($this->usesInternalWrapperDocument()) {
944✔
1084
            $content = $this->serializeInternalWrapperContent();
150✔
1085
        } elseif ($this->createdFromNode) {
839✔
1086
            if (\PHP_VERSION_ID < 80000) {
385✔
1087
                $content = $this->serializeCreatedFromNodeForPhpLt8();
110✔
1088
            } else {
1089
                $content = $this->serializeChildNodes($this->document);
385✔
1090
            }
1091
        } elseif ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
593✔
1092
            $content = $this->document->saveHTML($this->document->documentElement);
327✔
1093
        } else {
1094
            $content = $this->document->saveHTML();
329✔
1095
        }
1096

1097
        if ($content === false) {
1,064✔
1098
            return '';
×
1099
        }
1100

1101
        $output = $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
1,064✔
1102

1103
        return $output;
1,064✔
1104
    }
1105

1106
    /**
1107
     * Mark a parser-generated <p>-wrapper so fixHtmlOutput() can remove only
1108
     * the synthetic wrapper instead of stripping all paragraph tags. The
1109
     * wrapper is renamed to the placeholder tag that fixHtmlOutput() already
1110
     * strips from serialized output.
1111
     *
1112
     * @return void
1113
     */
1114
    private function markSyntheticParagraphWrapper(): void
1115
    {
1116
        if (!$this->isDOMDocumentCreatedWithoutPTagWrapper) {
2,121✔
1117
            return;
952✔
1118
        }
1119

1120
        $html = $this->document->documentElement;
1,288✔
1121
        if (
1122
            !$html instanceof \DOMElement
1,288✔
1123
            ||
1124
            \strtolower($html->tagName) !== 'html'
1,288✔
1125
        ) {
1126
            return;
779✔
1127
        }
1128

1129
        $body = $this->document->getElementsByTagName('body')->item(0);
616✔
1130
        if (!$body instanceof \DOMElement) {
616✔
1131
            return;
61✔
1132
        }
1133

1134
        $wrapper = null;
595✔
1135
        foreach ($body->childNodes as $child) {
595✔
1136
            if ($child instanceof \DOMText && \trim($child->nodeValue ?? '') === '') {
483✔
1137
                continue;
84✔
1138
            }
1139

1140
            if ($wrapper !== null) {
483✔
1141
                return;
×
1142
            }
1143

1144
            if (!$child instanceof \DOMElement) {
483✔
1145
                return;
28✔
1146
            }
1147

1148
            if (\strtolower($child->tagName) !== 'p') {
455✔
1149
                return;
399✔
1150
            }
1151

1152
            $wrapper = $child;
98✔
1153
        }
1154

1155
        if (!$wrapper instanceof \DOMElement || $wrapper->parentNode === null) {
210✔
1156
            return;
112✔
1157
        }
1158

1159
        $replacement = $this->document->createElement('simpleHtmlDomP');
98✔
1160

1161
        while ($wrapper->firstChild !== null) {
98✔
1162
            $replacement->appendChild($wrapper->firstChild);
98✔
1163
        }
1164

1165
        $wrapper->parentNode->replaceChild($replacement, $wrapper);
98✔
1166
    }
1167

1168
    /**
1169
     * Serialize a single DOM node to HTML.
1170
     *
1171
     * A detached DOMDocument is used so that the serialization context is
1172
     * independent of the internal wrapper tag name (older libxml HTML
1173
     * serializers treat unknown hyphenated tags as block-level and inject
1174
     * formatting newlines into the wrapper's children when saving the full
1175
     * document).
1176
     *
1177
     * On PHP < 8.0, older libxml injects a trailing "\n" after raw-text
1178
     * elements (script, style) when they are the root of a fresh document.
1179
     * For those elements we fall back to serializing from the original
1180
     * document and strip only the single trailing "\n".  For all other
1181
     * element types the fresh-document approach is used to avoid libxml
1182
     * injecting formatting newlines inside block-level content.  Text and
1183
     * other non-element nodes are always serialized from the owner document
1184
     * without any trailing-newline stripping (they carry no injected newline).
1185
     *
1186
     * @param \DOMNode $node
1187
     */
1188
    private function serializeNode(\DOMNode $node): string
1189
    {
1190
        // For script/style on PHP < 8.0 use ownerDocument to avoid fresh-doc
1191
        // libxml injecting "\n" inside raw-text content.
1192
        $useOwnerDoc = \PHP_VERSION_ID < 80000
665✔
1193
            && $node instanceof \DOMElement
665✔
1194
            && \in_array(\strtolower($node->tagName), ['script', 'style'], true);
665✔
1195

1196
        if (!$useOwnerDoc) {
665✔
1197
            $document = new \DOMDocument('1.0', $this->getEncoding());
663✔
1198
            $document->preserveWhiteSpace = true;
663✔
1199
            $document->formatOutput = false;
663✔
1200

1201
            $importedNode = $document->importNode($node, true);
663✔
1202
            // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
1203
            if (!$importedNode instanceof \DOMNode) {
663✔
1204
                return '';
×
1205
            }
1206

1207
            $document->appendChild($importedNode);
663✔
1208

1209
            $content = $document->saveHTML($importedNode);
663✔
1210
        } else {
1211
            // PHP < 8.0 script/style: serialize from original document and
1212
            // strip only the trailing "\n" that older libxml appends after
1213
            // raw-text elements.
1214
            $ownerDoc = $node->ownerDocument;
2✔
1215
            $content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false;
2✔
1216
            // Older libxml appends exactly one synthetic trailing "\n" here;
1217
            // preserve any real user-provided trailing newlines in the content.
1218
            if ($content !== false && \substr($content, -1) === "\n") {
2✔
1219
                $content = \substr($content, 0, -1);
×
1220
            }
1221
        }
1222

1223
        if ($content === false) {
665✔
1224
            return '';
×
1225
        }
1226

1227
        return $content;
665✔
1228
    }
1229

1230
    /**
1231
     * Serialize the single element that was imported via the node-backed
1232
     * constructor, for PHP < 8.0.
1233
     *
1234
     * On PHP < 8, saveHTML($node) with a node argument always injects
1235
     * formatting newlines between block-level child elements and a trailing
1236
     * "\n" after raw-text elements (script, style), even with formatOutput
1237
     * set to false.  saveHTML() called without a node argument respects
1238
     * formatOutput=false and does not inject those newlines.
1239
     *
1240
     * We call saveHTML() on the constructor document (which already has the
1241
     * imported element as its only child / documentElement) and strip the
1242
     * DOCTYPE and structural wrappers (html, body) that libxml may add around
1243
     * elements that are not recognised HTML root elements.
1244
     *
1245
     * @return string
1246
     */
1247
    private function serializeCreatedFromNodeForPhpLt8(): string
1248
    {
1249
        $full = $this->document->saveHTML();
110✔
1250
        if ($full === false) {
110✔
1251
            return '';
×
1252
        }
1253

1254
        // Strip the DOCTYPE declaration that libxml always prepends.
1255
        $full = (string) \preg_replace('/<!DOCTYPE[^>]+>/i', '', $full);
110✔
1256
        $full = \trim($full);
110✔
1257

1258
        $documentElement = $this->document->documentElement;
110✔
1259
        $tagName = $documentElement instanceof \DOMElement
110✔
1260
            ? \strtolower($documentElement->tagName)
104✔
1261
            : '';
110✔
1262

1263
        // Strip the <html>...</html> wrapper added by libxml when the root
1264
        // element is not the HTML element itself.
1265
        if ($tagName !== 'html') {
110✔
1266
            $full = (string) \preg_replace('/^<html[^>]*>/i', '', $full);
108✔
1267
            $full = (string) \preg_replace('/<\/html>$/i', '', $full);
108✔
1268
            $full = \trim($full);
108✔
1269

1270
            // Strip the <body>...</body> wrapper added for non-body elements.
1271
            if ($tagName !== 'body') {
108✔
1272
                $full = (string) \preg_replace('/^<body[^>]*>/i', '', $full);
106✔
1273
                $full = (string) \preg_replace('/<\/body>$/i', '', $full);
106✔
1274
                // Remove a trailing empty <body> libxml may add for <head> roots.
1275
                $full = \str_replace('<body></body>', '', $full);
106✔
1276
                $full = \trim($full);
106✔
1277
            }
1278
        }
1279

1280
        return $full;
110✔
1281
    }
1282

1283
    /**
1284
     * @param \DOMNode $parentNode
1285
     *
1286
     * @return string
1287
     */
1288
    private function serializeChildNodes(\DOMNode $parentNode): string
1289
    {
1290
        $content = '';
658✔
1291

1292
        foreach ($parentNode->childNodes as $childNode) {
658✔
1293
            $content .= $this->serializeNode($childNode);
658✔
1294
        }
1295

1296
        return $content;
658✔
1297
    }
1298

1299
    /**
1300
     * @return bool
1301
     */
1302
    private function usesInternalWrapperDocument(): bool
1303
    {
1304
        return $this->document->documentElement instanceof \DOMElement
1,246✔
1305
            && $this->document->documentElement->tagName === self::$domHtmlWrapperHelper;
1,246✔
1306
    }
1307

1308
    /**
1309
     * Older libxml preserves body-only fragments more faithfully when the whole
1310
     * temporary document is serialized and fixHtmlOutput() removes the wrappers
1311
     * afterwards. Head-only fragments still need root-element serialization, or
1312
     * <meta charset=...> can trigger output re-encoding (e.g. utf-7).
1313
     */
1314
    private function isBodyOnlyHtmlFragmentDocument(): bool
1315
    {
1316
        $documentElement = $this->document->documentElement;
122✔
1317
        if (!$documentElement instanceof \DOMElement || \strtolower($documentElement->tagName) !== 'html') {
122✔
1318
            return false;
68✔
1319
        }
1320

1321
        $head = $documentElement->getElementsByTagName('head')->item(0);
58✔
1322
        $body = $documentElement->getElementsByTagName('body')->item(0);
58✔
1323

1324
        $hasHeadContent = $head instanceof \DOMElement && $head->childNodes->length > 0;
58✔
1325
        $hasBodyContent = $body instanceof \DOMElement && $body->childNodes->length > 0;
58✔
1326

1327
        return !$hasHeadContent && $hasBodyContent;
58✔
1328
    }
1329

1330
    private function shouldUseWholeDocumentSerializationForHtmlOnPhpLt8(): bool
1331
    {
1332
        if (\PHP_VERSION_ID >= 80000) {
1,064✔
1333
            return false;
760✔
1334
        }
1335

1336
        if ($this->usesInternalWrapperDocument()) {
304✔
1337
            return true;
60✔
1338
        }
1339

1340
        if (!$this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
262✔
1341
            return false;
180✔
1342
        }
1343

1344
        $documentElement = $this->document->documentElement;
122✔
1345
        if (!$documentElement instanceof \DOMElement) {
122✔
1346
            return false;
14✔
1347
        }
1348

1349
        return \strtolower($documentElement->tagName) !== 'html'
116✔
1350
            || $this->isBodyOnlyHtmlFragmentDocument();
116✔
1351
    }
1352

1353
    private function shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8(): bool
1354
    {
1355
        return \PHP_VERSION_ID < 80000
280✔
1356
            && (
280✔
1357
                $this->usesInternalWrapperDocument()
280✔
1358
                || $this->isBodyOnlyHtmlFragmentDocument()
280✔
1359
            );
280✔
1360
    }
1361

1362
    /**
1363
     * Keep helper wrapper markers around detached child serialization so
1364
     * fixHtmlOutput() does not trim leading/trailing fragment whitespace.
1365
     *
1366
     * @return string
1367
     */
1368
    private function serializeInternalWrapperContent(): string
1369
    {
1370
        if ($this->document->documentElement === null) {
150✔
1371
            return '';
×
1372
        }
1373

1374
        $wrapperTag = self::$domHtmlWrapperHelper;
150✔
1375

1376
        return '<' . $wrapperTag . '>'
150✔
1377
            . $this->serializeChildNodes($this->document->documentElement)
150✔
1378
            . '</' . $wrapperTag . '>';
150✔
1379
    }
1380

1381
    /**
1382
     * Parse the fragment inside the internal wrapper and count significant
1383
     * direct children. This is more reliable than regex for fragments whose
1384
     * top-level elements have attributes or nested markup.
1385
     *
1386
     * @param string $html
1387
     * @param int    $optionsXml
1388
     *
1389
     * @return bool
1390
     */
1391
    private function hasMultipleTopLevelNodes(string $html, int $optionsXml): bool
1392
    {
1393
        $internalErrors = \libxml_use_internal_errors(true);
1,365✔
1394
        try {
1395
            \libxml_clear_errors();
1,365✔
1396

1397
            $xmlProbe = '<' . self::$domHtmlWrapperHelper . '>'
1,365✔
1398
                . self::replaceToPreserveHtmlEntities($html)
1,365✔
1399
                . '</' . self::$domHtmlWrapperHelper . '>';
1,365✔
1400

1401
            $simpleXml = \simplexml_load_string($xmlProbe, \SimpleXMLElement::class, $optionsXml);
1,365✔
1402
            if ($simpleXml === false || \count(\libxml_get_errors()) > 0) {
1,365✔
1403
                return false;
342✔
1404
            }
1405

1406
            $wrapper = \dom_import_simplexml($simpleXml);
1,093✔
1407
            if (!$wrapper instanceof \DOMElement) {
1,093✔
1408
                return false;
×
1409
            }
1410

1411
            return $this->countSignificantChildNodes($wrapper) > 1;
1,093✔
1412
        } finally {
1413
            \libxml_clear_errors();
1,365✔
1414
            \libxml_use_internal_errors($internalErrors);
1,365✔
1415
        }
1416
    }
1417

1418
    /**
1419
     * @param \DOMNode $node
1420
     *
1421
     * @return int
1422
     */
1423
    private function countSignificantChildNodes(\DOMNode $node): int
1424
    {
1425
        $count = 0;
1,093✔
1426

1427
        foreach ($node->childNodes as $childNode) {
1,093✔
1428
            if (
1429
                $childNode->nodeType === \XML_TEXT_NODE
1,079✔
1430
                &&
1431
                \trim($childNode->textContent) === ''
1,079✔
1432
            ) {
1433
                continue;
72✔
1434
            }
1435

1436
            ++$count;
1,079✔
1437
            if ($count > 1) {
1,079✔
1438
                return $count;
214✔
1439
            }
1440
        }
1441

1442
        return $count;
935✔
1443
    }
1444

1445
    /**
1446
     * {@inheritdoc}
1447
     */
1448
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1449
    {
1450
        $text = '';
287✔
1451

1452
        if ($this->document->documentElement) {
287✔
1453
            if ($this->shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8()) {
280✔
1454
                $text = $this->document->saveHTML();
12✔
1455
            } elseif ($this->usesInternalWrapperDocument()) {
268✔
1456
                $text = $this->serializeInternalWrapperContent();
5✔
1457
            } else {
1458
                $text = $this->serializeChildNodes($this->document->documentElement);
263✔
1459
            }
1460
        }
1461

1462
        if ($text === false) {
287✔
1463
            $text = '';
×
1464
        }
1465

1466
        $output = $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
287✔
1467

1468
        return $output;
287✔
1469
    }
1470

1471
    /**
1472
     * Get dom node's plain text.
1473
     *
1474
     * HTML document plaintext should exclude raw-text container contents like
1475
     * <script> and <style> while still preserving other text nodes in document
1476
     * order (e.g. <title> content).
1477
     *
1478
     * @param bool $multiDecodeNewHtmlEntity
1479
     *
1480
     * @return string
1481
     */
1482
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1483
    {
1484
        $parts = [];
70✔
1485

1486
        $xPath = new \DOMXPath($this->document);
70✔
1487
        $textNodes = $xPath->query(
70✔
1488
            \sprintf(
70✔
1489
                '//text()[not(ancestor::script or ancestor::style or ancestor::%s)]',
70✔
1490
                self::$domHtmlSpecialScriptHelper
70✔
1491
            )
70✔
1492
        );
70✔
1493

1494
        if ($textNodes !== false) {
70✔
1495
            foreach ($textNodes as $textNode) {
70✔
1496
                $parts[] = $textNode->nodeValue;
70✔
1497
            }
1498
        }
1499

1500
        return $this->fixHtmlOutput(\implode('', $parts), $multiDecodeNewHtmlEntity);
70✔
1501
    }
1502

1503
    /**
1504
     * Load HTML from string.
1505
     *
1506
     * @param string   $html
1507
     * @param int|null $libXMLExtraOptions
1508
     * @param bool     $useDefaultLibXMLOptions
1509
     *
1510
     * @return $this
1511
     */
1512
    public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1513
    {
1514
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
2,121✔
1515

1516
        return $this;
2,121✔
1517
    }
1518

1519
    /**
1520
     * Load HTML from file.
1521
     *
1522
     * @param string   $filePath
1523
     * @param int|null $libXMLExtraOptions
1524
     * @param bool     $useDefaultLibXMLOptions
1525
     *
1526
     * @throws \RuntimeException
1527
     *
1528
     * @return $this
1529
     */
1530
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1531
    {
1532
        if (!\preg_match("/^https?:\/\//i", $filePath)) {
112✔
1533
            if (!\file_exists($filePath)) {
105✔
1534
                throw new \RuntimeException('File ' . $filePath . ' not found');
7✔
1535
            }
1536

1537
            if (!\is_file($filePath)) {
98✔
1538
                throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1539
            }
1540
        }
1541

1542
        try {
1543
            if (\class_exists('\voku\helper\UTF8')) {
98✔
1544
                $html = \voku\helper\UTF8::file_get_contents($filePath);
×
1545
            } else {
1546
                $html = \file_get_contents($filePath);
98✔
1547
            }
1548
        } catch (\Exception $e) {
7✔
1549
            throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1550
        }
1551

1552
        if ($html === false) {
91✔
1553
            throw new \RuntimeException('Could not load file ' . $filePath);
×
1554
        }
1555

1556
        return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
91✔
1557
    }
1558

1559
    /**
1560
     * Get the HTML as XML or plain XML if needed.
1561
     *
1562
     * @param bool $multiDecodeNewHtmlEntity
1563
     * @param bool $htmlToXml
1564
     * @param bool $removeXmlHeader
1565
     * @param int  $options
1566
     *
1567
     * @return string
1568
     */
1569
    public function xml(
1570
        bool $multiDecodeNewHtmlEntity = false,
1571
        bool $htmlToXml = true,
1572
        bool $removeXmlHeader = true,
1573
        int $options = \LIBXML_NOEMPTYTAG
1574
    ): string {
1575
        $xml = $this->document->saveXML(null, $options);
28✔
1576
        if ($xml === false) {
28✔
1577
            return '';
×
1578
        }
1579

1580
        if ($removeXmlHeader) {
28✔
1581
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
14✔
1582
        }
1583

1584
        if ($htmlToXml) {
28✔
1585
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
14✔
1586
        } else {
1587
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
14✔
1588

1589
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
14✔
1590
        }
1591

1592
        return $return;
28✔
1593
    }
1594

1595
    /**
1596
     * @param string $selector
1597
     * @param int    $idx
1598
     *
1599
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1600
     */
1601
    public function __invoke($selector, $idx = null)
1602
    {
1603
        return $this->find($selector, $idx);
21✔
1604
    }
1605

1606
    /**
1607
     * @return bool
1608
     */
1609
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
1610
    {
1611
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
1,400✔
1612
    }
1613

1614
    /**
1615
     * @return bool
1616
     */
1617
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
1618
    {
1619
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
182✔
1620
    }
1621

1622
    /**
1623
     * @return bool
1624
     */
1625
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
1626
    {
1627
        return $this->isDOMDocumentCreatedWithoutHtml;
1,400✔
1628
    }
1629

1630
    /**
1631
     * @return bool
1632
     */
1633
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
1634
    {
1635
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
1,400✔
1636
    }
1637

1638
    /**
1639
     * @return bool
1640
     */
1641
    public function getIsDOMDocumentCreatedWithMultiRoot(): bool
1642
    {
1643
        return $this->isDOMDocumentCreatedWithMultiRoot;
7✔
1644
    }
1645

1646
    /**
1647
     * @return bool
1648
     */
1649
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
1650
    {
1651
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
1,400✔
1652
    }
1653

1654
    /**
1655
     * @return bool
1656
     */
1657
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
1658
    {
1659
        return $this->isDOMDocumentCreatedWithoutWrapper;
1,400✔
1660
    }
1661

1662
    /**
1663
     * @return bool
1664
     */
1665
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
1666
    {
1667
        return $this->isDOMDocumentCreatedWithFakeEndScript;
1,400✔
1668
    }
1669

1670
    /**
1671
     * @param string $html
1672
     *
1673
     * @return string
1674
     */
1675
    protected function keepBrokenHtml(string $html): string
1676
    {
1677
        do {
1678
            $original = $html;
35✔
1679

1680
            $html = (string) \preg_replace_callback(
35✔
1681
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
35✔
1682
                static function ($matches) {
35✔
1683
                    return $matches['start'] .
35✔
1684
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
35✔
1685
                        $matches['value'] .
35✔
1686
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
35✔
1687
                        $matches['end'];
35✔
1688
                },
35✔
1689
                $html
35✔
1690
            );
35✔
1691
        } while ($original !== $html);
35✔
1692

1693
        do {
1694
            $original = $html;
35✔
1695

1696
            $html = (string) \preg_replace_callback(
35✔
1697
                '/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
35✔
1698
                function ($matches) {
35✔
1699
                    $matches['broken'] = \str_replace(
21✔
1700
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
21✔
1701
                        ['</', '<', '>'],
21✔
1702
                        $matches['broken']
21✔
1703
                    );
21✔
1704

1705
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
21✔
1706
                    $this->registerDynamicDomBrokenReplaceHelper($matches['broken'], $matchesHash);
21✔
1707

1708
                    return $matches['start'] . $matchesHash . $matches['end'];
21✔
1709
                },
35✔
1710
                $html
35✔
1711
            );
35✔
1712
        } while ($original !== $html);
35✔
1713

1714
        return \str_replace(
35✔
1715
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
35✔
1716
            ['</', '<', '>'],
35✔
1717
            $html
35✔
1718
        );
35✔
1719
    }
1720

1721
    /**
1722
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
1723
     *
1724
     * @param string $html
1725
     *
1726
     * @return void
1727
     */
1728
    protected function keepSpecialSvgTags(string &$html)
1729
    {
1730
        // regEx for e.g.: [mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">...</svg>')]
1731
        /** @noinspection HtmlDeprecatedTag */
1732
        $regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
329✔
1733
        $htmlTmp = \preg_replace_callback(
329✔
1734
            $regExSpecialSvg,
329✔
1735
            function ($svgs) {
329✔
1736
                $content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
14✔
1737
                $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
14✔
1738
                $this->registerDynamicDomBrokenReplaceHelper($content, $matchesHash);
14✔
1739

1740
                return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
14✔
1741
            },
329✔
1742
            $html
329✔
1743
        );
329✔
1744

1745
        if ($htmlTmp !== null) {
329✔
1746
            $html = $htmlTmp;
329✔
1747
        }
1748
    }
1749

1750
    /**
1751
     * @param string $html
1752
     *
1753
     * @return void
1754
     */
1755
    protected function keepSpecialScriptTags(string &$html)
1756
    {
1757
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1758
        $tags = \implode('|', \array_map(
49✔
1759
            static function ($value) {
49✔
1760
                return \preg_quote($value, '/');
49✔
1761
            },
49✔
1762
            $this->specialScriptTags
49✔
1763
        ));
49✔
1764
        $html = (string) \preg_replace_callback(
49✔
1765
            '/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
49✔
1766
            function ($matches) {
49✔
1767
                // Check for logic in special script tags containing EJS/ERB-style template syntax
1768
                // (e.g. <% ... %> blocks), because often this looks like non-valid html in the template itself.
1769
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
35✔
1770
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
35✔
1771
                        // remove the html5 fallback
1772
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
28✔
1773

1774
                        $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
28✔
1775
                        $this->registerDynamicDomBrokenReplaceHelper($matches['innerContent'], $matchesHash);
28✔
1776

1777
                        return $matches['start'] . $matchesHash . $matches['end'];
28✔
1778
                    }
1779
                }
1780

1781
                // remove the html5 fallback
1782
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
21✔
1783

1784
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
21✔
1785

1786
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
21✔
1787
            },
49✔
1788
            $html
49✔
1789
        );
49✔
1790
    }
1791

1792
    /**
1793
     * @param bool $keepBrokenHtml
1794
     *
1795
     * @return $this
1796
     */
1797
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1798
    {
1799
        $this->keepBrokenHtml = $keepBrokenHtml;
35✔
1800

1801
        return $this;
35✔
1802
    }
1803

1804
    /**
1805
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1806
     *
1807
     * @return $this
1808
     */
1809
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1810
    {
1811
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
14✔
1812
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1813
            if (!\is_string($tmp)) {
14✔
1814
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
7✔
1815
            }
1816
        }
1817

1818
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
7✔
1819

1820
        return $this;
7✔
1821
    }
1822

1823
    /**
1824
     * @param string[] $specialScriptTags
1825
     *
1826
     * @return $this
1827
     */
1828
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1829
    {
1830
        foreach ($specialScriptTags as $tag) {
7✔
1831
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1832
            if (!\is_string($tag)) {
7✔
1833
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
7✔
1834
            }
1835
        }
1836

1837
        $this->specialScriptTags = $specialScriptTags;
7✔
1838

1839
        return $this;
7✔
1840
    }
1841

1842
    /**
1843
     * @param callable $callbackXPathBeforeQuery
1844
     *
1845
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1846
     *
1847
     * @return $this
1848
     */
1849
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
1850
    {
1851
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
21✔
1852

1853
        return $this;
21✔
1854
    }
1855

1856
    /**
1857
     * @param callable $callbackBeforeCreateDom
1858
     *
1859
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1860
     *
1861
     * @return $this
1862
     */
1863
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
1864
    {
1865
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
7✔
1866

1867
        return $this;
7✔
1868
    }
1869
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc