• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 25253018666

02 May 2026 01:30PM UTC coverage: 96.77% (-0.02%) from 96.791%
25253018666

Pull #145

github

web-flow
Merge 17f0ed49c into 7d12f7c70
Pull Request #145: Add Infection + PHPStan CI check on PHP 8.3 and fix PHP 7.x template serialization

28 of 32 new or added lines in 1 file covered. (87.5%)

23 existing lines in 1 file now uncovered.

2187 of 2260 relevant lines covered (96.77%)

292.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.47
/src/voku/helper/HtmlDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42

43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49

50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61

62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71

72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/template',
79
     *     'text/x-custom-template',
80
     *     'text/x-handlebars-template'
81
     * ]
82
     * ```
83
     *
84
     * @var string[]
85
     */
86
    protected $specialScriptTags = [
87
        'text/html',
88
        'text/template',
89
        'text/x-custom-template',
90
        'text/x-handlebars-template',
91
    ];
92

93
    /**
94
     * @var string[]
95
     */
96
    protected $selfClosingTags = [
97
        'area',
98
        'base',
99
        'br',
100
        'col',
101
        'command',
102
        'embed',
103
        'hr',
104
        'img',
105
        'input',
106
        'keygen',
107
        'link',
108
        'meta',
109
        'param',
110
        'source',
111
        'track',
112
        'wbr',
113
    ];
114

115
    /**
116
     * @var bool
117
     */
118
    protected $isDOMDocumentCreatedWithoutHtml = false;
119

120
    /**
121
     * @var bool
122
     */
123
    protected $isDOMDocumentCreatedWithoutWrapper = false;
124

125
    /**
126
     * @var bool
127
     */
128
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
129

130
    /**
131
     * @var bool
132
     */
133
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
134

135
    /**
136
     * @var bool
137
     */
138
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
139

140
    /**
141
     * @var bool
142
     */
143
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
144

145
    /**
146
     * @var bool
147
     */
148
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
149

150
    /**
151
     * @var bool
152
     */
153
    protected $isDOMDocumentCreatedWithMultiRoot = false;
154

155
    /**
156
     * @var bool
157
     */
158
    protected $isDOMDocumentCreatedWithEdgeWhitespace = false;
159

160
    /**
161
     * @var bool
162
     */
163
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
164

165
    /**
166
     * @var bool
167
     */
168
    protected $createdFromNode = false;
169

170
    /**
171
     * @var bool
172
     */
173
    protected $keepBrokenHtml = false;
174

175
    /**
176
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
177
     */
178
    public function __construct($element = null)
179
    {
180
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
2,310✔
181

182
        // DOMDocument settings
183
        $this->document->preserveWhiteSpace = true;
2,310✔
184
        $this->document->formatOutput = false;
2,310✔
185

186
        if ($element instanceof SimpleHtmlDomInterface) {
2,310✔
187
            $element = $element->getNode();
833✔
188
        }
189

190
        if ($element instanceof \DOMDocument) {
2,310✔
191
            $html = $element->saveHTML();
7✔
192
            if ($html !== false) {
7✔
193
                $this->loadHtml($html);
7✔
194
            }
195

196
            return;
7✔
197
        }
198

199
        if ($element instanceof \DOMNode) {
2,310✔
200
            $this->createdFromNode = true;
847✔
201

202
            $domNode = $this->document->importNode($element, true);
847✔
203

204
            // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
205
            if ($domNode instanceof \DOMNode) {
840✔
206
                $this->document->appendChild($domNode);
840✔
207
            }
208

209
            return;
840✔
210
        }
211

212
        if ($element !== null) {
2,303✔
213
            $this->loadHtml($element);
756✔
214
        }
215
    }
216

217
    /**
218
     * @param string       $name
219
     * @param array<mixed> $arguments
220
     *
221
     * @return bool|mixed
222
     */
223
    public function __call($name, $arguments)
224
    {
225
        $name = \strtolower($name);
546✔
226

227
        if (isset(self::$functionAliases[$name])) {
546✔
228
            $method = self::$functionAliases[$name];
539✔
229

230
            return $this->{$method}(...$arguments);
539✔
231
        }
232

233
        throw new \BadMethodCallException('Method does not exist: ' . $name);
7✔
234
    }
235

236
    /**
237
     * @param string       $name
238
     * @param array<mixed> $arguments
239
     *
240
     * @throws \BadMethodCallException
241
     * @throws \RuntimeException
242
     *
243
     * @return static
244
     */
245
    public static function __callStatic($name, $arguments)
246
    {
247
        $arguments0 = $arguments[0] ?? '';
714✔
248

249
        $arguments1 = $arguments[1] ?? null;
714✔
250

251
        if ($name === 'str_get_html') {
714✔
252
            $parser = self::createStaticParser();
672✔
253

254
            return $parser->loadHtml($arguments0, $arguments1);
672✔
255
        }
256

257
        if ($name === 'file_get_html') {
49✔
258
            $parser = self::createStaticParser();
42✔
259

260
            return $parser->loadHtmlFile($arguments0, $arguments1);
42✔
261
        }
262

263
        throw new \BadMethodCallException('Method does not exist');
7✔
264
    }
265

266
    /**
267
     * @return static
268
     */
269
    private static function createStaticParser()
270
    {
271
        // @phpstan-ignore new.static (factory methods intentionally preserve late static binding)
272
        return new static();
707✔
273
    }
274

275
    /** @noinspection MagicMethodsValidityInspection */
276

277
    /**
278
     * @param string $name
279
     *
280
     * @return string|null
281
     */
282
    public function __get($name)
283
    {
284
        $name = \strtolower($name);
210✔
285

286
        switch ($name) {
287
            case 'outerhtml':
210✔
288
            case 'outertext':
168✔
289
                return $this->html();
133✔
290
            case 'innerhtml':
119✔
291
            case 'innertext':
77✔
292
                return $this->innerHtml();
49✔
293
            case 'innerhtmlkeep':
70✔
294
                return $this->innerHtml(false, false);
×
295
            case 'text':
70✔
296
            case 'plaintext':
70✔
297
                return $this->text();
63✔
298
        }
299

300
        return null;
7✔
301
    }
302

303
    /**
304
     * @return string
305
     */
306
    public function __toString()
307
    {
308
        return $this->html();
147✔
309
    }
310

311
    /**
312
     * does nothing (only for api-compatibility-reasons)
313
     *
314
     * @return bool
315
     *
316
     * @deprecated
317
     */
318
    public function clear(): bool
319
    {
320
        return true;
14✔
321
    }
322

323
    /**
324
     * Create DOMDocument from HTML.
325
     *
326
     * @param string   $html
327
     * @param int|null $libXMLExtraOptions
328
     * @param bool     $useDefaultLibXMLOptions
329
     *
330
     * @return \DOMDocument
331
     */
332
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
333
    {
334
        $this->resetDynamicDomHelpers();
2,177✔
335

336
        if ($this->callbackBeforeCreateDom) {
2,177✔
337
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
7✔
338
        }
339

340
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
341
        $isDOMDocumentCreatedWithDoctype = false;
2,177✔
342
        if (\stripos($html, '<!DOCTYPE') !== false) {
2,177✔
343
            $isDOMDocumentCreatedWithDoctype = true;
427✔
344
            if (
345
                \preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
427✔
346
                &&
347
                \trim($matches_before_doctype[1])
427✔
348
            ) {
349
                $html = \str_replace($matches_before_doctype[1], '', $html);
14✔
350
            }
351
        }
352

353
        if ($this->keepBrokenHtml) {
2,177✔
354
            $html = $this->keepBrokenHtml(\trim($html));
35✔
355
        }
356

357
        if (\strpos($html, '<') === false) {
2,177✔
358
            $this->isDOMDocumentCreatedWithoutHtml = true;
98✔
359
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
2,163✔
360
            $this->isDOMDocumentCreatedWithoutWrapper = true;
70✔
361
        }
362

363
        if (\strpos(\ltrim($html), '<!--') === 0) {
2,177✔
364
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
84✔
365
        }
366

367
        /** @noinspection HtmlRequiredLangAttribute */
368
        if (
369
            \strpos($html, '<html ') === false
2,177✔
370
            &&
371
            \strpos($html, '<html>') === false
2,177✔
372
        ) {
373
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
1,393✔
374
        }
375

376
        if (
377
            \strpos($html, '<body ') === false
2,177✔
378
            &&
379
            \strpos($html, '<body>') === false
2,177✔
380
        ) {
381
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
1,407✔
382
        }
383

384
        if (
385
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,177✔
386
            &&
387
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,177✔
388
            &&
389
            \trim($html) !== $html
2,177✔
390
            &&
391
            \substr_count($html, '</') >= 2
2,177✔
392
            &&
393
            \preg_match('#^\s*<([a-zA-Z][^\\s>/]*)>.*?</\\1>#su', $html) === 1
2,177✔
394
        ) {
395
            $this->isDOMDocumentCreatedWithEdgeWhitespace = true;
28✔
396
        }
397

398
        /** @noinspection HtmlRequiredTitleElement */
399
        if (
400
            \strpos($html, '<head ') === false
2,177✔
401
            &&
402
            \strpos($html, '<head>') === false
2,177✔
403
        ) {
404
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
1,778✔
405
        }
406

407
        if (
408
            \stripos($html, '<p ') === false
2,177✔
409
            &&
410
            \stripos($html, '<p>') === false
2,177✔
411
        ) {
412
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
1,316✔
413
        }
414

415
        if (
416
            \strpos($html, '</script>') === false
2,177✔
417
            &&
418
            \strpos($html, '<\/script>') !== false
2,177✔
419
        ) {
420
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
7✔
421
        }
422

423
        if (\stripos($html, '</html>') !== false) {
2,177✔
424
            /** @noinspection NestedPositiveIfStatementsInspection */
425
            if (
426
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
875✔
427
                &&
428
                \trim($matches_after_html[1])
875✔
429
            ) {
430
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
28✔
431
            }
432
        }
433

434
        if (\strpos($html, '<script') !== false) {
2,177✔
435
            // keepSpecialScriptTags must run before html5FallbackForScriptTags so
436
            // that special-type scripts (type="text/html", etc.) are converted to
437
            // the simplevokuspecialscript placeholder element before the script-tag
438
            // regex runs.  On PHP < 8.0 the regex uses hash placeholders; if it
439
            // ran first the special-script content would be hashed and
440
            // keepSpecialScriptTags would only see the hash, losing the ability to
441
            // pass the real HTML content to the DOM for error-recovery parsing.
442
            foreach ($this->specialScriptTags as $tag) {
168✔
443
                if (\strpos($html, $tag) !== false) {
168✔
444
                    $this->keepSpecialScriptTags($html);
49✔
445
                    break;
49✔
446
                }
447
            }
448

449
            $this->html5FallbackForScriptTags($html);
168✔
450
        }
451

452
        if (\strpos($html, '<svg') !== false) {
2,177✔
453
            $this->keepSpecialSvgTags($html);
329✔
454
        }
455

456
        $html = \str_replace(
2,177✔
457
            \array_map(static function ($e) {
2,177✔
458
                return '<' . $e . '>';
2,177✔
459
            }, $this->selfClosingTags),
2,177✔
460
            \array_map(static function ($e) {
2,177✔
461
                return '<' . $e . '/>';
2,177✔
462
            }, $this->selfClosingTags),
2,177✔
463
            $html
2,177✔
464
        );
2,177✔
465

466
        // set error level
467
        $internalErrors = \libxml_use_internal_errors(true);
2,177✔
468
        if (\PHP_VERSION_ID < 80000) {
2,177✔
469
            $disableEntityLoader = \libxml_disable_entity_loader(true);
622✔
470
        }
471
        \libxml_clear_errors();
2,177✔
472

473
        $optionsXml = 0;
2,177✔
474
        if ($useDefaultLibXMLOptions) {
2,177✔
475
            $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
2,177✔
476

477
            if (\defined('LIBXML_BIGLINES')) {
2,177✔
478
                $optionsXml |= \LIBXML_BIGLINES;
2,177✔
479
            }
480

481
            if (\defined('LIBXML_COMPACT')) {
2,177✔
482
                $optionsXml |= \LIBXML_COMPACT;
2,177✔
483
            }
484

485
            if (\defined('LIBXML_HTML_NODEFDTD')) {
2,177✔
486
                $optionsXml |= \LIBXML_HTML_NODEFDTD;
2,177✔
487
            }
488
        }
489

490
        if ($libXMLExtraOptions !== null) {
2,177✔
491
            $optionsXml |= $libXMLExtraOptions;
63✔
492
        }
493

494
        if (
495
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
2,177✔
496
            &&
497
            $this->isDOMDocumentCreatedWithoutBodyWrapper
2,177✔
498
        ) {
499
            $this->isDOMDocumentCreatedWithMultiRoot = $this->hasMultipleTopLevelNodes($html, $optionsXml);
1,365✔
500
        }
501

502
        if (
503
            $this->isDOMDocumentCreatedWithMultiRoot
2,177✔
504
            ||
505
            $this->isDOMDocumentCreatedWithEdgeWhitespace
2,033✔
506
            ||
507
            $this->isDOMDocumentCreatedWithoutWrapper
2,021✔
508
            ||
509
            $this->isDOMDocumentCreatedWithCommentWrapper
1,972✔
510
            ||
511
            (
512
                !$isDOMDocumentCreatedWithDoctype
2,177✔
513
                &&
2,177✔
514
                $this->keepBrokenHtml
2,177✔
515
            )
516
        ) {
517
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
366✔
518
        }
519

520
        $html = self::replaceToPreserveHtmlEntities($html);
2,177✔
521

522
        $documentFound = false;
2,177✔
523
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
2,177✔
524
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
2,177✔
525
            $domElementTmp = \dom_import_simplexml($sxe);
1,375✔
526
            if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
1,375✔
527
                $documentFound = true;
1,375✔
528
                $this->document = $domElementTmp->ownerDocument;
1,375✔
529
            }
530
        }
531

532
        if ($documentFound === false) {
2,177✔
533
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
534
            $xmlHackUsed = false;
944✔
535
            if (\stripos('<?xml', $html) !== 0) {
944✔
536
                $xmlHackUsed = true;
939✔
537
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
939✔
538
            }
539

540
            if ($html !== '') {
944✔
541
                $this->document->loadHTML($html, $optionsXml);
939✔
542
            }
543

544
            // remove the "xml-encoding" hack
545
            if ($xmlHackUsed) {
944✔
546
                foreach ($this->document->childNodes as $child) {
939✔
547
                    if ($child->nodeType === \XML_PI_NODE) {
939✔
548
                        $this->document->removeChild($child);
939✔
549

550
                        break;
939✔
551
                    }
552
                }
553
            }
554
        }
555

556
        $this->markSyntheticParagraphWrapper();
2,177✔
557

558
        // set encoding
559
        $this->document->encoding = $this->getEncoding();
2,177✔
560

561
        // restore lib-xml settings
562
        \libxml_clear_errors();
2,177✔
563
        \libxml_use_internal_errors($internalErrors);
2,177✔
564
        // @phpstan-ignore isset.variable (only defined on PHP < 8 paths where it is used)
565
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
2,177✔
566
            \libxml_disable_entity_loader($disableEntityLoader);
622✔
567
        }
568

569
        return $this->document;
2,177✔
570
    }
571

572
    /**
573
     * Find list of nodes with a CSS selector.
574
     *
575
     * @param string   $selector
576
     * @param int|null $idx
577
     *
578
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
579
     */
580
    public function find(string $selector, $idx = null)
581
    {
582
        return $this->findInNodeContext($selector, null, $idx);
1,568✔
583
    }
584

585
    /**
586
     * Find list of nodes with a CSS selector within an optional DOM context.
587
     *
588
     * @param string        $selector
589
     * @param \DOMNode|null $contextNode
590
     * @param int|null      $idx
591
     *
592
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
593
     *
594
     * @internal Used by wrapped SimpleHtmlDom instances to preserve parser
595
     *           callback state when scoping queries to an existing DOM node.
596
     */
597
    public function findInNodeContext(string $selector, ?\DOMNode $contextNode = null, $idx = null)
598
    {
599
        return self::findInDocumentContext(
1,568✔
600
            $selector,
1,568✔
601
            $this->document,
1,568✔
602
            $contextNode,
1,568✔
603
            $idx,
1,568✔
604
            $this->callbackXPathBeforeQuery,
1,568✔
605
            $this
1,568✔
606
        );
1,568✔
607
    }
608

609
    /**
610
     * Find list of nodes with a CSS selector within an optional DOMDocument
611
     * context, optionally applying the parser callback before the XPath query.
612
     *
613
     * @param string        $selector
614
     * @param \DOMDocument  $document
615
     * @param \DOMNode|null $contextNode
616
     * @param int|null      $idx
617
     * @param callable|null $callbackXPathBeforeQuery
618
     * @param self|null     $queryHtmlDomParser
619
     *
620
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
621
     *
622
     * @phpstan-param null|callable(string, string, \DOMXPath, self): string $callbackXPathBeforeQuery
623
     *
624
     * @internal Used by wrapped SimpleHtmlDom instances to keep queries scoped
625
     *           to an existing DOMDocument while preserving parser callback
626
     *           behavior.
627
     */
628
    public static function findInDocumentContext(
629
        string $selector,
630
        \DOMDocument $document,
631
        ?\DOMNode $contextNode = null,
632
        $idx = null,
633
        ?callable $callbackXPathBeforeQuery = null,
634
        ?self $queryHtmlDomParser = null
635
    ) {
636
        $xPathQuery = SelectorConverter::toXPath($selector);
1,666✔
637

638
        $xPath = new \DOMXPath($document);
1,666✔
639

640
        if ($callbackXPathBeforeQuery !== null && $queryHtmlDomParser !== null) {
1,666✔
641
            $xPathQuery = \call_user_func($callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $queryHtmlDomParser);
21✔
642
        }
643

644
        if ($contextNode !== null) {
1,666✔
645
            $xPathQuery = self::scopeXPathQueryToContextNode($xPathQuery);
378✔
646
        }
647

648
        $nodesList = $xPath->query($xPathQuery, $contextNode);
1,666✔
649

650
        return self::createFindResultFromNodeList($nodesList, $idx, $queryHtmlDomParser);
1,666✔
651
    }
652

653
    /**
654
     * Prefix absolute XPath segments so they stay scoped to the provided
655
     * context node, including every branch of union expressions.
656
     *
657
     * @param string $xPathQuery
658
     *
659
     * @return string
660
     */
661
    private static function scopeXPathQueryToContextNode(string $xPathQuery): string
662
    {
663
        $scopedXPathQuery = '';
385✔
664
        $quoteCharacter = null;
385✔
665
        $bracketDepth = 0;
385✔
666
        $parenthesisDepth = 0;
385✔
667
        $isAtBranchStart = true;
385✔
668
        $length = \strlen($xPathQuery);
385✔
669

670
        for ($i = 0; $i < $length; ++$i) {
385✔
671
            $character = $xPathQuery[$i];
385✔
672

673
            if ($quoteCharacter !== null) {
385✔
674
                $scopedXPathQuery .= $character;
133✔
675

676
                if ($character === $quoteCharacter) {
133✔
677
                    $quoteCharacter = null;
133✔
678
                }
679

680
                continue;
133✔
681
            }
682

683
            if ($character === '"' || $character === "'") {
385✔
684
                $scopedXPathQuery .= $character;
133✔
685
                $quoteCharacter = $character;
133✔
686

687
                continue;
133✔
688
            }
689

690
            if ($isAtBranchStart) {
385✔
691
                if (\trim($character) === '') {
385✔
692
                    $scopedXPathQuery .= $character;
35✔
693

694
                    continue;
35✔
695
                }
696

697
                if ($character === '/') {
385✔
698
                    $scopedXPathQuery .= '.';
63✔
699
                }
700

701
                $isAtBranchStart = false;
385✔
702
            }
703

704
            if ($character === '[') {
385✔
705
                ++$bracketDepth;
161✔
706
            } elseif ($character === ']' && $bracketDepth > 0) {
385✔
707
                --$bracketDepth;
161✔
708
            } elseif ($character === '(') {
385✔
709
                ++$parenthesisDepth;
119✔
710
            } elseif ($character === ')' && $parenthesisDepth > 0) {
385✔
711
                --$parenthesisDepth;
119✔
712
            }
713

714
            $scopedXPathQuery .= $character;
385✔
715

716
            if ($character === '|' && $bracketDepth === 0 && $parenthesisDepth === 0) {
385✔
717
                $isAtBranchStart = true;
35✔
718
            }
719
        }
720

721
        return $scopedXPathQuery;
385✔
722
    }
723

724
    /**
725
     * @param \DOMNodeList<\DOMNameSpaceNode|\DOMNode>|false $nodesList
726
     * @param int|null                                       $idx
727
     *
728
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
729
     */
730
    private static function createFindResultFromNodeList($nodesList, $idx, ?self $queryHtmlDomParser = null)
731
    {
732
        $elements = new SimpleHtmlDomNode();
1,666✔
733

734
        if ($nodesList) {
1,666✔
735
            foreach ($nodesList as $node) {
1,666✔
736
                if (!$node instanceof \DOMNode) {
1,589✔
737
                    continue;
×
738
                }
739

740
                $elements[] = new SimpleHtmlDom($node, $queryHtmlDomParser);
1,589✔
741
            }
742
        }
743

744
        // return all elements
745
        if ($idx === null) {
1,666✔
746
            if (\count($elements) === 0) {
742✔
747
                return new SimpleHtmlDomNodeBlank();
140✔
748
            }
749

750
            return $elements;
714✔
751
        }
752

753
        // handle negative values
754
        if ($idx < 0) {
1,190✔
755
            $idx = \count($elements) + $idx;
77✔
756
        }
757

758
        // return one element
759
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
1,190✔
760
    }
761

762
    /**
763
     * Find nodes with a CSS selector.
764
     *
765
     * @param string $selector
766
     *
767
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
768
     */
769
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
770
    {
771
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
772
        $return = $this->find($selector, null);
126✔
773

774
        return $return;
126✔
775
    }
776

777
    /**
778
     * Find nodes with a CSS selector or false, if no element is found.
779
     *
780
     * @param string $selector
781
     *
782
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
783
     */
784
    public function findMultiOrFalse(string $selector)
785
    {
786
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
787
        $return = $this->find($selector, null);
35✔
788

789
        if ($return instanceof SimpleHtmlDomNodeBlank) {
35✔
790
            return false;
21✔
791
        }
792

793
        return $return;
21✔
794
    }
795

796
    /**
797
     * Find nodes with a CSS selector or null, if no element is found.
798
     *
799
     * @param string $selector
800
     *
801
     * @return null|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
802
     */
803
    public function findMultiOrNull(string $selector)
804
    {
805
        /** @var SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> $return */
806
        $return = $this->find($selector, null);
7✔
807

808
        if ($return instanceof SimpleHtmlDomNodeBlank) {
7✔
809
            return null;
7✔
810
        }
811

812
        return $return;
7✔
813
    }
814

815
    /**
816
     * Find one node with a CSS selector.
817
     *
818
     * @param string $selector
819
     *
820
     * @return SimpleHtmlDomInterface
821
     */
822
    public function findOne(string $selector): SimpleHtmlDomInterface
823
    {
824
        /** @var SimpleHtmlDomInterface $return */
825
        $return = $this->find($selector, 0);
588✔
826

827
        return $return;
588✔
828
    }
829

830
    /**
831
     * Find one node with a CSS selector or false, if no element is found.
832
     *
833
     * @param string $selector
834
     *
835
     * @return false|SimpleHtmlDomInterface
836
     */
837
    public function findOneOrFalse(string $selector)
838
    {
839
        /** @var SimpleHtmlDomInterface $return */
840
        $return = $this->find($selector, 0);
70✔
841

842
        if ($return instanceof SimpleHtmlDomBlank) {
70✔
843
            return false;
28✔
844
        }
845

846
        return $return;
56✔
847
    }
848

849
    /**
850
     * Find one node with a CSS selector or null, if no element is found.
851
     *
852
     * @param string $selector
853
     *
854
     * @return null|SimpleHtmlDomInterface
855
     */
856
    public function findOneOrNull(string $selector)
857
    {
858
        /** @var SimpleHtmlDomInterface $return */
859
        $return = $this->find($selector, 0);
7✔
860

861
        if ($return instanceof SimpleHtmlDomBlank) {
7✔
862
            return null;
7✔
863
        }
864

865
        return $return;
7✔
866
    }
867

868
    /**
869
     * @param string $content
870
     * @param bool   $multiDecodeNewHtmlEntity
871
     * @param bool   $putBrokenReplacedBack
872
     *
873
     * @return string
874
     */
875
    public function fixHtmlOutput(
876
        string $content,
877
        bool $multiDecodeNewHtmlEntity = false,
878
        bool $putBrokenReplacedBack = true
879
    ): string {
880
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
881
        //          so we try to remove it here again ...
882

883
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
1,449✔
884
            /** @noinspection HtmlRequiredLangAttribute */
885
            $content = \str_replace(
623✔
886
                [
623✔
887
                    '<html>',
623✔
888
                    '</html>',
623✔
889
                ],
623✔
890
                '',
623✔
891
                $content
623✔
892
            );
623✔
893
        }
894

895
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
1,449✔
896
            /** @noinspection HtmlRequiredTitleElement */
897
            $content = \str_replace(
826✔
898
                [
826✔
899
                    '<head>',
826✔
900
                    '</head>',
826✔
901
                ],
826✔
902
                '',
826✔
903
                $content
826✔
904
            );
826✔
905
        }
906

907
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
1,449✔
908
            $content = \str_replace(
623✔
909
                [
623✔
910
                    '<body>',
623✔
911
                    '</body>',
623✔
912
                ],
623✔
913
                '',
623✔
914
                $content
623✔
915
            );
623✔
916
        }
917

918
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
1,449✔
919
            $content = \str_replace(
7✔
920
                '</script>',
7✔
921
                '',
7✔
922
                $content
7✔
923
            );
7✔
924
        }
925

926
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
1,449✔
927
            $content = (string) \preg_replace('/^<p>/', '', $content);
56✔
928
            $content = (string) \preg_replace('/<\/p>/', '', $content);
56✔
929
        }
930

931
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
1,449✔
932
            $content = \str_replace(
84✔
933
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
84✔
934
                '',
84✔
935
                $content
84✔
936
            );
84✔
937
        }
938

939
        // https://bugs.php.net/bug.php?id=73175
940
        $content = \str_replace(
1,449✔
941
            \array_map(static function ($e) {
1,449✔
942
                return '</' . $e . '>';
1,449✔
943
            }, $this->selfClosingTags),
1,449✔
944
            '',
1,449✔
945
            $content
1,449✔
946
        );
1,449✔
947

948
        /** @noinspection HtmlRequiredTitleElement */
949
        $content = \trim(
1,449✔
950
            \str_replace(
1,449✔
951
                [
1,449✔
952
                    '<simpleHtmlDomHtml>',
1,449✔
953
                    '</simpleHtmlDomHtml>',
1,449✔
954
                    '<simpleHtmlDomP>',
1,449✔
955
                    '</simpleHtmlDomP>',
1,449✔
956
                    '<head><head>',
1,449✔
957
                    '</head></head>',
1,449✔
958
                ],
1,449✔
959
                [
1,449✔
960
                    '',
1,449✔
961
                    '',
1,449✔
962
                    '',
1,449✔
963
                    '',
1,449✔
964
                    '<head>',
1,449✔
965
                    '</head>',
1,449✔
966
                ],
1,449✔
967
                $content
1,449✔
968
            )
1,449✔
969
        );
1,449✔
970

971
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
1,449✔
972

973
        return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
1,449✔
974
    }
975

976
    /**
977
     * Return elements by ".class".
978
     *
979
     * @param string $class
980
     *
981
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
982
     */
983
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
984
    {
985
        return $this->findMulti('.' . $class);
7✔
986
    }
987

988
    /**
989
     * Return element by #id.
990
     *
991
     * @param string $id
992
     *
993
     * @return SimpleHtmlDomInterface
994
     */
995
    public function getElementById(string $id): SimpleHtmlDomInterface
996
    {
997
        return $this->findOne('#' . $id);
84✔
998
    }
999

1000
    /**
1001
     * Return element by tag name.
1002
     *
1003
     * @param string $name
1004
     *
1005
     * @return SimpleHtmlDomInterface
1006
     */
1007
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
1008
    {
1009
        $node = $this->document->getElementsByTagName($name)->item(0);
35✔
1010

1011
        if ($node === null) {
35✔
1012
            return new SimpleHtmlDomBlank();
7✔
1013
        }
1014

1015
        return new SimpleHtmlDom($node, $this);
28✔
1016
    }
1017

1018
    /**
1019
     * Returns elements by "#id".
1020
     *
1021
     * @param string   $id
1022
     * @param int|null $idx
1023
     *
1024
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1025
     */
1026
    public function getElementsById(string $id, $idx = null)
1027
    {
1028
        return $this->find('#' . $id, $idx);
7✔
1029
    }
1030

1031
    /**
1032
     * Returns elements by tag name.
1033
     *
1034
     * @param string   $name
1035
     * @param int|null $idx
1036
     *
1037
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1038
     */
1039
    public function getElementsByTagName(string $name, $idx = null)
1040
    {
1041
        $nodesList = $this->document->getElementsByTagName($name);
49✔
1042

1043
        $elements = new SimpleHtmlDomNode();
49✔
1044

1045
        foreach ($nodesList as $node) {
49✔
1046
            $elements[] = new SimpleHtmlDom($node, $this);
28✔
1047
        }
1048

1049
        // return all elements
1050
        if ($idx === null) {
49✔
1051
            if (\count($elements) === 0) {
35✔
1052
                return new SimpleHtmlDomNodeBlank();
14✔
1053
            }
1054

1055
            return $elements;
21✔
1056
        }
1057

1058
        // handle negative values
1059
        if ($idx < 0) {
14✔
1060
            $idx = \count($elements) + $idx;
×
1061
        }
1062

1063
        // return one element
1064
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
14✔
1065
    }
1066

1067
    /**
1068
     * Get dom node's outer html.
1069
     *
1070
     * @param bool $multiDecodeNewHtmlEntity
1071
     * @param bool $putBrokenReplacedBack
1072
     *
1073
     * @return string
1074
     */
1075
    public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1076
    {
1077
        if (static::$callback !== null) {
1,113✔
1078
            \call_user_func(static::$callback, [$this]);
770✔
1079
        }
1080

1081
        if ($this->shouldUseWholeDocumentSerializationForHtmlOnPhpLt8()) {
1,113✔
1082
            $content = $this->document->saveHTML();
158✔
1083
        } elseif ($this->usesInternalWrapperDocument()) {
993✔
1084
            $content = $this->serializeInternalWrapperContent();
150✔
1085
        } elseif ($this->createdFromNode) {
888✔
1086
            if (\PHP_VERSION_ID < 80000) {
385✔
1087
                $content = $this->serializeCreatedFromNodeForPhpLt8();
110✔
1088
            } else {
1089
                $content = $this->serializeChildNodes($this->document);
385✔
1090
            }
1091
        } elseif ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
642✔
1092
            $content = $this->document->saveHTML($this->document->documentElement);
327✔
1093
        } else {
1094
            $content = $this->document->saveHTML();
378✔
1095
        }
1096

1097
        if ($content === false) {
1,113✔
1098
            return '';
×
1099
        }
1100

1101
        $output = $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
1,113✔
1102

1103
        return $output;
1,113✔
1104
    }
1105

1106
    /**
1107
     * Mark a parser-generated <p>-wrapper so fixHtmlOutput() can remove only
1108
     * the synthetic wrapper instead of stripping all paragraph tags. The
1109
     * wrapper is renamed to the placeholder tag that fixHtmlOutput() already
1110
     * strips from serialized output.
1111
     *
1112
     * @return void
1113
     */
1114
    private function markSyntheticParagraphWrapper(): void
1115
    {
1116
        if (!$this->isDOMDocumentCreatedWithoutPTagWrapper) {
2,177✔
1117
            return;
980✔
1118
        }
1119

1120
        $html = $this->document->documentElement;
1,316✔
1121
        if (
1122
            !$html instanceof \DOMElement
1,316✔
1123
            ||
1124
            \strtolower($html->tagName) !== 'html'
1,316✔
1125
        ) {
1126
            return;
779✔
1127
        }
1128

1129
        $body = $this->document->getElementsByTagName('body')->item(0);
644✔
1130
        if (!$body instanceof \DOMElement) {
644✔
1131
            return;
61✔
1132
        }
1133

1134
        $wrapper = null;
623✔
1135
        foreach ($body->childNodes as $child) {
623✔
1136
            if ($child instanceof \DOMText && \trim($child->nodeValue ?? '') === '') {
511✔
1137
                continue;
84✔
1138
            }
1139

1140
            if ($wrapper !== null) {
511✔
1141
                return;
×
1142
            }
1143

1144
            if (!$child instanceof \DOMElement) {
511✔
1145
                return;
28✔
1146
            }
1147

1148
            if (\strtolower($child->tagName) !== 'p') {
483✔
1149
                return;
427✔
1150
            }
1151

1152
            $wrapper = $child;
98✔
1153
        }
1154

1155
        if (!$wrapper instanceof \DOMElement || $wrapper->parentNode === null) {
210✔
1156
            return;
112✔
1157
        }
1158

1159
        $replacement = $this->document->createElement('simpleHtmlDomP');
98✔
1160

1161
        while ($wrapper->firstChild !== null) {
98✔
1162
            $replacement->appendChild($wrapper->firstChild);
98✔
1163
        }
1164

1165
        $wrapper->parentNode->replaceChild($replacement, $wrapper);
98✔
1166
    }
1167

1168
    /**
1169
     * Serialize a single DOM node to HTML.
1170
     *
1171
     * A detached DOMDocument is used so that the serialization context is
1172
     * independent of the internal wrapper tag name (older libxml HTML
1173
     * serializers treat unknown hyphenated tags as block-level and inject
1174
     * formatting newlines into the wrapper's children when saving the full
1175
     * document).
1176
     *
1177
     * On PHP < 8.0, older libxml injects a trailing "\n" after raw-text
1178
     * elements (script, style) when they are the root of a fresh document.
1179
     * For those elements we fall back to serializing from the original
1180
     * document and strip only the single trailing "\n".  For all other
1181
     * element types the fresh-document approach is used to avoid libxml
1182
     * injecting formatting newlines inside block-level content.  Text and
1183
     * other non-element nodes are always serialized from the owner document
1184
     * without any trailing-newline stripping (they carry no injected newline).
1185
     *
1186
     * @param \DOMNode $node
1187
     */
1188
    private function serializeNode(\DOMNode $node): string
1189
    {
1190
        if (\PHP_VERSION_ID < 80000 && $node instanceof \DOMElement) {
707✔
NEW
1191
            return $this->serializeElementNodeForPhpLt8($node);
36✔
1192
        }
1193

1194
        $document = new \DOMDocument('1.0', $this->getEncoding());
685✔
1195
        $document->preserveWhiteSpace = true;
685✔
1196
        $document->formatOutput = false;
685✔
1197

1198
        $importedNode = $document->importNode($node, true);
685✔
1199
        // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
1200
        if (!$importedNode instanceof \DOMNode) {
685✔
NEW
1201
            return '';
×
1202
        }
1203

1204
        $document->appendChild($importedNode);
685✔
1205

1206
        $content = $document->saveHTML($importedNode);
685✔
1207

1208
        if ($content === false) {
685✔
UNCOV
1209
            return '';
×
1210
        }
1211

1212
        return $content;
685✔
1213
    }
1214

1215
    /**
1216
     * On PHP < 8.0, saveHTML($node) injects formatting newlines for detached
1217
     * block-level elements, so serialize a temporary whole document instead.
1218
     *
1219
     * @param \DOMElement $node
1220
     *
1221
     * @return string
1222
     */
1223
    private function serializeElementNodeForPhpLt8(\DOMElement $node): string
1224
    {
1225
        $document = new \DOMDocument('1.0', $this->getEncoding());
43✔
1226
        $document->preserveWhiteSpace = true;
43✔
1227
        $document->formatOutput = false;
43✔
1228

1229
        $importedNode = $document->importNode($node, true);
43✔
1230
        // @phpstan-ignore instanceof.alwaysTrue (importNode() returns DOMNode here)
1231
        if (!$importedNode instanceof \DOMElement) {
43✔
NEW
1232
            return '';
×
1233
        }
1234

1235
        $document->appendChild($importedNode);
43✔
1236

1237
        $content = $document->saveHTML();
43✔
1238
        if ($content === false) {
43✔
NEW
1239
            return '';
×
1240
        }
1241

1242
        $content = (string) \preg_replace('/^<!DOCTYPE[^>]+>\s*/i', '', $content);
43✔
1243

1244
        $tagName = \strtolower($importedNode->tagName);
43✔
1245
        if ($tagName !== 'html') {
43✔
1246
            $content = (string) \preg_replace('/^<html[^>]*>/i', '', $content);
43✔
1247
            $content = (string) \preg_replace('/<\/html>\s*$/i', '', $content);
43✔
1248

1249
            if ($tagName !== 'body') {
43✔
1250
                $content = (string) \preg_replace('/^<body[^>]*>/i', '', $content);
43✔
1251
                $content = (string) \preg_replace('/<\/body>\s*$/i', '', $content);
43✔
1252
                $content = \str_replace('<body></body>', '', $content);
43✔
1253
            }
1254
        }
1255

1256
        if (\substr($content, -1) === "\n") {
43✔
1257
            $content = \substr($content, 0, -1);
43✔
1258
        }
1259

1260
        return $content;
43✔
1261
    }
1262

1263
    /**
1264
     * Serialize the single element that was imported via the node-backed
1265
     * constructor, for PHP < 8.0.
1266
     *
1267
     * On PHP < 8, saveHTML($node) with a node argument always injects
1268
     * formatting newlines between block-level child elements and a trailing
1269
     * "\n" after raw-text elements (script, style), even with formatOutput
1270
     * set to false.  saveHTML() called without a node argument respects
1271
     * formatOutput=false and does not inject those newlines.
1272
     *
1273
     * We call saveHTML() on the constructor document (which already has the
1274
     * imported element as its only child / documentElement) and strip the
1275
     * DOCTYPE and structural wrappers (html, body) that libxml may add around
1276
     * elements that are not recognised HTML root elements.
1277
     *
1278
     * @return string
1279
     */
1280
    private function serializeCreatedFromNodeForPhpLt8(): string
1281
    {
1282
        $full = $this->document->saveHTML();
110✔
1283
        if ($full === false) {
110✔
1284
            return '';
×
1285
        }
1286

1287
        // Strip the DOCTYPE declaration that libxml always prepends.
1288
        $full = (string) \preg_replace('/<!DOCTYPE[^>]+>/i', '', $full);
110✔
1289
        $full = \trim($full);
110✔
1290

1291
        $documentElement = $this->document->documentElement;
110✔
1292
        $tagName = $documentElement instanceof \DOMElement
110✔
1293
            ? \strtolower($documentElement->tagName)
104✔
1294
            : '';
110✔
1295

1296
        // Strip the <html>...</html> wrapper added by libxml when the root
1297
        // element is not the HTML element itself.
1298
        if ($tagName !== 'html') {
110✔
UNCOV
1299
            $full = (string) \preg_replace('/^<html[^>]*>/i', '', $full);
108✔
UNCOV
1300
            $full = (string) \preg_replace('/<\/html>$/i', '', $full);
108✔
1301
            $full = \trim($full);
108✔
1302

1303
            // Strip the <body>...</body> wrapper added for non-body elements.
1304
            if ($tagName !== 'body') {
108✔
1305
                $full = (string) \preg_replace('/^<body[^>]*>/i', '', $full);
106✔
1306
                $full = (string) \preg_replace('/<\/body>$/i', '', $full);
106✔
1307
                // Remove a trailing empty <body> libxml may add for <head> roots.
UNCOV
1308
                $full = \str_replace('<body></body>', '', $full);
106✔
UNCOV
1309
                $full = \trim($full);
106✔
1310
            }
1311
        }
1312

UNCOV
1313
        return $full;
110✔
1314
    }
1315

1316
    /**
1317
     * @param \DOMNode $parentNode
1318
     *
1319
     * @return string
1320
     */
1321
    private function serializeChildNodes(\DOMNode $parentNode): string
1322
    {
1323
        $content = '';
700✔
1324

1325
        foreach ($parentNode->childNodes as $childNode) {
700✔
1326
            $content .= $this->serializeNode($childNode);
700✔
1327
        }
1328

1329
        return $content;
700✔
1330
    }
1331

1332
    /**
1333
     * @return bool
1334
     */
1335
    private function usesInternalWrapperDocument(): bool
1336
    {
1337
        return $this->document->documentElement instanceof \DOMElement
1,295✔
1338
            && $this->document->documentElement->tagName === self::$domHtmlWrapperHelper;
1,295✔
1339
    }
1340

1341
    /**
1342
     * Older libxml preserves body-only fragments more faithfully when the whole
1343
     * temporary document is serialized and fixHtmlOutput() removes the wrappers
1344
     * afterwards. Head-only fragments still need root-element serialization, or
1345
     * <meta charset=...> can trigger output re-encoding (e.g. utf-7).
1346
     */
1347
    private function isBodyOnlyHtmlFragmentDocument(): bool
1348
    {
UNCOV
1349
        $documentElement = $this->document->documentElement;
134✔
UNCOV
1350
        if (!$documentElement instanceof \DOMElement || \strtolower($documentElement->tagName) !== 'html') {
134✔
1351
            return false;
80✔
1352
        }
1353

1354
        $head = $documentElement->getElementsByTagName('head')->item(0);
58✔
1355
        $body = $documentElement->getElementsByTagName('body')->item(0);
58✔
1356

1357
        $hasHeadContent = $head instanceof \DOMElement && $head->childNodes->length > 0;
58✔
UNCOV
1358
        $hasBodyContent = $body instanceof \DOMElement && $body->childNodes->length > 0;
58✔
1359

UNCOV
1360
        return !$hasHeadContent && $hasBodyContent;
58✔
1361
    }
1362

1363
    private function shouldUseWholeDocumentSerializationForHtmlOnPhpLt8(): bool
1364
    {
1365
        if (\PHP_VERSION_ID >= 80000) {
1,113✔
1366
            return false;
795✔
1367
        }
1368

UNCOV
1369
        if ($this->usesInternalWrapperDocument()) {
318✔
1370
            return true;
60✔
1371
        }
1372

UNCOV
1373
        if (!$this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
276✔
1374
            return false;
194✔
1375
        }
1376

UNCOV
1377
        $documentElement = $this->document->documentElement;
122✔
UNCOV
1378
        if (!$documentElement instanceof \DOMElement) {
122✔
1379
            return false;
14✔
1380
        }
1381

UNCOV
1382
        return \strtolower($documentElement->tagName) !== 'html'
116✔
UNCOV
1383
            || $this->isBodyOnlyHtmlFragmentDocument();
116✔
1384
    }
1385

1386
    private function shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8(): bool
1387
    {
1388
        return \PHP_VERSION_ID < 80000
322✔
1389
            && (
322✔
1390
                $this->usesInternalWrapperDocument()
322✔
1391
                || $this->isBodyOnlyHtmlFragmentDocument()
322✔
1392
            );
322✔
1393
    }
1394

1395
    /**
1396
     * Keep helper wrapper markers around detached child serialization so
1397
     * fixHtmlOutput() does not trim leading/trailing fragment whitespace.
1398
     *
1399
     * @return string
1400
     */
1401
    private function serializeInternalWrapperContent(): string
1402
    {
1403
        if ($this->document->documentElement === null) {
150✔
UNCOV
1404
            return '';
×
1405
        }
1406

1407
        $wrapperTag = self::$domHtmlWrapperHelper;
150✔
1408

1409
        return '<' . $wrapperTag . '>'
150✔
1410
            . $this->serializeChildNodes($this->document->documentElement)
150✔
1411
            . '</' . $wrapperTag . '>';
150✔
1412
    }
1413

1414
    /**
1415
     * Parse the fragment inside the internal wrapper and count significant
1416
     * direct children. This is more reliable than regex for fragments whose
1417
     * top-level elements have attributes or nested markup.
1418
     *
1419
     * @param string $html
1420
     * @param int    $optionsXml
1421
     *
1422
     * @return bool
1423
     */
1424
    private function hasMultipleTopLevelNodes(string $html, int $optionsXml): bool
1425
    {
1426
        $internalErrors = \libxml_use_internal_errors(true);
1,372✔
1427
        try {
1428
            \libxml_clear_errors();
1,372✔
1429

1430
            $xmlProbe = '<' . self::$domHtmlWrapperHelper . '>'
1,372✔
1431
                . self::replaceToPreserveHtmlEntities($html)
1,372✔
1432
                . '</' . self::$domHtmlWrapperHelper . '>';
1,372✔
1433

1434
            $simpleXml = \simplexml_load_string($xmlProbe, \SimpleXMLElement::class, $optionsXml);
1,372✔
1435
            if ($simpleXml === false || \count(\libxml_get_errors()) > 0) {
1,372✔
1436
                return false;
342✔
1437
            }
1438

1439
            $wrapper = \dom_import_simplexml($simpleXml);
1,100✔
1440
            if (!$wrapper instanceof \DOMElement) {
1,100✔
UNCOV
1441
                return false;
×
1442
            }
1443

1444
            return $this->countSignificantChildNodes($wrapper) > 1;
1,100✔
1445
        } finally {
1446
            \libxml_clear_errors();
1,372✔
1447
            \libxml_use_internal_errors($internalErrors);
1,372✔
1448
        }
1449
    }
1450

1451
    /**
1452
     * @param \DOMNode $node
1453
     *
1454
     * @return int
1455
     */
1456
    private function countSignificantChildNodes(\DOMNode $node): int
1457
    {
1458
        $count = 0;
1,100✔
1459

1460
        foreach ($node->childNodes as $childNode) {
1,100✔
1461
            if (
1462
                $childNode->nodeType === \XML_TEXT_NODE
1,086✔
1463
                &&
1464
                \trim($childNode->textContent) === ''
1,086✔
1465
            ) {
1466
                continue;
72✔
1467
            }
1468

1469
            ++$count;
1,086✔
1470
            if ($count > 1) {
1,086✔
1471
                return $count;
214✔
1472
            }
1473
        }
1474

1475
        return $count;
942✔
1476
    }
1477

1478
    /**
1479
     * {@inheritdoc}
1480
     */
1481
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1482
    {
1483
        $text = '';
329✔
1484

1485
        if ($this->document->documentElement) {
329✔
1486
            if ($this->shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8()) {
322✔
UNCOV
1487
                $text = $this->document->saveHTML();
12✔
1488
            } elseif ($this->usesInternalWrapperDocument()) {
310✔
1489
                $text = $this->serializeInternalWrapperContent();
5✔
1490
            } else {
1491
                $text = $this->serializeChildNodes($this->document->documentElement);
305✔
1492
            }
1493
        }
1494

1495
        if ($text === false) {
329✔
UNCOV
1496
            $text = '';
×
1497
        }
1498

1499
        $output = $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
329✔
1500

1501
        return $output;
329✔
1502
    }
1503

1504
    /**
1505
     * Get dom node's plain text.
1506
     *
1507
     * HTML document plaintext should exclude raw-text container contents like
1508
     * <script> and <style> while still preserving other text nodes in document
1509
     * order (e.g. <title> content).
1510
     *
1511
     * @param bool $multiDecodeNewHtmlEntity
1512
     *
1513
     * @return string
1514
     */
1515
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1516
    {
1517
        $parts = [];
70✔
1518

1519
        $xPath = new \DOMXPath($this->document);
70✔
1520
        $textNodes = $xPath->query(
70✔
1521
            \sprintf(
70✔
1522
                '//text()[not(ancestor::script or ancestor::style or ancestor::%s)]',
70✔
1523
                self::$domHtmlSpecialScriptHelper
70✔
1524
            )
70✔
1525
        );
70✔
1526

1527
        if ($textNodes !== false) {
70✔
1528
            foreach ($textNodes as $textNode) {
70✔
1529
                $parts[] = $textNode->nodeValue;
70✔
1530
            }
1531
        }
1532

1533
        return $this->fixHtmlOutput(\implode('', $parts), $multiDecodeNewHtmlEntity);
70✔
1534
    }
1535

1536
    /**
1537
     * Load HTML from string.
1538
     *
1539
     * @param string   $html
1540
     * @param int|null $libXMLExtraOptions
1541
     * @param bool     $useDefaultLibXMLOptions
1542
     *
1543
     * @return $this
1544
     */
1545
    public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1546
    {
1547
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
2,177✔
1548

1549
        return $this;
2,177✔
1550
    }
1551

1552
    /**
1553
     * Load HTML from file.
1554
     *
1555
     * @param string   $filePath
1556
     * @param int|null $libXMLExtraOptions
1557
     * @param bool     $useDefaultLibXMLOptions
1558
     *
1559
     * @throws \RuntimeException
1560
     *
1561
     * @return $this
1562
     */
1563
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1564
    {
1565
        if (!\preg_match("/^https?:\/\//i", $filePath)) {
112✔
1566
            if (!\file_exists($filePath)) {
105✔
1567
                throw new \RuntimeException('File ' . $filePath . ' not found');
7✔
1568
            }
1569

1570
            if (!\is_file($filePath)) {
98✔
1571
                throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1572
            }
1573
        }
1574

1575
        try {
1576
            if (\class_exists('\voku\helper\UTF8')) {
98✔
UNCOV
1577
                $html = \voku\helper\UTF8::file_get_contents($filePath);
×
1578
            } else {
1579
                $html = \file_get_contents($filePath);
98✔
1580
            }
1581
        } catch (\Exception $e) {
7✔
1582
            throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1583
        }
1584

1585
        if ($html === false) {
91✔
UNCOV
1586
            throw new \RuntimeException('Could not load file ' . $filePath);
×
1587
        }
1588

1589
        return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
91✔
1590
    }
1591

1592
    /**
1593
     * Get the HTML as XML or plain XML if needed.
1594
     *
1595
     * @param bool $multiDecodeNewHtmlEntity
1596
     * @param bool $htmlToXml
1597
     * @param bool $removeXmlHeader
1598
     * @param int  $options
1599
     *
1600
     * @return string
1601
     */
1602
    public function xml(
1603
        bool $multiDecodeNewHtmlEntity = false,
1604
        bool $htmlToXml = true,
1605
        bool $removeXmlHeader = true,
1606
        int $options = \LIBXML_NOEMPTYTAG
1607
    ): string {
1608
        $xml = $this->document->saveXML(null, $options);
28✔
1609
        if ($xml === false) {
28✔
UNCOV
1610
            return '';
×
1611
        }
1612

1613
        if ($removeXmlHeader) {
28✔
1614
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
14✔
1615
        }
1616

1617
        if ($htmlToXml) {
28✔
1618
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
14✔
1619
        } else {
1620
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
14✔
1621

1622
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
14✔
1623
        }
1624

1625
        return $return;
28✔
1626
    }
1627

1628
    /**
1629
     * @param string $selector
1630
     * @param int    $idx
1631
     *
1632
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1633
     */
1634
    public function __invoke($selector, $idx = null)
1635
    {
1636
        return $this->find($selector, $idx);
21✔
1637
    }
1638

1639
    /**
1640
     * @return bool
1641
     */
1642
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
1643
    {
1644
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
1,449✔
1645
    }
1646

1647
    /**
1648
     * @return bool
1649
     */
1650
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
1651
    {
1652
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
182✔
1653
    }
1654

1655
    /**
1656
     * @return bool
1657
     */
1658
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
1659
    {
1660
        return $this->isDOMDocumentCreatedWithoutHtml;
1,449✔
1661
    }
1662

1663
    /**
1664
     * @return bool
1665
     */
1666
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
1667
    {
1668
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
1,449✔
1669
    }
1670

1671
    /**
1672
     * @return bool
1673
     */
1674
    public function getIsDOMDocumentCreatedWithMultiRoot(): bool
1675
    {
1676
        return $this->isDOMDocumentCreatedWithMultiRoot;
7✔
1677
    }
1678

1679
    /**
1680
     * @return bool
1681
     */
1682
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
1683
    {
1684
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
1,449✔
1685
    }
1686

1687
    /**
1688
     * @return bool
1689
     */
1690
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
1691
    {
1692
        return $this->isDOMDocumentCreatedWithoutWrapper;
1,449✔
1693
    }
1694

1695
    /**
1696
     * @return bool
1697
     */
1698
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
1699
    {
1700
        return $this->isDOMDocumentCreatedWithFakeEndScript;
1,449✔
1701
    }
1702

1703
    /**
1704
     * @param string $html
1705
     *
1706
     * @return string
1707
     */
1708
    protected function keepBrokenHtml(string $html): string
1709
    {
1710
        do {
1711
            $original = $html;
35✔
1712

1713
            $html = (string) \preg_replace_callback(
35✔
1714
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
35✔
1715
                static function ($matches) {
35✔
1716
                    return $matches['start'] .
35✔
1717
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
35✔
1718
                        $matches['value'] .
35✔
1719
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
35✔
1720
                        $matches['end'];
35✔
1721
                },
35✔
1722
                $html
35✔
1723
            );
35✔
1724
        } while ($original !== $html);
35✔
1725

1726
        do {
1727
            $original = $html;
35✔
1728

1729
            $html = (string) \preg_replace_callback(
35✔
1730
                '/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
35✔
1731
                function ($matches) {
35✔
1732
                    $matches['broken'] = \str_replace(
21✔
1733
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
21✔
1734
                        ['</', '<', '>'],
21✔
1735
                        $matches['broken']
21✔
1736
                    );
21✔
1737

1738
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
21✔
1739
                    $this->registerDynamicDomBrokenReplaceHelper($matches['broken'], $matchesHash);
21✔
1740

1741
                    return $matches['start'] . $matchesHash . $matches['end'];
21✔
1742
                },
35✔
1743
                $html
35✔
1744
            );
35✔
1745
        } while ($original !== $html);
35✔
1746

1747
        return \str_replace(
35✔
1748
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
35✔
1749
            ['</', '<', '>'],
35✔
1750
            $html
35✔
1751
        );
35✔
1752
    }
1753

1754
    /**
1755
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
1756
     *
1757
     * @param string $html
1758
     *
1759
     * @return void
1760
     */
1761
    protected function keepSpecialSvgTags(string &$html)
1762
    {
1763
        // regEx for e.g.: [mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">...</svg>')]
1764
        /** @noinspection HtmlDeprecatedTag */
1765
        $regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
329✔
1766
        $htmlTmp = \preg_replace_callback(
329✔
1767
            $regExSpecialSvg,
329✔
1768
            function ($svgs) {
329✔
1769
                $content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
14✔
1770
                $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
14✔
1771
                $this->registerDynamicDomBrokenReplaceHelper($content, $matchesHash);
14✔
1772

1773
                return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
14✔
1774
            },
329✔
1775
            $html
329✔
1776
        );
329✔
1777

1778
        if ($htmlTmp !== null) {
329✔
1779
            $html = $htmlTmp;
329✔
1780
        }
1781
    }
1782

1783
    /**
1784
     * @param string $html
1785
     *
1786
     * @return void
1787
     */
1788
    protected function keepSpecialScriptTags(string &$html)
1789
    {
1790
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1791
        $tags = \implode('|', \array_map(
49✔
1792
            static function ($value) {
49✔
1793
                return \preg_quote($value, '/');
49✔
1794
            },
49✔
1795
            $this->specialScriptTags
49✔
1796
        ));
49✔
1797
        $html = (string) \preg_replace_callback(
49✔
1798
            '/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
49✔
1799
            function ($matches) {
49✔
1800
                // Check for logic in special script tags containing EJS/ERB-style template syntax
1801
                // (e.g. <% ... %> blocks), because often this looks like non-valid html in the template itself.
1802
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
35✔
1803
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
35✔
1804
                        // remove the html5 fallback
1805
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
28✔
1806

1807
                        $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
28✔
1808
                        $this->registerDynamicDomBrokenReplaceHelper($matches['innerContent'], $matchesHash);
28✔
1809

1810
                        return $matches['start'] . $matchesHash . $matches['end'];
28✔
1811
                    }
1812
                }
1813

1814
                // remove the html5 fallback
1815
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
21✔
1816

1817
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
21✔
1818

1819
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
21✔
1820
            },
49✔
1821
            $html
49✔
1822
        );
49✔
1823
    }
1824

1825
    /**
1826
     * @param bool $keepBrokenHtml
1827
     *
1828
     * @return $this
1829
     */
1830
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1831
    {
1832
        $this->keepBrokenHtml = $keepBrokenHtml;
35✔
1833

1834
        return $this;
35✔
1835
    }
1836

1837
    /**
1838
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1839
     *
1840
     * @return $this
1841
     */
1842
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1843
    {
1844
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
14✔
1845
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1846
            if (!\is_string($tmp)) {
14✔
1847
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
7✔
1848
            }
1849
        }
1850

1851
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
7✔
1852

1853
        return $this;
7✔
1854
    }
1855

1856
    /**
1857
     * @param string[] $specialScriptTags
1858
     *
1859
     * @return $this
1860
     */
1861
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1862
    {
1863
        foreach ($specialScriptTags as $tag) {
7✔
1864
            // @phpstan-ignore function.alreadyNarrowedType (runtime guard kept for public API validation)
1865
            if (!\is_string($tag)) {
7✔
1866
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
7✔
1867
            }
1868
        }
1869

1870
        $this->specialScriptTags = $specialScriptTags;
7✔
1871

1872
        return $this;
7✔
1873
    }
1874

1875
    /**
1876
     * @param callable $callbackXPathBeforeQuery
1877
     *
1878
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1879
     *
1880
     * @return $this
1881
     */
1882
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
1883
    {
1884
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
21✔
1885

1886
        return $this;
21✔
1887
    }
1888

1889
    /**
1890
     * @param callable $callbackBeforeCreateDom
1891
     *
1892
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1893
     *
1894
     * @return $this
1895
     */
1896
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
1897
    {
1898
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
7✔
1899

1900
        return $this;
7✔
1901
    }
1902
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc