• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 24632839975

19 Apr 2026 03:42PM UTC coverage: 77.11% (+6.3%) from 70.769%
24632839975

push

github

web-flow
Merge pull request #135 from voku/copilot/fix-html-parsing-newline-issue

Preserve node HTML formatting when serializing nested elements

4 of 24 new or added lines in 1 file covered. (16.67%)

51 existing lines in 6 files now uncovered.

1654 of 2145 relevant lines covered (77.11%)

262.05 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.83
/src/voku/helper/HtmlDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42

43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49

50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61

62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71

72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/template',
79
     *     'text/x-custom-template',
80
     *     'text/x-handlebars-template'
81
     * ]
82
     * ```
83
     *
84
     * @var string[]
85
     */
86
    protected $specialScriptTags = [
87
        'text/html',
88
        'text/template',
89
        'text/x-custom-template',
90
        'text/x-handlebars-template',
91
    ];
92

93
    /**
94
     * @var string[]
95
     */
96
    protected $selfClosingTags = [
97
        'area',
98
        'base',
99
        'br',
100
        'col',
101
        'command',
102
        'embed',
103
        'hr',
104
        'img',
105
        'input',
106
        'keygen',
107
        'link',
108
        'meta',
109
        'param',
110
        'source',
111
        'track',
112
        'wbr',
113
    ];
114

115
    /**
116
     * @var bool
117
     */
118
    protected $isDOMDocumentCreatedWithoutHtml = false;
119

120
    /**
121
     * @var bool
122
     */
123
    protected $isDOMDocumentCreatedWithoutWrapper = false;
124

125
    /**
126
     * @var bool
127
     */
128
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
129

130
    /**
131
     * @var bool
132
     */
133
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
134

135
    /**
136
     * @var bool
137
     */
138
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
139

140
    /**
141
     * @var bool
142
     */
143
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
144

145
    /**
146
     * @var bool
147
     */
148
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
149

150
    /**
151
     * @var bool
152
     */
153
    protected $isDOMDocumentCreatedWithMultiRoot = false;
154

155
    /**
156
     * @var bool
157
     */
158
    protected $isDOMDocumentCreatedWithEdgeWhitespace = false;
159

160
    /**
161
     * @var bool
162
     */
163
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
164

165
    /**
166
     * @var bool
167
     */
168
    protected $createdFromNode = false;
169

170
    /**
171
     * @var bool
172
     */
173
    protected $keepBrokenHtml = false;
174

175
    /**
176
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
177
     */
178
    public function __construct($element = null)
179
    {
180
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
2,121✔
181

182
        // DOMDocument settings
183
        $this->document->preserveWhiteSpace = true;
2,121✔
184
        $this->document->formatOutput = false;
2,121✔
185

186
        if ($element instanceof SimpleHtmlDomInterface) {
2,121✔
187
            $element = $element->getNode();
742✔
188
        }
189

190
        if ($element instanceof \DOMNode) {
2,121✔
191
            $this->createdFromNode = true;
756✔
192

193
            $domNode = $this->document->importNode($element, true);
756✔
194

195
            if ($domNode instanceof \DOMNode) {
749✔
196
                $this->document->appendChild($domNode);
749✔
197
            }
198

199
            return;
749✔
200
        }
201

202
        if ($element !== null) {
2,114✔
203
            $this->loadHtml($element);
728✔
204
        }
205
    }
206

207
    /**
208
     * @param string $name
209
     * @param array  $arguments
210
     *
211
     * @return bool|mixed
212
     */
213
    public function __call($name, $arguments)
214
    {
215
        $name = \strtolower($name);
546✔
216

217
        if (isset(self::$functionAliases[$name])) {
546✔
218
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
539✔
219
        }
220

221
        throw new \BadMethodCallException('Method does not exist: ' . $name);
7✔
222
    }
223

224
    /**
225
     * @param string $name
226
     * @param array  $arguments
227
     *
228
     * @throws \BadMethodCallException
229
     * @throws \RuntimeException
230
     *
231
     * @return static
232
     */
233
    public static function __callStatic($name, $arguments)
234
    {
235
        $arguments0 = $arguments[0] ?? '';
588✔
236

237
        $arguments1 = $arguments[1] ?? null;
588✔
238

239
        if ($name === 'str_get_html') {
588✔
240
            $parser = new static();
546✔
241

242
            return $parser->loadHtml($arguments0, $arguments1);
546✔
243
        }
244

245
        if ($name === 'file_get_html') {
49✔
246
            $parser = new static();
42✔
247

248
            return $parser->loadHtmlFile($arguments0, $arguments1);
42✔
249
        }
250

251
        throw new \BadMethodCallException('Method does not exist');
7✔
252
    }
253

254
    /** @noinspection MagicMethodsValidityInspection */
255

256
    /**
257
     * @param string $name
258
     *
259
     * @return string|null
260
     */
261
    public function __get($name)
262
    {
263
        $name = \strtolower($name);
210✔
264

265
        switch ($name) {
266
            case 'outerhtml':
210✔
267
            case 'outertext':
168✔
268
                return $this->html();
133✔
269
            case 'innerhtml':
119✔
270
            case 'innertext':
77✔
271
                return $this->innerHtml();
49✔
272
            case 'innerhtmlkeep':
70✔
273
                return $this->innerHtml(false, false);
×
274
            case 'text':
70✔
275
            case 'plaintext':
70✔
276
                return $this->text();
63✔
277
        }
278

279
        return null;
7✔
280
    }
281

282
    /**
283
     * @return string
284
     */
285
    public function __toString()
286
    {
287
        return $this->html();
147✔
288
    }
289

290
    /**
291
     * does nothing (only for api-compatibility-reasons)
292
     *
293
     * @return bool
294
     *
295
     * @deprecated
296
     */
297
    public function clear(): bool
298
    {
299
        return true;
7✔
300
    }
301

302
    /**
303
     * Create DOMDocument from HTML.
304
     *
305
     * @param string   $html
306
     * @param int|null $libXMLExtraOptions
307
     * @param bool     $useDefaultLibXMLOptions
308
     *
309
     * @return \DOMDocument
310
     */
311
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
312
    {
313
        $this->resetDynamicDomHelpers();
1,995✔
314

315
        if ($this->callbackBeforeCreateDom) {
1,995✔
316
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
7✔
317
        }
318

319
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
320
        $isDOMDocumentCreatedWithDoctype = false;
1,995✔
321
        if (\stripos($html, '<!DOCTYPE') !== false) {
1,995✔
322
            $isDOMDocumentCreatedWithDoctype = true;
420✔
323
            if (
324
                \preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
420✔
325
                &&
326
                \trim($matches_before_doctype[1])
420✔
327
            ) {
328
                $html = \str_replace($matches_before_doctype[1], '', $html);
14✔
329
            }
330
        }
331

332
        if ($this->keepBrokenHtml) {
1,995✔
333
            $html = $this->keepBrokenHtml(\trim($html));
35✔
334
        }
335

336
        if (\strpos($html, '<') === false) {
1,995✔
337
            $this->isDOMDocumentCreatedWithoutHtml = true;
91✔
338
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
1,981✔
339
            $this->isDOMDocumentCreatedWithoutWrapper = true;
63✔
340
        }
341

342
        if (\strpos(\ltrim($html), '<!--') === 0) {
1,995✔
343
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
84✔
344
        }
345

346
        /** @noinspection HtmlRequiredLangAttribute */
347
        if (
348
            \strpos($html, '<html ') === false
1,995✔
349
            &&
350
            \strpos($html, '<html>') === false
1,995✔
351
        ) {
352
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
1,288✔
353
        }
354

355
        if (
356
            \strpos($html, '<body ') === false
1,995✔
357
            &&
358
            \strpos($html, '<body>') === false
1,995✔
359
        ) {
360
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
1,302✔
361
        }
362

363
        if (
364
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
1,995✔
365
            &&
366
            $this->isDOMDocumentCreatedWithoutBodyWrapper
1,995✔
367
            &&
368
            \trim($html) !== $html
1,995✔
369
            &&
370
            \substr_count($html, '</') >= 2
1,995✔
371
            &&
372
            \preg_match('#^\s*<([a-zA-Z][^\\s>/]*)>.*?</\\1>#su', $html) === 1
1,995✔
373
        ) {
374
            $this->isDOMDocumentCreatedWithEdgeWhitespace = true;
28✔
375
        }
376

377
        /** @noinspection HtmlRequiredTitleElement */
378
        if (
379
            \strpos($html, '<head ') === false
1,995✔
380
            &&
381
            \strpos($html, '<head>') === false
1,995✔
382
        ) {
383
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
1,596✔
384
        }
385

386
        if (
387
            \stripos($html, '<p ') === false
1,995✔
388
            &&
389
            \stripos($html, '<p>') === false
1,995✔
390
        ) {
391
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
1,162✔
392
        }
393

394
        if (
395
            \strpos($html, '</script>') === false
1,995✔
396
            &&
397
            \strpos($html, '<\/script>') !== false
1,995✔
398
        ) {
399
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
7✔
400
        }
401

402
        if (\stripos($html, '</html>') !== false) {
1,995✔
403
            /** @noinspection NestedPositiveIfStatementsInspection */
404
            if (
405
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
777✔
406
                &&
407
                \trim($matches_after_html[1])
777✔
408
            ) {
409
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
28✔
410
            }
411
        }
412

413
        if (\strpos($html, '<script') !== false) {
1,995✔
414
            // keepSpecialScriptTags must run before html5FallbackForScriptTags so
415
            // that special-type scripts (type="text/html", etc.) are converted to
416
            // the simplevokuspecialscript placeholder element before the script-tag
417
            // regex runs.  On PHP < 8.0 the regex uses hash placeholders; if it
418
            // ran first the special-script content would be hashed and
419
            // keepSpecialScriptTags would only see the hash, losing the ability to
420
            // pass the real HTML content to the DOM for error-recovery parsing.
421
            foreach ($this->specialScriptTags as $tag) {
168✔
422
                if (\strpos($html, $tag) !== false) {
168✔
423
                    $this->keepSpecialScriptTags($html);
49✔
424
                    break;
49✔
425
                }
426
            }
427

428
            $this->html5FallbackForScriptTags($html);
168✔
429
        }
430

431
        if (\strpos($html, '<svg') !== false) {
1,995✔
432
            $this->keepSpecialSvgTags($html);
329✔
433
        }
434

435
        $html = \str_replace(
1,995✔
436
            \array_map(static function ($e) {
1,995✔
437
                return '<' . $e . '>';
1,995✔
438
            }, $this->selfClosingTags),
1,995✔
439
            \array_map(static function ($e) {
1,995✔
440
                return '<' . $e . '/>';
1,995✔
441
            }, $this->selfClosingTags),
1,995✔
442
            $html
1,995✔
443
        );
1,995✔
444

445
        // set error level
446
        $internalErrors = \libxml_use_internal_errors(true);
1,995✔
447
        if (\PHP_VERSION_ID < 80000) {
1,995✔
UNCOV
448
            $disableEntityLoader = \libxml_disable_entity_loader(true);
570✔
449
        }
450
        \libxml_clear_errors();
1,995✔
451

452
        $optionsXml = 0;
1,995✔
453
        if ($useDefaultLibXMLOptions) {
1,995✔
454
            $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
1,995✔
455

456
            if (\defined('LIBXML_BIGLINES')) {
1,995✔
457
                $optionsXml |= \LIBXML_BIGLINES;
1,995✔
458
            }
459

460
            if (\defined('LIBXML_COMPACT')) {
1,995✔
461
                $optionsXml |= \LIBXML_COMPACT;
1,995✔
462
            }
463

464
            if (\defined('LIBXML_HTML_NODEFDTD')) {
1,995✔
465
                $optionsXml |= \LIBXML_HTML_NODEFDTD;
1,995✔
466
            }
467
        }
468

469
        if ($libXMLExtraOptions !== null) {
1,995✔
470
            $optionsXml |= $libXMLExtraOptions;
63✔
471
        }
472

473
        if (
474
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
1,995✔
475
            &&
476
            $this->isDOMDocumentCreatedWithoutBodyWrapper
1,995✔
477
        ) {
478
            $this->isDOMDocumentCreatedWithMultiRoot = $this->hasMultipleTopLevelNodes($html, $optionsXml);
1,260✔
479
        }
480

481
        if (
482
            $this->isDOMDocumentCreatedWithMultiRoot
1,995✔
483
            ||
484
            $this->isDOMDocumentCreatedWithEdgeWhitespace
1,858✔
485
            ||
486
            $this->isDOMDocumentCreatedWithoutWrapper
1,846✔
487
            ||
488
            $this->isDOMDocumentCreatedWithCommentWrapper
1,804✔
489
            ||
490
            (
491
                !$isDOMDocumentCreatedWithDoctype
1,995✔
492
                &&
1,995✔
493
                $this->keepBrokenHtml
1,995✔
494
            )
495
        ) {
496
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
338✔
497
        }
498

499
        $html = self::replaceToPreserveHtmlEntities($html);
1,995✔
500

501
        $documentFound = false;
1,995✔
502
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
1,995✔
503
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
1,995✔
504
            $domElementTmp = \dom_import_simplexml($sxe);
1,249✔
505
            if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
1,249✔
506
                $documentFound = true;
1,249✔
507
                $this->document = $domElementTmp->ownerDocument;
1,249✔
508
            }
509
        }
510

511
        if ($documentFound === false) {
1,995✔
512
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
513
            $xmlHackUsed = false;
853✔
514
            if (\stripos('<?xml', $html) !== 0) {
853✔
515
                $xmlHackUsed = true;
848✔
516
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
848✔
517
            }
518

519
            if ($html !== '') {
853✔
520
                $this->document->loadHTML($html, $optionsXml);
848✔
521
            }
522

523
            // remove the "xml-encoding" hack
524
            if ($xmlHackUsed) {
853✔
525
                foreach ($this->document->childNodes as $child) {
848✔
526
                    if ($child->nodeType === \XML_PI_NODE) {
848✔
527
                        $this->document->removeChild($child);
848✔
528

529
                        break;
848✔
530
                    }
531
                }
532
            }
533
        }
534

535
        $this->markSyntheticParagraphWrapper();
1,995✔
536

537
        // set encoding
538
        $this->document->encoding = $this->getEncoding();
1,995✔
539

540
        // restore lib-xml settings
541
        \libxml_clear_errors();
1,995✔
542
        \libxml_use_internal_errors($internalErrors);
1,995✔
543
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
1,995✔
UNCOV
544
            \libxml_disable_entity_loader($disableEntityLoader);
570✔
545
        }
546

547
        return $this->document;
1,995✔
548
    }
549

550
    /**
551
     * Find list of nodes with a CSS selector.
552
     *
553
     * @param string   $selector
554
     * @param int|null $idx
555
     *
556
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
557
     */
558
    public function find(string $selector, $idx = null)
559
    {
560
        return $this->findInNodeContext($selector, null, $idx);
1,400✔
561
    }
562

563
    /**
564
     * Find list of nodes with a CSS selector within an optional DOM context.
565
     *
566
     * @param string        $selector
567
     * @param \DOMNode|null $contextNode
568
     * @param int|null      $idx
569
     *
570
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
571
     */
572
    public function findInNodeContext(string $selector, ?\DOMNode $contextNode = null, $idx = null)
573
    {
574
        return self::findInDocumentContext(
1,400✔
575
            $selector,
1,400✔
576
            $this->document,
1,400✔
577
            $contextNode,
1,400✔
578
            $idx,
1,400✔
579
            $this->callbackXPathBeforeQuery,
1,400✔
580
            $this
1,400✔
581
        );
1,400✔
582
    }
583

584
    /**
585
     * Find list of nodes with a CSS selector within an optional DOMDocument
586
     * context, optionally applying the parser callback before the XPath query.
587
     *
588
     * @param string        $selector
589
     * @param \DOMDocument  $document
590
     * @param \DOMNode|null $contextNode
591
     * @param int|null      $idx
592
     * @param callable|null $callbackXPathBeforeQuery
593
     * @param self|null     $queryHtmlDomParser
594
     *
595
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
596
     *
597
     * @phpstan-param null|callable(string, string, \DOMXPath, self): string $callbackXPathBeforeQuery
598
     */
599
    public static function findInDocumentContext(
600
        string $selector,
601
        \DOMDocument $document,
602
        ?\DOMNode $contextNode = null,
603
        $idx = null,
604
        ?callable $callbackXPathBeforeQuery = null,
605
        ?self $queryHtmlDomParser = null
606
    ) {
607
        $xPathQuery = SelectorConverter::toXPath($selector);
1,498✔
608

609
        $xPath = new \DOMXPath($document);
1,498✔
610

611
        if ($callbackXPathBeforeQuery !== null && $queryHtmlDomParser !== null) {
1,498✔
612
            $xPathQuery = \call_user_func($callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $queryHtmlDomParser);
21✔
613
        }
614

615
        if ($contextNode !== null) {
1,498✔
616
            $xPathQuery = self::scopeXPathQueryToContextNode($xPathQuery);
350✔
617
        }
618

619
        $nodesList = $xPath->query($xPathQuery, $contextNode);
1,498✔
620

621
        return self::createFindResultFromNodeList($nodesList, $idx, $queryHtmlDomParser);
1,498✔
622
    }
623

624
    /**
625
     * Prefix absolute XPath segments so they stay scoped to the provided
626
     * context node, including every branch of union expressions.
627
     *
628
     * @param string $xPathQuery
629
     *
630
     * @return string
631
     */
632
    public static function scopeXPathQueryToContextNode(string $xPathQuery): string
633
    {
634
        $scopedXPathQuery = '';
357✔
635
        $quoteCharacter = null;
357✔
636
        $bracketDepth = 0;
357✔
637
        $parenthesisDepth = 0;
357✔
638
        $isAtBranchStart = true;
357✔
639
        $length = \strlen($xPathQuery);
357✔
640

641
        for ($i = 0; $i < $length; ++$i) {
357✔
642
            $character = $xPathQuery[$i];
357✔
643

644
            if ($quoteCharacter !== null) {
357✔
645
                $scopedXPathQuery .= $character;
119✔
646

647
                if ($character === $quoteCharacter) {
119✔
648
                    $quoteCharacter = null;
119✔
649
                }
650

651
                continue;
119✔
652
            }
653

654
            if ($character === '"' || $character === "'") {
357✔
655
                $scopedXPathQuery .= $character;
119✔
656
                $quoteCharacter = $character;
119✔
657

658
                continue;
119✔
659
            }
660

661
            if ($isAtBranchStart) {
357✔
662
                if (\trim($character) === '') {
357✔
663
                    $scopedXPathQuery .= $character;
35✔
664

665
                    continue;
35✔
666
                }
667

668
                if ($character === '/') {
357✔
669
                    $scopedXPathQuery .= '.';
63✔
670
                }
671

672
                $isAtBranchStart = false;
357✔
673
            }
674

675
            if ($character === '[') {
357✔
676
                ++$bracketDepth;
147✔
677
            } elseif ($character === ']' && $bracketDepth > 0) {
357✔
678
                --$bracketDepth;
147✔
679
            } elseif ($character === '(') {
357✔
680
                ++$parenthesisDepth;
112✔
681
            } elseif ($character === ')' && $parenthesisDepth > 0) {
357✔
682
                --$parenthesisDepth;
112✔
683
            }
684

685
            $scopedXPathQuery .= $character;
357✔
686

687
            if ($character === '|' && $bracketDepth === 0 && $parenthesisDepth === 0) {
357✔
688
                $isAtBranchStart = true;
35✔
689
            }
690
        }
691

692
        return $scopedXPathQuery;
357✔
693
    }
694

695
    /**
696
     * @param \DOMNodeList<\DOMNode>|false $nodesList
697
     * @param int|null                     $idx
698
     *
699
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
700
     */
701
    private static function createFindResultFromNodeList($nodesList, $idx, ?self $queryHtmlDomParser = null)
702
    {
703
        $elements = new SimpleHtmlDomNode();
1,498✔
704

705
        if ($nodesList) {
1,498✔
706
            foreach ($nodesList as $node) {
1,498✔
707
                $elements[] = new SimpleHtmlDom($node, $queryHtmlDomParser);
1,428✔
708
            }
709
        }
710

711
        // return all elements
712
        if ($idx === null) {
1,498✔
713
            if (\count($elements) === 0) {
693✔
714
                return new SimpleHtmlDomNodeBlank();
126✔
715
            }
716

717
            return $elements;
672✔
718
        }
719

720
        // handle negative values
721
        if ($idx < 0) {
1,036✔
722
            $idx = \count($elements) + $idx;
77✔
723
        }
724

725
        // return one element
726
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
1,036✔
727
    }
728

729
    /**
730
     * Find nodes with a CSS selector.
731
     *
732
     * @param string $selector
733
     *
734
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
735
     */
736
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
737
    {
738
        return $this->find($selector, null);
98✔
739
    }
740

741
    /**
742
     * Find nodes with a CSS selector or false, if no element is found.
743
     *
744
     * @param string $selector
745
     *
746
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
747
     */
748
    public function findMultiOrFalse(string $selector)
749
    {
750
        $return = $this->find($selector, null);
35✔
751

752
        if ($return instanceof SimpleHtmlDomNodeBlank) {
35✔
753
            return false;
21✔
754
        }
755

756
        return $return;
21✔
757
    }
758

759
    /**
760
     * Find nodes with a CSS selector or null, if no element is found.
761
     *
762
     * @param string $selector
763
     *
764
     * @return null|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
765
     */
766
    public function findMultiOrNull(string $selector)
767
    {
768
        $return = $this->find($selector, null);
7✔
769

770
        if ($return instanceof SimpleHtmlDomNodeBlank) {
7✔
771
            return null;
7✔
772
        }
773

774
        return $return;
7✔
775
    }
776

777
    /**
778
     * Find one node with a CSS selector.
779
     *
780
     * @param string $selector
781
     *
782
     * @return SimpleHtmlDomInterface
783
     */
784
    public function findOne(string $selector): SimpleHtmlDomInterface
785
    {
786
        return $this->find($selector, 0);
448✔
787
    }
788

789
    /**
790
     * Find one node with a CSS selector or false, if no element is found.
791
     *
792
     * @param string $selector
793
     *
794
     * @return false|SimpleHtmlDomInterface
795
     */
796
    public function findOneOrFalse(string $selector)
797
    {
798
        $return = $this->find($selector, 0);
56✔
799

800
        if ($return instanceof SimpleHtmlDomBlank) {
56✔
801
            return false;
14✔
802
        }
803

804
        return $return;
49✔
805
    }
806

807
    /**
808
     * Find one node with a CSS selector or null, if no element is found.
809
     *
810
     * @param string $selector
811
     *
812
     * @return null|SimpleHtmlDomInterface
813
     */
814
    public function findOneOrNull(string $selector)
815
    {
816
        $return = $this->find($selector, 0);
7✔
817

818
        if ($return instanceof SimpleHtmlDomBlank) {
7✔
819
            return null;
7✔
820
        }
821

822
        return $return;
7✔
823
    }
824

825
    /**
826
     * @param string $content
827
     * @param bool   $multiDecodeNewHtmlEntity
828
     * @param bool   $putBrokenReplacedBack
829
     *
830
     * @return string
831
     */
832
    public function fixHtmlOutput(
833
        string $content,
834
        bool $multiDecodeNewHtmlEntity = false,
835
        bool $putBrokenReplacedBack = true
836
    ): string {
837
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
838
        //          so we try to remove it here again ...
839

840
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
1,309✔
841
            /** @noinspection HtmlRequiredLangAttribute */
842
            $content = \str_replace(
567✔
843
                [
567✔
844
                    '<html>',
567✔
845
                    '</html>',
567✔
846
                ],
567✔
847
                '',
567✔
848
                $content
567✔
849
            );
567✔
850
        }
851

852
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
1,309✔
853
            /** @noinspection HtmlRequiredTitleElement */
854
            $content = \str_replace(
721✔
855
                [
721✔
856
                    '<head>',
721✔
857
                    '</head>',
721✔
858
                ],
721✔
859
                '',
721✔
860
                $content
721✔
861
            );
721✔
862
        }
863

864
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
1,309✔
865
            $content = \str_replace(
581✔
866
                [
581✔
867
                    '<body>',
581✔
868
                    '</body>',
581✔
869
                ],
581✔
870
                '',
581✔
871
                $content
581✔
872
            );
581✔
873
        }
874

875
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
1,309✔
876
            $content = \str_replace(
7✔
877
                '</script>',
7✔
878
                '',
7✔
879
                $content
7✔
880
            );
7✔
881
        }
882

883
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
1,309✔
884
            $content = (string) \preg_replace('/^<p>/', '', $content);
49✔
885
            $content = (string) \preg_replace('/<\/p>/', '', $content);
49✔
886
        }
887

888
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
1,309✔
889
            $content = \str_replace(
77✔
890
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
77✔
891
                '',
77✔
892
                $content
77✔
893
            );
77✔
894
        }
895

896
        // https://bugs.php.net/bug.php?id=73175
897
        $content = \str_replace(
1,309✔
898
            \array_map(static function ($e) {
1,309✔
899
                return '</' . $e . '>';
1,309✔
900
            }, $this->selfClosingTags),
1,309✔
901
            '',
1,309✔
902
            $content
1,309✔
903
        );
1,309✔
904

905
        /** @noinspection HtmlRequiredTitleElement */
906
        $content = \trim(
1,309✔
907
            \str_replace(
1,309✔
908
                [
1,309✔
909
                    '<simpleHtmlDomHtml>',
1,309✔
910
                    '</simpleHtmlDomHtml>',
1,309✔
911
                    '<simpleHtmlDomP>',
1,309✔
912
                    '</simpleHtmlDomP>',
1,309✔
913
                    '<head><head>',
1,309✔
914
                    '</head></head>',
1,309✔
915
                ],
1,309✔
916
                [
1,309✔
917
                    '',
1,309✔
918
                    '',
1,309✔
919
                    '',
1,309✔
920
                    '',
1,309✔
921
                    '<head>',
1,309✔
922
                    '</head>',
1,309✔
923
                ],
1,309✔
924
                $content
1,309✔
925
            )
1,309✔
926
        );
1,309✔
927

928
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
1,309✔
929

930
        return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
1,309✔
931
    }
932

933
    /**
934
     * Return elements by ".class".
935
     *
936
     * @param string $class
937
     *
938
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
939
     */
940
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
941
    {
942
        return $this->findMulti('.' . $class);
×
943
    }
944

945
    /**
946
     * Return element by #id.
947
     *
948
     * @param string $id
949
     *
950
     * @return SimpleHtmlDomInterface
951
     */
952
    public function getElementById(string $id): SimpleHtmlDomInterface
953
    {
954
        return $this->findOne('#' . $id);
21✔
955
    }
956

957
    /**
958
     * Return element by tag name.
959
     *
960
     * @param string $name
961
     *
962
     * @return SimpleHtmlDomInterface
963
     */
964
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
965
    {
966
        $node = $this->document->getElementsByTagName($name)->item(0);
7✔
967

968
        if ($node === null) {
7✔
969
            return new SimpleHtmlDomBlank();
×
970
        }
971

972
        return new SimpleHtmlDom($node, $this);
7✔
973
    }
974

975
    /**
976
     * Returns elements by "#id".
977
     *
978
     * @param string   $id
979
     * @param int|null $idx
980
     *
981
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
982
     */
983
    public function getElementsById(string $id, $idx = null)
984
    {
985
        return $this->find('#' . $id, $idx);
×
986
    }
987

988
    /**
989
     * Returns elements by tag name.
990
     *
991
     * @param string   $name
992
     * @param int|null $idx
993
     *
994
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
995
     */
996
    public function getElementsByTagName(string $name, $idx = null)
997
    {
998
        $nodesList = $this->document->getElementsByTagName($name);
42✔
999

1000
        $elements = new SimpleHtmlDomNode();
42✔
1001

1002
        foreach ($nodesList as $node) {
42✔
1003
            $elements[] = new SimpleHtmlDom($node, $this);
28✔
1004
        }
1005

1006
        // return all elements
1007
        if ($idx === null) {
42✔
1008
            if (\count($elements) === 0) {
35✔
1009
                return new SimpleHtmlDomNodeBlank();
14✔
1010
            }
1011

1012
            return $elements;
21✔
1013
        }
1014

1015
        // handle negative values
1016
        if ($idx < 0) {
7✔
1017
            $idx = \count($elements) + $idx;
×
1018
        }
1019

1020
        // return one element
1021
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
7✔
1022
    }
1023

1024
    /**
1025
     * Get dom node's outer html.
1026
     *
1027
     * @param bool $multiDecodeNewHtmlEntity
1028
     * @param bool $putBrokenReplacedBack
1029
     *
1030
     * @return string
1031
     */
1032
    public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1033
    {
1034
        if (static::$callback !== null) {
987✔
1035
            \call_user_func(static::$callback, [$this]);
×
1036
        }
1037

1038
        if ($this->shouldUseWholeDocumentSerializationForHtmlOnPhpLt8()) {
987✔
UNCOV
1039
            $content = $this->document->saveHTML();
142✔
1040
        } elseif ($this->usesInternalWrapperDocument()) {
875✔
1041
            $content = $this->serializeInternalWrapperContent();
135✔
1042
        } elseif ($this->createdFromNode) {
780✔
1043
            if (\PHP_VERSION_ID < 80000) {
371✔
NEW
1044
                $content = $this->serializeCreatedFromNodeForPhpLt8();
106✔
1045
            } else {
1046
                $content = $this->serializeChildNodes($this->document);
371✔
1047
            }
1048
        } elseif ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
541✔
1049
            $content = $this->document->saveHTML($this->document->documentElement);
293✔
1050
        } else {
1051
            $content = $this->document->saveHTML();
287✔
1052
        }
1053

1054
        if ($content === false) {
987✔
1055
            return '';
×
1056
        }
1057

1058
        $output = $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
987✔
1059

1060
        return $output;
987✔
1061
    }
1062

1063
    /**
1064
     * Mark a parser-generated <p>-wrapper so fixHtmlOutput() can remove only
1065
     * the synthetic wrapper instead of stripping all paragraph tags. The
1066
     * wrapper is renamed to the placeholder tag that fixHtmlOutput() already
1067
     * strips from serialized output.
1068
     *
1069
     * @return void
1070
     */
1071
    private function markSyntheticParagraphWrapper(): void
1072
    {
1073
        if (!$this->isDOMDocumentCreatedWithoutPTagWrapper) {
1,995✔
1074
            return;
931✔
1075
        }
1076

1077
        $html = $this->document->documentElement;
1,162✔
1078
        if (
1079
            !$html instanceof \DOMElement
1,162✔
1080
            ||
1081
            \strtolower($html->tagName) !== 'html'
1,162✔
1082
        ) {
1083
            return;
702✔
1084
        }
1085

1086
        $body = $this->document->getElementsByTagName('body')->item(0);
539✔
1087
        if (!$body instanceof \DOMElement) {
539✔
1088
            return;
54✔
1089
        }
1090

1091
        $wrapper = null;
518✔
1092
        foreach ($body->childNodes as $child) {
518✔
1093
            if ($child instanceof \DOMText && \trim($child->nodeValue ?? '') === '') {
406✔
1094
                continue;
84✔
1095
            }
1096

1097
            if ($wrapper !== null) {
406✔
1098
                return;
×
1099
            }
1100

1101
            if (!$child instanceof \DOMElement) {
406✔
1102
                return;
28✔
1103
            }
1104

1105
            if (\strtolower($child->tagName) !== 'p') {
378✔
1106
                return;
322✔
1107
            }
1108

1109
            $wrapper = $child;
91✔
1110
        }
1111

1112
        if (!$wrapper instanceof \DOMElement || $wrapper->parentNode === null) {
203✔
1113
            return;
112✔
1114
        }
1115

1116
        $replacement = $this->document->createElement('simpleHtmlDomP');
91✔
1117

1118
        while ($wrapper->firstChild !== null) {
91✔
1119
            $replacement->appendChild($wrapper->firstChild);
91✔
1120
        }
1121

1122
        $wrapper->parentNode->replaceChild($replacement, $wrapper);
91✔
1123
    }
1124

1125
    /**
1126
     * Serialize a single DOM node to HTML.
1127
     *
1128
     * A detached DOMDocument is used so that the serialization context is
1129
     * independent of the internal wrapper tag name (older libxml HTML
1130
     * serializers treat unknown hyphenated tags as block-level and inject
1131
     * formatting newlines into the wrapper's children when saving the full
1132
     * document).
1133
     *
1134
     * On PHP < 8.0, older libxml injects a trailing "\n" after raw-text
1135
     * elements (script, style) when they are the root of a fresh document.
1136
     * For those elements we fall back to serializing from the original
1137
     * document and strip only the single trailing "\n".  For all other
1138
     * element types the fresh-document approach is used to avoid libxml
1139
     * injecting formatting newlines inside block-level content.  Text and
1140
     * other non-element nodes are always serialized from the owner document
1141
     * without any trailing-newline stripping (they carry no injected newline).
1142
     *
1143
     * @param \DOMNode $node
1144
     */
1145
    private function serializeNode(\DOMNode $node): string
1146
    {
1147
        // For script/style on PHP < 8.0 use ownerDocument to avoid fresh-doc
1148
        // libxml injecting "\n" inside raw-text content.
1149
        $useOwnerDoc = \PHP_VERSION_ID < 80000
633✔
1150
            && $node instanceof \DOMElement
633✔
1151
            && \in_array(\strtolower($node->tagName), ['script', 'style'], true);
633✔
1152

1153
        if (!$useOwnerDoc) {
633✔
1154
            $document = new \DOMDocument('1.0', $this->getEncoding());
631✔
1155
            $document->preserveWhiteSpace = true;
631✔
1156
            $document->formatOutput = false;
631✔
1157

1158
            $importedNode = $document->importNode($node, true);
631✔
1159
            if (!$importedNode instanceof \DOMNode) {
631✔
1160
                return '';
×
1161
            }
1162

1163
            $document->appendChild($importedNode);
631✔
1164

1165
            $content = $document->saveHTML($importedNode);
631✔
1166
        } else {
1167
            // PHP < 8.0 script/style: serialize from original document and
1168
            // strip only the trailing "\n" that older libxml appends after
1169
            // raw-text elements.
UNCOV
1170
            $ownerDoc = $node->ownerDocument;
2✔
UNCOV
1171
            $content = $ownerDoc !== null ? $ownerDoc->saveHTML($node) : false;
2✔
1172
            // Older libxml appends exactly one synthetic trailing "\n" here;
1173
            // preserve any real user-provided trailing newlines in the content.
UNCOV
1174
            if ($content !== false && \substr($content, -1) === "\n") {
2✔
1175
                $content = \substr($content, 0, -1);
×
1176
            }
1177
        }
1178

1179
        if ($content === false) {
633✔
1180
            return '';
×
1181
        }
1182

1183
        return $content;
633✔
1184
    }
1185

1186
    /**
1187
     * Serialize the single element that was imported via the node-backed
1188
     * constructor, for PHP < 8.0.
1189
     *
1190
     * On PHP < 8, saveHTML($node) with a node argument always injects
1191
     * formatting newlines between block-level child elements and a trailing
1192
     * "\n" after raw-text elements (script, style), even with formatOutput
1193
     * set to false.  saveHTML() called without a node argument respects
1194
     * formatOutput=false and does not inject those newlines.
1195
     *
1196
     * We call saveHTML() on the constructor document (which already has the
1197
     * imported element as its only child / documentElement) and strip the
1198
     * DOCTYPE and structural wrappers (html, body) that libxml may add around
1199
     * elements that are not recognised HTML root elements.
1200
     *
1201
     * @return string
1202
     */
1203
    private function serializeCreatedFromNodeForPhpLt8(): string
1204
    {
NEW
1205
        $full = $this->document->saveHTML();
106✔
NEW
1206
        if ($full === false) {
106✔
NEW
1207
            return '';
×
1208
        }
1209

1210
        // Strip the DOCTYPE declaration that libxml always prepends.
NEW
1211
        $full = (string) \preg_replace('/<!DOCTYPE[^>]+>/i', '', $full);
106✔
NEW
1212
        $full = \trim($full);
106✔
1213

NEW
1214
        $documentElement = $this->document->documentElement;
106✔
NEW
1215
        $tagName = $documentElement instanceof \DOMElement
106✔
NEW
1216
            ? \strtolower($documentElement->tagName)
100✔
NEW
1217
            : '';
106✔
1218

1219
        // Strip the <html>...</html> wrapper added by libxml when the root
1220
        // element is not the HTML element itself.
NEW
1221
        if ($tagName !== 'html') {
106✔
NEW
1222
            $full = (string) \preg_replace('/^<html[^>]*>/i', '', $full);
104✔
NEW
1223
            $full = (string) \preg_replace('/<\/html>$/i', '', $full);
104✔
NEW
1224
            $full = \trim($full);
104✔
1225

1226
            // Strip the <body>...</body> wrapper added for non-body elements.
NEW
1227
            if ($tagName !== 'body') {
104✔
NEW
1228
                $full = (string) \preg_replace('/^<body[^>]*>/i', '', $full);
102✔
NEW
1229
                $full = (string) \preg_replace('/<\/body>$/i', '', $full);
102✔
1230
                // Remove a trailing empty <body> libxml may add for <head> roots.
NEW
1231
                $full = \str_replace('<body></body>', '', $full);
102✔
NEW
1232
                $full = \trim($full);
102✔
1233
            }
1234
        }
1235

NEW
1236
        return $full;
106✔
1237
    }
1238

1239
    /**
1240
     * @param \DOMNode $parentNode
1241
     *
1242
     * @return string
1243
     */
1244
    private function serializeChildNodes(\DOMNode $parentNode): string
1245
    {
1246
        $content = '';
633✔
1247

1248
        foreach ($parentNode->childNodes as $childNode) {
633✔
1249
            $content .= $this->serializeNode($childNode);
633✔
1250
        }
1251

1252
        return $content;
633✔
1253
    }
1254

1255
    /**
1256
     * @return bool
1257
     */
1258
    private function usesInternalWrapperDocument(): bool
1259
    {
1260
        return $this->document->documentElement instanceof \DOMElement
1,169✔
1261
            && $this->document->documentElement->tagName === self::$domHtmlWrapperHelper;
1,169✔
1262
    }
1263

1264
    /**
1265
     * Older libxml preserves body-only fragments more faithfully when the whole
1266
     * temporary document is serialized and fixHtmlOutput() removes the wrappers
1267
     * afterwards. Head-only fragments still need root-element serialization, or
1268
     * <meta charset=...> can trigger output re-encoding (e.g. utf-7).
1269
     */
1270
    private function isBodyOnlyHtmlFragmentDocument(): bool
1271
    {
UNCOV
1272
        $documentElement = $this->document->documentElement;
114✔
UNCOV
1273
        if (!$documentElement instanceof \DOMElement || \strtolower($documentElement->tagName) !== 'html') {
114✔
UNCOV
1274
            return false;
68✔
1275
        }
1276

UNCOV
1277
        $head = $documentElement->getElementsByTagName('head')->item(0);
50✔
UNCOV
1278
        $body = $documentElement->getElementsByTagName('body')->item(0);
50✔
1279

UNCOV
1280
        $hasHeadContent = $head instanceof \DOMElement && $head->childNodes->length > 0;
50✔
UNCOV
1281
        $hasBodyContent = $body instanceof \DOMElement && $body->childNodes->length > 0;
50✔
1282

UNCOV
1283
        return !$hasHeadContent && $hasBodyContent;
50✔
1284
    }
1285

1286
    private function shouldUseWholeDocumentSerializationForHtmlOnPhpLt8(): bool
1287
    {
1288
        if (\PHP_VERSION_ID >= 80000) {
987✔
1289
            return false;
705✔
1290
        }
1291

UNCOV
1292
        if ($this->usesInternalWrapperDocument()) {
282✔
UNCOV
1293
            return true;
54✔
1294
        }
1295

UNCOV
1296
        if (!$this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
244✔
UNCOV
1297
            return false;
166✔
1298
        }
1299

UNCOV
1300
        $documentElement = $this->document->documentElement;
110✔
UNCOV
1301
        if (!$documentElement instanceof \DOMElement) {
110✔
UNCOV
1302
            return false;
14✔
1303
        }
1304

UNCOV
1305
        return \strtolower($documentElement->tagName) !== 'html'
104✔
UNCOV
1306
            || $this->isBodyOnlyHtmlFragmentDocument();
104✔
1307
    }
1308

1309
    private function shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8(): bool
1310
    {
1311
        return \PHP_VERSION_ID < 80000
273✔
1312
            && (
273✔
1313
                $this->usesInternalWrapperDocument()
273✔
1314
                || $this->isBodyOnlyHtmlFragmentDocument()
273✔
1315
            );
273✔
1316
    }
1317

1318
    /**
1319
     * Keep helper wrapper markers around detached child serialization so
1320
     * fixHtmlOutput() does not trim leading/trailing fragment whitespace.
1321
     *
1322
     * @return string
1323
     */
1324
    private function serializeInternalWrapperContent(): string
1325
    {
1326
        $wrapperTag = self::$domHtmlWrapperHelper;
135✔
1327

1328
        return '<' . $wrapperTag . '>'
135✔
1329
            . $this->serializeChildNodes($this->document->documentElement)
135✔
1330
            . '</' . $wrapperTag . '>';
135✔
1331
    }
1332

1333
    /**
1334
     * Parse the fragment inside the internal wrapper and count significant
1335
     * direct children. This is more reliable than regex for fragments whose
1336
     * top-level elements have attributes or nested markup.
1337
     *
1338
     * @param string $html
1339
     * @param int    $optionsXml
1340
     *
1341
     * @return bool
1342
     */
1343
    private function hasMultipleTopLevelNodes(string $html, int $optionsXml): bool
1344
    {
1345
        $internalErrors = \libxml_use_internal_errors(true);
1,267✔
1346
        try {
1347
            \libxml_clear_errors();
1,267✔
1348

1349
            $xmlProbe = '<' . self::$domHtmlWrapperHelper . '>'
1,267✔
1350
                . self::replaceToPreserveHtmlEntities($html)
1,267✔
1351
                . '</' . self::$domHtmlWrapperHelper . '>';
1,267✔
1352

1353
            $simpleXml = \simplexml_load_string($xmlProbe, \SimpleXMLElement::class, $optionsXml);
1,267✔
1354
            if ($simpleXml === false || \count(\libxml_get_errors()) > 0) {
1,267✔
1355
                return false;
293✔
1356
            }
1357

1358
            $wrapper = \dom_import_simplexml($simpleXml);
1,016✔
1359
            if (!$wrapper instanceof \DOMElement) {
1,016✔
1360
                return false;
×
1361
            }
1362

1363
            return $this->countSignificantChildNodes($wrapper) > 1;
1,016✔
1364
        } finally {
1365
            \libxml_clear_errors();
1,267✔
1366
            \libxml_use_internal_errors($internalErrors);
1,267✔
1367
        }
1368
    }
1369

1370
    /**
1371
     * @param \DOMNode $node
1372
     *
1373
     * @return int
1374
     */
1375
    private function countSignificantChildNodes(\DOMNode $node): int
1376
    {
1377
        $count = 0;
1,016✔
1378

1379
        foreach ($node->childNodes as $childNode) {
1,016✔
1380
            if (
1381
                $childNode->nodeType === \XML_TEXT_NODE
1,002✔
1382
                &&
1383
                \trim($childNode->textContent) === ''
1,002✔
1384
            ) {
1385
                continue;
72✔
1386
            }
1387

1388
            ++$count;
1,002✔
1389
            if ($count > 1) {
1,002✔
1390
                return $count;
193✔
1391
            }
1392
        }
1393

1394
        return $count;
872✔
1395
    }
1396

1397
    /**
1398
     * {@inheritdoc}
1399
     */
1400
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
1401
    {
1402
        $text = '';
273✔
1403

1404
        if ($this->document->documentElement) {
273✔
1405
            if ($this->shouldUseWholeDocumentSerializationForInnerHtmlOnPhpLt8()) {
273✔
UNCOV
1406
                $text = $this->document->saveHTML();
10✔
1407
            } elseif ($this->usesInternalWrapperDocument()) {
263✔
1408
                $text = $this->serializeInternalWrapperContent();
×
1409
            } else {
1410
                $text = $this->serializeChildNodes($this->document->documentElement);
263✔
1411
            }
1412
        }
1413

1414
        $output = $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
273✔
1415

1416
        return $output;
273✔
1417
    }
1418

1419
    /**
1420
     * Get dom node's plain text.
1421
     *
1422
     * HTML document plaintext should exclude raw-text container contents like
1423
     * <script> and <style> while still preserving other text nodes in document
1424
     * order (e.g. <title> content).
1425
     *
1426
     * @param bool $multiDecodeNewHtmlEntity
1427
     *
1428
     * @return string
1429
     */
1430
    public function text(bool $multiDecodeNewHtmlEntity = false): string
1431
    {
1432
        $parts = [];
70✔
1433

1434
        $xPath = new \DOMXPath($this->document);
70✔
1435
        $textNodes = $xPath->query(
70✔
1436
            \sprintf(
70✔
1437
                '//text()[not(ancestor::script or ancestor::style or ancestor::%s)]',
70✔
1438
                self::$domHtmlSpecialScriptHelper
70✔
1439
            )
70✔
1440
        );
70✔
1441

1442
        if ($textNodes !== false) {
70✔
1443
            foreach ($textNodes as $textNode) {
70✔
1444
                $parts[] = $textNode->nodeValue;
70✔
1445
            }
1446
        }
1447

1448
        return $this->fixHtmlOutput(\implode('', $parts), $multiDecodeNewHtmlEntity);
70✔
1449
    }
1450

1451
    /**
1452
     * Load HTML from string.
1453
     *
1454
     * @param string   $html
1455
     * @param int|null $libXMLExtraOptions
1456
     * @param bool     $useDefaultLibXMLOptions
1457
     *
1458
     * @return $this
1459
     */
1460
    public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1461
    {
1462
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
1,995✔
1463

1464
        return $this;
1,995✔
1465
    }
1466

1467
    /**
1468
     * Load HTML from file.
1469
     *
1470
     * @param string   $filePath
1471
     * @param int|null $libXMLExtraOptions
1472
     * @param bool     $useDefaultLibXMLOptions
1473
     *
1474
     * @throws \RuntimeException
1475
     *
1476
     * @return $this
1477
     */
1478
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
1479
    {
1480
        if (
1481
            !\preg_match("/^https?:\/\//i", $filePath)
98✔
1482
            &&
1483
            !\file_exists($filePath)
98✔
1484
        ) {
1485
            throw new \RuntimeException('File ' . $filePath . ' not found');
7✔
1486
        }
1487

1488
        try {
1489
            if (\class_exists('\voku\helper\UTF8')) {
91✔
1490
                $html = \voku\helper\UTF8::file_get_contents($filePath);
×
1491
            } else {
1492
                $html = \file_get_contents($filePath);
91✔
1493
            }
1494
        } catch (\Exception $e) {
7✔
1495
            throw new \RuntimeException('Could not load file ' . $filePath);
7✔
1496
        }
1497

1498
        if ($html === false) {
84✔
1499
            throw new \RuntimeException('Could not load file ' . $filePath);
×
1500
        }
1501

1502
        return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
84✔
1503
    }
1504

1505
    /**
1506
     * Get the HTML as XML or plain XML if needed.
1507
     *
1508
     * @param bool $multiDecodeNewHtmlEntity
1509
     * @param bool $htmlToXml
1510
     * @param bool $removeXmlHeader
1511
     * @param int  $options
1512
     *
1513
     * @return string
1514
     */
1515
    public function xml(
1516
        bool $multiDecodeNewHtmlEntity = false,
1517
        bool $htmlToXml = true,
1518
        bool $removeXmlHeader = true,
1519
        int $options = \LIBXML_NOEMPTYTAG
1520
    ): string {
1521
        $xml = $this->document->saveXML(null, $options);
14✔
1522
        if ($xml === false) {
14✔
1523
            return '';
×
1524
        }
1525

1526
        if ($removeXmlHeader) {
14✔
1527
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
14✔
1528
        }
1529

1530
        if ($htmlToXml) {
14✔
1531
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
14✔
1532
        } else {
1533
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
×
1534

1535
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
×
1536
        }
1537

1538
        return $return;
14✔
1539
    }
1540

1541
    /**
1542
     * @param string $selector
1543
     * @param int    $idx
1544
     *
1545
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
1546
     */
1547
    public function __invoke($selector, $idx = null)
1548
    {
1549
        return $this->find($selector, $idx);
21✔
1550
    }
1551

1552
    /**
1553
     * @return bool
1554
     */
1555
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
1556
    {
1557
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
1,309✔
1558
    }
1559

1560
    /**
1561
     * @return bool
1562
     */
1563
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
1564
    {
1565
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
154✔
1566
    }
1567

1568
    /**
1569
     * @return bool
1570
     */
1571
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
1572
    {
1573
        return $this->isDOMDocumentCreatedWithoutHtml;
1,309✔
1574
    }
1575

1576
    /**
1577
     * @return bool
1578
     */
1579
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
1580
    {
1581
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
1,309✔
1582
    }
1583

1584
    /**
1585
     * @return bool
1586
     */
1587
    public function getIsDOMDocumentCreatedWithMultiRoot(): bool
1588
    {
1589
        return $this->isDOMDocumentCreatedWithMultiRoot;
×
1590
    }
1591

1592
    /**
1593
     * @return bool
1594
     */
1595
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
1596
    {
1597
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
1,309✔
1598
    }
1599

1600
    /**
1601
     * @return bool
1602
     */
1603
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
1604
    {
1605
        return $this->isDOMDocumentCreatedWithoutWrapper;
1,309✔
1606
    }
1607

1608
    /**
1609
     * @return bool
1610
     */
1611
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
1612
    {
1613
        return $this->isDOMDocumentCreatedWithFakeEndScript;
1,309✔
1614
    }
1615

1616
    /**
1617
     * @param string $html
1618
     *
1619
     * @return string
1620
     */
1621
    protected function keepBrokenHtml(string $html): string
1622
    {
1623
        do {
1624
            $original = $html;
35✔
1625

1626
            $html = (string) \preg_replace_callback(
35✔
1627
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
35✔
1628
                static function ($matches) {
35✔
1629
                    return $matches['start'] .
35✔
1630
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
35✔
1631
                        $matches['value'] .
35✔
1632
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
35✔
1633
                        $matches['end'];
35✔
1634
                },
35✔
1635
                $html
35✔
1636
            );
35✔
1637
        } while ($original !== $html);
35✔
1638

1639
        do {
1640
            $original = $html;
35✔
1641

1642
            $html = (string) \preg_replace_callback(
35✔
1643
                '/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
35✔
1644
                function ($matches) {
35✔
1645
                    $matches['broken'] = \str_replace(
21✔
1646
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
21✔
1647
                        ['</', '<', '>'],
21✔
1648
                        $matches['broken']
21✔
1649
                    );
21✔
1650

1651
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
21✔
1652
                    $this->registerDynamicDomBrokenReplaceHelper($matches['broken'], $matchesHash);
21✔
1653

1654
                    return $matches['start'] . $matchesHash . $matches['end'];
21✔
1655
                },
35✔
1656
                $html
35✔
1657
            );
35✔
1658
        } while ($original !== $html);
35✔
1659

1660
        return \str_replace(
35✔
1661
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
35✔
1662
            ['</', '<', '>'],
35✔
1663
            $html
35✔
1664
        );
35✔
1665
    }
1666

1667
    /**
1668
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
1669
     *
1670
     * @param string $html
1671
     *
1672
     * @return void
1673
     */
1674
    protected function keepSpecialSvgTags(string &$html)
1675
    {
1676
        // regEx for e.g.: [mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">...</svg>')]
1677
        /** @noinspection HtmlDeprecatedTag */
1678
        $regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
329✔
1679
        $htmlTmp = \preg_replace_callback(
329✔
1680
            $regExSpecialSvg,
329✔
1681
            function ($svgs) {
329✔
1682
                $content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
14✔
1683
                $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
14✔
1684
                $this->registerDynamicDomBrokenReplaceHelper($content, $matchesHash);
14✔
1685

1686
                return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
14✔
1687
            },
329✔
1688
            $html
329✔
1689
        );
329✔
1690

1691
        if ($htmlTmp !== null) {
329✔
1692
            $html = $htmlTmp;
329✔
1693
        }
1694
    }
1695

1696
    /**
1697
     * @param string $html
1698
     *
1699
     * @return void
1700
     */
1701
    protected function keepSpecialScriptTags(string &$html)
1702
    {
1703
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1704
        $tags = \implode('|', \array_map(
49✔
1705
            static function ($value) {
49✔
1706
                return \preg_quote($value, '/');
49✔
1707
            },
49✔
1708
            $this->specialScriptTags
49✔
1709
        ));
49✔
1710
        $html = (string) \preg_replace_callback(
49✔
1711
            '/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
49✔
1712
            function ($matches) {
49✔
1713
                // Check for logic in special script tags containing EJS/ERB-style template syntax
1714
                // (e.g. <% ... %> blocks), because often this looks like non-valid html in the template itself.
1715
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
35✔
1716
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
35✔
1717
                        // remove the html5 fallback
1718
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
28✔
1719

1720
                        $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
28✔
1721
                        $this->registerDynamicDomBrokenReplaceHelper($matches['innerContent'], $matchesHash);
28✔
1722

1723
                        return $matches['start'] . $matchesHash . $matches['end'];
28✔
1724
                    }
1725
                }
1726

1727
                // remove the html5 fallback
1728
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
21✔
1729

1730
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
21✔
1731

1732
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
21✔
1733
            },
49✔
1734
            $html
49✔
1735
        );
49✔
1736
    }
1737

1738
    /**
1739
     * @param bool $keepBrokenHtml
1740
     *
1741
     * @return $this
1742
     */
1743
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1744
    {
1745
        $this->keepBrokenHtml = $keepBrokenHtml;
35✔
1746

1747
        return $this;
35✔
1748
    }
1749

1750
    /**
1751
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1752
     *
1753
     * @return $this
1754
     */
1755
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1756
    {
1757
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
14✔
1758
            if (!\is_string($tmp)) {
14✔
1759
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
7✔
1760
            }
1761
        }
1762

1763
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
7✔
1764

1765
        return $this;
7✔
1766
    }
1767

1768
    /**
1769
     * @param string[] $specialScriptTags
1770
     *
1771
     * @return $this
1772
     */
1773
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1774
    {
1775
        foreach ($specialScriptTags as $tag) {
×
1776
            if (!\is_string($tag)) {
×
1777
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
×
1778
            }
1779
        }
1780

1781
        $this->specialScriptTags = $specialScriptTags;
×
1782

1783
        return $this;
×
1784
    }
1785

1786
    /**
1787
     * @param callable $callbackXPathBeforeQuery
1788
     *
1789
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1790
     *
1791
     * @return $this
1792
     */
1793
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
1794
    {
1795
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
21✔
1796

1797
        return $this;
21✔
1798
    }
1799

1800
    /**
1801
     * @param callable $callbackBeforeCreateDom
1802
     *
1803
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1804
     *
1805
     * @return $this
1806
     */
1807
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
1808
    {
1809
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
7✔
1810

1811
        return $this;
7✔
1812
    }
1813
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc