• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 9233656301

25 May 2024 06:51AM UTC coverage: 66.091%. Remained the same
9233656301

push

github

web-flow
Merge pull request #106 from devteam-emroc/php-83

Adding 'id' to DOMNodes with type string.

0 of 1 new or added line in 1 file covered. (0.0%)

4 existing lines in 2 files now uncovered.

1109 of 1678 relevant lines covered (66.09%)

102.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.24
/src/voku/helper/HtmlDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
/**
8
 * @property-read string $outerText
9
 *                                 <p>Get dom node's outer html (alias for "outerHtml").</p>
10
 * @property-read string $outerHtml
11
 *                                 <p>Get dom node's outer html.</p>
12
 * @property-read string $innerText
13
 *                                 <p>Get dom node's inner html (alias for "innerHtml").</p>
14
 * @property-read string $innerHtml
15
 *                                 <p>Get dom node's inner html.</p>
16
 * @property-read string $plaintext
17
 *                                 <p>Get dom node's plain text.</p>
18
 *
19
 * @method string outerText()
20
 *                                 <p>Get dom node's outer html (alias for "outerHtml()").</p>
21
 * @method string outerHtml()
22
 *                                 <p>Get dom node's outer html.</p>
23
 * @method string innerText()
24
 *                                 <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 * @method HtmlDomParser load(string $html)
26
 *                                 <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file(string $html)
28
 *                                 <p>Load HTML from file.</p>
29
 * @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null)
30
 *                                 <p>Load HTML from file.</p>
31
 * @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null)
32
 *                                 <p>Load HTML from string.</p>
33
 */
34
class HtmlDomParser extends AbstractDomParser
35
{
36
    /**
37
     * @var callable|null
38
     *
39
     * @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string
40
     */
41
    private $callbackXPathBeforeQuery;
42

43
    /**
44
     * @var callable|null
45
     *
46
     * @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string
47
     */
48
    private $callbackBeforeCreateDom;
49

50
    /**
51
     * @var string[]
52
     */
53
    protected static $functionAliases = [
54
        'outertext' => 'html',
55
        'outerhtml' => 'html',
56
        'innertext' => 'innerHtml',
57
        'innerhtml' => 'innerHtml',
58
        'load'      => 'loadHtml',
59
        'load_file' => 'loadHtmlFile',
60
    ];
61

62
    /**
63
     * @var string[]
64
     */
65
    protected $templateLogicSyntaxInSpecialScriptTags = [
66
        '+',
67
        '<%',
68
        '{%',
69
        '{{',
70
    ];
71

72
    /**
73
     * The properties specified for each special script tag is an array.
74
     *
75
     * ```php
76
     * protected $specialScriptTags = [
77
     *     'text/html',
78
     *     'text/x-custom-template',
79
     *     'text/x-handlebars-template'
80
     * ]
81
     * ```
82
     *
83
     * @var string[]
84
     */
85
    protected $specialScriptTags = [
86
        'text/html',
87
        'text/x-custom-template',
88
        'text/x-handlebars-template',
89
    ];
90

91
    /**
92
     * @var string[]
93
     */
94
    protected $selfClosingTags = [
95
        'area',
96
        'base',
97
        'br',
98
        'col',
99
        'command',
100
        'embed',
101
        'hr',
102
        'img',
103
        'input',
104
        'keygen',
105
        'link',
106
        'meta',
107
        'param',
108
        'source',
109
        'track',
110
        'wbr',
111
    ];
112

113
    /**
114
     * @var bool
115
     */
116
    protected $isDOMDocumentCreatedWithoutHtml = false;
117

118
    /**
119
     * @var bool
120
     */
121
    protected $isDOMDocumentCreatedWithoutWrapper = false;
122

123
    /**
124
     * @var bool
125
     */
126
    protected $isDOMDocumentCreatedWithCommentWrapper = false;
127

128
    /**
129
     * @var bool
130
     */
131
    protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
132

133
    /**
134
     * @var bool
135
     */
136
    protected $isDOMDocumentCreatedWithoutPTagWrapper = false;
137

138
    /**
139
     * @var bool
140
     */
141
    protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
142

143
    /**
144
     * @var bool
145
     */
146
    protected $isDOMDocumentCreatedWithoutBodyWrapper = false;
147

148
    /**
149
     * @var bool
150
     */
151
    protected $isDOMDocumentCreatedWithMultiRoot = false;
152

153
    /**
154
     * @var bool
155
     */
156
    protected $isDOMDocumentCreatedWithFakeEndScript = false;
157

158
    /**
159
     * @var bool
160
     */
161
    protected $keepBrokenHtml = false;
162

163
    /**
164
     * @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode
165
     */
166
    public function __construct($element = null)
167
    {
168
        $this->document = new \DOMDocument('1.0', $this->getEncoding());
919✔
169

170
        // DOMDocument settings
171
        $this->document->preserveWhiteSpace = true;
919✔
172
        $this->document->formatOutput = true;
919✔
173

174
        if ($element instanceof SimpleHtmlDomInterface) {
919✔
175
            $element = $element->getNode();
440✔
176
        }
177

178
        if ($element instanceof \DOMNode) {
919✔
179
            $domNode = $this->document->importNode($element, true);
440✔
180

181
            if ($domNode instanceof \DOMNode) {
440✔
182
                $this->document->appendChild($domNode);
440✔
183
            }
184

185
            return;
440✔
186
        }
187

188
        if ($element !== null) {
919✔
189
            $this->loadHtml($element);
355✔
190
        }
191
    }
192

193
    /**
194
     * @param string $name
195
     * @param array  $arguments
196
     *
197
     * @return bool|mixed
198
     */
199
    public function __call($name, $arguments)
200
    {
201
        $name = \strtolower($name);
272✔
202

203
        if (isset(self::$functionAliases[$name])) {
272✔
204
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
268✔
205
        }
206

207
        throw new \BadMethodCallException('Method does not exist: ' . $name);
4✔
208
    }
209

210
    /**
211
     * @param string $name
212
     * @param array  $arguments
213
     *
214
     * @throws \BadMethodCallException
215
     * @throws \RuntimeException
216
     *
217
     * @return static
218
     */
219
    public static function __callStatic($name, $arguments)
220
    {
221
        $arguments0 = $arguments[0] ?? '';
139✔
222

223
        $arguments1 = $arguments[1] ?? null;
139✔
224

225
        if ($name === 'str_get_html') {
139✔
226
            $parser = new static();
116✔
227

228
            return $parser->loadHtml($arguments0, $arguments1);
116✔
229
        }
230

231
        if ($name === 'file_get_html') {
27✔
232
            $parser = new static();
23✔
233

234
            return $parser->loadHtmlFile($arguments0, $arguments1);
23✔
235
        }
236

237
        throw new \BadMethodCallException('Method does not exist');
4✔
238
    }
239

240
    /** @noinspection MagicMethodsValidityInspection */
241

242
    /**
243
     * @param string $name
244
     *
245
     * @return string|null
246
     */
247
    public function __get($name)
248
    {
249
        $name = \strtolower($name);
68✔
250

251
        switch ($name) {
252
            case 'outerhtml':
68✔
253
            case 'outertext':
68✔
254
                return $this->html();
28✔
255
            case 'innerhtml':
44✔
256
            case 'innertext':
20✔
257
                return $this->innerHtml();
28✔
258
            case 'innerhtmlkeep':
16✔
259
                return $this->innerHtml(false, false);
×
260
            case 'text':
16✔
261
            case 'plaintext':
16✔
262
                return $this->text();
12✔
263
        }
264

265
        return null;
4✔
266
    }
267

268
    /**
269
     * @return string
270
     */
271
    public function __toString()
272
    {
273
        return $this->html();
80✔
274
    }
275

276
    /**
277
     * does nothing (only for api-compatibility-reasons)
278
     *
279
     * @return bool
280
     *
281
     * @deprecated
282
     */
283
    public function clear(): bool
284
    {
285
        return true;
4✔
286
    }
287

288
    /**
289
     * Create DOMDocument from HTML.
290
     *
291
     * @param string   $html
292
     * @param int|null $libXMLExtraOptions
293
     * @param bool     $useDefaultLibXMLOptions
294
     *
295
     * @return \DOMDocument
296
     */
297
    protected function createDOMDocument(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): \DOMDocument
298
    {
299
        if ($this->callbackBeforeCreateDom) {
855✔
300
            $html = \call_user_func($this->callbackBeforeCreateDom, $html, $this);
4✔
301
        }
302

303
        // Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input.
304
        $isDOMDocumentCreatedWithDoctype = false;
855✔
305
        if (\stripos($html, '<!DOCTYPE') !== false) {
855✔
306
            $isDOMDocumentCreatedWithDoctype = true;
243✔
307
            if (
308
                \preg_match('/(^.*?)<!DOCTYPE(?: [^>]*)?>/sui', $html, $matches_before_doctype)
243✔
309
                &&
310
                \trim($matches_before_doctype[1])
243✔
311
            ) {
312
                $html = \str_replace($matches_before_doctype[1], '', $html);
8✔
313
            }
314
        }
315

316
        if ($this->keepBrokenHtml) {
855✔
317
            $html = $this->keepBrokenHtml(\trim($html));
20✔
318
        }
319

320
        if (\strpos($html, '<') === false) {
855✔
321
            $this->isDOMDocumentCreatedWithoutHtml = true;
51✔
322
        } elseif (\strpos(\ltrim($html), '<') !== 0) {
847✔
323
            $this->isDOMDocumentCreatedWithoutWrapper = true;
24✔
324
        }
325

326
        if (\strpos(\ltrim($html), '<!--') === 0) {
855✔
327
            $this->isDOMDocumentCreatedWithCommentWrapper = true;
44✔
328
        }
329

330
        /** @noinspection HtmlRequiredLangAttribute */
331
        if (
332
            \strpos($html, '<html ') === false
855✔
333
            &&
334
            \strpos($html, '<html>') === false
855✔
335
        ) {
336
            $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
527✔
337
        }
338

339
        if (
340
            \strpos($html, '<body ') === false
855✔
341
            &&
342
            \strpos($html, '<body>') === false
855✔
343
        ) {
344
            $this->isDOMDocumentCreatedWithoutBodyWrapper = true;
535✔
345
        }
346

347
        /** @noinspection HtmlRequiredTitleElement */
348
        if (
349
            \strpos($html, '<head ') === false
855✔
350
            &&
351
            \strpos($html, '<head>') === false
855✔
352
        ) {
353
            $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
627✔
354
        }
355

356
        if (
357
            \strpos($html, '<p ') === false
855✔
358
            &&
359
            \strpos($html, '<p>') === false
855✔
360
        ) {
361
            $this->isDOMDocumentCreatedWithoutPTagWrapper = true;
483✔
362
        }
363

364
        if (
365
            \strpos($html, '</script>') === false
855✔
366
            &&
367
            \strpos($html, '<\/script>') !== false
855✔
368
        ) {
369
            $this->isDOMDocumentCreatedWithFakeEndScript = true;
4✔
370
        }
371

372
        if (\stripos($html, '</html>') !== false) {
855✔
373
            /** @noinspection NestedPositiveIfStatementsInspection */
374
            if (
375
                \preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html)
367✔
376
                &&
377
                \trim($matches_after_html[1])
367✔
378
            ) {
379
                $html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html);
16✔
380
            }
381
        }
382

383
        if (\strpos($html, '<script') !== false) {
855✔
384
            $this->html5FallbackForScriptTags($html);
95✔
385

386
            foreach ($this->specialScriptTags as $tag) {
95✔
387
                if (\strpos($html, $tag) !== false) {
95✔
388
                    $this->keepSpecialScriptTags($html);
24✔
389
                }
390
            }
391
        }
392

393
        if (\strpos($html, '<svg') !== false) {
855✔
394
            $this->keepSpecialSvgTags($html);
176✔
395
        }
396

397
        if (
398
            $this->isDOMDocumentCreatedWithoutHtmlWrapper
855✔
399
            &&
400
            $this->isDOMDocumentCreatedWithoutBodyWrapper
855✔
401
        ) {
402
            if (\substr_count($html, '</') >= 2) {
511✔
403
                $regexForMultiRootDetection = '#<(.*)>.*?</(\1)>#su';
284✔
404
                \preg_match($regexForMultiRootDetection, $html, $matches);
284✔
405
                if (($matches[0] ?? '') !== $html) {
284✔
406
                    $htmlTmp = \preg_replace($regexForMultiRootDetection, '', $html);
204✔
407
                    if ($htmlTmp !== null && trim($htmlTmp) === '') {
204✔
408
                        $this->isDOMDocumentCreatedWithMultiRoot = true;
36✔
409
                    }
410
                }
411
            }
412
        }
413

414
        $html = \str_replace(
855✔
415
            \array_map(static function ($e) {
855✔
416
                return '<' . $e . '>';
855✔
417
            }, $this->selfClosingTags),
855✔
418
            \array_map(static function ($e) {
855✔
419
                return '<' . $e . '/>';
855✔
420
            }, $this->selfClosingTags),
855✔
421
            $html
855✔
422
        );
855✔
423

424
        // set error level
425
        $internalErrors = \libxml_use_internal_errors(true);
855✔
426
        if (\PHP_VERSION_ID < 80000) {
855✔
UNCOV
427
            $disableEntityLoader = \libxml_disable_entity_loader(true);
427✔
428
        }
429
        \libxml_clear_errors();
855✔
430

431
        $optionsXml = 0;
855✔
432
        if ($useDefaultLibXMLOptions) {
855✔
433
            $optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET;
855✔
434

435
            if (\defined('LIBXML_BIGLINES')) {
855✔
436
                $optionsXml |= \LIBXML_BIGLINES;
855✔
437
            }
438

439
            if (\defined('LIBXML_COMPACT')) {
855✔
440
                $optionsXml |= \LIBXML_COMPACT;
855✔
441
            }
442

443
            if (\defined('LIBXML_HTML_NODEFDTD')) {
855✔
444
                $optionsXml |= \LIBXML_HTML_NODEFDTD;
855✔
445
            }
446
        }
447

448
        if ($libXMLExtraOptions !== null) {
855✔
449
            $optionsXml |= $libXMLExtraOptions;
20✔
450
        }
451

452
        if (
453
            $this->isDOMDocumentCreatedWithMultiRoot
855✔
454
            ||
455
            $this->isDOMDocumentCreatedWithoutWrapper
847✔
456
            ||
457
            $this->isDOMDocumentCreatedWithCommentWrapper
843✔
458
            ||
459
            (
460
                !$isDOMDocumentCreatedWithDoctype
855✔
461
                &&
855✔
462
                $this->keepBrokenHtml
855✔
463
            )
464
        ) {
465
            $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
108✔
466
        }
467

468
        $html = self::replaceToPreserveHtmlEntities($html);
855✔
469

470
        $documentFound = false;
855✔
471
        $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
855✔
472
        if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
855✔
473
            $domElementTmp = \dom_import_simplexml($sxe);
452✔
474
            if ($domElementTmp->ownerDocument instanceof \DOMDocument) {
452✔
475
                $documentFound = true;
452✔
476
                $this->document = $domElementTmp->ownerDocument;
452✔
477
            }
478
        }
479

480
        if ($documentFound === false) {
855✔
481
            // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
482
            $xmlHackUsed = false;
439✔
483
            if (\stripos('<?xml', $html) !== 0) {
439✔
484
                $xmlHackUsed = true;
437✔
485
                $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
437✔
486
            }
487

488
            if ($html !== '') {
439✔
489
                $this->document->loadHTML($html, $optionsXml);
437✔
490
            }
491

492
            // remove the "xml-encoding" hack
493
            if ($xmlHackUsed) {
439✔
494
                foreach ($this->document->childNodes as $child) {
437✔
495
                    if ($child->nodeType === \XML_PI_NODE) {
437✔
496
                        $this->document->removeChild($child);
437✔
497

498
                        break;
437✔
499
                    }
500
                }
501
            }
502
        }
503

504
        // set encoding
505
        $this->document->encoding = $this->getEncoding();
855✔
506

507
        // restore lib-xml settings
508
        \libxml_clear_errors();
855✔
509
        \libxml_use_internal_errors($internalErrors);
855✔
510
        if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) {
855✔
UNCOV
511
            \libxml_disable_entity_loader($disableEntityLoader);
427✔
512
        }
513

514
        return $this->document;
855✔
515
    }
516

517
    /**
518
     * Find list of nodes with a CSS selector.
519
     *
520
     * @param string   $selector
521
     * @param int|null $idx
522
     *
523
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
524
     */
525
    public function find(string $selector, $idx = null)
526
    {
527
        $xPathQuery = SelectorConverter::toXPath($selector);
631✔
528

529
        $xPath = new \DOMXPath($this->document);
631✔
530

531
        if ($this->callbackXPathBeforeQuery) {
631✔
532
            $xPathQuery = \call_user_func($this->callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $this);
4✔
533
        }
534

535
        $nodesList = $xPath->query($xPathQuery);
631✔
536

537
        $elements = new SimpleHtmlDomNode();
631✔
538

539
        if ($nodesList) {
631✔
540
            foreach ($nodesList as $node) {
631✔
541
                $elements[] = new SimpleHtmlDom($node);
591✔
542
            }
543
        }
544

545
        // return all elements
546
        if ($idx === null) {
631✔
547
            if (\count($elements) === 0) {
307✔
548
                return new SimpleHtmlDomNodeBlank();
64✔
549
            }
550

551
            return $elements;
295✔
552
        }
553

554
        // handle negative values
555
        if ($idx < 0) {
400✔
556
            $idx = \count($elements) + $idx;
44✔
557
        }
558

559
        // return one element
560
        return $elements[$idx] ?? new SimpleHtmlDomBlank();
400✔
561
    }
562

563
    /**
564
     * Find nodes with a CSS selector.
565
     *
566
     * @param string $selector
567
     *
568
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
569
     */
570
    public function findMulti(string $selector): SimpleHtmlDomNodeInterface
571
    {
572
        return $this->find($selector, null);
52✔
573
    }
574

575
    /**
576
     * Find nodes with a CSS selector or false, if no element is found.
577
     *
578
     * @param string $selector
579
     *
580
     * @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
581
     */
582
    public function findMultiOrFalse(string $selector)
583
    {
584
        $return = $this->find($selector, null);
15✔
585

586
        if ($return instanceof SimpleHtmlDomNodeBlank) {
15✔
587
            return false;
12✔
588
        }
589

590
        return $return;
7✔
591
    }
592

593
    /**
594
     * Find one node with a CSS selector.
595
     *
596
     * @param string $selector
597
     *
598
     * @return SimpleHtmlDomInterface
599
     */
600
    public function findOne(string $selector): SimpleHtmlDomInterface
601
    {
602
        return $this->find($selector, 0);
156✔
603
    }
604

605
    /**
606
     * Find one node with a CSS selector or false, if no element is found.
607
     *
608
     * @param string $selector
609
     *
610
     * @return false|SimpleHtmlDomInterface
611
     */
612
    public function findOneOrFalse(string $selector)
613
    {
614
        $return = $this->find($selector, 0);
32✔
615

616
        if ($return instanceof SimpleHtmlDomBlank) {
32✔
617
            return false;
12✔
618
        }
619

620
        return $return;
24✔
621
    }
622

623
    /**
624
     * @param string $content
625
     * @param bool   $multiDecodeNewHtmlEntity
626
     * @param bool   $putBrokenReplacedBack
627
     *
628
     * @return string
629
     */
630
    public function fixHtmlOutput(
631
        string $content,
632
        bool $multiDecodeNewHtmlEntity = false,
633
        bool $putBrokenReplacedBack = true
634
    ): string {
635
        // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
636
        //          so we try to remove it here again ...
637

638
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
555✔
639
            /** @noinspection HtmlRequiredLangAttribute */
640
            $content = \str_replace(
255✔
641
                [
255✔
642
                    '<html>',
255✔
643
                    '</html>',
255✔
644
                ],
255✔
645
                '',
255✔
646
                $content
255✔
647
            );
255✔
648
        }
649

650
        if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) {
555✔
651
            /** @noinspection HtmlRequiredTitleElement */
652
            $content = \str_replace(
275✔
653
                [
275✔
654
                    '<head>',
275✔
655
                    '</head>',
275✔
656
                ],
275✔
657
                '',
275✔
658
                $content
275✔
659
            );
275✔
660
        }
661

662
        if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) {
555✔
663
            $content = \str_replace(
263✔
664
                [
263✔
665
                    '<body>',
263✔
666
                    '</body>',
263✔
667
                ],
263✔
668
                '',
263✔
669
                $content
263✔
670
            );
263✔
671
        }
672

673
        if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) {
555✔
674
            $content = \str_replace(
4✔
675
                '</script>',
4✔
676
                '',
4✔
677
                $content
4✔
678
            );
4✔
679
        }
680

681
        if ($this->getIsDOMDocumentCreatedWithoutWrapper()) {
555✔
682
            $content = (string) \preg_replace('/^<p>/', '', $content);
16✔
683
            $content = (string) \preg_replace('/<\/p>/', '', $content);
16✔
684
        }
685

686
        if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) {
555✔
687
            $content = \str_replace(
259✔
688
                [
259✔
689
                    '<p>',
259✔
690
                    '</p>',
259✔
691
                ],
259✔
692
                '',
259✔
693
                $content
259✔
694
            );
259✔
695
        }
696

697
        if ($this->getIsDOMDocumentCreatedWithoutHtml()) {
555✔
698
            $content = \str_replace(
43✔
699
                '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
43✔
700
                '',
43✔
701
                $content
43✔
702
            );
43✔
703
        }
704

705
        // https://bugs.php.net/bug.php?id=73175
706
        $content = \str_replace(
555✔
707
            \array_map(static function ($e) {
555✔
708
                return '</' . $e . '>';
555✔
709
            }, $this->selfClosingTags),
555✔
710
            '',
555✔
711
            $content
555✔
712
        );
555✔
713

714
        /** @noinspection HtmlRequiredTitleElement */
715
        $content = \trim(
555✔
716
            \str_replace(
555✔
717
                [
555✔
718
                    '<simpleHtmlDomHtml>',
555✔
719
                    '</simpleHtmlDomHtml>',
555✔
720
                    '<simpleHtmlDomP>',
555✔
721
                    '</simpleHtmlDomP>',
555✔
722
                    '<head><head>',
555✔
723
                    '</head></head>',
555✔
724
                ],
555✔
725
                [
555✔
726
                    '',
555✔
727
                    '',
555✔
728
                    '',
555✔
729
                    '',
555✔
730
                    '<head>',
555✔
731
                    '</head>',
555✔
732
                ],
555✔
733
                $content
555✔
734
            )
555✔
735
        );
555✔
736

737
        $content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity);
555✔
738

739
        return self::putReplacedBackToPreserveHtmlEntities($content, $putBrokenReplacedBack);
555✔
740
    }
741

742
    /**
743
     * Return elements by ".class".
744
     *
745
     * @param string $class
746
     *
747
     * @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
748
     */
749
    public function getElementByClass(string $class): SimpleHtmlDomNodeInterface
750
    {
751
        return $this->findMulti('.' . $class);
×
752
    }
753

754
    /**
755
     * Return element by #id.
756
     *
757
     * @param string $id
758
     *
759
     * @return SimpleHtmlDomInterface
760
     */
761
    public function getElementById(string $id): SimpleHtmlDomInterface
762
    {
763
        return $this->findOne('#' . $id);
12✔
764
    }
765

766
    /**
767
     * Return element by tag name.
768
     *
769
     * @param string $name
770
     *
771
     * @return SimpleHtmlDomInterface
772
     */
773
    public function getElementByTagName(string $name): SimpleHtmlDomInterface
774
    {
775
        $node = $this->document->getElementsByTagName($name)->item(0);
4✔
776

777
        if ($node === null) {
4✔
778
            return new SimpleHtmlDomBlank();
×
779
        }
780

781
        return new SimpleHtmlDom($node);
4✔
782
    }
783

784
    /**
785
     * Returns elements by "#id".
786
     *
787
     * @param string   $id
788
     * @param int|null $idx
789
     *
790
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
791
     */
792
    public function getElementsById(string $id, $idx = null)
793
    {
794
        return $this->find('#' . $id, $idx);
×
795
    }
796

797
    /**
798
     * Returns elements by tag name.
799
     *
800
     * @param string   $name
801
     * @param int|null $idx
802
     *
803
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
804
     */
805
    public function getElementsByTagName(string $name, $idx = null)
806
    {
807
        $nodesList = $this->document->getElementsByTagName($name);
24✔
808

809
        $elements = new SimpleHtmlDomNode();
24✔
810

811
        foreach ($nodesList as $node) {
24✔
812
            $elements[] = new SimpleHtmlDom($node);
16✔
813
        }
814

815
        // return all elements
816
        if ($idx === null) {
24✔
817
            if (\count($elements) === 0) {
20✔
818
                return new SimpleHtmlDomNodeBlank();
8✔
819
            }
820

821
            return $elements;
12✔
822
        }
823

824
        // handle negative values
825
        if ($idx < 0) {
4✔
826
            $idx = \count($elements) + $idx;
×
827
        }
828

829
        // return one element
830
        return $elements[$idx] ?? new SimpleHtmlDomNodeBlank();
4✔
831
    }
832

833
    /**
834
     * Get dom node's outer html.
835
     *
836
     * @param bool $multiDecodeNewHtmlEntity
837
     * @param bool $putBrokenReplacedBack
838
     *
839
     * @return string
840
     */
841
    public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
842
    {
843
        if (static::$callback !== null) {
411✔
844
            \call_user_func(static::$callback, [$this]);
×
845
        }
846

847
        if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
411✔
848
            $content = $this->document->saveHTML($this->document->documentElement);
227✔
849
        } else {
850
            $content = $this->document->saveHTML();
247✔
851
        }
852

853
        if ($content === false) {
411✔
854
            return '';
×
855
        }
856

857
        return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
411✔
858
    }
859

860
    /**
861
     * Load HTML from string.
862
     *
863
     * @param string   $html
864
     * @param int|null $libXMLExtraOptions
865
     * @param bool     $useDefaultLibXMLOptions
866
     *
867
     * @return $this
868
     */
869
    public function loadHtml(string $html, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
870
    {
871
        $this->document = $this->createDOMDocument($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
855✔
872

873
        return $this;
855✔
874
    }
875

876
    /**
877
     * Load HTML from file.
878
     *
879
     * @param string   $filePath
880
     * @param int|null $libXMLExtraOptions
881
     * @param bool     $useDefaultLibXMLOptions
882
     *
883
     * @throws \RuntimeException
884
     *
885
     * @return $this
886
     */
887
    public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null, $useDefaultLibXMLOptions = true): DomParserInterface
888
    {
889
        if (
890
            !\preg_match("/^https?:\/\//i", $filePath)
55✔
891
            &&
892
            !\file_exists($filePath)
55✔
893
        ) {
894
            throw new \RuntimeException('File ' . $filePath . ' not found');
4✔
895
        }
896

897
        try {
898
            if (\class_exists('\voku\helper\UTF8')) {
51✔
899
                $html = \voku\helper\UTF8::file_get_contents($filePath);
×
900
            } else {
901
                $html = \file_get_contents($filePath);
51✔
902
            }
903
        } catch (\Exception $e) {
4✔
904
            throw new \RuntimeException('Could not load file ' . $filePath);
4✔
905
        }
906

907
        if ($html === false) {
47✔
908
            throw new \RuntimeException('Could not load file ' . $filePath);
×
909
        }
910

911
        return $this->loadHtml($html, $libXMLExtraOptions, $useDefaultLibXMLOptions);
47✔
912
    }
913

914
    /**
915
     * Get the HTML as XML or plain XML if needed.
916
     *
917
     * @param bool $multiDecodeNewHtmlEntity
918
     * @param bool $htmlToXml
919
     * @param bool $removeXmlHeader
920
     * @param int  $options
921
     *
922
     * @return string
923
     */
924
    public function xml(
925
        bool $multiDecodeNewHtmlEntity = false,
926
        bool $htmlToXml = true,
927
        bool $removeXmlHeader = true,
928
        int $options = \LIBXML_NOEMPTYTAG
929
    ): string {
930
        $xml = $this->document->saveXML(null, $options);
8✔
931
        if ($xml === false) {
8✔
932
            return '';
×
933
        }
934

935
        if ($removeXmlHeader) {
8✔
936
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
8✔
937
        }
938

939
        if ($htmlToXml) {
8✔
940
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
8✔
941
        } else {
942
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
×
943

944
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
×
945
        }
946

947
        return $return;
8✔
948
    }
949

950
    /**
951
     * @param string $selector
952
     * @param int    $idx
953
     *
954
     * @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface>
955
     */
956
    public function __invoke($selector, $idx = null)
957
    {
958
        return $this->find($selector, $idx);
12✔
959
    }
960

961
    /**
962
     * @return bool
963
     */
964
    public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
965
    {
966
        return $this->isDOMDocumentCreatedWithoutHeadWrapper;
555✔
967
    }
968

969
    /**
970
     * @return bool
971
     */
972
    public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool
973
    {
974
        return $this->isDOMDocumentCreatedWithoutPTagWrapper;
555✔
975
    }
976

977
    /**
978
     * @return bool
979
     */
980
    public function getIsDOMDocumentCreatedWithoutHtml(): bool
981
    {
982
        return $this->isDOMDocumentCreatedWithoutHtml;
555✔
983
    }
984

985
    /**
986
     * @return bool
987
     */
988
    public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool
989
    {
990
        return $this->isDOMDocumentCreatedWithoutBodyWrapper;
555✔
991
    }
992

993
    /**
994
     * @return bool
995
     */
996
    public function getIsDOMDocumentCreatedWithMultiRoot(): bool
997
    {
998
        return $this->isDOMDocumentCreatedWithMultiRoot;
×
999
    }
1000

1001
    /**
1002
     * @return bool
1003
     */
1004
    public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
1005
    {
1006
        return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
555✔
1007
    }
1008

1009
    /**
1010
     * @return bool
1011
     */
1012
    public function getIsDOMDocumentCreatedWithoutWrapper(): bool
1013
    {
1014
        return $this->isDOMDocumentCreatedWithoutWrapper;
555✔
1015
    }
1016

1017
    /**
1018
     * @return bool
1019
     */
1020
    public function getIsDOMDocumentCreatedWithFakeEndScript(): bool
1021
    {
1022
        return $this->isDOMDocumentCreatedWithFakeEndScript;
555✔
1023
    }
1024

1025
    /**
1026
     * @param string $html
1027
     *
1028
     * @return string
1029
     */
1030
    protected function keepBrokenHtml(string $html): string
1031
    {
1032
        do {
1033
            $original = $html;
20✔
1034

1035
            $html = (string) \preg_replace_callback(
20✔
1036
                '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
20✔
1037
                static function ($matches) {
20✔
1038
                    return $matches['start'] .
20✔
1039
                        '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
20✔
1040
                        $matches['value'] .
20✔
1041
                        '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
20✔
1042
                        $matches['end'];
20✔
1043
                },
20✔
1044
                $html
20✔
1045
            );
20✔
1046
        } while ($original !== $html);
20✔
1047

1048
        do {
1049
            $original = $html;
20✔
1050

1051
            $html = (string) \preg_replace_callback(
20✔
1052
                '/(?<start>[^<]*)?(?<broken>(?:<\/\w+(?:\s+\w+=\"[^"]+\")*+[^<]+>)+)(?<end>.*)/u',
20✔
1053
                static function ($matches) {
20✔
1054
                    $matches['broken'] = \str_replace(
12✔
1055
                        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
12✔
1056
                        ['</', '<', '>'],
12✔
1057
                        $matches['broken']
12✔
1058
                    );
12✔
1059

1060
                    self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
12✔
1061
                    self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']);
12✔
1062

1063
                    return $matches['start'] . $matchesHash . $matches['end'];
12✔
1064
                },
20✔
1065
                $html
20✔
1066
            );
20✔
1067
        } while ($original !== $html);
20✔
1068

1069
        return \str_replace(
20✔
1070
            ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
20✔
1071
            ['</', '<', '>'],
20✔
1072
            $html
20✔
1073
        );
20✔
1074
    }
1075

1076
    /**
1077
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
1078
     *
1079
     * @param string $html
1080
     *
1081
     * @return void
1082
     */
1083
    protected function keepSpecialSvgTags(string &$html)
1084
    {
1085
        // regEx for e.g.: [mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">...</svg>')]
1086
        /** @noinspection HtmlDeprecatedTag */
1087
        $regExSpecialSvg = '/\((["\'])?(?<start>data:image\/svg.*)<svg(?<attr>[^>]*?)>(?<content>.*)<\/svg>\1\)/isU';
176✔
1088
        $htmlTmp = \preg_replace_callback(
176✔
1089
            $regExSpecialSvg,
176✔
1090
            static function ($svgs) {
176✔
1091
                if (empty($svgs['content'])) {
4✔
1092
                    return $svgs[0];
×
1093
                }
1094

1095
                $content = '<svg' . $svgs['attr'] . '>' . $svgs['content'] . '</svg>';
4✔
1096
                self::$domBrokenReplaceHelper['orig'][] = $content;
4✔
1097
                self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($content);
4✔
1098

1099
                return '(' . $svgs[1] . $svgs['start'] . $matchesHash . $svgs[1] . ')';
4✔
1100
            },
176✔
1101
            $html
176✔
1102
        );
176✔
1103

1104
        if ($htmlTmp !== null) {
176✔
1105
            $html = $htmlTmp;
176✔
1106
        }
1107
    }
1108

1109
    /**
1110
     * @param string $html
1111
     *
1112
     * @return void
1113
     */
1114
    protected function keepSpecialScriptTags(string &$html)
1115
    {
1116
        // regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>]
1117
        $tags = \implode('|', \array_map(
24✔
1118
            static function ($value) {
24✔
1119
                return \preg_quote($value, '/');
24✔
1120
            },
24✔
1121
            $this->specialScriptTags
24✔
1122
        ));
24✔
1123
        $html = (string) \preg_replace_callback(
24✔
1124
            '/(?<start>(<script [^>]*type=["\']?(?:' . $tags . ')+[^>]*>))(?<innerContent>.*)(?<end><\/script>)/isU',
24✔
1125
            function ($matches) {
24✔
1126
                // Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>],
1127
                // because often this looks like non-valid html in the template itself.
1128
                foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) {
16✔
1129
                    if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) {
16✔
1130
                        // remove the html5 fallback
1131
                        $matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']);
12✔
1132

1133
                        self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent'];
12✔
1134
                        self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['innerContent']);
12✔
1135

1136
                        return $matches['start'] . $matchesHash . $matches['end'];
12✔
1137
                    }
1138
                }
1139

1140
                // remove the html5 fallback
1141
                $matches[0] = \str_replace('<\/', '</', $matches[0]);
12✔
1142

1143
                $specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script'));
12✔
1144

1145
                return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>';
12✔
1146
            },
24✔
1147
            $html
24✔
1148
        );
24✔
1149
    }
1150

1151
    /**
1152
     * @param bool $keepBrokenHtml
1153
     *
1154
     * @return $this
1155
     */
1156
    public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface
1157
    {
1158
        $this->keepBrokenHtml = $keepBrokenHtml;
20✔
1159

1160
        return $this;
20✔
1161
    }
1162

1163
    /**
1164
     * @param string[] $templateLogicSyntaxInSpecialScriptTags
1165
     *
1166
     * @return $this
1167
     */
1168
    public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface
1169
    {
1170
        foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) {
8✔
1171
            if (!\is_string($tmp)) {
8✔
1172
                throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]');
4✔
1173
            }
1174
        }
1175

1176
        $this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags;
4✔
1177

1178
        return $this;
4✔
1179
    }
1180

1181
    /**
1182
     * @param string[] $specialScriptTags
1183
     *
1184
     * @return $this
1185
     */
1186
    public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface
1187
    {
1188
        foreach ($specialScriptTags as $tag) {
×
1189
            if (!\is_string($tag)) {
×
1190
                throw new \InvalidArgumentException('SpecialScriptTags only allows string[]');
×
1191
            }
1192
        }
1193

1194
        $this->specialScriptTags = $specialScriptTags;
×
1195

1196
        return $this;
×
1197
    }
1198

1199
    /**
1200
     * @param callable $callbackXPathBeforeQuery
1201
     *
1202
     * @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery
1203
     *
1204
     * @return $this
1205
     */
1206
    public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery): self
1207
    {
1208
        $this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery;
4✔
1209

1210
        return $this;
4✔
1211
    }
1212

1213
    /**
1214
     * @param callable $callbackBeforeCreateDom
1215
     *
1216
     * @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom
1217
     *
1218
     * @return $this
1219
     */
1220
    public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom): self
1221
    {
1222
        $this->callbackBeforeCreateDom = $callbackBeforeCreateDom;
4✔
1223

1224
        return $this;
4✔
1225
    }
1226
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc