• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 24713207173

21 Apr 2026 08:51AM UTC coverage: 96.791% (+0.8%) from 96.034%
24713207173

push

github

voku
[+]: add more tests

2172 of 2244 relevant lines covered (96.79%)

287.34 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.72
/src/voku/helper/AbstractDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    // Keep this helper tag non-hyphenated: older libxml HTML serializers treat
13
    // unknown hyphenated elements as block-level and inject formatting newlines.
14
    protected static $domHtmlWrapperHelper = 'simplevokuwrapper';
15

16
    /**
17
     * @var string
18
     */
19
    protected static $domHtmlBrokenHtmlHelper = 'simplevokubroken';
20

21
    /**
22
     * @var string
23
     */
24
    protected static $domHtmlSpecialScriptHelper = 'simplevokuspecialscript';
25

26
    /**
27
     * @var array<string, array<int, string>>
28
     */
29
    protected static $domBrokenReplaceHelper = [];
30

31
    /**
32
     * @var string[][]
33
     */
34
    protected static $domLinkReplaceHelper = [
35
        'orig' => ['[', ']', '{', '}'],
36
        'tmp'  => [
37
            'SHDOM_SQUARE_BRACKET_LEFT',
38
            'SHDOM_SQUARE_BRACKET_RIGHT',
39
            'SHDOM_BRACKET_LEFT',
40
            'SHDOM_BRACKET_RIGHT',
41
        ],
42
    ];
43

44
    /**
45
     * @var string[][]
46
     */
47
    protected static $domReplaceHelper = [
48
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
49
        'tmp'  => [
50
            'SHDOM_AMP',
51
            'SHDOM_PIPE',
52
            'SHDOM_PLUS',
53
            'SHDOM_PERCENT',
54
            'SHDOM_AT',
55
            '<html SHDOM_GOOGLE_AMP="true"',
56
        ],
57
    ];
58

59
    /**
60
     * @var callable|null
61
     *
62
     * @phpstan-var null|callable(array{0: \voku\helper\XmlDomParser|\voku\helper\HtmlDomParser}): void
63
     */
64
    protected static $callback;
65

66
    /**
67
     * @var string[]
68
     */
69
    protected static $functionAliases = [];
70

71
    /**
72
     * @var string[]
73
     */
74
    protected $dynamicDomBrokenReplaceHelperKeys = [];
75

76
    /**
77
     * Remove the current parser instance's dynamic placeholder mappings from
78
     * the shared replacement table before reparsing this parser instance.
79
     *
80
     * @return void
81
     */
82
    protected function resetDynamicDomHelpers()
83
    {
84
        if (empty($this->dynamicDomBrokenReplaceHelperKeys)) {
2,373✔
85
            return;
2,366✔
86
        }
87

88
        foreach ($this->dynamicDomBrokenReplaceHelperKeys as $token) {
7✔
89
            foreach (\array_keys(self::$domBrokenReplaceHelper['tmp'] ?? [], $token, true) as $index) {
7✔
90
                unset(self::$domBrokenReplaceHelper['tmp'][$index], self::$domBrokenReplaceHelper['orig'][$index]);
7✔
91
            }
92
        }
93

94
        if (empty(self::$domBrokenReplaceHelper['tmp'])) {
7✔
95
            self::$domBrokenReplaceHelper = [];
7✔
96
        } else {
97
            self::$domBrokenReplaceHelper['tmp'] = \array_values(self::$domBrokenReplaceHelper['tmp']);
×
98
            self::$domBrokenReplaceHelper['orig'] = \array_values(self::$domBrokenReplaceHelper['orig']);
×
99
        }
100

101
        $this->dynamicDomBrokenReplaceHelperKeys = [];
7✔
102
    }
103

104
    /**
105
     * @param string $original
106
     * @param string $token
107
     *
108
     * @return void
109
     */
110
    protected function registerDynamicDomBrokenReplaceHelper(string $original, string $token)
111
    {
112
        self::$domBrokenReplaceHelper['orig'][] = $original;
81✔
113
        self::$domBrokenReplaceHelper['tmp'][] = $token;
81✔
114
        $this->dynamicDomBrokenReplaceHelperKeys[] = $token;
81✔
115
    }
116

117
    /**
118
     * @var \DOMDocument
119
     */
120
    protected $document;
121

122
    /**
123
     * @var string
124
     */
125
    protected $encoding = 'UTF-8';
126

127
    /**
128
     * @param string       $name
129
     * @param array<mixed> $arguments
130
     *
131
     * @return bool|mixed
132
     */
133
    public function __call($name, $arguments)
134
    {
135
        $name = \strtolower($name);
×
136

137
        if (isset(self::$functionAliases[$name])) {
×
138
            $method = self::$functionAliases[$name];
×
139

140
            return $this->{$method}(...$arguments);
×
141
        }
142

143
        throw new \BadMethodCallException('Method does not exist: ' . $name);
×
144
    }
145

146
    /**
147
     * @param string       $name
148
     * @param array<mixed> $arguments
149
     *
150
     * @throws \BadMethodCallException
151
     * @throws \RuntimeException
152
     *
153
     * @return static
154
     */
155
    abstract public static function __callStatic($name, $arguments);
156

157
    public function __clone()
158
    {
159
        $this->document = clone $this->document;
7✔
160
    }
161

162
    /**
163
     * @param string $name
164
     *
165
     * @return string|null
166
     */
167
    abstract public function __get($name);
168

169
    /**
170
     * @return string
171
     */
172
    abstract public function __toString();
173

174
    /**
175
     * does nothing (only for api-compatibility-reasons)
176
     *
177
     * @return bool
178
     *
179
     * @deprecated
180
     */
181
    public function clear(): bool
182
    {
183
        return true;
×
184
    }
185

186
    /**
187
     * Create DOMDocument from HTML.
188
     *
189
     * @param string   $html
190
     * @param int|null $libXMLExtraOptions
191
     *
192
     * @return \DOMDocument
193
     */
194
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
195

196
    /**
197
     * @param string $content
198
     * @param bool   $multiDecodeNewHtmlEntity
199
     *
200
     * @return string
201
     */
202
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
203
    {
204
        if ($multiDecodeNewHtmlEntity) {
1,561✔
205
            if (\class_exists('\voku\helper\UTF8')) {
21✔
206
                $content = UTF8::rawurldecode($content, true);
×
207
            } else {
208
                do {
209
                    $content_compare = $content;
21✔
210

211
                    $content = \rawurldecode(
21✔
212
                        \html_entity_decode(
21✔
213
                            $content,
21✔
214
                            \ENT_QUOTES | \ENT_HTML5
21✔
215
                        )
21✔
216
                    );
21✔
217
                } while ($content_compare !== $content);
21✔
218
            }
219
        } else {
220
            /** @noinspection NestedPositiveIfStatementsInspection */
221
            if (\class_exists('\voku\helper\UTF8')) {
1,554✔
222
                $content = UTF8::rawurldecode($content, false);
×
223
            } else {
224
                $content = \rawurldecode(
1,554✔
225
                    \html_entity_decode(
1,554✔
226
                        $content,
1,554✔
227
                        \ENT_QUOTES | \ENT_HTML5
1,554✔
228
                    )
1,554✔
229
                );
1,554✔
230
            }
231
        }
232

233
        return $content;
1,561✔
234
    }
235

236
    /**
237
     * Find list of nodes with a CSS selector.
238
     *
239
     * @param string   $selector
240
     * @param int|null $idx
241
     *
242
     * @return mixed
243
     */
244
    abstract public function find(string $selector, $idx = null);
245

246
    /**
247
     * Find nodes with a CSS selector.
248
     *
249
     * @param string $selector
250
     *
251
     * @return mixed
252
     */
253
    abstract public function findMulti(string $selector);
254

255
    /**
256
     * Find nodes with a CSS selector or false, if no element is found.
257
     *
258
     * @param string $selector
259
     *
260
     * @return mixed
261
     */
262
    abstract public function findMultiOrFalse(string $selector);
263

264
    /**
265
     * Find nodes with a CSS selector or null, if no element is found.
266
     *
267
     * @param string $selector
268
     *
269
     * @return mixed
270
     */
271
    abstract public function findMultiOrNull(string $selector);
272

273
    /**
274
     * Find one node with a CSS selector.
275
     *
276
     * @param string $selector
277
     *
278
     * @return mixed
279
     */
280
    abstract public function findOne(string $selector);
281

282
    /**
283
     * Find one node with a CSS selector or false, if no element is found.
284
     *
285
     * @param string $selector
286
     *
287
     * @return mixed
288
     */
289
    abstract public function findOneOrFalse(string $selector);
290

291
    /**
292
     * Find one node with a CSS selector or null, if no element is found.
293
     *
294
     * @param string $selector
295
     *
296
     * @return mixed
297
     */
298
    abstract public function findOneOrNull(string $selector);
299

300
    /**
301
     * @return \DOMDocument
302
     */
303
    public function getDocument(): \DOMDocument
304
    {
305
        return $this->document;
490✔
306
    }
307

308
    /**
309
     * Get dom node's outer html.
310
     *
311
     * @param bool $multiDecodeNewHtmlEntity
312
     * @param bool $putBrokenReplacedBack
313
     *
314
     * @return string
315
     */
316
    abstract public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string;
317

318
    /**
319
     * Get dom node's inner html.
320
     *
321
     * @param bool $multiDecodeNewHtmlEntity
322
     * @param bool $putBrokenReplacedBack
323
     *
324
     * @return string
325
     */
326
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
327
    {
328
        // init
329
        $text = '';
14✔
330

331
        if ($this->document->documentElement) {
14✔
332
            foreach ($this->document->documentElement->childNodes as $node) {
14✔
333
                $text .= $this->document->saveHTML($node);
14✔
334
            }
335
        }
336

337
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
14✔
338
    }
339

340
    /**
341
     * Get dom node's inner html.
342
     *
343
     * @param bool $multiDecodeNewHtmlEntity
344
     *
345
     * @return string
346
     */
347
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
348
    {
349
        // init
350
        $text = '';
28✔
351

352
        if ($this->document->documentElement) {
28✔
353
            foreach ($this->document->documentElement->childNodes as $node) {
28✔
354
                $text .= $this->document->saveXML($node);
28✔
355
            }
356
        }
357

358
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
28✔
359
    }
360

361
    /**
362
     * Load HTML from string.
363
     *
364
     * @param string   $html
365
     * @param int|null $libXMLExtraOptions
366
     *
367
     * @return DomParserInterface
368
     */
369
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
370

371
    /**
372
     * Load HTML from file.
373
     *
374
     * @param string   $filePath
375
     * @param int|null $libXMLExtraOptions
376
     *
377
     * @throws \RuntimeException
378
     *
379
     * @return DomParserInterface
380
     */
381
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
382

383
    /**
384
     * Save the html-dom as string.
385
     *
386
     * @param string $filepath
387
     *
388
     * @return string
389
     */
390
    public function save(string $filepath = ''): string
391
    {
392
        $string = $this->html();
91✔
393
        if ($filepath !== '') {
91✔
394
            \file_put_contents($filepath, $string, \LOCK_EX);
14✔
395
        }
396

397
        return $string;
91✔
398
    }
399

400
    /**
401
     * @param callable $functionName
402
     *
403
     * @phpstan-param callable(array{0: \voku\helper\XmlDomParser|\voku\helper\HtmlDomParser}): void $functionName
404
     *
405
     * @return void
406
     */
407
    public function set_callback($functionName)
408
    {
409
        static::$callback = $functionName;
21✔
410
    }
411

412
    /**
413
     * Get dom node's plain text.
414
     *
415
     * @param bool $multiDecodeNewHtmlEntity
416
     *
417
     * @return string
418
     */
419
    public function text(bool $multiDecodeNewHtmlEntity = false): string
420
    {
421
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
14✔
422
    }
423

424
    /**
425
     * Get the HTML as XML or plain XML if needed.
426
     *
427
     * @param bool $multiDecodeNewHtmlEntity
428
     * @param bool $htmlToXml
429
     * @param bool $removeXmlHeader
430
     * @param int  $options
431
     *
432
     * @return string
433
     */
434
    public function xml(
435
        bool $multiDecodeNewHtmlEntity = false,
436
        bool $htmlToXml = true,
437
        bool $removeXmlHeader = true,
438
        int $options = \LIBXML_NOEMPTYTAG
439
    ): string {
440
        $xml = $this->document->saveXML(null, $options);
70✔
441
        if ($xml === false) {
70✔
442
            return '';
×
443
        }
444

445
        if ($removeXmlHeader) {
70✔
446
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
63✔
447
        }
448

449
        if ($htmlToXml) {
70✔
450
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
35✔
451
        } else {
452
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
35✔
453

454
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
35✔
455
        }
456

457
        return $return;
70✔
458
    }
459

460
    /**
461
     * Get the encoding to use.
462
     *
463
     * @return string
464
     */
465
    protected function getEncoding(): string
466
    {
467
        return $this->encoding;
2,513✔
468
    }
469

470
    /**
471
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
472
     *
473
     * @param string $html
474
     *
475
     * @return void
476
     */
477
    protected function html5FallbackForScriptTags(string &$html)
478
    {
479
        // Normalize self-closing <script ... /> to <script ...></script> so
480
        // that the regex below does not treat the self-closing form as an
481
        // opening tag whose "content" extends to the next </script>.
482
        $html = (string) \preg_replace('/<script([^>]*)\/>/', '<script$1></script>', $html);
175✔
483

484
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
485
        /** @noinspection HtmlDeprecatedTag */
486
        $regExSpecialScript = '/<script(?<attr>[^>]*?)>(?<content>.*)<\/script>/isU';
175✔
487

488
        if (\PHP_VERSION_ID < 80000) {
175✔
489
            // On PHP < 8.0, older libxml's HTML parser can mishandle <\/ inside
490
            // <script> content, causing content after the sequence to leak outside
491
            // the element. Use a placeholder to protect any script content that
492
            // contains literal < characters so that loadHTML() receives safe input.
493
            $htmlTmp = \preg_replace_callback(
50✔
494
                $regExSpecialScript,
50✔
495
                function ($scripts) {
50✔
496
                    if (empty($scripts['content'])) {
46✔
497
                        return $scripts[0];
22✔
498
                    }
499

500
                    // Revert any existing <\/ escaping to check for bare < chars.
501
                    $contentReverted = \str_replace('<\/', '</', $scripts['content']);
38✔
502

503
                    if (\strpos($contentReverted, '<') === false) {
38✔
504
                        return $scripts[0];
24✔
505
                    }
506

507
                    // Apply the same </ → <\/ escaping that PHP 8+ applies so that
508
                    // when the placeholder is restored the output matches PHP 8+
509
                    // behaviour.  Any <\/ already present is left untouched because
510
                    // str_replace('</', ...) only matches the two-char sequence
511
                    // '<' + '/' and '<\/' has '\' in between.
512
                    $storedContent = \str_replace('</', '<\/', $scripts['content']);
22✔
513
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($storedContent);
22✔
514
                    $this->registerDynamicDomBrokenReplaceHelper($storedContent, $matchesHash);
22✔
515

516
                    return '<script' . $scripts['attr'] . '>' . $matchesHash . '</script>';
22✔
517
                },
50✔
518
                $html
50✔
519
            );
50✔
520

521
            if ($htmlTmp !== null) {
50✔
522
                $html = $htmlTmp;
50✔
523
            }
524

525
            return;
50✔
526
        }
527

528
        $htmlTmp = \preg_replace_callback(
125✔
529
            $regExSpecialScript,
125✔
530
            static function ($scripts) {
125✔
531
                if (empty($scripts['content'])) {
115✔
532
                    return $scripts[0];
55✔
533
                }
534

535
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
95✔
536
            },
125✔
537
            $html
125✔
538
        );
125✔
539

540
        if ($htmlTmp !== null) {
125✔
541
            $html = $htmlTmp;
125✔
542
        }
543
    }
544

545
    /**
546
     * @param string $html
547
     *
548
     * @return string
549
     */
550
    public static function putReplacedBackToPreserveHtmlEntities(string $html, bool $putBrokenReplacedBack = true): string
551
    {
552
        static $DOM_REPLACE__HELPER_CACHE = null;
1,876✔
553

554
        if ($DOM_REPLACE__HELPER_CACHE === null) {
1,876✔
555
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
230✔
556
                self::$domLinkReplaceHelper['tmp'],
230✔
557
                self::$domReplaceHelper['tmp']
230✔
558
            );
230✔
559
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
230✔
560
                self::$domLinkReplaceHelper['orig'],
230✔
561
                self::$domReplaceHelper['orig']
230✔
562
            );
230✔
563

564
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
230✔
565
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
230✔
566

567
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
230✔
568
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
230✔
569

570
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start_broken'] = self::$domHtmlWrapperHelper . '>';
230✔
571
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end_broken'] = '</' . self::$domHtmlWrapperHelper;
230✔
572

573
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start_broken'] = '';
230✔
574
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end_broken'] = '';
230✔
575

576
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
230✔
577
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
230✔
578

579
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
230✔
580
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
230✔
581

582
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start_broken'] = self::$domHtmlSpecialScriptHelper;
230✔
583
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end_broken'] = '</' . self::$domHtmlSpecialScriptHelper;
230✔
584

585
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start_broken'] = 'script';
230✔
586
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end_broken'] = '</script';
230✔
587
        }
588

589
        if (
590
            $putBrokenReplacedBack === true
1,876✔
591
            &&
592
            isset(self::$domBrokenReplaceHelper['tmp'])
1,876✔
593
            &&
594
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
1,876✔
595
        ) {
596
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
1,186✔
597
        }
598

599
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
1,876✔
600
    }
601

602
    /**
603
     * @param string $html
604
     *
605
     * @return string
606
     */
607
    public static function replaceToPreserveHtmlEntities(string $html): string
608
    {
609
        // init
610
        $linksNew = [];
2,387✔
611
        $linksOld = [];
2,387✔
612

613
        if (\strpos($html, 'http') !== false) {
2,387✔
614
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
615
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\(\w+\)|[^[:punct:]\s]|\/|}|]))/i';
542✔
616
            \preg_match_all($regExUrl, $html, $linksOld);
542✔
617

618
            if (!empty($linksOld[1])) {
542✔
619
                $linksOld = $linksOld[1];
523✔
620
                foreach ((array) $linksOld as $linkKey => $linkOld) {
523✔
621
                    $linksNew[$linkKey] = \str_replace(
523✔
622
                        self::$domLinkReplaceHelper['orig'],
523✔
623
                        self::$domLinkReplaceHelper['tmp'],
523✔
624
                        $linkOld
523✔
625
                    );
523✔
626
                }
627
            }
628
        }
629

630
        $linksNewCount = \count($linksNew);
2,387✔
631
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
2,387✔
632
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
523✔
633
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
523✔
634
        } else {
635
            $search = self::$domReplaceHelper['orig'];
1,920✔
636
            $replace = self::$domReplaceHelper['tmp'];
1,920✔
637
        }
638

639
        return \str_replace($search, $replace, $html);
2,387✔
640
    }
641
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc