• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 24291591710

11 Apr 2026 09:05PM UTC coverage: 69.094%. Remained the same
24291591710

push

github

web-flow
Merge pull request #118 from devteam-emroc/master

Symfony 8.0 support

1243 of 1799 relevant lines covered (69.09%)

248.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.82
/src/voku/helper/AbstractDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    // Keep this helper tag non-hyphenated: older libxml HTML serializers treat
13
    // unknown hyphenated elements as block-level and inject formatting newlines.
14
    protected static $domHtmlWrapperHelper = 'simplevokuwrapper';
15

16
    /**
17
     * @var string
18
     */
19
    protected static $domHtmlBrokenHtmlHelper = 'simplevokubroken';
20

21
    /**
22
     * @var string
23
     */
24
    protected static $domHtmlSpecialScriptHelper = 'simplevokuspecialscript';
25

26
    /**
27
     * @var array
28
     */
29
    protected static $domBrokenReplaceHelper = [];
30

31
    /**
32
     * @var string[][]
33
     */
34
    protected static $domLinkReplaceHelper = [
35
        'orig' => ['[', ']', '{', '}'],
36
        'tmp'  => [
37
            'SHDOM_SQUARE_BRACKET_LEFT',
38
            'SHDOM_SQUARE_BRACKET_RIGHT',
39
            'SHDOM_BRACKET_LEFT',
40
            'SHDOM_BRACKET_RIGHT',
41
        ],
42
    ];
43

44
    /**
45
     * @var string[][]
46
     */
47
    protected static $domReplaceHelper = [
48
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
49
        'tmp'  => [
50
            'SHDOM_AMP',
51
            'SHDOM_PIPE',
52
            'SHDOM_PLUS',
53
            'SHDOM_PERCENT',
54
            'SHDOM_AT',
55
            '<html SHDOM_GOOGLE_AMP="true"',
56
        ],
57
    ];
58

59
    /**
60
     * @var callable|null
61
     *
62
     * @phpstan-var null|callable(\voku\helper\XmlDomParser|\voku\helper\HtmlDomParser): void
63
     */
64
    protected static $callback;
65

66
    /**
67
     * @var string[]
68
     */
69
    protected static $functionAliases = [];
70

71
    /**
72
     * @var string[]
73
     */
74
    protected $dynamicDomBrokenReplaceHelperKeys = [];
75

76
    /**
77
     * Remove the current parser instance's dynamic placeholder mappings from
78
     * the shared replacement table before reparsing this parser instance.
79
     *
80
     * @return void
81
     */
82
    protected function resetDynamicDomHelpers()
83
    {
84
        if (empty($this->dynamicDomBrokenReplaceHelperKeys)) {
2,096✔
85
            return;
2,087✔
86
        }
87

88
        foreach ($this->dynamicDomBrokenReplaceHelperKeys as $token) {
9✔
89
            foreach (\array_keys(self::$domBrokenReplaceHelper['tmp'] ?? [], $token, true) as $index) {
9✔
90
                unset(self::$domBrokenReplaceHelper['tmp'][$index], self::$domBrokenReplaceHelper['orig'][$index]);
9✔
91
            }
92
        }
93

94
        if (empty(self::$domBrokenReplaceHelper['tmp'])) {
9✔
95
            self::$domBrokenReplaceHelper = [];
9✔
96
        } else {
97
            self::$domBrokenReplaceHelper['tmp'] = \array_values(self::$domBrokenReplaceHelper['tmp']);
×
98
            self::$domBrokenReplaceHelper['orig'] = \array_values(self::$domBrokenReplaceHelper['orig']);
×
99
        }
100

101
        $this->dynamicDomBrokenReplaceHelperKeys = [];
9✔
102
    }
2✔
103

104
    /**
105
     * @param string $original
106
     * @param string $token
107
     *
108
     * @return void
109
     */
110
    protected function registerDynamicDomBrokenReplaceHelper(string $original, string $token)
111
    {
112
        self::$domBrokenReplaceHelper['orig'][] = $original;
95✔
113
        self::$domBrokenReplaceHelper['tmp'][] = $token;
95✔
114
        $this->dynamicDomBrokenReplaceHelperKeys[] = $token;
95✔
115
    }
30✔
116

117
    /**
118
     * @var \DOMDocument
119
     */
120
    protected $document;
121

122
    /**
123
     * @var string
124
     */
125
    protected $encoding = 'UTF-8';
126

127
    /**
128
     * @param string $name
129
     * @param array  $arguments
130
     *
131
     * @return bool|mixed
132
     */
133
    public function __call($name, $arguments)
134
    {
135
        $name = \strtolower($name);
×
136

137
        if (isset(self::$functionAliases[$name])) {
×
138
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
×
139
        }
140

141
        throw new \BadMethodCallException('Method does not exist: ' . $name);
×
142
    }
143

144
    /**
145
     * @param string $name
146
     * @param array  $arguments
147
     *
148
     * @throws \BadMethodCallException
149
     * @throws \RuntimeException
150
     *
151
     * @return static
152
     */
153
    abstract public static function __callStatic($name, $arguments);
154

155
    public function __clone()
156
    {
157
        $this->document = clone $this->document;
×
158
    }
159

160
    /**
161
     * @param string $name
162
     *
163
     * @return string|null
164
     */
165
    abstract public function __get($name);
166

167
    /**
168
     * @return string
169
     */
170
    abstract public function __toString();
171

172
    /**
173
     * does nothing (only for api-compatibility-reasons)
174
     *
175
     * @return bool
176
     *
177
     * @deprecated
178
     */
179
    public function clear(): bool
180
    {
181
        return true;
×
182
    }
183

184
    /**
185
     * Create DOMDocument from HTML.
186
     *
187
     * @param string   $html
188
     * @param int|null $libXMLExtraOptions
189
     *
190
     * @return \DOMDocument
191
     */
192
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
193

194
    /**
195
     * @param string $content
196
     * @param bool   $multiDecodeNewHtmlEntity
197
     *
198
     * @return string
199
     */
200
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
201
    {
202
        if ($multiDecodeNewHtmlEntity) {
1,385✔
203
            if (\class_exists('\voku\helper\UTF8')) {
27✔
204
                $content = UTF8::rawurldecode($content, true);
×
205
            } else {
206
                do {
207
                    $content_compare = $content;
27✔
208

209
                    $content = \rawurldecode(
27✔
210
                        \html_entity_decode(
27✔
211
                            $content,
27✔
212
                            \ENT_QUOTES | \ENT_HTML5
27✔
213
                        )
21✔
214
                    );
21✔
215
                } while ($content_compare !== $content);
27✔
216
            }
217
        } else {
218
            /** @noinspection NestedPositiveIfStatementsInspection */
219
            if (\class_exists('\voku\helper\UTF8')) {
1,376✔
220
                $content = UTF8::rawurldecode($content, false);
×
221
            } else {
222
                $content = \rawurldecode(
1,376✔
223
                    \html_entity_decode(
1,376✔
224
                        $content,
1,376✔
225
                        \ENT_QUOTES | \ENT_HTML5
1,376✔
226
                    )
1,070✔
227
                );
1,070✔
228
            }
229
        }
230

231
        return $content;
1,385✔
232
    }
233

234
    /**
235
     * Find list of nodes with a CSS selector.
236
     *
237
     * @param string   $selector
238
     * @param int|null $idx
239
     *
240
     * @return mixed
241
     */
242
    abstract public function find(string $selector, $idx = null);
243

244
    /**
245
     * Find nodes with a CSS selector.
246
     *
247
     * @param string $selector
248
     *
249
     * @return mixed
250
     */
251
    abstract public function findMulti(string $selector);
252

253
    /**
254
     * Find nodes with a CSS selector or false, if no element is found.
255
     *
256
     * @param string $selector
257
     *
258
     * @return mixed
259
     */
260
    abstract public function findMultiOrFalse(string $selector);
261

262
    /**
263
     * Find one node with a CSS selector.
264
     *
265
     * @param string $selector
266
     *
267
     * @return mixed
268
     */
269
    abstract public function findOne(string $selector);
270

271
    /**
272
     * Find one node with a CSS selector or false, if no element is found.
273
     *
274
     * @param string $selector
275
     *
276
     * @return mixed
277
     */
278
    abstract public function findOneOrFalse(string $selector);
279

280
    /**
281
     * @return \DOMDocument
282
     */
283
    public function getDocument(): \DOMDocument
284
    {
285
        return $this->document;
458✔
286
    }
287

288
    /**
289
     * Get dom node's outer html.
290
     *
291
     * @param bool $multiDecodeNewHtmlEntity
292
     * @param bool $putBrokenReplacedBack
293
     *
294
     * @return string
295
     */
296
    abstract public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string;
297

298
    /**
299
     * Get dom node's inner html.
300
     *
301
     * @param bool $multiDecodeNewHtmlEntity
302
     * @param bool $putBrokenReplacedBack
303
     *
304
     * @return string
305
     */
306
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
307
    {
308
        // init
309
        $text = '';
×
310

311
        if ($this->document->documentElement) {
×
312
            foreach ($this->document->documentElement->childNodes as $node) {
×
313
                $text .= $this->document->saveHTML($node);
×
314
            }
315
        }
316

317
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
×
318
    }
319

320
    /**
321
     * Get dom node's inner html.
322
     *
323
     * @param bool $multiDecodeNewHtmlEntity
324
     *
325
     * @return string
326
     */
327
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
328
    {
329
        // init
330
        $text = '';
9✔
331

332
        if ($this->document->documentElement) {
9✔
333
            foreach ($this->document->documentElement->childNodes as $node) {
9✔
334
                $text .= $this->document->saveXML($node);
9✔
335
            }
336
        }
337

338
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
9✔
339
    }
340

341
    /**
342
     * Load HTML from string.
343
     *
344
     * @param string   $html
345
     * @param int|null $libXMLExtraOptions
346
     *
347
     * @return DomParserInterface
348
     */
349
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
350

351
    /**
352
     * Load HTML from file.
353
     *
354
     * @param string   $filePath
355
     * @param int|null $libXMLExtraOptions
356
     *
357
     * @throws \RuntimeException
358
     *
359
     * @return DomParserInterface
360
     */
361
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
362

363
    /**
364
     * Save the html-dom as string.
365
     *
366
     * @param string $filepath
367
     *
368
     * @return string
369
     */
370
    public function save(string $filepath = ''): string
371
    {
372
        $string = $this->html();
116✔
373
        if ($filepath !== '') {
116✔
374
            \file_put_contents($filepath, $string, \LOCK_EX);
17✔
375
        }
376

377
        return $string;
116✔
378
    }
379

380
    /**
381
     * @param callable $functionName
382
     *
383
     * @phpstan-param callable(\voku\helper\XmlDomParser|\voku\helper\HtmlDomParser): void $functionName
384
     *
385
     * @return void
386
     */
387
    public function set_callback($functionName)
388
    {
389
        static::$callback = $functionName;
×
390
    }
391

392
    /**
393
     * Get dom node's plain text.
394
     *
395
     * @param bool $multiDecodeNewHtmlEntity
396
     *
397
     * @return string
398
     */
399
    public function text(bool $multiDecodeNewHtmlEntity = false): string
400
    {
401
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
36✔
402
    }
403

404
    /**
405
     * Get the HTML as XML or plain XML if needed.
406
     *
407
     * @param bool $multiDecodeNewHtmlEntity
408
     * @param bool $htmlToXml
409
     * @param bool $removeXmlHeader
410
     * @param int  $options
411
     *
412
     * @return string
413
     */
414
    public function xml(
415
        bool $multiDecodeNewHtmlEntity = false,
416
        bool $htmlToXml = true,
417
        bool $removeXmlHeader = true,
418
        int $options = \LIBXML_NOEMPTYTAG
419
    ): string {
420
        $xml = $this->document->saveXML(null, $options);
36✔
421
        if ($xml === false) {
36✔
422
            return '';
×
423
        }
424

425
        if ($removeXmlHeader) {
36✔
426
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
36✔
427
        }
428

429
        if ($htmlToXml) {
36✔
430
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
18✔
431
        } else {
432
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
18✔
433

434
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
18✔
435
        }
436

437
        return $return;
36✔
438
    }
439

440
    /**
441
     * Get the encoding to use.
442
     *
443
     * @return string
444
     */
445
    protected function getEncoding(): string
446
    {
447
        return $this->encoding;
2,249✔
448
    }
449

450
    /**
451
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
452
     *
453
     * @param string $html
454
     *
455
     * @return void
456
     */
457
    protected function html5FallbackForScriptTags(string &$html)
458
    {
459
        // Normalize self-closing <script ... /> to <script ...></script> so
460
        // that the regex below does not treat the self-closing form as an
461
        // opening tag whose "content" extends to the next </script>.
462
        $html = (string) \preg_replace('/<script([^>]*)\/>/', '<script$1></script>', $html);
215✔
463

464
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
465
        /** @noinspection HtmlDeprecatedTag */
466
        $regExSpecialScript = '/<script(?<attr>[^>]*?)>(?<content>.*)<\/script>/isU';
215✔
467

468
        if (\PHP_VERSION_ID < 80000) {
215✔
469
            // On PHP < 8.0, older libxml's HTML parser can mishandle <\/ inside
470
            // <script> content, causing content after the sequence to leak outside
471
            // the element. Use a placeholder to protect any script content that
472
            // contains literal < characters so that loadHTML() receives safe input.
473
            $htmlTmp = \preg_replace_callback(
95✔
474
                $regExSpecialScript,
95✔
475
                function ($scripts) {
47✔
476
                    if (empty($scripts['content'])) {
87✔
477
                        return $scripts[0];
43✔
478
                    }
479

480
                    // Revert any existing <\/ escaping to check for bare < chars.
481
                    $contentReverted = \str_replace('<\/', '</', $scripts['content']);
71✔
482

483
                    if (\strpos($contentReverted, '<') === false) {
71✔
484
                        return $scripts[0];
47✔
485
                    }
486

487
                    // Apply the same </ → <\/ escaping that PHP 8+ applies so that
488
                    // when the placeholder is restored the output matches PHP 8+
489
                    // behaviour.  Any <\/ already present is left untouched because
490
                    // str_replace('</', ...) only matches the two-char sequence
491
                    // '<' + '/' and '<\/' has '\' in between.
492
                    $storedContent = \str_replace('</', '<\/', $scripts['content']);
40✔
493
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($storedContent);
40✔
494
                    $this->registerDynamicDomBrokenReplaceHelper($storedContent, $matchesHash);
40✔
495

496
                    return '<script' . $scripts['attr'] . '>' . $matchesHash . '</script>';
40✔
497
                },
95✔
498
                $html
95✔
499
            );
47✔
500

501
            if ($htmlTmp !== null) {
95✔
502
                $html = $htmlTmp;
95✔
503
            }
504

505
            return;
95✔
506
        }
507

508
        $htmlTmp = \preg_replace_callback(
120✔
509
            $regExSpecialScript,
120✔
510
            static function ($scripts) {
120✔
511
                if (empty($scripts['content'])) {
110✔
512
                    return $scripts[0];
55✔
513
                }
514

515
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
90✔
516
            },
120✔
517
            $html
120✔
518
        );
120✔
519

520
        if ($htmlTmp !== null) {
120✔
521
            $html = $htmlTmp;
120✔
522
        }
523
    }
524

525
    /**
526
     * @param string $html
527
     *
528
     * @return string
529
     */
530
    public static function putReplacedBackToPreserveHtmlEntities(string $html, bool $putBrokenReplacedBack = true): string
531
    {
532
        static $DOM_REPLACE__HELPER_CACHE = null;
1,538✔
533

534
        if ($DOM_REPLACE__HELPER_CACHE === null) {
1,538✔
535
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
298✔
536
                self::$domLinkReplaceHelper['tmp'],
298✔
537
                self::$domReplaceHelper['tmp']
298✔
538
            );
230✔
539
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
298✔
540
                self::$domLinkReplaceHelper['orig'],
298✔
541
                self::$domReplaceHelper['orig']
298✔
542
            );
230✔
543

544
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
298✔
545
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
298✔
546

547
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
298✔
548
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
298✔
549

550
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start_broken'] = self::$domHtmlWrapperHelper . '>';
298✔
551
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end_broken'] = '</' . self::$domHtmlWrapperHelper;
298✔
552

553
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start_broken'] = '';
298✔
554
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end_broken'] = '';
298✔
555

556
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
298✔
557
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
298✔
558

559
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
298✔
560
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
298✔
561

562
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start_broken'] = self::$domHtmlSpecialScriptHelper;
298✔
563
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end_broken'] = '</' . self::$domHtmlSpecialScriptHelper;
298✔
564

565
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start_broken'] = 'script';
298✔
566
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end_broken'] = '</script';
298✔
567
        }
568

569
        if (
570
            $putBrokenReplacedBack === true
1,538✔
571
            &&
572
            isset(self::$domBrokenReplaceHelper['tmp'])
1,538✔
573
            &&
574
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
1,538✔
575
        ) {
576
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
917✔
577
        }
578

579
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
1,538✔
580
    }
581

582
    /**
583
     * @param string $html
584
     *
585
     * @return string
586
     */
587
    public static function replaceToPreserveHtmlEntities(string $html): string
588
    {
589
        // init
590
        $linksNew = [];
2,114✔
591
        $linksOld = [];
2,114✔
592

593
        if (\strpos($html, 'http') !== false) {
2,114✔
594
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
595
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\(\w+\)|[^[:punct:]\s]|\/|}|]))/i';
639✔
596
            \preg_match_all($regExUrl, $html, $linksOld);
639✔
597

598
            if (!empty($linksOld[1])) {
639✔
599
                $linksOld = $linksOld[1];
616✔
600
                foreach ((array) $linksOld as $linkKey => $linkOld) {
616✔
601
                    $linksNew[$linkKey] = \str_replace(
616✔
602
                        self::$domLinkReplaceHelper['orig'],
616✔
603
                        self::$domLinkReplaceHelper['tmp'],
616✔
604
                        $linkOld
616✔
605
                    );
480✔
606
                }
607
            }
608
        }
609

610
        $linksNewCount = \count($linksNew);
2,114✔
611
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
2,114✔
612
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
616✔
613
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
616✔
614
        } else {
615
            $search = self::$domReplaceHelper['orig'];
1,578✔
616
            $replace = self::$domReplaceHelper['tmp'];
1,578✔
617
        }
618

619
        return \str_replace($search, $replace, $html);
2,114✔
620
    }
621
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc