• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

voku / simple_html_dom / 24632839975

19 Apr 2026 03:42PM UTC coverage: 77.11% (+6.3%) from 70.769%
24632839975

push

github

web-flow
Merge pull request #135 from voku/copilot/fix-html-parsing-newline-issue

Preserve node HTML formatting when serializing nested elements

4 of 24 new or added lines in 1 file covered. (16.67%)

51 existing lines in 6 files now uncovered.

1654 of 2145 relevant lines covered (77.11%)

262.05 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.0
/src/voku/helper/AbstractDomParser.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace voku\helper;
6

7
abstract class AbstractDomParser implements DomParserInterface
8
{
9
    /**
10
     * @var string
11
     */
12
    // Keep this helper tag non-hyphenated: older libxml HTML serializers treat
13
    // unknown hyphenated elements as block-level and inject formatting newlines.
14
    protected static $domHtmlWrapperHelper = 'simplevokuwrapper';
15

16
    /**
17
     * @var string
18
     */
19
    protected static $domHtmlBrokenHtmlHelper = 'simplevokubroken';
20

21
    /**
22
     * @var string
23
     */
24
    protected static $domHtmlSpecialScriptHelper = 'simplevokuspecialscript';
25

26
    /**
27
     * @var array
28
     */
29
    protected static $domBrokenReplaceHelper = [];
30

31
    /**
32
     * @var string[][]
33
     */
34
    protected static $domLinkReplaceHelper = [
35
        'orig' => ['[', ']', '{', '}'],
36
        'tmp'  => [
37
            'SHDOM_SQUARE_BRACKET_LEFT',
38
            'SHDOM_SQUARE_BRACKET_RIGHT',
39
            'SHDOM_BRACKET_LEFT',
40
            'SHDOM_BRACKET_RIGHT',
41
        ],
42
    ];
43

44
    /**
45
     * @var string[][]
46
     */
47
    protected static $domReplaceHelper = [
48
        'orig' => ['&', '|', '+', '%', '@', '<html ⚡'],
49
        'tmp'  => [
50
            'SHDOM_AMP',
51
            'SHDOM_PIPE',
52
            'SHDOM_PLUS',
53
            'SHDOM_PERCENT',
54
            'SHDOM_AT',
55
            '<html SHDOM_GOOGLE_AMP="true"',
56
        ],
57
    ];
58

59
    /**
60
     * @var callable|null
61
     *
62
     * @phpstan-var null|callable(\voku\helper\XmlDomParser|\voku\helper\HtmlDomParser): void
63
     */
64
    protected static $callback;
65

66
    /**
67
     * @var string[]
68
     */
69
    protected static $functionAliases = [];
70

71
    /**
72
     * @var string[]
73
     */
74
    protected $dynamicDomBrokenReplaceHelperKeys = [];
75

76
    /**
77
     * Remove the current parser instance's dynamic placeholder mappings from
78
     * the shared replacement table before reparsing this parser instance.
79
     *
80
     * @return void
81
     */
82
    protected function resetDynamicDomHelpers()
83
    {
84
        if (empty($this->dynamicDomBrokenReplaceHelperKeys)) {
2,121✔
85
            return;
2,114✔
86
        }
87

88
        foreach ($this->dynamicDomBrokenReplaceHelperKeys as $token) {
7✔
89
            foreach (\array_keys(self::$domBrokenReplaceHelper['tmp'] ?? [], $token, true) as $index) {
7✔
90
                unset(self::$domBrokenReplaceHelper['tmp'][$index], self::$domBrokenReplaceHelper['orig'][$index]);
7✔
91
            }
92
        }
93

94
        if (empty(self::$domBrokenReplaceHelper['tmp'])) {
7✔
95
            self::$domBrokenReplaceHelper = [];
7✔
96
        } else {
97
            self::$domBrokenReplaceHelper['tmp'] = \array_values(self::$domBrokenReplaceHelper['tmp']);
×
98
            self::$domBrokenReplaceHelper['orig'] = \array_values(self::$domBrokenReplaceHelper['orig']);
×
99
        }
100

101
        $this->dynamicDomBrokenReplaceHelperKeys = [];
7✔
102
    }
103

104
    /**
105
     * @param string $original
106
     * @param string $token
107
     *
108
     * @return void
109
     */
110
    protected function registerDynamicDomBrokenReplaceHelper(string $original, string $token)
111
    {
112
        self::$domBrokenReplaceHelper['orig'][] = $original;
79✔
113
        self::$domBrokenReplaceHelper['tmp'][] = $token;
79✔
114
        $this->dynamicDomBrokenReplaceHelperKeys[] = $token;
79✔
115
    }
116

117
    /**
118
     * @var \DOMDocument
119
     */
120
    protected $document;
121

122
    /**
123
     * @var string
124
     */
125
    protected $encoding = 'UTF-8';
126

127
    /**
128
     * @param string $name
129
     * @param array  $arguments
130
     *
131
     * @return bool|mixed
132
     */
133
    public function __call($name, $arguments)
134
    {
135
        $name = \strtolower($name);
×
136

137
        if (isset(self::$functionAliases[$name])) {
×
138
            return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
×
139
        }
140

141
        throw new \BadMethodCallException('Method does not exist: ' . $name);
×
142
    }
143

144
    /**
145
     * @param string $name
146
     * @param array  $arguments
147
     *
148
     * @throws \BadMethodCallException
149
     * @throws \RuntimeException
150
     *
151
     * @return static
152
     */
153
    abstract public static function __callStatic($name, $arguments);
154

155
    public function __clone()
156
    {
157
        $this->document = clone $this->document;
×
158
    }
159

160
    /**
161
     * @param string $name
162
     *
163
     * @return string|null
164
     */
165
    abstract public function __get($name);
166

167
    /**
168
     * @return string
169
     */
170
    abstract public function __toString();
171

172
    /**
173
     * does nothing (only for api-compatibility-reasons)
174
     *
175
     * @return bool
176
     *
177
     * @deprecated
178
     */
179
    public function clear(): bool
180
    {
181
        return true;
×
182
    }
183

184
    /**
185
     * Create DOMDocument from HTML.
186
     *
187
     * @param string   $html
188
     * @param int|null $libXMLExtraOptions
189
     *
190
     * @return \DOMDocument
191
     */
192
    abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument;
193

194
    /**
195
     * @param string $content
196
     * @param bool   $multiDecodeNewHtmlEntity
197
     *
198
     * @return string
199
     */
200
    protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string
201
    {
202
        if ($multiDecodeNewHtmlEntity) {
1,393✔
203
            if (\class_exists('\voku\helper\UTF8')) {
21✔
204
                $content = UTF8::rawurldecode($content, true);
×
205
            } else {
206
                do {
207
                    $content_compare = $content;
21✔
208

209
                    $content = \rawurldecode(
21✔
210
                        \html_entity_decode(
21✔
211
                            $content,
21✔
212
                            \ENT_QUOTES | \ENT_HTML5
21✔
213
                        )
21✔
214
                    );
21✔
215
                } while ($content_compare !== $content);
21✔
216
            }
217
        } else {
218
            /** @noinspection NestedPositiveIfStatementsInspection */
219
            if (\class_exists('\voku\helper\UTF8')) {
1,386✔
220
                $content = UTF8::rawurldecode($content, false);
×
221
            } else {
222
                $content = \rawurldecode(
1,386✔
223
                    \html_entity_decode(
1,386✔
224
                        $content,
1,386✔
225
                        \ENT_QUOTES | \ENT_HTML5
1,386✔
226
                    )
1,386✔
227
                );
1,386✔
228
            }
229
        }
230

231
        return $content;
1,393✔
232
    }
233

234
    /**
235
     * Find list of nodes with a CSS selector.
236
     *
237
     * @param string   $selector
238
     * @param int|null $idx
239
     *
240
     * @return mixed
241
     */
242
    abstract public function find(string $selector, $idx = null);
243

244
    /**
245
     * Find nodes with a CSS selector.
246
     *
247
     * @param string $selector
248
     *
249
     * @return mixed
250
     */
251
    abstract public function findMulti(string $selector);
252

253
    /**
254
     * Find nodes with a CSS selector or false, if no element is found.
255
     *
256
     * @param string $selector
257
     *
258
     * @return mixed
259
     */
260
    abstract public function findMultiOrFalse(string $selector);
261

262
    /**
263
     * Find nodes with a CSS selector or null, if no element is found.
264
     *
265
     * @param string $selector
266
     *
267
     * @return mixed
268
     */
269
    abstract public function findMultiOrNull(string $selector);
270

271
    /**
272
     * Find one node with a CSS selector.
273
     *
274
     * @param string $selector
275
     *
276
     * @return mixed
277
     */
278
    abstract public function findOne(string $selector);
279

280
    /**
281
     * Find one node with a CSS selector or false, if no element is found.
282
     *
283
     * @param string $selector
284
     *
285
     * @return mixed
286
     */
287
    abstract public function findOneOrFalse(string $selector);
288

289
    /**
290
     * Find one node with a CSS selector or null, if no element is found.
291
     *
292
     * @param string $selector
293
     *
294
     * @return mixed
295
     */
296
    abstract public function findOneOrNull(string $selector);
297

298
    /**
299
     * @return \DOMDocument
300
     */
301
    public function getDocument(): \DOMDocument
302
    {
303
        return $this->document;
413✔
304
    }
305

306
    /**
307
     * Get dom node's outer html.
308
     *
309
     * @param bool $multiDecodeNewHtmlEntity
310
     * @param bool $putBrokenReplacedBack
311
     *
312
     * @return string
313
     */
314
    abstract public function html(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string;
315

316
    /**
317
     * Get dom node's inner html.
318
     *
319
     * @param bool $multiDecodeNewHtmlEntity
320
     * @param bool $putBrokenReplacedBack
321
     *
322
     * @return string
323
     */
324
    public function innerHtml(bool $multiDecodeNewHtmlEntity = false, bool $putBrokenReplacedBack = true): string
325
    {
326
        // init
327
        $text = '';
×
328

329
        if ($this->document->documentElement) {
×
330
            foreach ($this->document->documentElement->childNodes as $node) {
×
331
                $text .= $this->document->saveHTML($node);
×
332
            }
333
        }
334

335
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity, $putBrokenReplacedBack);
×
336
    }
337

338
    /**
339
     * Get dom node's inner html.
340
     *
341
     * @param bool $multiDecodeNewHtmlEntity
342
     *
343
     * @return string
344
     */
345
    public function innerXml(bool $multiDecodeNewHtmlEntity = false): string
346
    {
347
        // init
348
        $text = '';
7✔
349

350
        if ($this->document->documentElement) {
7✔
351
            foreach ($this->document->documentElement->childNodes as $node) {
7✔
352
                $text .= $this->document->saveXML($node);
7✔
353
            }
354
        }
355

356
        return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
7✔
357
    }
358

359
    /**
360
     * Load HTML from string.
361
     *
362
     * @param string   $html
363
     * @param int|null $libXMLExtraOptions
364
     *
365
     * @return DomParserInterface
366
     */
367
    abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface;
368

369
    /**
370
     * Load HTML from file.
371
     *
372
     * @param string   $filePath
373
     * @param int|null $libXMLExtraOptions
374
     *
375
     * @throws \RuntimeException
376
     *
377
     * @return DomParserInterface
378
     */
379
    abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface;
380

381
    /**
382
     * Save the html-dom as string.
383
     *
384
     * @param string $filepath
385
     *
386
     * @return string
387
     */
388
    public function save(string $filepath = ''): string
389
    {
390
        $string = $this->html();
91✔
391
        if ($filepath !== '') {
91✔
392
            \file_put_contents($filepath, $string, \LOCK_EX);
14✔
393
        }
394

395
        return $string;
91✔
396
    }
397

398
    /**
399
     * @param callable $functionName
400
     *
401
     * @phpstan-param callable(\voku\helper\XmlDomParser|\voku\helper\HtmlDomParser): void $functionName
402
     *
403
     * @return void
404
     */
405
    public function set_callback($functionName)
406
    {
407
        static::$callback = $functionName;
×
408
    }
409

410
    /**
411
     * Get dom node's plain text.
412
     *
413
     * @param bool $multiDecodeNewHtmlEntity
414
     *
415
     * @return string
416
     */
417
    public function text(bool $multiDecodeNewHtmlEntity = false): string
418
    {
419
        return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
×
420
    }
421

422
    /**
423
     * Get the HTML as XML or plain XML if needed.
424
     *
425
     * @param bool $multiDecodeNewHtmlEntity
426
     * @param bool $htmlToXml
427
     * @param bool $removeXmlHeader
428
     * @param int  $options
429
     *
430
     * @return string
431
     */
432
    public function xml(
433
        bool $multiDecodeNewHtmlEntity = false,
434
        bool $htmlToXml = true,
435
        bool $removeXmlHeader = true,
436
        int $options = \LIBXML_NOEMPTYTAG
437
    ): string {
438
        $xml = $this->document->saveXML(null, $options);
28✔
439
        if ($xml === false) {
28✔
440
            return '';
×
441
        }
442

443
        if ($removeXmlHeader) {
28✔
444
            $xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml));
28✔
445
        }
446

447
        if ($htmlToXml) {
28✔
448
            $return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
14✔
449
        } else {
450
            $xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity);
14✔
451

452
            $return = self::putReplacedBackToPreserveHtmlEntities($xml);
14✔
453
        }
454

455
        return $return;
28✔
456
    }
457

458
    /**
459
     * Get the encoding to use.
460
     *
461
     * @return string
462
     */
463
    protected function getEncoding(): string
464
    {
465
        return $this->encoding;
2,254✔
466
    }
467

468
    /**
469
     * workaround for bug: https://bugs.php.net/bug.php?id=74628
470
     *
471
     * @param string $html
472
     *
473
     * @return void
474
     */
475
    protected function html5FallbackForScriptTags(string &$html)
476
    {
477
        // Normalize self-closing <script ... /> to <script ...></script> so
478
        // that the regex below does not treat the self-closing form as an
479
        // opening tag whose "content" extends to the next </script>.
480
        $html = (string) \preg_replace('/<script([^>]*)\/>/', '<script$1></script>', $html);
168✔
481

482
        // regEx for e.g.: [<script id="elements-image-2">...<script>]
483
        /** @noinspection HtmlDeprecatedTag */
484
        $regExSpecialScript = '/<script(?<attr>[^>]*?)>(?<content>.*)<\/script>/isU';
168✔
485

486
        if (\PHP_VERSION_ID < 80000) {
168✔
487
            // On PHP < 8.0, older libxml's HTML parser can mishandle <\/ inside
488
            // <script> content, causing content after the sequence to leak outside
489
            // the element. Use a placeholder to protect any script content that
490
            // contains literal < characters so that loadHTML() receives safe input.
UNCOV
491
            $htmlTmp = \preg_replace_callback(
48✔
UNCOV
492
                $regExSpecialScript,
48✔
UNCOV
493
                function ($scripts) {
48✔
UNCOV
494
                    if (empty($scripts['content'])) {
44✔
UNCOV
495
                        return $scripts[0];
20✔
496
                    }
497

498
                    // Revert any existing <\/ escaping to check for bare < chars.
UNCOV
499
                    $contentReverted = \str_replace('<\/', '</', $scripts['content']);
36✔
500

UNCOV
501
                    if (\strpos($contentReverted, '<') === false) {
36✔
UNCOV
502
                        return $scripts[0];
24✔
503
                    }
504

505
                    // Apply the same </ → <\/ escaping that PHP 8+ applies so that
506
                    // when the placeholder is restored the output matches PHP 8+
507
                    // behaviour.  Any <\/ already present is left untouched because
508
                    // str_replace('</', ...) only matches the two-char sequence
509
                    // '<' + '/' and '<\/' has '\' in between.
UNCOV
510
                    $storedContent = \str_replace('</', '<\/', $scripts['content']);
20✔
UNCOV
511
                    $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($storedContent);
20✔
UNCOV
512
                    $this->registerDynamicDomBrokenReplaceHelper($storedContent, $matchesHash);
20✔
513

UNCOV
514
                    return '<script' . $scripts['attr'] . '>' . $matchesHash . '</script>';
20✔
UNCOV
515
                },
48✔
UNCOV
516
                $html
48✔
UNCOV
517
            );
48✔
518

UNCOV
519
            if ($htmlTmp !== null) {
48✔
UNCOV
520
                $html = $htmlTmp;
48✔
521
            }
522

UNCOV
523
            return;
48✔
524
        }
525

526
        $htmlTmp = \preg_replace_callback(
120✔
527
            $regExSpecialScript,
120✔
528
            static function ($scripts) {
120✔
529
                if (empty($scripts['content'])) {
110✔
530
                    return $scripts[0];
50✔
531
                }
532

533
                return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>';
90✔
534
            },
120✔
535
            $html
120✔
536
        );
120✔
537

538
        if ($htmlTmp !== null) {
120✔
539
            $html = $htmlTmp;
120✔
540
        }
541
    }
542

543
    /**
544
     * @param string $html
545
     *
546
     * @return string
547
     */
548
    public static function putReplacedBackToPreserveHtmlEntities(string $html, bool $putBrokenReplacedBack = true): string
549
    {
550
        static $DOM_REPLACE__HELPER_CACHE = null;
1,666✔
551

552
        if ($DOM_REPLACE__HELPER_CACHE === null) {
1,666✔
553
            $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
230✔
554
                self::$domLinkReplaceHelper['tmp'],
230✔
555
                self::$domReplaceHelper['tmp']
230✔
556
            );
230✔
557
            $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
230✔
558
                self::$domLinkReplaceHelper['orig'],
230✔
559
                self::$domReplaceHelper['orig']
230✔
560
            );
230✔
561

562
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
230✔
563
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
230✔
564

565
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
230✔
566
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
230✔
567

568
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start_broken'] = self::$domHtmlWrapperHelper . '>';
230✔
569
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end_broken'] = '</' . self::$domHtmlWrapperHelper;
230✔
570

571
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start_broken'] = '';
230✔
572
            $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end_broken'] = '';
230✔
573

574
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper;
230✔
575
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>';
230✔
576

577
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script';
230✔
578
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>';
230✔
579

580
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start_broken'] = self::$domHtmlSpecialScriptHelper;
230✔
581
            $DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end_broken'] = '</' . self::$domHtmlSpecialScriptHelper;
230✔
582

583
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start_broken'] = 'script';
230✔
584
            $DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end_broken'] = '</script';
230✔
585
        }
586

587
        if (
588
            $putBrokenReplacedBack === true
1,666✔
589
            &&
590
            isset(self::$domBrokenReplaceHelper['tmp'])
1,666✔
591
            &&
592
            \count(self::$domBrokenReplaceHelper['tmp']) > 0
1,666✔
593
        ) {
594
            $html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
1,043✔
595
        }
596

597
        return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
1,666✔
598
    }
599

600
    /**
601
     * @param string $html
602
     *
603
     * @return string
604
     */
605
    public static function replaceToPreserveHtmlEntities(string $html): string
606
    {
607
        // init
608
        $linksNew = [];
2,135✔
609
        $linksOld = [];
2,135✔
610

611
        if (\strpos($html, 'http') !== false) {
2,135✔
612
            // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
613
            $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\(\w+\)|[^[:punct:]\s]|\/|}|]))/i';
514✔
614
            \preg_match_all($regExUrl, $html, $linksOld);
514✔
615

616
            if (!empty($linksOld[1])) {
514✔
617
                $linksOld = $linksOld[1];
495✔
618
                foreach ((array) $linksOld as $linkKey => $linkOld) {
495✔
619
                    $linksNew[$linkKey] = \str_replace(
495✔
620
                        self::$domLinkReplaceHelper['orig'],
495✔
621
                        self::$domLinkReplaceHelper['tmp'],
495✔
622
                        $linkOld
495✔
623
                    );
495✔
624
                }
625
            }
626
        }
627

628
        $linksNewCount = \count($linksNew);
2,135✔
629
        if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
2,135✔
630
            $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
495✔
631
            $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
495✔
632
        } else {
633
            $search = self::$domReplaceHelper['orig'];
1,696✔
634
            $replace = self::$domReplaceHelper['tmp'];
1,696✔
635
        }
636

637
        return \str_replace($search, $replace, $html);
2,135✔
638
    }
639
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc