• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

equalizedigital / accessibility-checker / 14623446636

23 Apr 2025 04:39PM UTC coverage: 26.595% (-1.2%) from 27.823%
14623446636

push

github

web-flow
Merge pull request #928 from equalizedigital/william/pro-51-convert-image-alt-redundant-rule-to-js

Convert image alt redundant rule to js

0 of 1 new or added line in 1 file covered. (0.0%)

68 existing lines in 3 files now uncovered.

1805 of 6787 relevant lines covered (26.59%)

1.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

46.27
/includes/simplehtmldom/simple_html_dom.php
1
<?php
2
// Exit if called directly.
3
if ( ! defined( 'ABSPATH' ) ) die;
4
/**
5
 * Website: http://sourceforge.net/projects/simplehtmldom/
6
 * Additional projects: http://sourceforge.net/projects/debugobject/
7
 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
8
 *
9
 * Licensed under The MIT License
10
 * See the LICENSE file in the project root for more information.
11
 *
12
 * Authors:
13
 *   S.C. Chen
14
 *   John Schlick
15
 *   Rus Carroll
16
 *   logmanoriginal
17
 *
18
 * Contributors:
19
 *   Yousuke Kumakura
20
 *   Vadim Voituk
21
 *   Antcs
22
 *
23
 * Version Rev. 1.9 (290)
24
 */
25

26
define('HDOM_TYPE_ELEMENT', 1);
27
define('HDOM_TYPE_COMMENT', 2);
28
define('HDOM_TYPE_TEXT', 3);
29
define('HDOM_TYPE_ENDTAG', 4);
30
define('HDOM_TYPE_ROOT', 5);
31
define('HDOM_TYPE_UNKNOWN', 6);
32
define('HDOM_QUOTE_DOUBLE', 0);
33
define('HDOM_QUOTE_SINGLE', 1);
34
define('HDOM_QUOTE_NO', 3);
35
define('HDOM_INFO_BEGIN', 0);
36
define('HDOM_INFO_END', 1);
37
define('HDOM_INFO_QUOTE', 2);
38
define('HDOM_INFO_SPACE', 3);
39
define('HDOM_INFO_TEXT', 4);
40
define('HDOM_INFO_INNER', 5);
41
define('HDOM_INFO_OUTER', 6);
42
define('HDOM_INFO_ENDSPACE', 7);
43

44
defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
45
defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
46
defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
47
defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 6000000);
48
define('HDOM_SMARTY_AS_TEXT', 1);
49

50
function file_get_html(
51
        $url,
52
        $use_include_path = false,
53
        $context = null,
54
        $offset = 0,
55
        $maxLen = -1,
56
        $lowercase = true,
57
        $forceTagsClosed = true,
58
        $target_charset = DEFAULT_TARGET_CHARSET,
59
        $stripRN = true,
60
        $defaultBRText = DEFAULT_BR_TEXT,
61
        $defaultSpanText = DEFAULT_SPAN_TEXT)
62
{
63
        if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
2✔
64

65
        $dom = new simple_html_dom(
2✔
66
                null,
2✔
67
                $lowercase,
2✔
68
                $forceTagsClosed,
2✔
69
                $target_charset,
2✔
70
                $stripRN,
2✔
71
                $defaultBRText,
2✔
72
                $defaultSpanText
2✔
73
        );
2✔
74

75
        /**
76
         * For sourceforge users: uncomment the next line and comment the
77
         * retrieve_url_contents line 2 lines down if it is not already done.
78
         */
79
        $contents = file_get_contents(
2✔
80
                $url,
2✔
81
                $use_include_path,
2✔
82
                $context,
2✔
83
                $offset,
2✔
84
                $maxLen
2✔
85
        );
2✔
86
        // $contents = retrieve_url_contents($url);
87

88
        if (empty($contents) || strlen($contents) > $maxLen) {
2✔
89
                $dom->clear();
×
90
                return false;
×
91
        }
92

93
        return $dom->load($contents, $lowercase, $stripRN);
2✔
94
}
95

96
function str_get_html(
97
        $str,
98
        $lowercase = true,
99
        $forceTagsClosed = true,
100
        $target_charset = DEFAULT_TARGET_CHARSET,
101
        $stripRN = true,
102
        $defaultBRText = DEFAULT_BR_TEXT,
103
        $defaultSpanText = DEFAULT_SPAN_TEXT)
104
{
UNCOV
105
        $dom = new simple_html_dom(
×
UNCOV
106
                null,
×
UNCOV
107
                $lowercase,
×
UNCOV
108
                $forceTagsClosed,
×
UNCOV
109
                $target_charset,
×
UNCOV
110
                $stripRN,
×
UNCOV
111
                $defaultBRText,
×
UNCOV
112
                $defaultSpanText
×
UNCOV
113
        );
×
114

UNCOV
115
        if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
×
116
                $dom->clear();
×
117
                return false;
×
118
        }
119

UNCOV
120
        return $dom->load($str, $lowercase, $stripRN);
×
121
}
122

123
function dump_html_tree($node, $show_attr = true, $deep = 0)
124
{
125
        $node->dump($node);
×
126
}
127

128
class simple_html_dom_node
129
{
130
        public $nodetype = HDOM_TYPE_TEXT;
131
        public $tag = 'text';
132
        public $attr = array();
133
        public $children = array();
134
        public $nodes = array();
135
        public $parent = null;
136
        public $_ = array();
137
        public $tag_start = 0;
138
        private $dom = null;
139

140
        function __construct($dom)
141
        {
142
                $this->dom = $dom;
2✔
143
                $dom->nodes[] = $this;
2✔
144
        }
145

146
        function __destruct()
147
        {
148
                $this->clear();
×
149
        }
150

151
        function __toString()
152
        {
153
                return $this->outertext();
×
154
        }
155

156
        function clear()
157
        {
158
                $this->dom = null;
×
159
                $this->nodes = null;
×
160
                $this->parent = null;
×
161
                $this->children = null;
×
162
        }
163

164
        function dump($show_attr = true, $depth = 0)
165
        {
166
                echo str_repeat("\t", $depth) . $this->tag;
×
167

168
                if ($show_attr && count($this->attr) > 0) {
×
169
                        echo '(';
×
170
                        foreach ($this->attr as $k => $v) {
×
171
                                echo "[$k]=>\"$v\", ";
×
172
                        }
173
                        echo ')';
×
174
                }
175

176
                echo "\n";
×
177

178
                if ($this->nodes) {
×
179
                        foreach ($this->nodes as $node) {
×
180
                                $node->dump($show_attr, $depth + 1);
×
181
                        }
182
                }
183
        }
184

185
        function dump_node($echo = true)
186
        {
187
                $string = $this->tag;
×
188

189
                if (count($this->attr) > 0) {
×
190
                        $string .= '(';
×
191
                        foreach ($this->attr as $k => $v) {
×
192
                                $string .= "[$k]=>\"$v\", ";
×
193
                        }
194
                        $string .= ')';
×
195
                }
196

197
                if (count($this->_) > 0) {
×
198
                        $string .= ' $_ (';
×
199
                        foreach ($this->_ as $k => $v) {
×
200
                                if (is_array($v)) {
×
201
                                        $string .= "[$k]=>(";
×
202
                                        foreach ($v as $k2 => $v2) {
×
203
                                                $string .= "[$k2]=>\"$v2\", ";
×
204
                                        }
205
                                        $string .= ')';
×
206
                                } else {
207
                                        $string .= "[$k]=>\"$v\", ";
×
208
                                }
209
                        }
210
                        $string .= ')';
×
211
                }
212

213
                if (isset($this->text)) {
×
214
                        $string .= " text: ({$this->text})";
×
215
                }
216

217
                $string .= ' HDOM_INNER_INFO: ';
×
218

219
                if (isset($node->_[HDOM_INFO_INNER])) {
×
220
                        $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
×
221
                } else {
222
                        $string .= ' NULL ';
×
223
                }
224

225
                $string .= ' children: ' . count($this->children);
×
226
                $string .= ' nodes: ' . count($this->nodes);
×
227
                $string .= ' tag_start: ' . $this->tag_start;
×
228
                $string .= "\n";
×
229

230
                if ($echo) {
×
231
                        echo $string;
×
232
                        return;
×
233
                } else {
234
                        return $string;
×
235
                }
236
        }
237

238
        function parent($parent = null)
239
        {
240
                // I am SURE that this doesn't work properly.
241
                // It fails to unset the current node from it's current parents nodes or
242
                // children list first.
243
                if ($parent !== null) {
×
244
                        $this->parent = $parent;
×
245
                        $this->parent->nodes[] = $this;
×
246
                        $this->parent->children[] = $this;
×
247
                }
248

249
                return $this->parent;
×
250
        }
251

252
        function has_child()
253
        {
254
                return !empty($this->children);
×
255
        }
256

257
        function children($idx = -1)
258
        {
259
                if ($idx === -1) {
2✔
260
                        return $this->children;
2✔
261
                }
262

263
                if (isset($this->children[$idx])) {
×
264
                        return $this->children[$idx];
×
265
                }
266

267
                return null;
×
268
        }
269

270
        function first_child()
271
        {
272
                if (count($this->children) > 0) {
×
273
                        return $this->children[0];
×
274
                }
275
                return null;
×
276
        }
277

278
        function last_child()
279
        {
280
                if (count($this->children) > 0) {
×
281
                        return end($this->children);
×
282
                }
283
                return null;
×
284
        }
285

286
        function next_sibling()
287
        {
288
                if ($this->parent === null) {
×
289
                        return null;
×
290
                }
291

292
                $idx = array_search($this, $this->parent->children, true);
×
293

294
                if ($idx !== false && isset($this->parent->children[$idx + 1])) {
×
295
                        return $this->parent->children[$idx + 1];
×
296
                }
297

298
                return null;
×
299
        }
300

301
        function prev_sibling()
302
        {
303
                if ($this->parent === null) {
×
304
                        return null;
×
305
                }
306

307
                $idx = array_search($this, $this->parent->children, true);
×
308

309
                if ($idx !== false && $idx > 0) {
×
310
                        return $this->parent->children[$idx - 1];
×
311
                }
312

313
                return null;
×
314
        }
315

316
        function find_ancestor_tag($tag)
317
        {
318
                global $debug_object;
×
319
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
×
320

321
                if ($this->parent === null) {
×
322
                        return null;
×
323
                }
324

325
                $ancestor = $this->parent;
×
326

327
                while (!is_null($ancestor)) {
×
328
                        if (is_object($debug_object)) {
×
329
                                $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
×
330
                        }
331

332
                        if ($ancestor->tag === $tag) {
×
333
                                break;
×
334
                        }
335

336
                        $ancestor = $ancestor->parent;
×
337
                }
338

339
                return $ancestor;
×
340
        }
341

342
        function innertext()
343
        {
344
                if (isset($this->_[HDOM_INFO_INNER])) {
2✔
345
                        return $this->_[HDOM_INFO_INNER];
×
346
                }
347

348
                if (isset($this->_[HDOM_INFO_TEXT])) {
2✔
349
                        return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
×
350
                }
351

352
                $ret = '';
2✔
353

354
                foreach ($this->nodes as $n) {
2✔
355
                        $ret .= $n->outertext();
2✔
356
                }
357

358
                return $ret;
2✔
359
        }
360

361
        function outertext()
362
        {
363
                global $debug_object;
2✔
364

365
                if (is_object($debug_object)) {
2✔
366
                        $text = '';
×
367

368
                        if ($this->tag === 'text') {
×
369
                                if (!empty($this->text)) {
×
370
                                        $text = ' with text: ' . $this->text;
×
371
                                }
372
                        }
373

374
                        $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
×
375
                }
376

377
                if ($this->tag === 'root') {
2✔
378
                        return $this->innertext();
×
379
                }
380

381
                // todo: What is the use of this callback? Remove?
382
                if ($this->dom && $this->dom->callback !== null) {
2✔
383
                        call_user_func_array($this->dom->callback, array($this));
×
384
                }
385

386
                if (isset($this->_[HDOM_INFO_OUTER])) {
2✔
387
                        return $this->_[HDOM_INFO_OUTER];
×
388
                }
389

390
                if (isset($this->_[HDOM_INFO_TEXT])) {
2✔
391
                        return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
2✔
392
                }
393

394
                $ret = '';
2✔
395

396
                if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
2✔
397
                        $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
2✔
398
                }
399

400
                if (isset($this->_[HDOM_INFO_INNER])) {
2✔
401
                        // todo: <br> should either never have HDOM_INFO_INNER or always
402
                        if ($this->tag !== 'br') {
×
403
                                $ret .= $this->_[HDOM_INFO_INNER];
×
404
                        }
405
                } elseif ($this->nodes) {
2✔
406
                        foreach ($this->nodes as $n) {
2✔
407
                                $ret .= $this->convert_text($n->outertext());
2✔
408
                        }
409
                }
410

411
                if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
2✔
412
                        $ret .= '</' . $this->tag . '>';
2✔
413
                }
414

415
                return $ret;
2✔
416
        }
417

418
        function text()
419
        {
420
                if (isset($this->_[HDOM_INFO_INNER])) {
2✔
421
                        return $this->_[HDOM_INFO_INNER];
×
422
                }
423

424
                switch ($this->nodetype) {
2✔
425
                        case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
2✔
426
                        case HDOM_TYPE_COMMENT: return '';
2✔
427
                        case HDOM_TYPE_UNKNOWN: return '';
2✔
428
                }
429

430
                if (strcasecmp($this->tag, 'script') === 0) { return ''; }
2✔
431
                if (strcasecmp($this->tag, 'style') === 0) { return ''; }
2✔
432

433
                $ret = '';
2✔
434

435
                // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
436
                // for some span tags, and some p tags) $this->nodes is set to NULL.
437
                // NOTE: This indicates that there is a problem where it's set to NULL
438
                // without a clear happening.
439
                // WHY is this happening?
440
                if (!is_null($this->nodes)) {
2✔
441
                        foreach ($this->nodes as $n) {
2✔
442
                                // Start paragraph after a blank line
443
                                if ($n->tag === 'p') {
2✔
444
                                        $ret = trim($ret) . "\n\n";
2✔
445
                                }
446

447
                                $ret .= $this->convert_text($n->text());
2✔
448

449
                                // If this node is a span... add a space at the end of it so
450
                                // multiple spans don't run into each other.  This is plaintext
451
                                // after all.
452
                                if ($n->tag === 'span') {
2✔
453
                                        $ret .= $this->dom->default_span_text;
×
454
                                }
455
                        }
456
                }
457
                return $ret;
2✔
458
        }
459

460
        function xmltext()
461
        {
462
                $ret = $this->innertext();
×
463
                $ret = str_ireplace('<![CDATA[', '', $ret);
×
464
                $ret = str_replace(']]>', '', $ret);
×
465
                return $ret;
×
466
        }
467

468
        function makeup()
469
        {
470
                // text, comment, unknown
471
                if (isset($this->_[HDOM_INFO_TEXT])) {
2✔
472
                        return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
×
473
                }
474

475
                $ret = '<' . $this->tag;
2✔
476
                $i = -1;
2✔
477

478
                foreach ($this->attr as $key => $val) {
2✔
479
                        ++$i;
2✔
480

481
                        // skip removed attribute
482
                        if ($val === null || $val === false) { continue; }
2✔
483

484
                        $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
2✔
485

486
                        //no value attr: nowrap, checked selected...
487
                        if ($val === true) {
2✔
488
                                $ret .= $key;
×
489
                        } else {
490
                                switch ($this->_[HDOM_INFO_QUOTE][$i])
2✔
491
                                {
492
                                        case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
2✔
493
                                        case HDOM_QUOTE_SINGLE: $quote = '\''; break;
×
494
                                        default: $quote = '';
×
495
                                }
496

497
                                $ret .= $key
2✔
498
                                . $this->_[HDOM_INFO_SPACE][$i][1]
2✔
499
                                . '='
2✔
500
                                . $this->_[HDOM_INFO_SPACE][$i][2]
2✔
501
                                . $quote
2✔
502
                                . $val
2✔
503
                                . $quote;
2✔
504
                        }
505
                }
506

507
                $ret = $this->dom->restore_noise($ret);
2✔
508
                return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
2✔
509
        }
510

511
        function find($selector, $idx = null, $lowercase = false)
512
        {
513
                $selectors = $this->parse_selector($selector);
2✔
514
                if (($count = count($selectors)) === 0) { return array(); }
2✔
515
                $found_keys = array();
2✔
516

517
                // find each selector
518
                for ($c = 0; $c < $count; ++$c) {
2✔
519
                        // The change on the below line was documented on the sourceforge
520
                        // code tracker id 2788009
521
                        // used to be: if (($levle=count($selectors[0]))===0) return array();
522
                        if (($levle = count($selectors[$c])) === 0) { return array(); }
2✔
523
                        if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
2✔
524

525
                        $head = array($this->_[HDOM_INFO_BEGIN] => 1);
2✔
526
                        $cmd = ' '; // Combinator
2✔
527

528
                        // handle descendant selectors, no recursive!
529
                        for ($l = 0; $l < $levle; ++$l) {
2✔
530
                                $ret = array();
2✔
531

532
                                foreach ($head as $k => $v) {
2✔
533
                                        $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
2✔
534
                                        //PaperG - Pass this optional parameter on to the seek function.
535
                                        $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
2✔
536
                                }
537

538
                                $head = $ret;
2✔
539
                                $cmd = $selectors[$c][$l][4]; // Next Combinator
2✔
540
                        }
541

542
                        foreach ($head as $k => $v) {
2✔
543
                                if (!isset($found_keys[$k])) {
2✔
544
                                        $found_keys[$k] = 1;
2✔
545
                                }
546
                        }
547
                }
548

549
                // sort keys
550
                ksort($found_keys);
2✔
551

552
                $found = array();
2✔
553
                foreach ($found_keys as $k => $v) {
2✔
554
                        $found[] = $this->dom->nodes[$k];
2✔
555
                }
556

557
                // return nth-element or array
558
                if (is_null($idx)) { return $found; }
2✔
559
                elseif ($idx < 0) { $idx = count($found) + $idx; }
2✔
560
                return (isset($found[$idx])) ? $found[$idx] : null;
2✔
561
        }
562

563
        protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
564
        {
565
                global $debug_object;
2✔
566
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2✔
567

568
                list($tag, $id, $class, $attributes, $cmb) = $selector;
2✔
569
                $nodes = array();
2✔
570

571
                if ($parent_cmd === ' ') { // Descendant Combinator
2✔
572
                        // Find parent closing tag if the current element doesn't have a closing
573
                        // tag (i.e. void element)
574
                        $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
2✔
575
                        if ($end == 0) {
2✔
576
                                $parent = $this->parent;
×
577
                                while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
×
578
                                        $end -= 1;
×
579
                                        $parent = $parent->parent;
×
580
                                }
581
                                $end += $parent->_[HDOM_INFO_END];
×
582
                        }
583

584
                        // Get list of target nodes
585
                        $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
2✔
586
                        $nodes_count = $end - $nodes_start;
2✔
587
                        $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
2✔
588
                } elseif ($parent_cmd === '>') { // Child Combinator
×
589
                        $nodes = $this->children;
×
590
                } elseif ($parent_cmd === '+'
×
591
                        && $this->parent
×
592
                        && in_array($this, $this->parent->children, true)) { // Next-Sibling Combinator
×
593
                                $index = array_search($this, $this->parent->children, true) + 1;
×
594
                                if ($index < count($this->parent->children))
×
595
                                        $nodes[] = $this->parent->children[$index];
×
596
                } elseif ($parent_cmd === '~'
×
597
                        && $this->parent
×
598
                        && in_array($this, $this->parent->children, true)) { // Subsequent Sibling Combinator
×
599
                                $index = array_search($this, $this->parent->children, true);
×
600
                                $nodes = array_slice($this->parent->children, $index);
×
601
                }
602

603
                // Go throgh each element starting at this element until the end tag
604
                // Note: If this element is a void tag, any previous void element is
605
                // skipped.
606
                foreach($nodes as $node) {
2✔
607
                        $pass = true;
2✔
608

609
                        // Skip root nodes
610
                        if(!$node->parent) {
2✔
611
                                $pass = false;
2✔
612
                        }
613

614
                        // Skip if node isn't a child node (i.e. text nodes)
615
                        if($pass && !in_array($node, $node->parent->children, true)) {
2✔
616
                                $pass = false;
2✔
617
                        }
618

619
                        // Skip if tag doesn't match
620
                        if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
2✔
621
                                $pass = false;
2✔
622
                        }
623

624
                        // Skip if ID doesn't exist
625
                        if ($pass && $id !== '' && !isset($node->attr['id'])) {
2✔
626
                                $pass = false;
2✔
627
                        }
628

629
                        // Check if ID matches
630
                        if ($pass && $id !== '' && isset($node->attr['id'])) {
2✔
631
                                // Note: Only consider the first ID (as browsers do)
632
                                $node_id = explode(' ', trim($node->attr['id']))[0];
×
633

634
                                if($id !== $node_id) { $pass = false; }
×
635
                        }
636

637
                        // Check if all class(es) exist
638
                        if ($pass && $class !== '' && is_array($class) && !empty($class)) {
2✔
639
                                if (isset($node->attr['class'])) {
2✔
640
                                        $node_classes = explode(' ', $node->attr['class']);
×
641

642
                                        if ($lowercase) {
×
643
                                                $node_classes = array_map('strtolower', $node_classes);
×
644
                                        }
645

646
                                        foreach($class as $c) {
×
647
                                                if(!in_array($c, $node_classes)) {
×
648
                                                        $pass = false;
×
649
                                                        break;
×
650
                                                }
651
                                        }
652
                                } else {
653
                                        $pass = false;
2✔
654
                                }
655
                        }
656

657
                        // Check attributes
658
                        if ($pass
2✔
659
                                && $attributes !== ''
2✔
660
                                && is_array($attributes)
2✔
661
                                && !empty($attributes)) {
2✔
662
                                        foreach($attributes as $a) {
2✔
663
                                                list (
2✔
664
                                                        $att_name,
2✔
665
                                                        $att_expr,
2✔
666
                                                        $att_val,
2✔
667
                                                        $att_inv,
2✔
668
                                                        $att_case_sensitivity
2✔
669
                                                ) = $a;
2✔
670

671
                                                // Handle indexing attributes (i.e. "[2]")
672
                                                /**
673
                                                 * Note: This is not supported by the CSS Standard but adds
674
                                                 * the ability to select items compatible to XPath (i.e.
675
                                                 * the 3rd element within it's parent).
676
                                                 *
677
                                                 * Note: This doesn't conflict with the CSS Standard which
678
                                                 * doesn't work on numeric attributes anyway.
679
                                                 */
680
                                                if (is_numeric($att_name)
2✔
681
                                                        && $att_expr === ''
2✔
682
                                                        && $att_val === '') {
2✔
683
                                                                $count = 0;
×
684

685
                                                                // Find index of current element in parent
686
                                                                foreach ($node->parent->children as $c) {
×
687
                                                                        if ($c->tag === $node->tag) ++$count;
×
688
                                                                        if ($c === $node) break;
×
689
                                                                }
690

691
                                                                // If this is the correct node, continue with next
692
                                                                // attribute
693
                                                                if ($count === (int)$att_name) continue;
×
694
                                                }
695

696
                                                // Check attribute availability
697
                                                if ($att_inv) { // Attribute should NOT be set
2✔
698
                                                        if (isset($node->attr[$att_name])) {
×
699
                                                                $pass = false;
×
700
                                                                break;
×
701
                                                        }
702
                                                } else { // Attribute should be set
703
                                                        // todo: "plaintext" is not a valid CSS selector!
704
                                                        if ($att_name !== 'plaintext'
2✔
705
                                                                && !isset($node->attr[$att_name])) {
2✔
706
                                                                        $pass = false;
2✔
707
                                                                        break;
2✔
708
                                                        }
709
                                                }
710

711
                                                // Continue with next attribute if expression isn't defined
712
                                                if ($att_expr === '') continue;
2✔
713

714
                                                // If they have told us that this is a "plaintext"
715
                                                // search then we want the plaintext of the node - right?
716
                                                // todo "plaintext" is not a valid CSS selector!
717
                                                if ($att_name === 'plaintext') {
2✔
718
                                                        $nodeKeyValue = $node->text();
×
719
                                                } else {
720
                                                        $nodeKeyValue = $node->attr[$att_name];
2✔
721
                                                }
722

723
                                                if (is_object($debug_object)) {
2✔
724
                                                        $debug_object->debug_log(2,
×
725
                                                                'testing node: '
×
726
                                                                . $node->tag
×
727
                                                                . ' for attribute: '
×
728
                                                                . $att_name
×
729
                                                                . $att_expr
×
730
                                                                . $att_val
×
731
                                                                . ' where nodes value is: '
×
732
                                                                . $nodeKeyValue
×
733
                                                        );
×
734
                                                }
735

736
                                                // If lowercase is set, do a case insensitive test of
737
                                                // the value of the selector.
738
                                                if ($lowercase) {
2✔
739
                                                        $check = $this->match(
2✔
740
                                                                $att_expr,
2✔
741
                                                                strtolower($att_val),
2✔
742
                                                                strtolower($nodeKeyValue),
2✔
743
                                                                $att_case_sensitivity
2✔
744
                                                        );
2✔
745
                                                } else {
UNCOV
746
                                                        $check = $this->match(
×
UNCOV
747
                                                                $att_expr,
×
UNCOV
748
                                                                $att_val,
×
UNCOV
749
                                                                $nodeKeyValue,
×
UNCOV
750
                                                                $att_case_sensitivity
×
UNCOV
751
                                                        );
×
752
                                                }
753

754
                                                if (is_object($debug_object)) {
2✔
755
                                                        $debug_object->debug_log(2,
×
756
                                                                'after match: '
×
757
                                                                . ($check ? 'true' : 'false')
×
758
                                                        );
×
759
                                                }
760

761
                                                if (!$check) {
2✔
UNCOV
762
                                                        $pass = false;
×
UNCOV
763
                                                        break;
×
764
                                                }
765
                                        }
766
                        }
767

768
                        // Found a match. Add to list and clear node
769
                        if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
2✔
770
                        unset($node);
2✔
771
                }
772
                // It's passed by reference so this is actually what this function returns.
773
                if (is_object($debug_object)) {
2✔
774
                        $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
×
775
                }
776
        }
777

778
        protected function match($exp, $pattern, $value, $case_sensitivity)
779
        {
780
                global $debug_object;
2✔
781
                if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
2✔
782

783
                if ($case_sensitivity === 'i') {
2✔
784
                        $pattern = strtolower($pattern);
×
785
                        $value = strtolower($value);
×
786
                }
787

788
                switch ($exp) {
789
                        case '=':
2✔
790
                                return ($value === $pattern);
2✔
UNCOV
791
                        case '!=':
×
792
                                return ($value !== $pattern);
×
UNCOV
793
                        case '^=':
×
794
                                return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
×
UNCOV
795
                        case '$=':
×
UNCOV
796
                                return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
×
UNCOV
797
                        case '*=':
×
UNCOV
798
                                return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
×
799
                        case '|=':
×
800
                                /**
801
                                 * [att|=val]
802
                                 *
803
                                 * Represents an element with the att attribute, its value
804
                                 * either being exactly "val" or beginning with "val"
805
                                 * immediately followed by "-" (U+002D).
806
                                 */
807
                                return strpos($value, $pattern) === 0;
×
808
                        case '~=':
×
809
                                /**
810
                                 * [att~=val]
811
                                 *
812
                                 * Represents an element with the att attribute whose value is a
813
                                 * whitespace-separated list of words, one of which is exactly
814
                                 * "val". If "val" contains whitespace, it will never represent
815
                                 * anything (since the words are separated by spaces). Also if
816
                                 * "val" is the empty string, it will never represent anything.
817
                                 */
818
                                return in_array($pattern, explode(' ', trim($value)), true);
×
819
                }
820
                return false;
×
821
        }
822

823
        protected function parse_selector($selector_string)
824
        {
825
                global $debug_object;
2✔
826
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2✔
827

828
                /**
829
                 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
830
                 *
831
                 * Paperg: Add the colon to the attribute, so that it properly finds
832
                 * <tag attr:ibute="something" > like google does.
833
                 *
834
                 * Note: if you try to look at this attribute, you MUST use getAttribute
835
                 * since $dom->x:y will fail the php syntax check.
836
                 *
837
                 * Notice the \[ starting the attribute? and the @? following? This
838
                 * implies that an attribute can begin with an @ sign that is not
839
                 * captured. This implies that an html attribute specifier may start
840
                 * with an @ sign that is NOT captured by the expression. Farther study
841
                 * is required to determine of this should be documented or removed.
842
                 *
843
                 * Matches selectors in this order:
844
                 *
845
                 * [0] - full match
846
                 *
847
                 * [1] - tag name
848
                 *     ([\w:\*-]*)
849
                 *     Matches the tag name consisting of zero or more words, colons,
850
                 *     asterisks and hyphens.
851
                 *
852
                 * [2] - id name
853
                 *     (?:\#([\w-]+))
854
                 *     Optionally matches a id name, consisting of an "#" followed by
855
                 *     the id name (one or more words and hyphens).
856
                 *
857
                 * [3] - class names (including dots)
858
                 *     (?:\.([\w\.-]+))?
859
                 *     Optionally matches a list of classs, consisting of an "."
860
                 *     followed by the class name (one or more words and hyphens)
861
                 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
862
                 *
863
                 * [4] - attributes
864
                 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
865
                 *     Optionally matches the attributes list
866
                 *
867
                 * [5] - separator
868
                 *     ([\/, >+~]+)
869
                 *     Matches the selector list separator
870
                 */
871
                // phpcs:ignore Generic.Files.LineLength
872
                $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
2✔
873

874
                preg_match_all(
2✔
875
                        $pattern,
2✔
876
                        trim($selector_string) . ' ', // Add final ' ' as pseudo separator
2✔
877
                        $matches,
2✔
878
                        PREG_SET_ORDER
2✔
879
                );
2✔
880

881
                if (is_object($debug_object)) {
2✔
882
                        $debug_object->debug_log(2, 'Matches Array: ', $matches);
×
883
                }
884

885
                $selectors = array();
2✔
886
                $result = array();
2✔
887

888
                foreach ($matches as $m) {
2✔
889
                        $m[0] = trim($m[0]);
2✔
890

891
                        // Skip NoOps
892
                        if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
2✔
893

894
                        // Convert to lowercase
895
                        if ($this->dom->lowercase) {
2✔
896
                                $m[1] = strtolower($m[1]);
2✔
897
                        }
898

899
                        // Extract classes
900
                        if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
2✔
901

902
                        /* Extract attributes (pattern based on the pattern above!)
903

904
                         * [0] - full match
905
                         * [1] - attribute name
906
                         * [2] - attribute expression
907
                         * [3] - attribute value
908
                         * [4] - case sensitivity
909
                         *
910
                         * Note: Attributes can be negated with a "!" prefix to their name
911
                         */
912
                        if($m[4] !== '') {
2✔
913
                                preg_match_all(
2✔
914
                                        "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
2✔
915
                                        trim($m[4]),
2✔
916
                                        $attributes,
2✔
917
                                        PREG_SET_ORDER
2✔
918
                                );
2✔
919

920
                                // Replace element by array
921
                                $m[4] = array();
2✔
922

923
                                foreach($attributes as $att) {
2✔
924
                                        // Skip empty matches
925
                                        if(trim($att[0]) === '') { continue; }
2✔
926

927
                                        $inverted = (isset($att[1][0]) && $att[1][0] === '!');
2✔
928
                                        $m[4][] = array(
2✔
929
                                                $inverted ? substr($att[1], 1) : $att[1], // Name
2✔
930
                                                (isset($att[2])) ? $att[2] : '', // Expression
2✔
931
                                                (isset($att[3])) ? $att[3] : '', // Value
2✔
932
                                                $inverted, // Inverted Flag
2✔
933
                                                (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
2✔
934
                                        );
2✔
935
                                }
936
                        }
937

938
                        // Sanitize Separator
939
                        if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
2✔
940
                                $m[5] = ' ';
2✔
941
                        } else { // Other Separator
942
                                $m[5] = trim($m[5]);
2✔
943
                        }
944

945
                        // Clear Separator if it's a Selector List
946
                        if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
2✔
947

948
                        // Remove full match before adding to results
949
                        array_shift($m);
2✔
950
                        $result[] = $m;
2✔
951

952
                        if ($is_list) { // Selector List
2✔
953
                                $selectors[] = $result;
2✔
954
                                $result = array();
2✔
955
                        }
956
                }
957

958
                if (count($result) > 0) { $selectors[] = $result; }
2✔
959
                return $selectors;
2✔
960
        }
961

962
        function __get($name)
963
        {
964
                if (isset($this->attr[$name])) {
2✔
965
                        return $this->convert_text($this->attr[$name]);
2✔
966
                }
967
                switch ($name) {
968
                        case 'outertext': return $this->outertext();
2✔
969
                        case 'innertext': return $this->innertext();
2✔
970
                        case 'plaintext': return $this->text();
2✔
UNCOV
971
                        case 'xmltext': return $this->xmltext();
×
UNCOV
972
                        default: return array_key_exists($name, $this->attr);
×
973
                }
974
        }
975

976
        function __set($name, $value)
977
        {
978
                global $debug_object;
×
979
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
×
980

981
                switch ($name) {
982
                        case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
×
983
                        case 'innertext':
×
984
                                if (isset($this->_[HDOM_INFO_TEXT])) {
×
985
                                        return $this->_[HDOM_INFO_TEXT] = $value;
×
986
                                }
987
                                return $this->_[HDOM_INFO_INNER] = $value;
×
988
                }
989

990
                if (!isset($this->attr[$name])) {
×
991
                        $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
×
992
                        $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
×
993
                }
994

995
                $this->attr[$name] = $value;
×
996
        }
997

998
        function __isset($name)
999
        {
1000
                switch ($name) {
1001
                        case 'outertext': return true;
×
1002
                        case 'innertext': return true;
×
1003
                        case 'plaintext': return true;
×
1004
                }
1005
                //no value attr: nowrap, checked selected...
1006
                return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
×
1007
        }
1008

1009
        function __unset($name)
1010
        {
1011
                if (isset($this->attr[$name])) { unset($this->attr[$name]); }
×
1012
        }
1013

1014
        function convert_text($text)
1015
        {
1016
                global $debug_object;
2✔
1017
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2✔
1018

1019
                $converted_text = $text;
2✔
1020

1021
                $sourceCharset = '';
2✔
1022
                $targetCharset = '';
2✔
1023

1024
                if ($this->dom) {
2✔
1025
                        $sourceCharset = strtoupper($this->dom->_charset);
2✔
1026
                        $targetCharset = strtoupper($this->dom->_target_charset);
2✔
1027
                }
1028

1029
                if (is_object($debug_object)) {
2✔
1030
                        $debug_object->debug_log(3,
×
1031
                                'source charset: '
×
1032
                                . $sourceCharset
×
1033
                                . ' target charaset: '
×
1034
                                . $targetCharset
×
1035
                        );
×
1036
                }
1037

1038
                if (!empty($sourceCharset)
2✔
1039
                        && !empty($targetCharset)
2✔
1040
                        && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
2✔
1041
                        // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1042
                        if ((strcasecmp($targetCharset, 'UTF-8') == 0)
×
1043
                                && ($this->is_utf8($text))) {
×
1044
                                $converted_text = $text;
×
1045
                        } else {
1046
                                $converted_text = iconv($sourceCharset, $targetCharset, $text);
×
1047
                        }
1048
                }
1049

1050
                // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1051
                if ($targetCharset === 'UTF-8') {
2✔
1052
                        if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
2✔
1053
                                $converted_text = substr($converted_text, 3);
×
1054
                        }
1055

1056
                        if (substr($converted_text, -3) === "\xef\xbb\xbf") {
2✔
1057
                                $converted_text = substr($converted_text, 0, -3);
×
1058
                        }
1059
                }
1060

1061
                return $converted_text;
2✔
1062
        }
1063

1064
        static function is_utf8($str)
1065
        {
1066
                $c = 0; $b = 0;
×
1067
                $bits = 0;
×
1068
                $len = strlen($str);
×
1069
                for($i = 0; $i < $len; $i++) {
×
1070
                        $c = ord($str[$i]);
×
1071
                        if($c > 128) {
×
1072
                                if(($c >= 254)) { return false; }
×
1073
                                elseif($c >= 252) { $bits = 6; }
×
1074
                                elseif($c >= 248) { $bits = 5; }
×
1075
                                elseif($c >= 240) { $bits = 4; }
×
1076
                                elseif($c >= 224) { $bits = 3; }
×
1077
                                elseif($c >= 192) { $bits = 2; }
×
1078
                                else { return false; }
×
1079
                                if(($i + $bits) > $len) { return false; }
×
1080
                                while($bits > 1) {
×
1081
                                        $i++;
×
1082
                                        $b = ord($str[$i]);
×
1083
                                        if($b < 128 || $b > 191) { return false; }
×
1084
                                        $bits--;
×
1085
                                }
1086
                        }
1087
                }
1088
                return true;
×
1089
        }
1090

1091
        function get_display_size()
1092
        {
1093
                global $debug_object;
×
1094

1095
                $width = -1;
×
1096
                $height = -1;
×
1097

1098
                if ($this->tag !== 'img') {
×
1099
                        return false;
×
1100
                }
1101

1102
                // See if there is aheight or width attribute in the tag itself.
1103
                if (isset($this->attr['width'])) {
×
1104
                        $width = $this->attr['width'];
×
1105
                }
1106

1107
                if (isset($this->attr['height'])) {
×
1108
                        $height = $this->attr['height'];
×
1109
                }
1110

1111
                // Now look for an inline style.
1112
                if (isset($this->attr['style'])) {
×
1113
                        // Thanks to user gnarf from stackoverflow for this regular expression.
1114
                        $attributes = array();
×
1115

1116
                        preg_match_all(
×
1117
                                '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
×
1118
                                $this->attr['style'],
×
1119
                                $matches,
×
1120
                                PREG_SET_ORDER
×
1121
                        );
×
1122

1123
                        foreach ($matches as $match) {
×
1124
                                $attributes[$match[1]] = $match[2];
×
1125
                        }
1126

1127
                        // If there is a width in the style attributes:
1128
                        if (isset($attributes['width']) && $width == -1) {
×
1129
                                // check that the last two characters are px (pixels)
1130
                                if (strtolower(substr($attributes['width'], -2)) === 'px') {
×
1131
                                        $proposed_width = substr($attributes['width'], 0, -2);
×
1132
                                        // Now make sure that it's an integer and not something stupid.
1133
                                        if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
×
1134
                                                $width = $proposed_width;
×
1135
                                        }
1136
                                }
1137
                        }
1138

1139
                        // If there is a width in the style attributes:
1140
                        if (isset($attributes['height']) && $height == -1) {
×
1141
                                // check that the last two characters are px (pixels)
1142
                                if (strtolower(substr($attributes['height'], -2)) == 'px') {
×
1143
                                        $proposed_height = substr($attributes['height'], 0, -2);
×
1144
                                        // Now make sure that it's an integer and not something stupid.
1145
                                        if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
×
1146
                                                $height = $proposed_height;
×
1147
                                        }
1148
                                }
1149
                        }
1150

1151
                }
1152

1153
                // Future enhancement:
1154
                // Look in the tag to see if there is a class or id specified that has
1155
                // a height or width attribute to it.
1156

1157
                // Far future enhancement
1158
                // Look at all the parent tags of this image to see if they specify a
1159
                // class or id that has an img selector that specifies a height or width
1160
                // Note that in this case, the class or id will have the img subselector
1161
                // for it to apply to the image.
1162

1163
                // ridiculously far future development
1164
                // If the class or id is specified in a SEPARATE css file thats not on
1165
                // the page, go get it and do what we were just doing for the ones on
1166
                // the page.
1167

1168
                $result = array(
×
1169
                        'height' => $height,
×
1170
                        'width' => $width
×
1171
                );
×
1172

1173
                return $result;
×
1174
        }
1175

1176
        function save($filepath = '')
1177
        {
1178
                $ret = $this->outertext();
×
1179

1180
                if ($filepath !== '') {
×
1181
                        file_put_contents($filepath, $ret, LOCK_EX);
×
1182
                }
1183

1184
                return $ret;
×
1185
        }
1186

1187
        function addClass($class)
1188
        {
1189
                if (is_string($class)) {
×
1190
                        $class = explode(' ', $class);
×
1191
                }
1192

1193
                if (is_array($class)) {
×
1194
                        foreach($class as $c) {
×
1195
                                if (isset($this->class)) {
×
1196
                                        if ($this->hasClass($c)) {
×
1197
                                                continue;
×
1198
                                        } else {
1199
                                                $this->class .= ' ' . $c;
×
1200
                                        }
1201
                                } else {
1202
                                        $this->class = $c;
×
1203
                                }
1204
                        }
1205
                } else {
1206
                        if (is_object($debug_object)) {
×
1207
                                $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
×
1208
                        }
1209
                }
1210
        }
1211

1212
        function hasClass($class)
1213
        {
1214
                if (is_string($class)) {
×
1215
                        if (isset($this->class)) {
×
1216
                                return in_array($class, explode(' ', $this->class), true);
×
1217
                        }
1218
                } else {
1219
                        if (is_object($debug_object)) {
×
1220
                                $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
×
1221
                        }
1222
                }
1223

1224
                return false;
×
1225
        }
1226

1227
        function removeClass($class = null)
1228
        {
1229
                if (!isset($this->class)) {
×
1230
                        return;
×
1231
                }
1232

1233
                if (is_null($class)) {
×
1234
                        $this->removeAttribute('class');
×
1235
                        return;
×
1236
                }
1237

1238
                if (is_string($class)) {
×
1239
                        $class = explode(' ', $class);
×
1240
                }
1241

1242
                if (is_array($class)) {
×
1243
                        $class = array_diff(explode(' ', $this->class), $class);
×
1244
                        if (empty($class)) {
×
1245
                                $this->removeAttribute('class');
×
1246
                        } else {
1247
                                $this->class = implode(' ', $class);
×
1248
                        }
1249
                }
1250
        }
1251

1252
        function getAllAttributes()
1253
        {
1254
                return $this->attr;
×
1255
        }
1256

1257
        function getAttribute($name)
1258
        {
UNCOV
1259
                return $this->__get($name);
×
1260
        }
1261

1262
        function setAttribute($name, $value)
1263
        {
1264
                $this->__set($name, $value);
×
1265
        }
1266

1267
        function hasAttribute($name)
1268
        {
1269
                return $this->__isset($name);
×
1270
        }
1271

1272
        function removeAttribute($name)
1273
        {
1274
                $this->__set($name, null);
×
1275
        }
1276

1277
        function remove()
1278
        {
1279
                if ($this->parent) {
×
1280
                        $this->parent->removeChild($this);
×
1281
                }
1282
        }
1283

1284
        function removeChild($node)
1285
        {
1286
                $nidx = array_search($node, $this->nodes, true);
×
1287
                $cidx = array_search($node, $this->children, true);
×
1288
                $didx = array_search($node, $this->dom->nodes, true);
×
1289

1290
                if ($nidx !== false && $cidx !== false && $didx !== false) {
×
1291

1292
                        foreach($node->children as $child) {
×
1293
                                $node->removeChild($child);
×
1294
                        }
1295

1296
                        foreach($node->nodes as $entity) {
×
1297
                                $enidx = array_search($entity, $node->nodes, true);
×
1298
                                $edidx = array_search($entity, $node->dom->nodes, true);
×
1299

1300
                                if ($enidx !== false && $edidx !== false) {
×
1301
                                        unset($node->nodes[$enidx]);
×
1302
                                        unset($node->dom->nodes[$edidx]);
×
1303
                                }
1304
                        }
1305

1306
                        unset($this->nodes[$nidx]);
×
1307
                        unset($this->children[$cidx]);
×
1308
                        unset($this->dom->nodes[$didx]);
×
1309

1310
                        $node->clear();
×
1311

1312
                }
1313
        }
1314

1315
        function getElementById($id)
1316
        {
1317
                return $this->find("#$id", 0);
×
1318
        }
1319

1320
        function getElementsById($id, $idx = null)
1321
        {
1322
                return $this->find("#$id", $idx);
×
1323
        }
1324

1325
        function getElementByTagName($name)
1326
        {
1327
                return $this->find($name, 0);
×
1328
        }
1329

1330
        function getElementsByTagName($name, $idx = null)
1331
        {
UNCOV
1332
                return $this->find($name, $idx);
×
1333
        }
1334

1335
        function parentNode()
1336
        {
1337
                return $this->parent();
×
1338
        }
1339

1340
        function childNodes($idx = -1)
1341
        {
1342
                return $this->children($idx);
×
1343
        }
1344

1345
        function firstChild()
1346
        {
1347
                return $this->first_child();
×
1348
        }
1349

1350
        function lastChild()
1351
        {
1352
                return $this->last_child();
×
1353
        }
1354

1355
        function nextSibling()
1356
        {
1357
                return $this->next_sibling();
×
1358
        }
1359

1360
        function previousSibling()
1361
        {
1362
                return $this->prev_sibling();
×
1363
        }
1364

1365
        function hasChildNodes()
1366
        {
1367
                return $this->has_child();
×
1368
        }
1369

1370
        function nodeName()
1371
        {
1372
                return $this->tag;
×
1373
        }
1374

1375
        function appendChild($node)
1376
        {
1377
                $node->parent($this);
×
1378
                return $node;
×
1379
        }
1380

1381
}
1382

1383
class simple_html_dom
1384
{
1385
        public $root = null;
1386
        public $nodes = array();
1387
        public $callback = null;
1388
        public $lowercase = false;
1389
        public $original_size;
1390
        public $size;
1391

1392
        protected $pos;
1393
        protected $doc;
1394
        protected $char;
1395

1396
        protected $cursor;
1397
        protected $parent;
1398
        protected $noise = array();
1399
        protected $token_blank = " \t\r\n";
1400
        protected $token_equal = ' =/>';
1401
        protected $token_slash = " />\r\n\t";
1402
        protected $token_attr = ' >';
1403

1404
        public $_charset = '';
1405
        public $_target_charset = '';
1406

1407
        protected $default_br_text = '';
1408

1409
        public $default_span_text = '';
1410

1411
        protected $self_closing_tags = array(
1412
                'area' => 1,
1413
                'base' => 1,
1414
                'br' => 1,
1415
                'col' => 1,
1416
                'embed' => 1,
1417
                'hr' => 1,
1418
                'img' => 1,
1419
                'input' => 1,
1420
                'link' => 1,
1421
                'meta' => 1,
1422
                'param' => 1,
1423
                'source' => 1,
1424
                'track' => 1,
1425
                'wbr' => 1
1426
        );
1427
        protected $block_tags = array(
1428
                'body' => 1,
1429
                'div' => 1,
1430
                'form' => 1,
1431
                'root' => 1,
1432
                'span' => 1,
1433
                'table' => 1
1434
        );
1435
        protected $optional_closing_tags = array(
1436
                // Not optional, see
1437
                // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1438
                'b' => array('b' => 1),
1439
                'dd' => array('dd' => 1, 'dt' => 1),
1440
                // Not optional, see
1441
                // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1442
                'dl' => array('dd' => 1, 'dt' => 1),
1443
                'dt' => array('dd' => 1, 'dt' => 1),
1444
                'li' => array('li' => 1),
1445
                'optgroup' => array('optgroup' => 1, 'option' => 1),
1446
                'option' => array('optgroup' => 1, 'option' => 1),
1447
                'p' => array('p' => 1),
1448
                'rp' => array('rp' => 1, 'rt' => 1),
1449
                'rt' => array('rp' => 1, 'rt' => 1),
1450
                'td' => array('td' => 1, 'th' => 1),
1451
                'th' => array('td' => 1, 'th' => 1),
1452
                'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1453
        );
1454

1455
        function __construct(
1456
                $str = null,
1457
                $lowercase = true,
1458
                $forceTagsClosed = true,
1459
                $target_charset = DEFAULT_TARGET_CHARSET,
1460
                $stripRN = true,
1461
                $defaultBRText = DEFAULT_BR_TEXT,
1462
                $defaultSpanText = DEFAULT_SPAN_TEXT,
1463
                $options = 0)
1464
        {
1465
                if ($str) {
2✔
1466
                        if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
×
1467
                                $this->load_file($str);
×
1468
                        } else {
1469
                                $this->load(
×
1470
                                        $str,
×
1471
                                        $lowercase,
×
1472
                                        $stripRN,
×
1473
                                        $defaultBRText,
×
1474
                                        $defaultSpanText,
×
1475
                                        $options
×
1476
                                );
×
1477
                        }
1478
                }
1479
                // Forcing tags to be closed implies that we don't trust the html, but
1480
                // it can lead to parsing errors if we SHOULD trust the html.
1481
                if (!$forceTagsClosed) {
2✔
1482
                        $this->optional_closing_array = array();
×
1483
                }
1484

1485
                $this->_target_charset = $target_charset;
2✔
1486
        }
1487

1488
        function __destruct()
1489
        {
1490
                $this->clear();
×
1491
        }
1492

1493
        function load(
1494
                $str,
1495
                $lowercase = true,
1496
                $stripRN = true,
1497
                $defaultBRText = DEFAULT_BR_TEXT,
1498
                $defaultSpanText = DEFAULT_SPAN_TEXT,
1499
                $options = 0)
1500
        {
1501
                global $debug_object;
2✔
1502

1503
                // prepare
1504
                $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
2✔
1505

1506
                // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1507
                // Script tags removal now preceeds style tag removal.
1508
                // strip out <script> tags
1509
                $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
2✔
1510
                $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
2✔
1511

1512
                // strip out the \r \n's if we are told to.
1513
                if ($stripRN) {
2✔
1514
                        $this->doc = str_replace("\r", ' ', $this->doc);
2✔
1515
                        $this->doc = str_replace("\n", ' ', $this->doc);
2✔
1516

1517
                        // set the length of content since we have changed it.
1518
                        $this->size = strlen($this->doc);
2✔
1519
                }
1520

1521
                // strip out cdata
1522
                $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
2✔
1523
                // strip out comments
1524
                $this->remove_noise("'<!--(.*?)-->'is");
2✔
1525
                // strip out <style> tags
1526
                $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
2✔
1527
                $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
2✔
1528
                // strip out preformatted tags
1529
                $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
2✔
1530
                // strip out server side scripts
1531
                $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
2✔
1532

1533
                if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
2✔
1534
                        $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
×
1535
                }
1536

1537
                // parsing
1538
                $this->parse();
2✔
1539
                // end
1540
                $this->root->_[HDOM_INFO_END] = $this->cursor;
2✔
1541
                $this->parse_charset();
2✔
1542

1543
                // make load function chainable
1544
                return $this;
2✔
1545
        }
1546

1547
        function load_file()
1548
        {
1549
                $args = func_get_args();
×
1550

1551
                if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
×
1552
                        $this->load($doc, true);
×
1553
                } else {
1554
                        return false;
×
1555
                }
1556
        }
1557

1558
        function set_callback($function_name)
1559
        {
1560
                $this->callback = $function_name;
×
1561
        }
1562

1563
        function remove_callback()
1564
        {
1565
                $this->callback = null;
×
1566
        }
1567

1568
        function save($filepath = '')
1569
        {
1570
                $ret = $this->root->innertext();
2✔
1571
                if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
2✔
1572
                return $ret;
2✔
1573
        }
1574

1575
        function find($selector, $idx = null, $lowercase = false)
1576
        {
1577
                return $this->root->find($selector, $idx, $lowercase);
2✔
1578
        }
1579

1580
        function clear()
1581
        {
1582
                if (isset($this->nodes)) {
2✔
1583
                        foreach ($this->nodes as $n) {
2✔
1584
                                $n->clear();
×
1585
                                $n = null;
×
1586
                        }
1587
                }
1588

1589
                // This add next line is documented in the sourceforge repository.
1590
                // 2977248 as a fix for ongoing memory leaks that occur even with the
1591
                // use of clear.
1592
                if (isset($this->children)) {
2✔
1593
                        foreach ($this->children as $n) {
×
1594
                                $n->clear();
×
1595
                                $n = null;
×
1596
                        }
1597
                }
1598

1599
                if (isset($this->parent)) {
2✔
1600
                        $this->parent->clear();
×
1601
                        unset($this->parent);
×
1602
                }
1603

1604
                if (isset($this->root)) {
2✔
1605
                        $this->root->clear();
×
1606
                        unset($this->root);
×
1607
                }
1608

1609
                unset($this->doc);
2✔
1610
                unset($this->noise);
2✔
1611
        }
1612

1613
        function dump($show_attr = true)
1614
        {
1615
                $this->root->dump($show_attr);
×
1616
        }
1617

1618
        protected function prepare(
1619
                $str, $lowercase = true,
1620
                $defaultBRText = DEFAULT_BR_TEXT,
1621
                $defaultSpanText = DEFAULT_SPAN_TEXT)
1622
        {
1623
                $this->clear();
2✔
1624

1625
                $this->doc = trim($str);
2✔
1626
                $this->size = strlen($this->doc);
2✔
1627
                $this->original_size = $this->size; // original size of the html
2✔
1628
                $this->pos = 0;
2✔
1629
                $this->cursor = 1;
2✔
1630
                $this->noise = array();
2✔
1631
                $this->nodes = array();
2✔
1632
                $this->lowercase = $lowercase;
2✔
1633
                $this->default_br_text = $defaultBRText;
2✔
1634
                $this->default_span_text = $defaultSpanText;
2✔
1635
                $this->root = new simple_html_dom_node($this);
2✔
1636
                $this->root->tag = 'root';
2✔
1637
                $this->root->_[HDOM_INFO_BEGIN] = -1;
2✔
1638
                $this->root->nodetype = HDOM_TYPE_ROOT;
2✔
1639
                $this->parent = $this->root;
2✔
1640
                if ($this->size > 0) { $this->char = $this->doc[0]; }
2✔
1641
        }
1642

1643
        protected function parse()
1644
        {
1645
                while (true) {
2✔
1646
                        // Read next tag if there is no text between current position and the
1647
                        // next opening tag.
1648
                        if (($s = $this->copy_until_char('<')) === '') {
2✔
1649
                                if($this->read_tag()) {
2✔
1650
                                        continue;
2✔
1651
                                } else {
1652
                                        return true;
2✔
1653
                                }
1654
                        }
1655

1656
                        // Add a text node for text between tags
1657
                        $node = new simple_html_dom_node($this);
2✔
1658
                        ++$this->cursor;
2✔
1659
                        $node->_[HDOM_INFO_TEXT] = $s;
2✔
1660
                        $this->link_nodes($node, false);
2✔
1661
                }
1662
        }
1663

1664
        protected function parse_charset()
1665
        {
1666
                global $debug_object;
2✔
1667

1668
                $charset = null;
2✔
1669

1670
                if (function_exists('get_last_retrieve_url_contents_content_type')) {
2✔
1671
                        $contentTypeHeader = get_last_retrieve_url_contents_content_type();
×
1672
                        $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
×
1673
                        if ($success) {
×
1674
                                $charset = $matches[1];
×
1675
                                if (is_object($debug_object)) {
×
1676
                                        $debug_object->debug_log(2,
×
1677
                                                'header content-type found charset of: '
×
1678
                                                . $charset
×
1679
                                        );
×
1680
                                }
1681
                        }
1682
                }
1683

1684
                if (empty($charset)) {
2✔
1685
                        // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1686
                        $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2✔
1687

1688
                        if (!empty($el)) {
2✔
1689
                                $fullvalue = $el->content;
2✔
1690
                                if (is_object($debug_object)) {
2✔
1691
                                        $debug_object->debug_log(2,
×
1692
                                                'meta content-type tag found'
×
1693
                                                . $fullvalue
×
1694
                                        );
×
1695
                                }
1696

1697
                                if (!empty($fullvalue)) {
2✔
1698
                                        $success = preg_match(
2✔
1699
                                                '/charset=(.+)/i',
2✔
1700
                                                $fullvalue,
2✔
1701
                                                $matches
2✔
1702
                                        );
2✔
1703

1704
                                        if ($success) {
2✔
1705
                                                $charset = $matches[1];
2✔
1706
                                        } else {
1707
                                                // If there is a meta tag, and they don't specify the
1708
                                                // character set, research says that it's typically
1709
                                                // ISO-8859-1
1710
                                                if (is_object($debug_object)) {
×
1711
                                                        $debug_object->debug_log(2,
×
1712
                                                                'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
×
1713
                                                        );
×
1714
                                                }
1715

1716
                                                $charset = 'ISO-8859-1';
×
1717
                                        }
1718
                                }
1719
                        }
1720
                }
1721

1722
                if (empty($charset)) {
2✔
1723
                        // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
UNCOV
1724
                        if ($meta = $this->root->find('meta[charset]', 0)) {
×
1725
                                $charset = $meta->charset;
×
1726
                                if (is_object($debug_object)) {
×
1727
                                        $debug_object->debug_log(2, 'meta charset: ' . $charset);
×
1728
                                }
1729
                        }
1730
                }
1731

1732
                if (empty($charset)) {
2✔
1733
                        // Try to guess the charset based on the content
1734
                        // Requires Multibyte String (mbstring) support (optional)
UNCOV
1735
                        if (function_exists('mb_detect_encoding')) {
×
1736
                                /**
1737
                                 * mb_detect_encoding() is not intended to distinguish between
1738
                                 * charsets, especially single-byte charsets. Its primary
1739
                                 * purpose is to detect which multibyte encoding is in use,
1740
                                 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1741
                                 *
1742
                                 * -- https://bugs.php.net/bug.php?id=38138
1743
                                 *
1744
                                 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1745
                                 * always result in CP1251/ISO-8859-5 and vice versa.
1746
                                 *
1747
                                 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1748
                                 * to stay compatible.
1749
                                 */
UNCOV
1750
                                $encoding = mb_detect_encoding(
×
UNCOV
1751
                                        $this->doc,
×
UNCOV
1752
                                        array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
×
UNCOV
1753
                                );
×
1754

UNCOV
1755
                                if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
×
1756
                                        // Due to a limitation of mb_detect_encoding
1757
                                        // 'CP1251'/'ISO-8859-5' will be detected as
1758
                                        // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1759
                                        // which case we can simply assume it is the other charset.
1760
                                        if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
×
1761
                                                $encoding = 'CP1251';
×
1762
                                        }
1763
                                }
1764

UNCOV
1765
                                if ($encoding !== false) {
×
UNCOV
1766
                                        $charset = $encoding;
×
UNCOV
1767
                                        if (is_object($debug_object)) {
×
1768
                                                $debug_object->debug_log(2, 'mb_detect: ' . $charset);
×
1769
                                        }
1770
                                }
1771
                        }
1772
                }
1773

1774
                if (empty($charset)) {
2✔
1775
                        // Assume it's UTF-8 as it is the most likely charset to be used
1776
                        $charset = 'UTF-8';
×
1777
                        if (is_object($debug_object)) {
×
1778
                                $debug_object->debug_log(2, 'No match found, assume ' . $charset);
×
1779
                        }
1780
                }
1781

1782
                // Since CP1252 is a superset, if we get one of it's subsets, we want
1783
                // it instead.
1784
                if ((strtolower($charset) == 'iso-8859-1')
2✔
1785
                        || (strtolower($charset) == 'latin1')
2✔
1786
                        || (strtolower($charset) == 'latin-1')) {
2✔
1787
                        $charset = 'CP1252';
×
1788
                        if (is_object($debug_object)) {
×
1789
                                $debug_object->debug_log(2,
×
1790
                                        'replacing ' . $charset . ' with CP1252 as its a superset'
×
1791
                                );
×
1792
                        }
1793
                }
1794

1795
                if (is_object($debug_object)) {
2✔
1796
                        $debug_object->debug_log(1, 'EXIT - ' . $charset);
×
1797
                }
1798

1799
                return $this->_charset = $charset;
2✔
1800
        }
1801

1802
        protected function read_tag()
1803
        {
1804
                // Set end position if no further tags found
1805
                if ($this->char !== '<') {
2✔
1806
                        $this->root->_[HDOM_INFO_END] = $this->cursor;
2✔
1807
                        return false;
2✔
1808
                }
1809

1810
                $begin_tag_pos = $this->pos;
2✔
1811
                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
1812

1813
                // end tag
1814
                if ($this->char === '/') {
2✔
1815
                        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
1816

1817
                        // Skip whitespace in end tags (i.e. in "</   html>")
1818
                        $this->skip($this->token_blank);
2✔
1819
                        $tag = $this->copy_until_char('>');
2✔
1820

1821
                        // Skip attributes in end tags
1822
                        if (($pos = strpos($tag, ' ')) !== false) {
2✔
1823
                                $tag = substr($tag, 0, $pos);
×
1824
                        }
1825

1826
                        $parent_lower = strtolower($this->parent->tag);
2✔
1827
                        $tag_lower = strtolower($tag);
2✔
1828

1829
                        // The end tag is supposed to close the parent tag. Handle situations
1830
                        // when it doesn't
1831
                        if ($parent_lower !== $tag_lower) {
2✔
1832
                                // Parent tag does not have to be closed necessarily (optional closing tag)
1833
                                // Current tag is a block tag, so it may close an ancestor
1834
                                if (isset($this->optional_closing_tags[$parent_lower])
×
1835
                                        && isset($this->block_tags[$tag_lower])) {
×
1836

1837
                                        $this->parent->_[HDOM_INFO_END] = 0;
×
1838
                                        $org_parent = $this->parent;
×
1839

1840
                                        // Traverse ancestors to find a matching opening tag
1841
                                        // Stop at root node
1842
                                        while (($this->parent->parent)
×
1843
                                                && strtolower($this->parent->tag) !== $tag_lower
×
1844
                                        ){
1845
                                                $this->parent = $this->parent->parent;
×
1846
                                        }
1847

1848
                                        // If we don't have a match add current tag as text node
1849
                                        if (strtolower($this->parent->tag) !== $tag_lower) {
×
1850
                                                $this->parent = $org_parent; // restore origonal parent
×
1851

1852
                                                if ($this->parent->parent) {
×
1853
                                                        $this->parent = $this->parent->parent;
×
1854
                                                }
1855

1856
                                                $this->parent->_[HDOM_INFO_END] = $this->cursor;
×
1857
                                                return $this->as_text_node($tag);
×
1858
                                        }
1859
                                } elseif (($this->parent->parent)
×
1860
                                        && isset($this->block_tags[$tag_lower])
×
1861
                                ) {
1862
                                        // Grandparent exists and current tag is a block tag, so our
1863
                                        // parent doesn't have an end tag
1864
                                        $this->parent->_[HDOM_INFO_END] = 0; // No end tag
×
1865
                                        $org_parent = $this->parent;
×
1866

1867
                                        // Traverse ancestors to find a matching opening tag
1868
                                        // Stop at root node
1869
                                        while (($this->parent->parent)
×
1870
                                                && strtolower($this->parent->tag) !== $tag_lower
×
1871
                                        ) {
1872
                                                $this->parent = $this->parent->parent;
×
1873
                                        }
1874

1875
                                        // If we don't have a match add current tag as text node
1876
                                        if (strtolower($this->parent->tag) !== $tag_lower) {
×
1877
                                                $this->parent = $org_parent; // restore origonal parent
×
1878
                                                $this->parent->_[HDOM_INFO_END] = $this->cursor;
×
1879
                                                return $this->as_text_node($tag);
×
1880
                                        }
1881
                                } elseif (($this->parent->parent)
×
1882
                                        && strtolower($this->parent->parent->tag) === $tag_lower
×
1883
                                ) { // Grandparent exists and current tag closes it
1884
                                        $this->parent->_[HDOM_INFO_END] = 0;
×
1885
                                        $this->parent = $this->parent->parent;
×
1886
                                } else { // Random tag, add as text node
1887
                                        return $this->as_text_node($tag);
×
1888
                                }
1889
                        }
1890

1891
                        // Set end position of parent tag to current cursor position
1892
                        $this->parent->_[HDOM_INFO_END] = $this->cursor;
2✔
1893

1894
                        if ($this->parent->parent) {
2✔
1895
                                $this->parent = $this->parent->parent;
2✔
1896
                        }
1897

1898
                        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
1899
                        return true;
2✔
1900
                }
1901

1902
                // start tag
1903
                $node = new simple_html_dom_node($this);
2✔
1904
                $node->_[HDOM_INFO_BEGIN] = $this->cursor;
2✔
1905
                ++$this->cursor;
2✔
1906
                $tag = $this->copy_until($this->token_slash); // Get tag name
2✔
1907
                $node->tag_start = $begin_tag_pos;
2✔
1908

1909
                // doctype, cdata & comments...
1910
                // <!DOCTYPE html>
1911
                // <![CDATA[ ... ]]>
1912
                // <!-- Comment -->
1913
                if (isset($tag[0]) && $tag[0] === '!') {
2✔
1914
                        $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2✔
1915

1916
                        if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2✔
1917
                                $node->nodetype = HDOM_TYPE_COMMENT;
×
1918
                                $node->tag = 'comment';
×
1919
                        } else { // Could be doctype or CDATA but we don't care
1920
                                $node->nodetype = HDOM_TYPE_UNKNOWN;
2✔
1921
                                $node->tag = 'unknown';
2✔
1922
                        }
1923

1924
                        if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2✔
1925

1926
                        $this->link_nodes($node, true);
2✔
1927
                        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
1928
                        return true;
2✔
1929
                }
1930

1931
                // The start tag cannot contain another start tag, if so add as text
1932
                // i.e. "<<html>"
1933
                if ($pos = strpos($tag, '<') !== false) {
2✔
1934
                        $tag = '<' . substr($tag, 0, -1);
×
1935
                        $node->_[HDOM_INFO_TEXT] = $tag;
×
1936
                        $this->link_nodes($node, false);
×
1937
                        $this->char = $this->doc[--$this->pos]; // prev
×
1938
                        return true;
×
1939
                }
1940

1941
                // Handle invalid tag names (i.e. "<html#doc>")
1942
                if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2✔
1943
                        $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
×
1944

1945
                        // Next char is the beginning of a new tag, don't touch it.
1946
                        if ($this->char === '<') {
×
1947
                                $this->link_nodes($node, false);
×
1948
                                return true;
×
1949
                        }
1950

1951
                        // Next char closes current tag, add and be done with it.
1952
                        if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
×
1953
                        $this->link_nodes($node, false);
×
1954
                        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
1955
                        return true;
×
1956
                }
1957

1958
                // begin tag, add new node
1959
                $node->nodetype = HDOM_TYPE_ELEMENT;
2✔
1960
                $tag_lower = strtolower($tag);
2✔
1961
                $node->tag = ($this->lowercase) ? $tag_lower : $tag;
2✔
1962

1963
                // handle optional closing tags
1964
                if (isset($this->optional_closing_tags[$tag_lower])) {
2✔
1965
                        // Traverse ancestors to close all optional closing tags
1966
                        while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2✔
1967
                                $this->parent->_[HDOM_INFO_END] = 0;
×
1968
                                $this->parent = $this->parent->parent;
×
1969
                        }
1970
                        $node->parent = $this->parent;
2✔
1971
                }
1972

1973
                $guard = 0; // prevent infinity loop
2✔
1974

1975
                // [0] Space between tag and first attribute
1976
                $space = array($this->copy_skip($this->token_blank), '', '');
2✔
1977

1978
                // attributes
1979
                do {
1980
                        // Everything until the first equal sign should be the attribute name
1981
                        $name = $this->copy_until($this->token_equal);
2✔
1982

1983
                        if ($name === '' && $this->char !== null && $space[0] === '') {
2✔
1984
                                break;
2✔
1985
                        }
1986

1987
                        if ($guard === $this->pos) { // Escape infinite loop
2✔
1988
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
1989
                                continue;
×
1990
                        }
1991

1992
                        $guard = $this->pos;
2✔
1993

1994
                        // handle endless '<'
1995
                        // Out of bounds before the tag ended
1996
                        if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2✔
1997
                                $node->nodetype = HDOM_TYPE_TEXT;
×
1998
                                $node->_[HDOM_INFO_END] = 0;
×
1999
                                $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
×
2000
                                $node->tag = 'text';
×
2001
                                $this->link_nodes($node, false);
×
2002
                                return true;
×
2003
                        }
2004

2005
                        // handle mismatch '<'
2006
                        // Attributes cannot start after opening tag
2007
                        if ($this->doc[$this->pos - 1] == '<') {
2✔
2008
                                $node->nodetype = HDOM_TYPE_TEXT;
×
2009
                                $node->tag = 'text';
×
2010
                                $node->attr = array();
×
2011
                                $node->_[HDOM_INFO_END] = 0;
×
2012
                                $node->_[HDOM_INFO_TEXT] = substr(
×
2013
                                        $this->doc,
×
2014
                                        $begin_tag_pos,
×
2015
                                        $this->pos - $begin_tag_pos - 1
×
2016
                                );
×
2017
                                $this->pos -= 2;
×
2018
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
2019
                                $this->link_nodes($node, false);
×
2020
                                return true;
×
2021
                        }
2022

2023
                        if ($name !== '/' && $name !== '') { // this is a attribute name
2✔
2024
                                // [1] Whitespace after attribute name
2025
                                $space[1] = $this->copy_skip($this->token_blank);
2✔
2026

2027
                                $name = $this->restore_noise($name); // might be a noisy name
2✔
2028

2029
                                if ($this->lowercase) { $name = strtolower($name); }
2✔
2030

2031
                                if ($this->char === '=') { // attribute with value
2✔
2032
                                        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2033
                                        $this->parse_attr($node, $name, $space); // get attribute value
2✔
2034
                                } else {
2035
                                        //no value attr: nowrap, checked selected...
2036
                                        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
×
2037
                                        $node->attr[$name] = true;
×
2038
                                        if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
×
2039
                                }
2040

2041
                                $node->_[HDOM_INFO_SPACE][] = $space;
2✔
2042

2043
                                // prepare for next attribute
2044
                                $space = array(
2✔
2045
                                        $this->copy_skip($this->token_blank),
2✔
2046
                                        '',
2✔
2047
                                        ''
2✔
2048
                                );
2✔
2049
                        } else { // no more attributes
2050
                                break;
×
2051
                        }
2052
                } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2✔
2053

2054
                $this->link_nodes($node, true);
2✔
2055
                $node->_[HDOM_INFO_ENDSPACE] = $space[0];
2✔
2056

2057
                // handle empty tags (i.e. "<div/>")
2058
                if ($this->copy_until_char('>') === '/') {
2✔
2059
                        $node->_[HDOM_INFO_ENDSPACE] .= '/';
2✔
2060
                        $node->_[HDOM_INFO_END] = 0;
2✔
2061
                } else {
2062
                        // reset parent
2063
                        if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2✔
2064
                                $this->parent = $node;
2✔
2065
                        }
2066
                }
2067

2068
                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2069

2070
                // If it's a BR tag, we need to set it's text to the default text.
2071
                // This way when we see it in plaintext, we can generate formatting that the user wants.
2072
                // since a br tag never has sub nodes, this works well.
2073
                if ($node->tag === 'br') {
2✔
2074
                        $node->_[HDOM_INFO_INNER] = $this->default_br_text;
×
2075
                }
2076

2077
                return true;
2✔
2078
        }
2079

2080
        protected function parse_attr($node, $name, &$space)
2081
        {
2082
                $is_duplicate = isset($node->attr[$name]);
2✔
2083

2084
                if (!$is_duplicate) // Copy whitespace between "=" and value
2✔
2085
                        $space[2] = $this->copy_skip($this->token_blank);
2✔
2086

2087
                switch ($this->char) {
2✔
2088
                        case '"':
2✔
2089
                                $quote_type = HDOM_QUOTE_DOUBLE;
2✔
2090
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2091
                                $value = $this->copy_until_char('"');
2✔
2092
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2093
                                break;
2✔
2094
                        case '\'':
×
2095
                                $quote_type = HDOM_QUOTE_SINGLE;
×
2096
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
2097
                                $value = $this->copy_until_char('\'');
×
2098
                                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
2099
                                break;
×
2100
                        default:
2101
                                $quote_type = HDOM_QUOTE_NO;
×
2102
                                $value = $this->copy_until($this->token_attr);
×
2103
                }
2104

2105
                $value = $this->restore_noise($value);
2✔
2106

2107
                // PaperG: Attributes should not have \r or \n in them, that counts as
2108
                // html whitespace.
2109
                $value = str_replace("\r", '', $value);
2✔
2110
                $value = str_replace("\n", '', $value);
2✔
2111

2112
                // PaperG: If this is a "class" selector, lets get rid of the preceeding
2113
                // and trailing space since some people leave it in the multi class case.
2114
                if ($name === 'class') {
2✔
2115
                        $value = trim($value);
×
2116
                }
2117

2118
                if (!$is_duplicate) {
2✔
2119
                        $node->_[HDOM_INFO_QUOTE][] = $quote_type;
2✔
2120
                        $node->attr[$name] = $value;
2✔
2121
                }
2122
        }
2123

2124
        protected function link_nodes(&$node, $is_child)
2125
        {
2126
                $node->parent = $this->parent;
2✔
2127
                $this->parent->nodes[] = $node;
2✔
2128
                if ($is_child) {
2✔
2129
                        $this->parent->children[] = $node;
2✔
2130
                }
2131
        }
2132

2133
        protected function as_text_node($tag)
2134
        {
2135
                $node = new simple_html_dom_node($this);
×
2136
                ++$this->cursor;
×
2137
                $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
×
2138
                $this->link_nodes($node, false);
×
2139
                $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
×
2140
                return true;
×
2141
        }
2142

2143
        protected function skip($chars)
2144
        {
2145
                $this->pos += strspn($this->doc, $chars, $this->pos);
2✔
2146
                $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2147
        }
2148

2149
        protected function copy_skip($chars)
2150
        {
2151
                $pos = $this->pos;
2✔
2152
                $len = strspn($this->doc, $chars, $pos);
2✔
2153
                $this->pos += $len;
2✔
2154
                $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2155
                if ($len === 0) { return ''; }
2✔
2156
                return substr($this->doc, $pos, $len);
2✔
2157
        }
2158

2159
        protected function copy_until($chars)
2160
        {
2161
                $pos = $this->pos;
2✔
2162
                $len = strcspn($this->doc, $chars, $pos);
2✔
2163
                $this->pos += $len;
2✔
2164
                $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2✔
2165
                return substr($this->doc, $pos, $len);
2✔
2166
        }
2167

2168
        protected function copy_until_char($char)
2169
        {
2170
                if ($this->char === null) { return ''; }
2✔
2171

2172
                if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2✔
2173
                        $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
×
2174
                        $this->char = null;
×
2175
                        $this->pos = $this->size;
×
2176
                        return $ret;
×
2177
                }
2178

2179
                if ($pos === $this->pos) { return ''; }
2✔
2180

2181
                $pos_old = $this->pos;
2✔
2182
                $this->char = $this->doc[$pos];
2✔
2183
                $this->pos = $pos;
2✔
2184
                return substr($this->doc, $pos_old, $pos - $pos_old);
2✔
2185
        }
2186

2187
        protected function remove_noise($pattern, $remove_tag = false)
2188
        {
2189
                global $debug_object;
2✔
2190
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2✔
2191

2192
                $count = preg_match_all(
2✔
2193
                        $pattern,
2✔
2194
                        $this->doc,
2✔
2195
                        $matches,
2✔
2196
                        PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2✔
2197
                );
2✔
2198

2199
                for ($i = $count - 1; $i > -1; --$i) {
2✔
2200
                        $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2✔
2201

2202
                        if (is_object($debug_object)) {
2✔
2203
                                $debug_object->debug_log(2, 'key is: ' . $key);
×
2204
                        }
2205

2206
                        $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2✔
2207
                        $this->noise[$key] = $matches[$i][$idx][0];
2✔
2208
                        $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2✔
2209
                }
2210

2211
                // reset the length of content
2212
                $this->size = strlen($this->doc);
2✔
2213

2214
                if ($this->size > 0) {
2✔
2215
                        $this->char = $this->doc[0];
2✔
2216
                }
2217
        }
2218

2219
        function restore_noise($text)
2220
        {
2221
                global $debug_object;
2✔
2222
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2✔
2223

2224
                while (($pos = strpos($text, '___noise___')) !== false) {
2✔
2225
                        // Sometimes there is a broken piece of markup, and we don't GET the
2226
                        // pos+11 etc... token which indicates a problem outside of us...
2227

2228
                        // todo: "___noise___1000" (or any number with four or more digits)
2229
                        // in the DOM causes an infinite loop which could be utilized by
2230
                        // malicious software
2231
                        if (strlen($text) > $pos + 15) {
2✔
2232
                                $key = '___noise___'
2✔
2233
                                . $text[$pos + 11]
2✔
2234
                                . $text[$pos + 12]
2✔
2235
                                . $text[$pos + 13]
2✔
2236
                                . $text[$pos + 14]
2✔
2237
                                . $text[$pos + 15];
2✔
2238

2239
                                if (is_object($debug_object)) {
2✔
2240
                                        $debug_object->debug_log(2, 'located key of: ' . $key);
×
2241
                                }
2242

2243
                                if (isset($this->noise[$key])) {
2✔
2244
                                        $text = substr($text, 0, $pos)
2✔
2245
                                        . $this->noise[$key]
2✔
2246
                                        . substr($text, $pos + 16);
2✔
2247
                                } else {
2248
                                        // do this to prevent an infinite loop.
2249
                                        $text = substr($text, 0, $pos)
2✔
2250
                                        . 'UNDEFINED NOISE FOR KEY: '
2✔
2251
                                        . $key
2✔
2252
                                        . substr($text, $pos + 16);
2✔
2253
                                }
2254
                        } else {
2255
                                // There is no valid key being given back to us... We must get
2256
                                // rid of the ___noise___ or we will have a problem.
2257
                                $text = substr($text, 0, $pos)
×
2258
                                . 'NO NUMERIC NOISE KEY'
×
2259
                                . substr($text, $pos + 11);
×
2260
                        }
2261
                }
2262
                return $text;
2✔
2263
        }
2264

2265
        function search_noise($text)
2266
        {
2267
                global $debug_object;
×
2268
                if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
×
2269

2270
                foreach($this->noise as $noiseElement) {
×
2271
                        if (strpos($noiseElement, $text) !== false) {
×
2272
                                return $noiseElement;
×
2273
                        }
2274
                }
2275
        }
2276

2277
        function __toString()
2278
        {
UNCOV
2279
                return $this->root->innertext();
×
2280
        }
2281

2282
        function __get($name)
2283
        {
2284
                switch ($name) {
2285
                        case 'outertext':
×
2286
                                return $this->root->innertext();
×
2287
                        case 'innertext':
×
2288
                                return $this->root->innertext();
×
2289
                        case 'plaintext':
×
2290
                                return $this->root->text();
×
2291
                        case 'charset':
×
2292
                                return $this->_charset;
×
2293
                        case 'target_charset':
×
2294
                                return $this->_target_charset;
×
2295
                }
2296
        }
2297

2298
        function childNodes($idx = -1)
2299
        {
2300
                return $this->root->childNodes($idx);
×
2301
        }
2302

2303
        function firstChild()
2304
        {
2305
                return $this->root->first_child();
×
2306
        }
2307

2308
        function lastChild()
2309
        {
2310
                return $this->root->last_child();
×
2311
        }
2312

2313
        function createElement($name, $value = null)
2314
        {
2315
                return @str_get_html("<$name>$value</$name>")->firstChild();
×
2316
        }
2317

2318
        function createTextNode($value)
2319
        {
2320
                return @end(str_get_html($value)->nodes);
×
2321
        }
2322

2323
        function getElementById($id)
2324
        {
2325
                return $this->find("#$id", 0);
×
2326
        }
2327

2328
        function getElementsById($id, $idx = null)
2329
        {
2330
                return $this->find("#$id", $idx);
×
2331
        }
2332

2333
        function getElementByTagName($name)
2334
        {
2335
                return $this->find($name, 0);
×
2336
        }
2337

2338
        function getElementsByTagName($name, $idx = -1)
2339
        {
2340
                return $this->find($name, $idx);
×
2341
        }
2342

2343
        function loadFile()
2344
        {
2345
                $args = func_get_args();
×
2346
                $this->load_file($args);
×
2347
        }
2348
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc