• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

brick / structured-data / 19978974834

05 Dec 2025 11:31PM UTC coverage: 76.101% (-1.6%) from 77.67%
19978974834

Pull #7

github

web-flow
Merge 6695ae9b0 into 251e970ec
Pull Request #7: Add compatibility with PHP 8.4 Dom\HTMLDocument

15 of 21 new or added lines in 4 files covered. (71.43%)

3 existing lines in 1 file now uncovered.

242 of 318 relevant lines covered (76.1%)

1.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.7
/src/Reader/MicrodataReader.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Brick\StructuredData\Reader;
6

7
use Brick\StructuredData\Item;
8
use Brick\StructuredData\Reader;
9
use Dom\Document;
10
use Dom\Node;
11
use Dom\XPath;
12
use DOMDocument;
13
use DOMNode;
14
use DOMXPath;
15
use Override;
16
use Sabre\Uri\InvalidUriException;
17

18
use function array_filter;
19
use function array_map;
20
use function array_values;
21
use function assert;
22
use function class_exists;
23
use function explode;
24
use function in_array;
25
use function iterator_to_array;
26
use function preg_replace;
27
use function Sabre\Uri\resolve;
28
use function str_contains;
29
use function strpos;
30
use function strrpos;
31
use function substr;
32
use function trim;
33

34
/**
35
 * Reads Microdata embedded into a HTML document.
36
 *
37
 * https://www.w3.org/TR/microdata/
38
 *
39
 * @todo support for the itemref attribute
40
 */
41
final class MicrodataReader implements Reader
42
{
43
    #[Override]
44
    public function read(Document|DOMDocument $document, string $url): array
45
    {
46
        if ($document instanceof Document) {
5✔
47
            assert(class_exists(XPath::class));
48
            $xpath = new XPath($document);
5✔
49
        } else {
NEW
50
            $xpath = new DOMXPath($document);
×
51
        }
52

53
        /**
54
         * An item is a top-level Microdata item if its element does not have an itemprop attribute.
55
         *
56
         * https://www.w3.org/TR/microdata/#associating-names-with-items
57
         */
58
        $nodes = $xpath->query('//*[@itemscope and not(@itemprop)]');
5✔
59
        $nodes = iterator_to_array($nodes);
5✔
60

61
        return array_map(
5✔
62
            fn (DOMNode|Node $node) => $this->nodeToItem($node, $xpath, $url),
5✔
63
            $nodes,
5✔
64
        );
5✔
65
    }
66

67
    /**
68
     * Extracts information from a (DOM)Node into an Item.
69
     *
70
     * @param DOMNode|Node   $node  A (DOM)Node representing an element with the itemscope attribute.
71
     * @param DOMXPath|XPath $xpath A (DOM)XPath object created from the node's document element.
72
     * @param string         $url   The URL the document was retrieved from, for relative URL resolution.
73
     */
74
    private function nodeToItem(DOMNode|Node $node, DOMXPath|XPath $xpath, string $url): Item
75
    {
76
        $itemid = $node->attributes->getNamedItem('itemid');
1✔
77

78
        if ($itemid !== null) {
1✔
79
            /**
80
             * The global identifier of an item is the value of its element's itemid attribute, if it has one, resolved
81
             * relative to the element on which the attribute is specified. If the itemid attribute is missing or if
82
             * resolving it fails, it is said to have no global identifier.
83
             *
84
             * https://www.w3.org/TR/microdata/#items
85
             */
86
            $id = resolve($url, $itemid->textContent);
×
87
        } else {
88
            $id = null;
1✔
89
        }
90

91
        $itemtype = $node->attributes->getNamedItem('itemtype');
1✔
92

93
        if ($itemtype !== null) {
1✔
94
            /**
95
             * The item types of an item are the tokens obtained by splitting the element's itemtype attribute's value
96
             * on spaces.
97
             *
98
             * https://www.w3.org/TR/microdata/#items
99
             */
100
            $types = explode(' ', $itemtype->textContent);
1✔
101

102
            /**
103
             * If the itemtype attribute is missing or parsing it in this way finds no tokens, the item is said to have
104
             * no item types.
105
             */
106
            $types = array_values(array_filter($types, fn (string $type) => $type !== ''));
1✔
107
        } else {
108
            $types = [];
×
109
        }
110

111
        $item = new Item($id, ...$types);
1✔
112

113
        // Find all nested properties
114
        $itemprops = $xpath->query('.//*[@itemprop]', $node);
1✔
115
        $itemprops = iterator_to_array($itemprops);
1✔
116

117
        // Exclude properties that are inside a nested item; XPath does not seem to provide a way to do this.
118
        // See: https://stackoverflow.com/q/26365495/759866
119
        $itemprops = array_filter($itemprops, function (DOMNode|Node $itemprop) use ($node) {
1✔
120
            for (; ;) {
121
                $itemprop = $itemprop->parentNode;
1✔
122

123
                if ($itemprop->isSameNode($node)) {
1✔
124
                    return true;
1✔
125
                }
126

127
                if ($itemprop->attributes->getNamedItem('itemscope')) {
1✔
128
                    return false;
1✔
129
                }
130
            }
131
        });
1✔
132

133
        $vocabularyIdentifier = $this->getVocabularyIdentifier($types);
1✔
134

135
        /** @var array<DOMNode|Node> $itemprops */
136
        foreach ($itemprops as $itemprop) {
1✔
137
            /**
138
             * An element introducing a property can introduce multiple properties at once, to avoid duplication when
139
             * some of the properties have the same value.
140
             *
141
             * https://www.w3.org/TR/microdata/#ex-multival
142
             */
143
            $names = $itemprop->attributes->getNamedItem('itemprop')->textContent;
1✔
144
            $names = explode(' ', $names);
1✔
145

146
            foreach ($names as $name) {
1✔
147
                /**
148
                 * Each token must be either a valid absolute URL or a a string that contains no "." (U+002E) characters
149
                 * and no ":" (U+003A) characters.
150
                 *
151
                 * https://www.w3.org/TR/microdata/#items
152
                 *
153
                 * We therefore consider anything containing these characters as an absolute URL, and only prepend the
154
                 * vocabulary identifier if none of these characters are found.
155
                 */
156
                if (! str_contains($name, '.') && ! str_contains($name, ':')) {
1✔
157
                    $name = $vocabularyIdentifier . $name;
1✔
158
                }
159

160
                $value = $this->getPropertyValue($itemprop, $xpath, $url);
1✔
161

162
                $item->addProperty($name, $value);
1✔
163
            }
164
        }
165

166
        return $item;
1✔
167
    }
168

169
    /**
170
     * @see https://www.w3.org/TR/microdata/#values
171
     *
172
     * @param DOMNode|Node   $node  A (DOM)Node representing an element with the itemprop attribute.
173
     * @param DOMXPath|XPath $xpath A (DOM)XPath object created from the node's document element.
174
     * @param string         $url   The URL the document was retrieved from, for relative URL resolution.
175
     */
176
    private function getPropertyValue(DOMNode|Node $node, DOMXPath|XPath $xpath, string $url): Item|string
177
    {
178
        /**
179
         * If the element also has an itemscope attribute: the value is the item created by the element.
180
         */
181
        $attr = $node->attributes->getNamedItem('itemscope');
1✔
182

183
        if ($attr !== null) {
1✔
184
            return $this->nodeToItem($node, $xpath, $url);
1✔
185
        }
186

187
        /**
188
         * If the element has a content attribute: the value is the textContent of the element's content attribute.
189
         */
190
        $attr = $node->attributes->getNamedItem('content');
1✔
191

192
        if ($attr !== null) {
1✔
193
            return $attr->textContent;
1✔
194
        }
195

196
        /**
197
         * If the element is an audio, embed, iframe, img, source, track, or video element: if the element has a src
198
         * attribute, let proposed value be the result of resolving that attribute's textContent. If proposed value is a
199
         * valid absolute URL: The value is proposed value. Otherwise the value is the empty string.
200
         */
201
        $elements = ['audio', 'embed', 'iframe', 'img', 'source', 'track', 'video'];
1✔
202

203
        if (in_array($node->nodeName, $elements, true)) {
1✔
204
            $attr = $node->attributes->getNamedItem('src');
1✔
205

206
            if ($attr !== null) {
1✔
207
                try {
208
                    return resolve($url, $attr->textContent);
1✔
209
                } catch (InvalidUriException) {
×
210
                    return '';
×
211
                }
212
            }
213
        }
214

215
        /**
216
         * If the element is an a, area, or link element: if the element has an href attribute, let proposed value be
217
         * the result of resolving that attribute's textContent. If proposed value is a valid absolute URL: The value is
218
         * proposed value. Otherwise the value is the empty string.
219
         */
220
        $elements = ['a', 'area', 'link'];
1✔
221

222
        if (in_array($node->nodeName, $elements, true)) {
1✔
223
            $attr = $node->attributes->getNamedItem('href');
1✔
224

225
            if ($attr !== null) {
1✔
226
                try {
227
                    return resolve($url, $attr->textContent);
1✔
228
                } catch (InvalidUriException) {
×
229
                    return '';
×
230
                }
231
            }
232
        }
233

234
        /**
235
         * If the element is an object element: if the element has a data attribute, let proposed value be the result of
236
         * resolving that attribute's textContent. If proposed value is a valid absolute URL: The value is proposed
237
         * value. Otherwise the value is the empty string.
238
         */
239
        if ($node->nodeName === 'object') {
1✔
240
            $attr = $node->attributes->getNamedItem('data');
×
241

242
            if ($attr !== null) {
×
243
                try {
244
                    return resolve($url, $attr->textContent);
×
245
                } catch (InvalidUriException) {
×
246
                    return '';
×
247
                }
248
            }
249
        }
250

251
        /**
252
         * If the element is a data or meter element: if the element has a value attribute, the value is that
253
         * attribute's textContent.
254
         */
255
        if ($node->nodeName === 'data' || $node->nodeName === 'meter') {
1✔
256
            $attr = $node->attributes->getNamedItem('value');
×
257

258
            if ($attr !== null) {
×
259
                return $attr->textContent;
×
260
            }
261
        }
262

263
        /**
264
         * If the element is a time element: if the element has a datetime attribute, the value is that attribute's
265
         * textContent.
266
         */
267
        if ($node->nodeName === 'time') {
1✔
268
            $attr = $node->attributes->getNamedItem('datetime');
×
269

270
            if ($attr !== null) {
×
271
                return $attr->textContent;
×
272
            }
273
        }
274

275
        /**
276
         * Otherwise: the value is the element's textContent.
277
         *
278
         * Note that even though this is not suggested by the spec, we remove extra whitespace that's likely to be
279
         * an artifact of HTML formatting.
280
         */
281
        return trim(preg_replace('/\s+/', ' ', $node->textContent));
1✔
282
    }
283

284
    /**
285
     * Returns the vocabulary identifier for a given type.
286
     *
287
     * https://www.w3.org/TR/microdata/#dfn-vocabulary-identifier
288
     *
289
     * @param string[] $types The types, as valid absolute URLs.
290
     */
291
    private function getVocabularyIdentifier(array $types): string
292
    {
293
        if (! $types) {
1✔
294
            return '';
×
295
        }
296

297
        $type = $types[0];
1✔
298

299
        $pos = strpos($type, '#');
1✔
300

301
        if ($pos !== false) {
1✔
302
            return substr($type, 0, $pos + 1);
×
303
        }
304

305
        $pos = strrpos($type, '/');
1✔
306

307
        if ($pos !== false) {
1✔
308
            return substr($type, 0, $pos + 1);
1✔
309
        }
310

311
        return $type . '/';
×
312
    }
313
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc