• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

brick / structured-data / 17475567799

04 Sep 2025 08:14PM UTC coverage: 77.67%. Remained the same
17475567799

push

github

BenMorel
Apply coding standard

14 of 15 new or added lines in 5 files covered. (93.33%)

59 existing lines in 7 files now uncovered.

240 of 309 relevant lines covered (77.67%)

1.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

75.31
/src/Reader/MicrodataReader.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Brick\StructuredData\Reader;
6

7
use Brick\StructuredData\Item;
8
use Brick\StructuredData\Reader;
9
use DOMDocument;
10
use DOMNode;
11
use DOMXPath;
12
use Override;
13
use Sabre\Uri\InvalidUriException;
14

15
use function array_filter;
16
use function array_map;
17
use function array_values;
18
use function explode;
19
use function in_array;
20
use function iterator_to_array;
21
use function preg_replace;
22
use function Sabre\Uri\resolve;
23
use function str_contains;
24
use function strpos;
25
use function strrpos;
26
use function substr;
27
use function trim;
28

29
/**
30
 * Reads Microdata embedded into a HTML document.
31
 *
32
 * https://www.w3.org/TR/microdata/
33
 *
34
 * @todo support for the itemref attribute
35
 */
36
final class MicrodataReader implements Reader
37
{
38
    #[Override]
39
    public function read(DOMDocument $document, string $url): array
40
    {
41
        $xpath = new DOMXPath($document);
5✔
42

43
        /**
44
         * An item is a top-level Microdata item if its element does not have an itemprop attribute.
45
         *
46
         * https://www.w3.org/TR/microdata/#associating-names-with-items
47
         */
48
        $nodes = $xpath->query('//*[@itemscope and not(@itemprop)]');
5✔
49
        $nodes = iterator_to_array($nodes);
5✔
50

51
        return array_map(
5✔
52
            fn (DOMNode $node) => $this->nodeToItem($node, $xpath, $url),
5✔
53
            $nodes,
5✔
54
        );
5✔
55
    }
56

57
    /**
58
     * Extracts information from a DOMNode into an Item.
59
     *
60
     * @param DOMNode  $node  A DOMNode representing an element with the itemscope attribute.
61
     * @param DOMXPath $xpath A DOMXPath object created from the node's document element.
62
     * @param string   $url   The URL the document was retrieved from, for relative URL resolution.
63
     */
64
    private function nodeToItem(DOMNode $node, DOMXPath $xpath, string $url): Item
65
    {
66
        $itemid = $node->attributes->getNamedItem('itemid');
1✔
67

68
        if ($itemid !== null) {
1✔
69
            /**
70
             * The global identifier of an item is the value of its element's itemid attribute, if it has one, resolved
71
             * relative to the element on which the attribute is specified. If the itemid attribute is missing or if
72
             * resolving it fails, it is said to have no global identifier.
73
             *
74
             * https://www.w3.org/TR/microdata/#items
75
             */
UNCOV
76
            $id = resolve($url, $itemid->textContent);
×
77
        } else {
78
            $id = null;
1✔
79
        }
80

81
        $itemtype = $node->attributes->getNamedItem('itemtype');
1✔
82

83
        if ($itemtype !== null) {
1✔
84
            /**
85
             * The item types of an item are the tokens obtained by splitting the element's itemtype attribute's value
86
             * on spaces.
87
             *
88
             * https://www.w3.org/TR/microdata/#items
89
             */
90
            $types = explode(' ', $itemtype->textContent);
1✔
91

92
            /**
93
             * If the itemtype attribute is missing or parsing it in this way finds no tokens, the item is said to have
94
             * no item types.
95
             */
96
            $types = array_values(array_filter($types, fn (string $type) => $type !== ''));
1✔
97
        } else {
UNCOV
98
            $types = [];
×
99
        }
100

101
        $item = new Item($id, ...$types);
1✔
102

103
        // Find all nested properties
104
        $itemprops = $xpath->query('.//*[@itemprop]', $node);
1✔
105
        $itemprops = iterator_to_array($itemprops);
1✔
106

107
        // Exclude properties that are inside a nested item; XPath does not seem to provide a way to do this.
108
        // See: https://stackoverflow.com/q/26365495/759866
109
        $itemprops = array_filter($itemprops, function (DOMNode $itemprop) use ($node, $xpath) {
1✔
110
            for (; ;) {
111
                $itemprop = $itemprop->parentNode;
1✔
112

113
                if ($itemprop->isSameNode($node)) {
1✔
114
                    return true;
1✔
115
                }
116

117
                if ($itemprop->attributes->getNamedItem('itemscope')) {
1✔
118
                    return false;
1✔
119
                }
120
            }
121
        });
1✔
122

123
        $vocabularyIdentifier = $this->getVocabularyIdentifier($types);
1✔
124

125
        /** @var DOMNode[] $itemprops */
126
        foreach ($itemprops as $itemprop) {
1✔
127
            /**
128
             * An element introducing a property can introduce multiple properties at once, to avoid duplication when
129
             * some of the properties have the same value.
130
             *
131
             * https://www.w3.org/TR/microdata/#ex-multival
132
             */
133
            $names = $itemprop->attributes->getNamedItem('itemprop')->textContent;
1✔
134
            $names = explode(' ', $names);
1✔
135

136
            foreach ($names as $name) {
1✔
137
                /**
138
                 * Each token must be either a valid absolute URL or a a string that contains no "." (U+002E) characters
139
                 * and no ":" (U+003A) characters.
140
                 *
141
                 * https://www.w3.org/TR/microdata/#items
142
                 *
143
                 * We therefore consider anything containing these characters as an absolute URL, and only prepend the
144
                 * vocabulary identifier if none of these characters are found.
145
                 */
146
                if (! str_contains($name, '.') && ! str_contains($name, ':')) {
1✔
147
                    $name = $vocabularyIdentifier . $name;
1✔
148
                }
149

150
                $value = $this->getPropertyValue($itemprop, $xpath, $url);
1✔
151

152
                $item->addProperty($name, $value);
1✔
153
            }
154
        }
155

156
        return $item;
1✔
157
    }
158

159
    /**
160
     * @see https://www.w3.org/TR/microdata/#values
161
     *
162
     * @param DOMNode  $node  A DOMNode representing an element with the itemprop attribute.
163
     * @param DOMXPath $xpath A DOMXPath object created from the node's document element.
164
     * @param string   $url   The URL the document was retrieved from, for relative URL resolution.
165
     */
166
    private function getPropertyValue(DOMNode $node, DOMXPath $xpath, string $url): Item|string
167
    {
168
        /**
169
         * If the element also has an itemscope attribute: the value is the item created by the element.
170
         */
171
        $attr = $node->attributes->getNamedItem('itemscope');
1✔
172

173
        if ($attr !== null) {
1✔
174
            return $this->nodeToItem($node, $xpath, $url);
1✔
175
        }
176

177
        /**
178
         * If the element has a content attribute: the value is the textContent of the element's content attribute.
179
         */
180
        $attr = $node->attributes->getNamedItem('content');
1✔
181

182
        if ($attr !== null) {
1✔
183
            return $attr->textContent;
1✔
184
        }
185

186
        /**
187
         * If the element is an audio, embed, iframe, img, source, track, or video element: if the element has a src
188
         * attribute, let proposed value be the result of resolving that attribute's textContent. If proposed value is a
189
         * valid absolute URL: The value is proposed value. Otherwise the value is the empty string.
190
         */
191
        $elements = ['audio', 'embed', 'iframe', 'img', 'source', 'track', 'video'];
1✔
192

193
        if (in_array($node->nodeName, $elements, true)) {
1✔
194
            $attr = $node->attributes->getNamedItem('src');
1✔
195

196
            if ($attr !== null) {
1✔
197
                try {
198
                    return resolve($url, $attr->textContent);
1✔
UNCOV
199
                } catch (InvalidUriException) {
×
UNCOV
200
                    return '';
×
201
                }
202
            }
203
        }
204

205
        /**
206
         * If the element is an a, area, or link element: if the element has an href attribute, let proposed value be
207
         * the result of resolving that attribute's textContent. If proposed value is a valid absolute URL: The value is
208
         * proposed value. Otherwise the value is the empty string.
209
         */
210
        $elements = ['a', 'area', 'link'];
1✔
211

212
        if (in_array($node->nodeName, $elements, true)) {
1✔
213
            $attr = $node->attributes->getNamedItem('href');
1✔
214

215
            if ($attr !== null) {
1✔
216
                try {
217
                    return resolve($url, $attr->textContent);
1✔
UNCOV
218
                } catch (InvalidUriException) {
×
UNCOV
219
                    return '';
×
220
                }
221
            }
222
        }
223

224
        /**
225
         * If the element is an object element: if the element has a data attribute, let proposed value be the result of
226
         * resolving that attribute's textContent. If proposed value is a valid absolute URL: The value is proposed
227
         * value. Otherwise the value is the empty string.
228
         */
229
        if ($node->nodeName === 'object') {
1✔
UNCOV
230
            $attr = $node->attributes->getNamedItem('data');
×
231

UNCOV
232
            if ($attr !== null) {
×
233
                try {
UNCOV
234
                    return resolve($url, $attr->textContent);
×
UNCOV
235
                } catch (InvalidUriException) {
×
UNCOV
236
                    return '';
×
237
                }
238
            }
239
        }
240

241
        /**
242
         * If the element is a data or meter element: if the element has a value attribute, the value is that
243
         * attribute's textContent.
244
         */
245
        if ($node->nodeName === 'data' || $node->nodeName === 'meter') {
1✔
UNCOV
246
            $attr = $node->attributes->getNamedItem('value');
×
247

UNCOV
248
            if ($attr !== null) {
×
UNCOV
249
                return $attr->textContent;
×
250
            }
251
        }
252

253
        /**
254
         * If the element is a time element: if the element has a datetime attribute, the value is that attribute's
255
         * textContent.
256
         */
257
        if ($node->nodeName === 'time') {
1✔
258
            $attr = $node->attributes->getNamedItem('datetime');
×
259

UNCOV
260
            if ($attr !== null) {
×
UNCOV
261
                return $attr->textContent;
×
262
            }
263
        }
264

265
        /**
266
         * Otherwise: the value is the element's textContent.
267
         *
268
         * Note that even though this is not suggested by the spec, we remove extra whitespace that's likely to be
269
         * an artifact of HTML formatting.
270
         */
271
        return trim(preg_replace('/\s+/', ' ', $node->textContent));
1✔
272
    }
273

274
    /**
275
     * Returns the vocabulary identifier for a given type.
276
     *
277
     * https://www.w3.org/TR/microdata/#dfn-vocabulary-identifier
278
     *
279
     * @param string[] $types The types, as valid absolute URLs.
280
     */
281
    private function getVocabularyIdentifier(array $types): string
282
    {
283
        if (! $types) {
1✔
UNCOV
284
            return '';
×
285
        }
286

287
        $type = $types[0];
1✔
288

289
        $pos = strpos($type, '#');
1✔
290

291
        if ($pos !== false) {
1✔
UNCOV
292
            return substr($type, 0, $pos + 1);
×
293
        }
294

295
        $pos = strrpos($type, '/');
1✔
296

297
        if ($pos !== false) {
1✔
298
            return substr($type, 0, $pos + 1);
1✔
299
        }
300

UNCOV
301
        return $type . '/';
×
302
    }
303
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc