• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

brick / structured-data / 17475567799

04 Sep 2025 08:14PM UTC coverage: 77.67%. Remained the same
17475567799

push

github

BenMorel
Apply coding standard

14 of 15 new or added lines in 5 files covered. (93.33%)

59 existing lines in 7 files now uncovered.

240 of 309 relevant lines covered (77.67%)

1.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.19
/src/Reader/RdfaLiteReader.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Brick\StructuredData\Reader;
6

7
use Brick\StructuredData\Item;
8
use Brick\StructuredData\Reader;
9
use DOMDocument;
10
use DOMNode;
11
use DOMXPath;
12
use Override;
13
use Sabre\Uri\InvalidUriException;
14

15
use function array_filter;
16
use function array_map;
17
use function array_values;
18
use function count;
19
use function explode;
20
use function iterator_to_array;
21
use function preg_replace;
22
use function Sabre\Uri\build;
23
use function Sabre\Uri\parse;
24
use function Sabre\Uri\resolve;
25
use function trim;
26

27
/**
28
 * Reads RDFa Lite embedded into a HTML document.
29
 *
30
 * https://www.w3.org/TR/rdfa-lite/
31
 *
32
 * @todo support for the prefix attribute; only predefined prefixes are supported right now
33
 */
34
final class RdfaLiteReader implements Reader
35
{
36
    /**
37
     * The predefined RDFa prefixes.
38
     *
39
     * https://www.w3.org/2011/rdfa-context/rdfa-1.1
40
     */
41
    private const PREDEFINED_PREFIXES = [
42
        'as' => 'https://www.w3.org/ns/activitystreams#',
43
        'csvw' => 'http://www.w3.org/ns/csvw#',
44
        'cat' => 'http://www.w3.org/ns/dcat#',
45
        'cc' => 'http://creativecommons.org/ns#',
46
        'cnt' => 'http://www.w3.org/2008/content#',
47
        'ctag' => 'http://commontag.org/ns#',
48
        'dc' => 'http://purl.org/dc/terms/',
49
        'dc11' => 'http://purl.org/dc/elements/1.1/',
50
        'dcat' => 'http://www.w3.org/ns/dcat#',
51
        'dcterms' => 'http://purl.org/dc/terms/',
52
        'dqv' => 'http://www.w3.org/ns/dqv#',
53
        'duv' => 'https://www.w3.org/TR/vocab-duv#',
54
        'earl' => 'http://www.w3.org/ns/earl#',
55
        'foaf' => 'http://xmlns.com/foaf/0.1/',
56
        'gldp' => 'http://www.w3.org/ns/people#',
57
        'gr' => 'http://purl.org/goodrelations/v1#',
58
        'grddl' => 'http://www.w3.org/2003/g/data-view#',
59
        'ht' => 'http://www.w3.org/2006/http#',
60
        'ical' => 'http://www.w3.org/2002/12/cal/icaltzd#',
61
        'ldp' => 'http://www.w3.org/ns/ldp#',
62
        'ma' => 'http://www.w3.org/ns/ma-ont#',
63
        'oa' => 'http://www.w3.org/ns/oa#',
64
        'odrl' => 'http://www.w3.org/ns/odrl/2/',
65
        'og' => 'http://ogp.me/ns#',
66
        'org' => 'http://www.w3.org/ns/org#',
67
        'owl' => 'http://www.w3.org/2002/07/owl#',
68
        'prov' => 'http://www.w3.org/ns/prov#',
69
        'ptr' => 'http://www.w3.org/2009/pointers#',
70
        'qb' => 'http://purl.org/linked-data/cube#',
71
        'rev' => 'http://purl.org/stuff/rev#',
72
        'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
73
        'rdfa' => 'http://www.w3.org/ns/rdfa#',
74
        'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
75
        'rif' => 'http://www.w3.org/2007/rif#',
76
        'rr' => 'http://www.w3.org/ns/r2rml#',
77
        'schema' => 'http://schema.org/',
78
        'sd' => 'http://www.w3.org/ns/sparql-service-description#',
79
        'sioc' => 'http://rdfs.org/sioc/ns#',
80
        'skos' => 'http://www.w3.org/2004/02/skos/core#',
81
        'skosxl' => 'http://www.w3.org/2008/05/skos-xl#',
82
        'ssn' => 'http://www.w3.org/ns/ssn/',
83
        'sosa' => 'http://www.w3.org/ns/sosa/',
84
        'time' => 'http://www.w3.org/2006/time#',
85
        'v' => 'http://rdf.data-vocabulary.org/#',
86
        'vcard' => 'http://www.w3.org/2006/vcard/ns#',
87
        'void' => 'http://rdfs.org/ns/void#',
88
        'wdr' => 'http://www.w3.org/2007/05/powder#',
89
        'wdrs' => 'http://www.w3.org/2007/05/powder-s#',
90
        'xhv' => 'http://www.w3.org/1999/xhtml/vocab#',
91
        'xml' => 'http://www.w3.org/XML/1998/namespace',
92
        'xsd' => 'http://www.w3.org/2001/XMLSchema#',
93
    ];
94

95
    #[Override]
96
    public function read(DOMDocument $document, string $url): array
97
    {
98
        $xpath = new DOMXPath($document);
5✔
99

100
        /**
101
         * Top-level item has a typeof attribute and no property attribute.
102
         */
103
        $nodes = $xpath->query('//*[@typeof and not(@property)]');
5✔
104
        $nodes = iterator_to_array($nodes);
5✔
105

106
        return array_map(
5✔
107
            fn (DOMNode $node) => $this->nodeToItem($node, $xpath, $url, self::PREDEFINED_PREFIXES, null),
5✔
108
            $nodes,
5✔
109
        );
5✔
110
    }
111

112
    /**
113
     * Extracts information from a DOMNode into an Item.
114
     *
115
     * @param DOMNode     $node       A DOMNode representing an element with the typeof attribute.
116
     * @param DOMXPath    $xpath      A DOMXPath object created from the node's document element.
117
     * @param string      $url        The URL the document was retrieved from, for relative URL resolution.
118
     * @param string[]    $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
119
     * @param string|null $vocabulary The URL of the vocabulary in use, if any.
120
     *                                This is the content of the vocab attribute of the closest item ancestor.
121
     */
122
    private function nodeToItem(DOMNode $node, DOMXPath $xpath, string $url, array $prefixes, ?string $vocabulary): Item
123
    {
124
        $vocabulary = $this->updateVocabulary($node, $vocabulary);
1✔
125

126
        /**
127
         * The resource attribute holds the item identifier, that must be resolved relative to the current URL.
128
         *
129
         * https://www.w3.org/TR/rdfa-lite/#resource
130
         */
131
        $resource = $node->attributes->getNamedItem('resource');
1✔
132

133
        if ($resource !== null) {
1✔
UNCOV
134
            $id = resolve($url, $resource->textContent);
×
135
        } else {
136
            $id = null;
1✔
137
        }
138

139
        $typeof = $node->attributes->getNamedItem('typeof');
1✔
140

141
        // Multiple types can be specified, separated with spaces
142
        $types = explode(' ', $typeof->textContent);
1✔
143

144
        // Resolve types, replace invalid ones with empty strings; we'll filter them out in the next step
145
        $types = array_map(function (string $type) use ($prefixes, $vocabulary) {
1✔
146
            if ($type !== '') {
1✔
147
                $type = $this->resolveTerm($type, $prefixes, $vocabulary);
1✔
148

149
                if ($type !== null) {
1✔
150
                    return $type;
1✔
151
                }
152
            }
153

UNCOV
154
            return '';
×
155
        }, $types);
1✔
156

157
        // Remove empty values
158
        $types = array_values(array_filter($types, fn (string $type) => $type !== ''));
1✔
159

160
        $item = new Item($id, ...$types);
1✔
161

162
        // Find all nested properties
163
        $properties = $xpath->query('.//*[@property]', $node);
1✔
164
        $properties = iterator_to_array($properties);
1✔
165

166
        // Exclude properties that are inside a nested item; XPath does not seem to provide a way to do this.
167
        // See: https://stackoverflow.com/q/26365495/759866
168
        $properties = array_filter($properties, function (DOMNode $itemprop) use ($node, $xpath) {
1✔
169
            for (; ;) {
170
                $itemprop = $itemprop->parentNode;
1✔
171

172
                if ($itemprop->isSameNode($node)) {
1✔
173
                    return true;
1✔
174
                }
175

176
                if ($itemprop->attributes->getNamedItem('typeof')) {
1✔
177
                    return false;
1✔
178
                }
179
            }
180

181
            // Unreachable, but makes static analysis happy
UNCOV
182
            return false;
×
183
        });
1✔
184

185
        /** @var DOMNode[] $properties */
186
        foreach ($properties as $property) {
1✔
187
            $names = $property->attributes->getNamedItem('property')->textContent;
1✔
188

189
            // Multiple property names can be specified, separated with spaces
190
            $names = explode(' ', $names);
1✔
191

192
            foreach ($names as $name) {
1✔
193
                $name = $this->resolveTerm($name, $prefixes, $this->updateVocabulary($property, $vocabulary));
1✔
194

195
                if ($name === null) {
1✔
UNCOV
196
                    continue;
×
197
                }
198

199
                $value = $this->getPropertyValue($property, $xpath, $url, $prefixes, $vocabulary);
1✔
200

201
                $item->addProperty($name, $value);
1✔
202
            }
203
        }
204

205
        return $item;
1✔
206
    }
207

208
    /**
209
     * Returns whether the given URL is a valid absolute URL.
210
     *
211
     * @param string      $term       The term to resolve, e.g. 'name', 'schema:name', or 'http://schema.org/name'.
212
     * @param string[]    $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
213
     * @param string|null $vocabulary The current vocabulary URL, if any.
214
     *
215
     * @return string|null An absolute URL, or null if the term cannot be resolved.
216
     */
217
    private function resolveTerm(string $term, array $prefixes, ?string $vocabulary): ?string
218
    {
219
        if ($this->isValidAbsoluteURL($term)) {
1✔
UNCOV
220
            return $term;
×
221
        }
222

223
        $parts = explode(':', $term);
1✔
224

225
        if (count($parts) === 2) {
1✔
UNCOV
226
            [$prefix, $term] = $parts;
×
227

UNCOV
228
            if (! isset($prefixes[$prefix])) {
×
UNCOV
229
                return null;
×
230
            }
231

UNCOV
232
            return $prefixes[$prefix] . $term;
×
233
        }
234

235
        if ($vocabulary === null) {
1✔
UNCOV
236
            return null;
×
237
        }
238

239
        return $vocabulary . $term;
1✔
240
    }
241

242
    private function isValidAbsoluteURL(string $url): bool
243
    {
244
        try {
245
            $parts = parse($url);
1✔
246
        } catch (InvalidUriException) {
×
247
            return false;
×
248
        }
249

250
        if ($parts['scheme'] === null) {
1✔
251
            return false;
1✔
252
        }
253

254
        if ($parts['host'] === null) {
×
255
            return false;
×
256
        }
257

258
        return true;
×
259
    }
260

261
    /**
262
     * Replaces the current vocabulary with the one from the vocab attribute of the current node, if set.
263
     *
264
     * @param DOMNode     $node       The DOMNode that may contain a vocab attribute.
265
     * @param string|null $vocabulary The URL of the vocabulary in use, if any.
266
     *
267
     * @return string|null The updated vocabulary URL, if any.
268
     */
269
    private function updateVocabulary(DOMNode $node, ?string $vocabulary): ?string
270
    {
271
        $vocab = $node->attributes->getNamedItem('vocab');
1✔
272

273
        if ($vocab !== null) {
1✔
274
            return $this->checkVocabularyUrl($vocab->textContent);
1✔
275
        }
276

277
        return $vocabulary;
1✔
278
    }
279

280
    /**
281
     * Ensures that the vocabulary URL is a valid absolute URL, and ensure that it has a path.
282
     *
283
     * Example: http://schema.org would return http://schema.org/
284
     *
285
     * @return string|null An absolute URL, or null if the input is not valid.
286
     */
287
    private function checkVocabularyUrl(string $url): ?string
288
    {
289
        try {
290
            $parts = parse($url);
1✔
UNCOV
291
        } catch (InvalidUriException) {
×
UNCOV
292
            return null;
×
293
        }
294

295
        if ($parts['scheme'] === null) {
1✔
UNCOV
296
            return null;
×
297
        }
298

299
        if ($parts['host'] === null) {
1✔
UNCOV
300
            return null;
×
301
        }
302

303
        if ($parts['path'] === null) {
1✔
UNCOV
304
            $parts['path'] = '/';
×
305
        }
306

307
        return build($parts);
1✔
308
    }
309

310
    /**
311
     * @see https://www.w3.org/TR/microdata/#values
312
     *
313
     * @param DOMNode     $node       A DOMNode representing an element with the property attribute.
314
     * @param DOMXPath    $xpath      A DOMXPath object created from the node's document element.
315
     * @param string      $url        The URL the document was retrieved from, for relative URL resolution.
316
     * @param string[]    $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
317
     * @param string|null $vocabulary The URL of the vocabulary in use, if any.
318
     */
319
    private function getPropertyValue(DOMNode $node, DOMXPath $xpath, string $url, array $prefixes, ?string $vocabulary): Item|string
320
    {
321
        // If the element also has an typeof attribute, create an item from the element
322
        $attr = $node->attributes->getNamedItem('typeof');
1✔
323

324
        if ($attr !== null) {
1✔
325
            return $this->nodeToItem($node, $xpath, $url, $prefixes, $vocabulary);
1✔
326
        }
327

328
        // Look for a content attribute
329
        $attr = $node->attributes->getNamedItem('content');
1✔
330

331
        if ($attr !== null) {
1✔
332
            return $attr->textContent;
1✔
333
        }
334

335
        // Look for an href attribute
336
        $attr = $node->attributes->getNamedItem('href');
1✔
337

338
        if ($attr !== null) {
1✔
339
            try {
340
                return resolve($url, $attr->textContent);
1✔
UNCOV
341
            } catch (InvalidUriException) {
×
UNCOV
342
                return '';
×
343
            }
344
        }
345

346
        // Look for a src attribute
347
        $attr = $node->attributes->getNamedItem('src');
1✔
348

349
        if ($attr !== null) {
1✔
350
            try {
351
                return resolve($url, $attr->textContent);
1✔
UNCOV
352
            } catch (InvalidUriException) {
×
UNCOV
353
                return '';
×
354
            }
355
        }
356

357
        // Otherwise, take the value of the element's textContent. Note that even though this is not suggested by the
358
        // spec, we remove extra whitespace that's likely to be an artifact of HTML formatting.
359
        return trim(preg_replace('/\s+/', ' ', $node->textContent));
1✔
360
    }
361
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc