• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

brick / structured-data / 19978974834

05 Dec 2025 11:31PM UTC coverage: 76.101% (-1.6%) from 77.67%
19978974834

Pull #7

github

web-flow
Merge 6695ae9b0 into 251e970ec
Pull Request #7: Add compatibility with PHP 8.4 Dom\HTMLDocument

15 of 21 new or added lines in 4 files covered. (71.43%)

3 existing lines in 1 file now uncovered.

242 of 318 relevant lines covered (76.1%)

1.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.47
/src/Reader/RdfaLiteReader.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Brick\StructuredData\Reader;
6

7
use Brick\StructuredData\Item;
8
use Brick\StructuredData\Reader;
9
use Dom\Document;
10
use Dom\Element;
11
use Dom\Node;
12
use Dom\XPath;
13
use DOMDocument;
14
use DOMNode;
15
use DOMXPath;
16
use Override;
17
use Sabre\Uri\InvalidUriException;
18

19
use function array_filter;
20
use function array_map;
21
use function array_values;
22
use function assert;
23
use function class_exists;
24
use function count;
25
use function explode;
26
use function iterator_to_array;
27
use function preg_replace;
28
use function Sabre\Uri\build;
29
use function Sabre\Uri\parse;
30
use function Sabre\Uri\resolve;
31
use function trim;
32

33
/**
34
 * Reads RDFa Lite embedded into a HTML document.
35
 *
36
 * https://www.w3.org/TR/rdfa-lite/
37
 *
38
 * @todo support for the prefix attribute; only predefined prefixes are supported right now
39
 */
40
final class RdfaLiteReader implements Reader
41
{
42
    /**
43
     * The predefined RDFa prefixes.
44
     *
45
     * https://www.w3.org/2011/rdfa-context/rdfa-1.1
46
     */
47
    private const PREDEFINED_PREFIXES = [
48
        'as' => 'https://www.w3.org/ns/activitystreams#',
49
        'csvw' => 'http://www.w3.org/ns/csvw#',
50
        'cat' => 'http://www.w3.org/ns/dcat#',
51
        'cc' => 'http://creativecommons.org/ns#',
52
        'cnt' => 'http://www.w3.org/2008/content#',
53
        'ctag' => 'http://commontag.org/ns#',
54
        'dc' => 'http://purl.org/dc/terms/',
55
        'dc11' => 'http://purl.org/dc/elements/1.1/',
56
        'dcat' => 'http://www.w3.org/ns/dcat#',
57
        'dcterms' => 'http://purl.org/dc/terms/',
58
        'dqv' => 'http://www.w3.org/ns/dqv#',
59
        'duv' => 'https://www.w3.org/TR/vocab-duv#',
60
        'earl' => 'http://www.w3.org/ns/earl#',
61
        'foaf' => 'http://xmlns.com/foaf/0.1/',
62
        'gldp' => 'http://www.w3.org/ns/people#',
63
        'gr' => 'http://purl.org/goodrelations/v1#',
64
        'grddl' => 'http://www.w3.org/2003/g/data-view#',
65
        'ht' => 'http://www.w3.org/2006/http#',
66
        'ical' => 'http://www.w3.org/2002/12/cal/icaltzd#',
67
        'ldp' => 'http://www.w3.org/ns/ldp#',
68
        'ma' => 'http://www.w3.org/ns/ma-ont#',
69
        'oa' => 'http://www.w3.org/ns/oa#',
70
        'odrl' => 'http://www.w3.org/ns/odrl/2/',
71
        'og' => 'http://ogp.me/ns#',
72
        'org' => 'http://www.w3.org/ns/org#',
73
        'owl' => 'http://www.w3.org/2002/07/owl#',
74
        'prov' => 'http://www.w3.org/ns/prov#',
75
        'ptr' => 'http://www.w3.org/2009/pointers#',
76
        'qb' => 'http://purl.org/linked-data/cube#',
77
        'rev' => 'http://purl.org/stuff/rev#',
78
        'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
79
        'rdfa' => 'http://www.w3.org/ns/rdfa#',
80
        'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
81
        'rif' => 'http://www.w3.org/2007/rif#',
82
        'rr' => 'http://www.w3.org/ns/r2rml#',
83
        'schema' => 'http://schema.org/',
84
        'sd' => 'http://www.w3.org/ns/sparql-service-description#',
85
        'sioc' => 'http://rdfs.org/sioc/ns#',
86
        'skos' => 'http://www.w3.org/2004/02/skos/core#',
87
        'skosxl' => 'http://www.w3.org/2008/05/skos-xl#',
88
        'ssn' => 'http://www.w3.org/ns/ssn/',
89
        'sosa' => 'http://www.w3.org/ns/sosa/',
90
        'time' => 'http://www.w3.org/2006/time#',
91
        'v' => 'http://rdf.data-vocabulary.org/#',
92
        'vcard' => 'http://www.w3.org/2006/vcard/ns#',
93
        'void' => 'http://rdfs.org/ns/void#',
94
        'wdr' => 'http://www.w3.org/2007/05/powder#',
95
        'wdrs' => 'http://www.w3.org/2007/05/powder-s#',
96
        'xhv' => 'http://www.w3.org/1999/xhtml/vocab#',
97
        'xml' => 'http://www.w3.org/XML/1998/namespace',
98
        'xsd' => 'http://www.w3.org/2001/XMLSchema#',
99
    ];
100

101
    #[Override]
102
    public function read(Document|DOMDocument $document, string $url): array
103
    {
104
        if ($document instanceof Document) {
5✔
105
            assert(class_exists(XPath::class));
106
            $xpath = new XPath($document);
5✔
107
        } else {
NEW
108
            $xpath = new DOMXPath($document);
×
109
        }
110

111
        /**
112
         * Top-level item has a typeof attribute and no property attribute.
113
         */
114
        $nodes = $xpath->query('//*[@typeof and not(@property)]');
5✔
115
        $nodes = iterator_to_array($nodes);
5✔
116

117
        return array_map(
5✔
118
            fn (DOMNode|Node $node) => $this->nodeToItem($node, $xpath, $url, self::PREDEFINED_PREFIXES, null),
5✔
119
            $nodes,
5✔
120
        );
5✔
121
    }
122

123
    /**
124
     * Extracts information from a DOMNode into an Item.
125
     *
126
     * @param DOMNode|Node   $node       A (DOM)Node representing an element with the typeof attribute.
127
     * @param DOMXPath|XPath $xpath      A (DOM)XPath object created from the node's document element.
128
     * @param string         $url        The URL the document was retrieved from, for relative URL resolution.
129
     * @param string[]       $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
130
     * @param string|null    $vocabulary The URL of the vocabulary in use, if any.
131
     *                                   This is the content of the vocab attribute of the closest item ancestor.
132
     */
133
    private function nodeToItem(DOMNode|Node $node, DOMXPath|XPath $xpath, string $url, array $prefixes, ?string $vocabulary): Item
134
    {
135
        $vocabulary = $this->updateVocabulary($node, $vocabulary);
1✔
136

137
        /**
138
         * The resource attribute holds the item identifier, that must be resolved relative to the current URL.
139
         *
140
         * https://www.w3.org/TR/rdfa-lite/#resource
141
         */
142
        $resource = $node->attributes->getNamedItem('resource');
1✔
143

144
        if ($resource !== null) {
1✔
145
            $id = resolve($url, $resource->textContent);
×
146
        } else {
147
            $id = null;
1✔
148
        }
149

150
        $typeof = $node->attributes->getNamedItem('typeof');
1✔
151

152
        // Multiple types can be specified, separated with spaces
153
        $types = explode(' ', $typeof->textContent);
1✔
154

155
        // Resolve types, replace invalid ones with empty strings; we'll filter them out in the next step
156
        $types = array_map(function (string $type) use ($prefixes, $vocabulary) {
1✔
157
            if ($type !== '') {
1✔
158
                $type = $this->resolveTerm($type, $prefixes, $vocabulary);
1✔
159

160
                if ($type !== null) {
1✔
161
                    return $type;
1✔
162
                }
163
            }
164

165
            return '';
×
166
        }, $types);
1✔
167

168
        // Remove empty values
169
        $types = array_values(array_filter($types, fn (string $type) => $type !== ''));
1✔
170

171
        $item = new Item($id, ...$types);
1✔
172

173
        // Find all nested properties
174
        $properties = $xpath->query('.//*[@property]', $node);
1✔
175
        $properties = iterator_to_array($properties);
1✔
176

177
        // Exclude properties that are inside a nested item; XPath does not seem to provide a way to do this.
178
        // See: https://stackoverflow.com/q/26365495/759866
179
        $properties = array_filter($properties, function (DOMNode|Node $itemprop) use ($node) {
1✔
180
            for (; ;) {
181
                $itemprop = $itemprop->parentNode;
1✔
182

183
                if ($itemprop->isSameNode($node)) {
1✔
184
                    return true;
1✔
185
                }
186

187
                if ($itemprop->attributes->getNamedItem('typeof')) {
1✔
188
                    return false;
1✔
189
                }
190
            }
191
        });
1✔
192

193
        /** @var array<DOMNode|Node> $properties */
194
        foreach ($properties as $property) {
1✔
195
            $names = $property->attributes->getNamedItem('property')->textContent;
1✔
196

197
            // Multiple property names can be specified, separated with spaces
198
            $names = explode(' ', $names);
1✔
199

200
            foreach ($names as $name) {
1✔
201
                $name = $this->resolveTerm($name, $prefixes, $this->updateVocabulary($property, $vocabulary));
1✔
202

203
                if ($name === null) {
1✔
204
                    continue;
×
205
                }
206

207
                $value = $this->getPropertyValue($property, $xpath, $url, $prefixes, $vocabulary);
1✔
208

209
                $item->addProperty($name, $value);
1✔
210
            }
211
        }
212

213
        return $item;
1✔
214
    }
215

216
    /**
217
     * Returns whether the given URL is a valid absolute URL.
218
     *
219
     * @param string      $term       The term to resolve, e.g. 'name', 'schema:name', or 'http://schema.org/name'.
220
     * @param string[]    $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
221
     * @param string|null $vocabulary The current vocabulary URL, if any.
222
     *
223
     * @return string|null An absolute URL, or null if the term cannot be resolved.
224
     */
225
    private function resolveTerm(string $term, array $prefixes, ?string $vocabulary): ?string
226
    {
227
        if ($this->isValidAbsoluteURL($term)) {
1✔
228
            return $term;
×
229
        }
230

231
        $parts = explode(':', $term);
1✔
232

233
        if (count($parts) === 2) {
1✔
234
            [$prefix, $term] = $parts;
×
235

236
            if (! isset($prefixes[$prefix])) {
×
237
                return null;
×
238
            }
239

240
            return $prefixes[$prefix] . $term;
×
241
        }
242

243
        if ($vocabulary === null) {
1✔
244
            return null;
×
245
        }
246

247
        return $vocabulary . $term;
1✔
248
    }
249

250
    private function isValidAbsoluteURL(string $url): bool
251
    {
252
        try {
253
            $parts = parse($url);
1✔
254
        } catch (InvalidUriException) {
×
255
            return false;
×
256
        }
257

258
        if ($parts['scheme'] === null) {
1✔
259
            return false;
1✔
260
        }
261

262
        if ($parts['host'] === null) {
×
263
            return false;
×
264
        }
265

266
        return true;
×
267
    }
268

269
    /**
270
     * Replaces the current vocabulary with the one from the vocab attribute of the current node, if set.
271
     *
272
     * @param DOMNode|Node $node       The (DOM)Node that may contain a vocab attribute.
273
     * @param string|null  $vocabulary The URL of the vocabulary in use, if any.
274
     *
275
     * @return string|null The updated vocabulary URL, if any.
276
     */
277
    private function updateVocabulary(DOMNode|Node $node, ?string $vocabulary): ?string
278
    {
279
        $vocab = $node->attributes->getNamedItem('vocab');
1✔
280

281
        if ($vocab !== null) {
1✔
282
            return $this->checkVocabularyUrl($vocab->textContent);
1✔
283
        }
284

285
        return $vocabulary;
1✔
286
    }
287

288
    /**
289
     * Ensures that the vocabulary URL is a valid absolute URL, and ensure that it has a path.
290
     *
291
     * Example: http://schema.org would return http://schema.org/
292
     *
293
     * @return string|null An absolute URL, or null if the input is not valid.
294
     */
295
    private function checkVocabularyUrl(string $url): ?string
296
    {
297
        try {
298
            $parts = parse($url);
1✔
299
        } catch (InvalidUriException) {
×
300
            return null;
×
301
        }
302

303
        if ($parts['scheme'] === null) {
1✔
304
            return null;
×
305
        }
306

307
        if ($parts['host'] === null) {
1✔
308
            return null;
×
309
        }
310

311
        if ($parts['path'] === null) {
1✔
312
            $parts['path'] = '/';
×
313
        }
314

315
        return build($parts);
1✔
316
    }
317

318
    /**
319
     * @see https://www.w3.org/TR/microdata/#values
320
     *
321
     * @param DOMNode|Node   $node       A (DOM)Node representing an element with the property attribute.
322
     * @param DOMXPath|XPath $xpath      A (DOM)XPath object created from the node's document element.
323
     * @param string         $url        The URL the document was retrieved from, for relative URL resolution.
324
     * @param string[]       $prefixes   The prefixes in use, as a map of prefix to vocabulary URL.
325
     * @param string|null    $vocabulary The URL of the vocabulary in use, if any.
326
     */
327
    private function getPropertyValue(DOMNode|Node $node, DOMXPath|XPath $xpath, string $url, array $prefixes, ?string $vocabulary): Item|string
328
    {
329
        // If the element also has an typeof attribute, create an item from the element
330
        $attr = $node->attributes->getNamedItem('typeof');
1✔
331

332
        if ($attr !== null) {
1✔
333
            return $this->nodeToItem($node, $xpath, $url, $prefixes, $vocabulary);
1✔
334
        }
335

336
        // Look for a content attribute
337
        $attr = $node->attributes->getNamedItem('content');
1✔
338

339
        if ($attr !== null) {
1✔
340
            return $attr->textContent;
1✔
341
        }
342

343
        // Look for an href attribute
344
        $attr = $node->attributes->getNamedItem('href');
1✔
345

346
        if ($attr !== null) {
1✔
347
            try {
348
                return resolve($url, $attr->textContent);
1✔
349
            } catch (InvalidUriException) {
×
350
                return '';
×
351
            }
352
        }
353

354
        // Look for a src attribute
355
        $attr = $node->attributes->getNamedItem('src');
1✔
356

357
        if ($attr !== null) {
1✔
358
            try {
359
                return resolve($url, $attr->textContent);
1✔
360
            } catch (InvalidUriException) {
×
361
                return '';
×
362
            }
363
        }
364

365
        // Otherwise, take the value of the element's textContent. Note that even though this is not suggested by the
366
        // spec, we remove extra whitespace that's likely to be an artifact of HTML formatting.
367
        return trim(preg_replace('/\s+/', ' ', $node->textContent));
1✔
368
    }
369
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc