• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

brick / structured-data / 19978974834

05 Dec 2025 11:31PM UTC coverage: 76.101% (-1.6%) from 77.67%
19978974834

Pull #7

github

web-flow
Merge 6695ae9b0 into 251e970ec
Pull Request #7: Add compatibility with PHP 8.4 Dom\HTMLDocument

15 of 21 new or added lines in 4 files covered. (71.43%)

3 existing lines in 1 file now uncovered.

242 of 318 relevant lines covered (76.1%)

1.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.57
/src/Reader/JsonLdReader.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace Brick\StructuredData\Reader;
6

7
use Brick\StructuredData\Item;
8
use Brick\StructuredData\Reader;
9
use Dom\Document;
10
use Dom\Node;
11
use Dom\XPath;
12
use DOMDocument;
13
use DOMNode;
14
use DOMXPath;
15
use Override;
16
use Sabre\Uri\InvalidUriException;
17

18
use function array_filter;
19
use function array_map;
20
use function array_merge;
21
use function array_values;
22
use function array_walk_recursive;
23
use function assert;
24
use function class_exists;
25
use function in_array;
26
use function is_array;
27
use function is_bool;
28
use function is_object;
29
use function is_scalar;
30
use function is_string;
31
use function iterator_to_array;
32
use function json_decode;
33
use function Sabre\Uri\build;
34
use function Sabre\Uri\parse;
35
use function Sabre\Uri\resolve;
36

37
use const JSON_THROW_ON_ERROR;
38

39
/**
40
 * Reads JSON-LD documents embedded into a HTML document.
41
 *
42
 * This first implementation is a rudimentary parser that only implements a subset of the JSON-LD spec, only allows a
43
 * string in `@context`, and considers this string a vocabulary identifier and not an external context file.
44
 *
45
 * This may look like it's missing a lot (it is), like it will make mistakes (it will), but this should be enough to
46
 * parse most of the web pages embedding schema.org data, as long as they follow the simple syntax used in the examples.
47
 *
48
 * https://json-ld.org/spec/latest/json-ld/
49
 */
50
final class JsonLdReader implements Reader
51
{
52
    /**
53
     * @var string[]
54
     */
55
    private readonly array $iriProperties;
56

57
    /**
58
     * JsonLdReader constructor.
59
     *
60
     * Because this naive implementation cannot parse contexts, it accepts a hardcoded list of properties whose values
61
     * will be considered as IRIs and resolved relative to the current URL.
62
     *
63
     * Example: ['http://schema.org/image', 'http://schema.org/url']
64
     *
65
     * @param string[] $iriProperties
66
     */
67
    public function __construct(array $iriProperties = [])
68
    {
69
        $this->iriProperties = $iriProperties;
5✔
70
    }
71

72
    #[Override]
73
    public function read(Document|DOMDocument $document, string $url): array
74
    {
75
        if ($document instanceof Document) {
5✔
76
            assert(class_exists(XPath::class));
77
            $xpath = new XPath($document);
5✔
78
        } else {
NEW
79
            $xpath = new DOMXPath($document);
×
80
        }
81

82
        $nodes = $xpath->query('//script[@type="application/ld+json"]');
5✔
83
        $nodes = iterator_to_array($nodes);
5✔
84

85
        if (! $nodes) {
5✔
86
            return [];
2✔
87
        }
88

89
        $items = array_map(
3✔
90
            fn (DOMNode|Node $node) => $this->readJson($node->textContent, $url),
3✔
91
            $nodes,
3✔
92
        );
3✔
93

94
        return array_merge(...$items);
3✔
95
    }
96

97
    /**
98
     * Reads a list of items from a JSON-LD string.
99
     *
100
     * If the JSON is not valid, an empty array is returned.
101
     *
102
     * @param string $json The JSON string.
103
     * @param string $url  The URL the document was retrieved from, for relative URL resolution.
104
     *
105
     * @return Item[]
106
     */
107
    private function readJson(string $json, string $url): array
108
    {
109
        $data = json_decode($json, flags: JSON_THROW_ON_ERROR);
3✔
110

111
        if ($data === null) {
3✔
112
            return [];
×
113
        }
114

115
        if (is_object($data)) {
3✔
116
            if (isset($data->{'@graph'}) && is_array($data->{'@graph'})) {
3✔
117
                $data = $data->{'@graph'};
1✔
118
            } else {
119
                $item = $this->readItem($data, $url, null);
2✔
120

121
                return [$item];
2✔
122
            }
123
        }
124

125
        if (is_array($data)) {
1✔
126
            $items = array_map(
1✔
127
                fn (mixed $item) => is_object($item) ? $this->readItem($item, $url, null) : null,
1✔
128
                $data,
1✔
129
            );
1✔
130

131
            $items = array_filter($items);
1✔
132
            $items = array_values($items);
1✔
133

134
            return $items;
1✔
135
        }
136

137
        return [];
×
138
    }
139

140
    /**
141
     * Reads a single item.
142
     *
143
     * @param object      $item       A decoded JSON object representing an item, or null if invalid.
144
     * @param string      $url        The URL the document was retrieved from, for relative URL resolution.
145
     * @param string|null $vocabulary The currently vocabulary URL, if any.
146
     */
147
    private function readItem(object $item, string $url, ?string $vocabulary): Item
148
    {
149
        if (isset($item->{'@context'}) && is_string($item->{'@context'})) {
3✔
150
            $vocabulary = $this->checkVocabularyUrl($item->{'@context'}); // ugh
3✔
151
        }
152

153
        $id = null;
3✔
154

155
        if (isset($item->{'@id'}) && is_string($item->{'@id'})) {
3✔
156
            try {
157
                $id = resolve($url, $item->{'@id'}); // always relative to the document URL, no support for @base
1✔
158
            } catch (InvalidUriException) {
×
159
                // ignore
160
            }
161
        }
162

163
        $types = [];
3✔
164

165
        if (isset($item->{'@type'})) {
3✔
166
            $type = $item->{'@type'};
3✔
167

168
            if (is_string($type)) {
3✔
169
                $type = $this->resolveTerm($type, $vocabulary);
3✔
170
                $types = [$type];
3✔
171
            } elseif (is_array($type)) {
×
172
                $types = array_map(
×
173
                    fn ($type) => is_string($type) ? $this->resolveTerm($type, $vocabulary) : null,
×
NEW
174
                    $type,
×
175
                );
×
176

177
                $types = array_filter($types);
×
178
                $types = array_values($types);
×
179
            }
180
        }
181

182
        $result = new Item($id, ...$types);
3✔
183

184
        foreach ($item as $name => $value) {
3✔
185
            if ($name === '' || $name[0] === '@') {
3✔
186
                continue;
3✔
187
            }
188

189
            $name = $this->resolveTerm($name, $vocabulary);
3✔
190

191
            if (is_array($value)) {
3✔
192
                // Flatten the array: not sure if this is required by the JSON-LD standard, but some websites output
193
                // nested arrays such as "offer": [[ { ... } ]], and Google Structured Data Testing Tool does recognize
194
                // this syntax, so we're doing the same here.
195
                $value = $this->flattenArray($value);
3✔
196

197
                foreach ($value as $theValue) {
3✔
198
                    $theValue = $this->getPropertyValue($name, $theValue, $url, $vocabulary);
3✔
199

200
                    if ($theValue !== null) {
3✔
201
                        $result->addProperty($name, $theValue);
3✔
202
                    }
203
                }
204
            } else {
205
                $value = $this->getPropertyValue($name, $value, $url, $vocabulary);
3✔
206

207
                if ($value !== null) {
3✔
208
                    $result->addProperty($name, $value);
3✔
209
                }
210
            }
211
        }
212

213
        return $result;
3✔
214
    }
215

216
    /**
217
     * Flattens a potentially multidimensional array.
218
     *
219
     * The result array contains no nested arrays.
220
     */
221
    private function flattenArray(array $array): array
222
    {
223
        $result = [];
3✔
224

225
        array_walk_recursive($array, function (mixed $a) use (&$result): void {
3✔
226
            $result[] = $a;
3✔
227
        });
3✔
228

229
        return $result;
3✔
230
    }
231

232
    private function resolveTerm(string $term, ?string $vocabulary): string
233
    {
234
        if ($vocabulary !== null) {
3✔
235
            return $vocabulary . $term;
3✔
236
        }
237

238
        return $term;
×
239
    }
240

241
    /**
242
     * @param string      $name       The property name.
243
     * @param mixed       $value      The property value. Any JSON type.
244
     * @param string      $url        The URL the document was retrieved from, for relative URL resolution.
245
     * @param string|null $vocabulary The current vocabulary URL, if any.
246
     *
247
     * @return Item|string|null The value, or NULL if the input value is NULL or an array.
248
     */
249
    private function getPropertyValue(string $name, mixed $value, string $url, ?string $vocabulary): null|Item|string
250
    {
251
        if (is_string($value)) {
3✔
252
            if (in_array($name, $this->iriProperties, true)) {
3✔
253
                try {
254
                    $value = resolve($url, $value);
1✔
255
                } catch (InvalidUriException) {
×
256
                    // ignore
257
                }
258
            }
259
        }
260

261
        if (is_bool($value)) {
3✔
262
            return $value ? 'true' : 'false';
×
263
        }
264

265
        if (is_scalar($value)) {
3✔
266
            return (string) $value;
3✔
267
        }
268

269
        if (is_object($value)) {
3✔
270
            return $this->readItem($value, $url, $vocabulary);
3✔
271
        }
272

273
        return null;
×
274
    }
275

276
    /**
277
     * Ensures that the vocabulary URL is a valid absolute URL, and ensure that it has a path.
278
     *
279
     * Example: http://schema.org would return http://schema.org/
280
     *
281
     * @return string|null An absolute URL, or null if the input is not valid.
282
     */
283
    private function checkVocabularyUrl(string $url): ?string
284
    {
285
        try {
286
            $parts = parse($url);
3✔
287
        } catch (InvalidUriException) {
×
288
            return null;
×
289
        }
290

291
        if ($parts['scheme'] === null) {
3✔
292
            return null;
×
293
        }
294

295
        if ($parts['host'] === null) {
3✔
296
            return null;
×
297
        }
298

299
        if ($parts['path'] === null) {
3✔
300
            $parts['path'] = '/';
2✔
301
        }
302

303
        return build($parts);
3✔
304
    }
305
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc