• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kit-data-manager / ro-crate-java / #359

16 Apr 2025 04:37PM UTC coverage: 85.989%. First build
#359

Pull #233

github

Pfeil
fix: add null and existence checks for JSON-LD file in JsonLdExpander
Pull Request #233: Version 2.1.0

532 of 694 new or added lines in 19 files covered. (76.66%)

1878 of 2184 relevant lines covered (85.99%)

0.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.54
/src/main/java/edu/kit/datamanager/ro_crate/reader/RoCrateReader.java
1
package edu.kit.datamanager.ro_crate.reader;
2

3
import com.fasterxml.jackson.databind.JsonNode;
4
import com.fasterxml.jackson.databind.node.ArrayNode;
5
import com.fasterxml.jackson.databind.node.ObjectNode;
6

7
import edu.kit.datamanager.ro_crate.RoCrate;
8
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
9
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
10
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
11
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
12
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
13
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
14
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
15

16
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
17
import edu.kit.datamanager.ro_crate.validation.Validator;
18

19
import java.io.File;
20
import java.io.InputStream;
21
import java.util.*;
22
import java.util.stream.Collectors;
23
import java.util.stream.StreamSupport;
24
import org.slf4j.Logger;
25
import org.slf4j.LoggerFactory;
26

27
/**
28
 * This class allows reading crates from the outside into the library in order
29
 * to inspect or modify it.
30
 * <p>
31
 * The constructor takes a strategy to support different ways of importing the
32
 * crates. (from zip, folder, etc.).
33
 * <p>
34
 * The reader consideres "hasPart" and "isPartOf" properties and considers all
35
 * entities (in-)directly connected to the root entity ("./") as DataEntities.
36
 */
37
public class RoCrateReader {
38

39
    private static Logger logger = LoggerFactory.getLogger(RoCrateReader.class);
1✔
40

41
    /**
42
     * This is a private inner class that shall not be exposed. **Do not make it
43
     * public or protected.** It serves only the purpose of unsafe operations
44
     * while reading a crate and may be specific to this implementation.
45
     */
46
    private static class RoCrateUnsafe extends RoCrate {
47

48
        public void addDataEntityWithoutRootHasPart(DataEntity entity) {
49
            this.metadataContext.checkEntity(entity);
1✔
50
            this.roCratePayload.addDataEntity(entity);
1✔
51
        }
1✔
52
    }
53

54
    /**
55
     * If the number of JSON entities in the crate is larger than this number,
56
     * parallelization will be used.
57
     */
58
    private static final int PARALLELIZATION_THRESHOLD = 100;
59

60
    private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
61
    private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
62
    private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
63

64
    protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
65

66
    protected static final String PROP_ABOUT = "about";
67
    protected static final String PROP_CONTEXT = "@context";
68
    protected static final String PROP_CONFORMS_TO = "conformsTo";
69
    protected static final String PROP_GRAPH = "@graph";
70
    protected static final String PROP_HAS_PART = "hasPart";
71
    protected static final String PROP_ID = "@id";
72

73
    private final ReaderStrategy reader;
74

75
    public RoCrateReader(ReaderStrategy reader) {
1✔
76
        this.reader = reader;
1✔
77
    }
1✔
78

79
    /**
80
     * This function will read the location (using one of the specified
81
     * strategies) and then build the relation between the entities.
82
     *
83
     * @param source the input stream to read from
84
     *
85
     * @return the read RO-crate
86
     */
87
    public RoCrate readCrate(InputStream source) {
88
        RoCrate result = null;
1✔
89
        if (reader instanceof StreamReaderStrategy streamReaderStrategy) {
1✔
90
            ObjectNode metadata = streamReaderStrategy.readMetadataJson(source);
1✔
91
            File content = streamReaderStrategy.readContent(source);
1✔
92
            HashSet<String> usedFiles = new HashSet<>();
1✔
93
            usedFiles.add(content.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
94
            usedFiles.add(content.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
95
            usedFiles.add(content.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
96
            result = rebuildCrate(metadata, content, usedFiles);
1✔
97
        } else {
1✔
NEW
98
            logger.error("Provided writer does not implement StreamReaderStrategy. Please use 'readCrate(String location)'.");
×
99
        }
100
        return result;
1✔
101
    }
102

103
    /**
104
     * This function will read the location (using one of the specified
105
     * strategies) and then build the relation between the entities.
106
     *
107
     * @param location the location of the ro-crate to be read
108
     * @return the read RO-crate
109
     */
110
    public RoCrate readCrate(String location) {
111
        // get the ro-crate-medata.json
112
        ObjectNode metadataJson = reader.readMetadataJson(location);
1✔
113
        // get the content of the crate
114
        File files = reader.readContent(location);
1✔
115

116
        // this set will contain the files that are associated with entities
117
        HashSet<String> usedFiles = new HashSet<>();
1✔
118
        usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
119
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
120
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
121
        return rebuildCrate(metadataJson, files, usedFiles);
1✔
122
    }
123

124
    private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
125
        if (metadataJson == null) {
1✔
NEW
126
            logger.error("Metadata JSON is null, cannot rebuild crate");
×
NEW
127
            return null;
×
128
        }
129
        if (files == null) {
1✔
NEW
130
            logger.error("Content files directory is null, cannot rebuild crate");
×
NEW
131
            return null;
×
132
        }
133
        JsonNode context = metadataJson.get(PROP_CONTEXT);
1✔
134

135
        CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
1✔
136
        RoCrateUnsafe crate = new RoCrateUnsafe();
1✔
137
        crate.setMetadataContext(crateContext);
1✔
138
        JsonNode graph = metadataJson.get(PROP_GRAPH);
1✔
139

140
        if (graph.isArray()) {
1✔
141
            moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
1✔
142
            RootDataEntity root = crate.getRootDataEntity();
1✔
143
            if (root != null) {
1✔
144
                Set<String> dataEntityIds = getDataEntityIds(root, graph);
1✔
145
                for (JsonNode entityJson : graph) {
1✔
146
                    String eId = unpackId(entityJson);
1✔
147
                    if (dataEntityIds.contains(eId)) {
1✔
148
                        // data entity
149
                        DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
1✔
150
                                .setAll(entityJson.deepCopy());
1✔
151

152
                        // Handle data entities with corresponding file
153
                        checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
1✔
154
                            usedFiles.add(file.getPath());
1✔
155
                            dataEntity.setLocationWithExceptions(file.toPath())
1✔
156
                                    .setId(file.getName());
1✔
157
                        });
1✔
158

159
                        crate.addDataEntityWithoutRootHasPart(dataEntity.build());
1✔
160
                    } else {
1✔
161
                        // contextual entity
162
                        crate.addContextualEntity(
1✔
163
                                new ContextualEntity.ContextualEntityBuilder()
164
                                        .setAll(entityJson.deepCopy())
1✔
165
                                        .build());
1✔
166
                    }
167
                }
1✔
168
            }
169
        }
170

171
        Collection<File> untrackedFiles = Arrays.stream(
1✔
172
                Optional.ofNullable(files.listFiles()).orElse(new File[0]))
1✔
173
                .filter(f -> !usedFiles.contains(f.getPath()))
1✔
174
                .collect(Collectors.toSet());
1✔
175

176
        crate.setUntrackedFiles(untrackedFiles);
1✔
177
        Validator defaultValidation = new Validator(new JsonSchemaValidation());
1✔
178
        defaultValidation.validate(crate);
1✔
179
        return crate;
1✔
180
    }
181

182
    /**
183
     * Extracts graph connections from top to bottom.
184
     * <p>
185
     * Example: (connections.get(parent) -> children)
186
     *
187
     * @param graph the ArrayNode with all Entities.
188
     * @return the graph connections.
189
     */
190
    protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
191
        Map<String, Set<String>> connections = new HashMap<>();
1✔
192

193
        Map<String, JsonNode> idToNodes = new HashMap<>();
1✔
194
        StreamSupport.stream(graph.spliterator(), false)
1✔
195
                .forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
1✔
196

197
        for (JsonNode entityNode : graph) {
1✔
198
            String currentId = unpackId(entityNode);
1✔
199
            StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
1✔
200
                    .map(this::unpackId)
1✔
201
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
202
                    .filter(Objects::nonNull)
1✔
203
                    .forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
1✔
204
                    .add(unpackId(child)));
1✔
205
            StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
1✔
206
                    .map(this::unpackId)
1✔
207
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
208
                    .filter(Objects::nonNull)
1✔
209
                    .forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
1✔
210
                    .add(currentId));
1✔
211
        }
1✔
212
        return connections;
1✔
213
    }
214

215
    protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
216
        if (root == null) {
1✔
NEW
217
            return Set.of();
×
218
        }
219
        Map<String, Set<String>> network = makeEntityGraph(graph);
1✔
220
        Set<String> directDataEntities = new HashSet<>(root.hasPart);
1✔
221

222
        Stack<String> processingQueue = new Stack<>();
1✔
223
        processingQueue.addAll(directDataEntities);
1✔
224
        Set<String> result = new HashSet<>();
1✔
225

226
        while (!processingQueue.empty()) {
1✔
227
            String currentId = processingQueue.pop();
1✔
228
            result.add(currentId);
1✔
229
            network.getOrDefault(currentId, new HashSet<>()).stream()
1✔
230
                    .filter(subId -> !result.contains(subId)) // avoid loops!
1✔
231
                    .forEach(subId -> {
1✔
232
                        result.add(subId);
1✔
233
                        processingQueue.add(subId);
1✔
234
                    });
1✔
235
        }
1✔
236
        return result;
1✔
237
    }
238

239
    protected String unpackId(JsonNode node) {
240
        if (node.isTextual()) {
1✔
241
            return node.asText();
1✔
242
        } else /*if (node.isObject())*/ {
243
            return node.path(PROP_ID).asText();
1✔
244
        }
245
    }
246

247
    protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
248
        if (IdentifierUtils.isUrl(filepathOrId)) {
1✔
NEW
249
            return Optional.empty();
×
250
        }
251
        return IdentifierUtils.decode(filepathOrId)
1✔
252
                .map(decoded -> folder.toPath().resolve(decoded).toFile())
1✔
253
                .filter(File::exists);
1✔
254
    }
255

256
    /**
257
     * Moves the descriptor and the root entity from the graph to the crate.
258
     * <p>
259
     * Extracts the root data entity and the Metadata File Descriptor from the
260
     * graph and inserts them into the crate object. It also deletes it from the
261
     * graph. We will need the root dataset to distinguish between data entities
262
     * and contextual entities.
263
     *
264
     * @param crate the crate, which will receive the entities, if available in
265
     * the graph.
266
     * @param graph the graph of the Metadata JSON file, where the entities are
267
     * extracted and removed from.
268
     */
269
    protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
270
        Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
1✔
271

272
        maybeDescriptor.ifPresent(descriptor -> {
1✔
273
            setCrateDescriptor(crate, descriptor);
1✔
274
            JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
1✔
275

276
            Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
1✔
277

278
            maybeRoot.ifPresent(root -> {
1✔
279
                Set<String> hasPartIds = extractHasPartIds(root);
1✔
280

281
                crate.setRootDataEntity(
1✔
282
                        new RootDataEntity.RootDataEntityBuilder()
283
                                .setAll(root.deepCopy())
1✔
284
                                .setHasPart(hasPartIds)
1✔
285
                                .build());
1✔
286

287
                JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
1✔
288
            });
1✔
289
        });
1✔
290
    }
1✔
291

292
    /**
293
     * Find the metadata descriptor.
294
     * <p>
295
     * Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
296
     *
297
     * @param graph the graph to search the descriptor in.
298
     * @return the metadata descriptor of the crate.
299
     */
300
    protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
301
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
302
        // use the algorithm described here:
303
        // https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
304
        Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
1✔
305
                // "2. if the conformsTo property is a URI that starts with
306
                // https://w3id.org/ro/crate/"
307
                .filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
1✔
308
                // "3. from this entity’s about object keep the @id URI as variable root"
309
                .filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
1✔
310
                // There should be only one descriptor. If multiple exist, we take the first
311
                // one.
312
                .findFirst();
1✔
313
        return maybeDescriptor.or(()
1✔
314
                -> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
315
                StreamSupport.stream(graph.spliterator(), isParallel)
1✔
316
                        .filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
1✔
317
                        .findFirst()
1✔
318
        );
319
    }
320

321
    /**
322
     * Extracts the root entity from the graph, using the information from the
323
     * descriptor.
324
     * <p>
325
     * Basically implements step 5 of the algorithm described here:
326
     * <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
327
     * https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
328
     * </a>
329
     *
330
     * @param graph the graph from the metadata JSON-LD file
331
     * @param descriptor the RO-Crate descriptor
332
     * @return the root entity, if found
333
     */
334
    private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
335
        String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
1✔
336
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
337
        return StreamSupport.stream(graph.spliterator(), isParallel)
1✔
338
                // root is an object (filter + conversion)
339
                .filter(JsonNode::isObject)
1✔
340
                .map(JsonNode::<ObjectNode>deepCopy)
1✔
341
                // "5. if the entity has an @id URI that matches root return it"
342
                .filter(node -> node.path(PROP_ID).asText().equals(rootId))
1✔
343
                .findFirst();
1✔
344
    }
345

346
    private Set<String> extractHasPartIds(ObjectNode root) {
347
        JsonNode hasPartNode = root.path(PROP_HAS_PART);
1✔
348
        boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
1✔
349
        Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
1✔
350
                .map(hasPart -> hasPart.path(PROP_ID).asText())
1✔
351
                .filter(text -> !text.isBlank())
1✔
352
                .collect(Collectors.toSet());
1✔
353
        if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
1✔
354
            hasPartIds.add(hasPartNode.path(PROP_ID).asText());
1✔
355
        }
356
        return hasPartIds;
1✔
357
    }
358

359
    private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
360
        ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
1✔
361
                .setAll(descriptor.deepCopy())
1✔
362
                .build();
1✔
363
        crate.setJsonDescriptor(descriptorEntity);
1✔
364
    }
1✔
365
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc