• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kit-data-manager / ro-crate-java / #356

15 Apr 2025 01:27PM UTC coverage: 87.033%. First build
#356

Pull #233

github

web-flow
Merge pull request #245 from kit-data-manager/241-getting-key-value-pairs-from-the-context

Fix #241: Getting key value pairs from the context
Pull Request #233: Version 2.1.0

538 of 676 new or added lines in 19 files covered. (79.59%)

1886 of 2167 relevant lines covered (87.03%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.01
/src/main/java/edu/kit/datamanager/ro_crate/reader/RoCrateReader.java
1
package edu.kit.datamanager.ro_crate.reader;
2

3
import com.fasterxml.jackson.databind.JsonNode;
4
import com.fasterxml.jackson.databind.node.ArrayNode;
5
import com.fasterxml.jackson.databind.node.ObjectNode;
6

7
import edu.kit.datamanager.ro_crate.RoCrate;
8
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
9
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
10
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
11
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
12
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
13
import edu.kit.datamanager.ro_crate.preview.model.ROCratePreviewModel.ROCrate;
14
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
15
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
16

17
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
18
import edu.kit.datamanager.ro_crate.validation.Validator;
19

20
import java.io.File;
21
import java.io.InputStream;
22
import java.util.*;
23
import java.util.stream.Collectors;
24
import java.util.stream.StreamSupport;
25
import org.slf4j.Logger;
26
import org.slf4j.LoggerFactory;
27

28
/**
29
 * This class allows reading crates from the outside into the library in order
30
 * to inspect or modify it.
31
 * <p>
32
 * The constructor takes a strategy to support different ways of importing the
33
 * crates. (from zip, folder, etc.).
34
 * <p>
35
 * The reader consideres "hasPart" and "isPartOf" properties and considers all
36
 * entities (in-)directly connected to the root entity ("./") as DataEntities.
37
 */
38
public class RoCrateReader {
39

40
    private static Logger logger = LoggerFactory.getLogger(RoCrateReader.class);
1✔
41

42
    /**
43
     * This is a private inner class that shall not be exposed. **Do not make it
44
     * public or protected.** It serves only the purpose of unsafe operations
45
     * while reading a crate and may be specific to this implementation.
46
     */
47
    private static class RoCrateUnsafe extends RoCrate {
48

49
        public void addDataEntityWithoutRootHasPart(DataEntity entity) {
50
            this.metadataContext.checkEntity(entity);
1✔
51
            this.roCratePayload.addDataEntity(entity);
1✔
52
        }
1✔
53
    }
54

55
    /**
56
     * If the number of JSON entities in the crate is larger than this number,
57
     * parallelization will be used.
58
     */
59
    private static final int PARALLELIZATION_THRESHOLD = 100;
60

61
    private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
62
    private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
63
    private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
64

65
    protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
66

67
    protected static final String PROP_ABOUT = "about";
68
    protected static final String PROP_CONTEXT = "@context";
69
    protected static final String PROP_CONFORMS_TO = "conformsTo";
70
    protected static final String PROP_GRAPH = "@graph";
71
    protected static final String PROP_HAS_PART = "hasPart";
72
    protected static final String PROP_ID = "@id";
73

74
    private final ReaderStrategy reader;
75

76
    public RoCrateReader(ReaderStrategy reader) {
1✔
77
        this.reader = reader;
1✔
78
    }
1✔
79

80
    /**
81
     * This function will read the location (using one of the specified
82
     * strategies) and then build the relation between the entities.
83
     *
84
     * @param source the input stream to read from
85
     *
86
     * @return the read RO-crate
87
     */
88
    public RoCrate readCrate(InputStream source) {
89
        RoCrate result = null;
1✔
90
        if (reader instanceof StreamReaderStrategy streamReaderStrategy) {
1✔
91
            ObjectNode metadata = streamReaderStrategy.readMetadataJson(source);
1✔
92
            File content = streamReaderStrategy.readContent(source);
1✔
93
            HashSet<String> usedFiles = new HashSet<>();
1✔
94
            usedFiles.add(content.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
95
            usedFiles.add(content.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
96
            usedFiles.add(content.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
97
            result = rebuildCrate(metadata, content, usedFiles);
1✔
98
        } else {
1✔
NEW
99
            logger.error("Provided writer does not implement StreamWriterStrategy. Please use 'save(Crate crate, String destination)'.");
×
100
        }
101
        return result;
1✔
102
    }
103

104
    /**
105
     * This function will read the location (using one of the specified
106
     * strategies) and then build the relation between the entities.
107
     *
108
     * @param location the location of the ro-crate to be read
109
     * @return the read RO-crate
110
     */
111
    public RoCrate readCrate(String location) {
112
        // get the ro-crate-medata.json
113
        ObjectNode metadataJson = reader.readMetadataJson(location);
1✔
114
        // get the content of the crate
115
        File files = reader.readContent(location);
1✔
116

117
        // this set will contain the files that are associated with entities
118
        HashSet<String> usedFiles = new HashSet<>();
1✔
119
        usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
120
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
121
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
122
        return rebuildCrate(metadataJson, files, usedFiles);
1✔
123
    }
124

125
    private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
126
        JsonNode context = metadataJson.get(PROP_CONTEXT);
1✔
127

128
        CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
1✔
129
        RoCrateUnsafe crate = new RoCrateUnsafe();
1✔
130
        crate.setMetadataContext(crateContext);
1✔
131
        JsonNode graph = metadataJson.get(PROP_GRAPH);
1✔
132

133
        if (graph.isArray()) {
1✔
134
            moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
1✔
135
            RootDataEntity root = crate.getRootDataEntity();
1✔
136
            if (root != null) {
1✔
137
                Set<String> dataEntityIds = getDataEntityIds(root, graph);
1✔
138
                for (JsonNode entityJson : graph) {
1✔
139
                    String eId = unpackId(entityJson);
1✔
140
                    if (dataEntityIds.contains(eId)) {
1✔
141
                        // data entity
142
                        DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
1✔
143
                                .setAll(entityJson.deepCopy());
1✔
144

145
                        // Handle data entities with corresponding file
146
                        checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
1✔
147
                            usedFiles.add(file.getPath());
1✔
148
                            dataEntity.setLocationWithExceptions(file.toPath())
1✔
149
                                    .setId(file.getName());
1✔
150
                        });
1✔
151

152
                        crate.addDataEntityWithoutRootHasPart(dataEntity.build());
1✔
153
                    } else {
1✔
154
                        // contextual entity
155
                        crate.addContextualEntity(
1✔
156
                                new ContextualEntity.ContextualEntityBuilder()
157
                                        .setAll(entityJson.deepCopy())
1✔
158
                                        .build());
1✔
159
                    }
160
                }
1✔
161
            }
162
        }
163

164
        Collection<File> untrackedFiles = Arrays.stream(
1✔
165
                Optional.ofNullable(files.listFiles()).orElse(new File[0]))
1✔
166
                .filter(f -> !usedFiles.contains(f.getPath()))
1✔
167
                .collect(Collectors.toSet());
1✔
168

169
        crate.setUntrackedFiles(untrackedFiles);
1✔
170
        Validator defaultValidation = new Validator(new JsonSchemaValidation());
1✔
171
        defaultValidation.validate(crate);
1✔
172
        return crate;
1✔
173
    }
174

175
    /**
176
     * Extracts graph connections from top to bottom.
177
     * <p>
178
     * Example: (connections.get(parent) -> children)
179
     *
180
     * @param graph the ArrayNode with all Entities.
181
     * @return the graph connections.
182
     */
183
    protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
184
        Map<String, Set<String>> connections = new HashMap<>();
1✔
185

186
        Map<String, JsonNode> idToNodes = new HashMap<>();
1✔
187
        StreamSupport.stream(graph.spliterator(), false)
1✔
188
                .forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
1✔
189

190
        for (JsonNode entityNode : graph) {
1✔
191
            String currentId = unpackId(entityNode);
1✔
192
            StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
1✔
193
                    .map(this::unpackId)
1✔
194
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
195
                    .filter(Objects::nonNull)
1✔
196
                    .forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
1✔
197
                    .add(unpackId(child)));
1✔
198
            StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
1✔
199
                    .map(this::unpackId)
1✔
200
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
201
                    .filter(Objects::nonNull)
1✔
202
                    .forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
1✔
203
                    .add(currentId));
1✔
204
        }
1✔
205
        return connections;
1✔
206
    }
207

208
    protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
209
        if (root == null) {
1✔
NEW
210
            return Set.of();
×
211
        }
212
        Map<String, Set<String>> network = makeEntityGraph(graph);
1✔
213
        Set<String> directDataEntities = new HashSet<>(root.hasPart);
1✔
214

215
        Stack<String> processingQueue = new Stack<>();
1✔
216
        processingQueue.addAll(directDataEntities);
1✔
217
        Set<String> result = new HashSet<>();
1✔
218

219
        while (!processingQueue.empty()) {
1✔
220
            String currentId = processingQueue.pop();
1✔
221
            result.add(currentId);
1✔
222
            network.getOrDefault(currentId, new HashSet<>()).stream()
1✔
223
                    .filter(subId -> !result.contains(subId)) // avoid loops!
1✔
224
                    .forEach(subId -> {
1✔
225
                        result.add(subId);
1✔
226
                        processingQueue.add(subId);
1✔
227
                    });
1✔
228
        }
1✔
229
        return result;
1✔
230
    }
231

232
    protected String unpackId(JsonNode node) {
233
        if (node.isTextual()) {
1✔
234
            return node.asText();
1✔
235
        } else /*if (node.isObject())*/ {
236
            return node.path(PROP_ID).asText();
1✔
237
        }
238
    }
239

240
    protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
241
        if (IdentifierUtils.isUrl(filepathOrId)) {
1✔
NEW
242
            return Optional.empty();
×
243
        }
244
        return IdentifierUtils.decode(filepathOrId)
1✔
245
                .map(decoded -> folder.toPath().resolve(decoded).toFile())
1✔
246
                .filter(File::exists);
1✔
247
    }
248

249
    /**
250
     * Moves the descriptor and the root entity from the graph to the crate.
251
     * <p>
252
     * Extracts the root data entity and the Metadata File Descriptor from the
253
     * graph and inserts them into the crate object. It also deletes it from the
254
     * graph. We will need the root dataset to distinguish between data entities
255
     * and contextual entities.
256
     *
257
     * @param crate the crate, which will receive the entities, if available in
258
     * the graph.
259
     * @param graph the graph of the Metadata JSON file, where the entities are
260
     * extracted and removed from.
261
     */
262
    protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
263
        Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
1✔
264

265
        maybeDescriptor.ifPresent(descriptor -> {
1✔
266
            setCrateDescriptor(crate, descriptor);
1✔
267
            JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
1✔
268

269
            Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
1✔
270

271
            maybeRoot.ifPresent(root -> {
1✔
272
                Set<String> hasPartIds = extractHasPartIds(root);
1✔
273

274
                crate.setRootDataEntity(
1✔
275
                        new RootDataEntity.RootDataEntityBuilder()
276
                                .setAll(root.deepCopy())
1✔
277
                                .setHasPart(hasPartIds)
1✔
278
                                .build());
1✔
279

280
                JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
1✔
281
            });
1✔
282
        });
1✔
283
    }
1✔
284

285
    /**
286
     * Find the metadata descriptor.
287
     * <p>
288
     * Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
289
     *
290
     * @param graph the graph to search the descriptor in.
291
     * @return the metadata descriptor of the crate.
292
     */
293
    protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
294
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
295
        // use the algorithm described here:
296
        // https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
297
        Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
1✔
298
                // "2. if the conformsTo property is a URI that starts with
299
                // https://w3id.org/ro/crate/"
300
                .filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
1✔
301
                // "3. from this entity’s about object keep the @id URI as variable root"
302
                .filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
1✔
303
                // There should be only one descriptor. If multiple exist, we take the first
304
                // one.
305
                .findFirst();
1✔
306
        return maybeDescriptor.or(()
1✔
307
                -> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
308
                StreamSupport.stream(graph.spliterator(), isParallel)
1✔
309
                        .filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
1✔
310
                        .findFirst()
1✔
311
        );
312
    }
313

314
    /**
315
     * Extracts the root entity from the graph, using the information from the
316
     * descriptor.
317
     * <p>
318
     * Basically implements step 5 of the algorithm described here:
319
     * <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
320
     * https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
321
     * </a>
322
     *
323
     * @param graph the graph from the metadata JSON-LD file
324
     * @param descriptor the RO-Crate descriptor
325
     * @return the root entity, if found
326
     */
327
    private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
328
        String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
1✔
329
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
330
        return StreamSupport.stream(graph.spliterator(), isParallel)
1✔
331
                // root is an object (filter + conversion)
332
                .filter(JsonNode::isObject)
1✔
333
                .map(JsonNode::<ObjectNode>deepCopy)
1✔
334
                // "5. if the entity has an @id URI that matches root return it"
335
                .filter(node -> node.path(PROP_ID).asText().equals(rootId))
1✔
336
                .findFirst();
1✔
337
    }
338

339
    private Set<String> extractHasPartIds(ObjectNode root) {
340
        JsonNode hasPartNode = root.path(PROP_HAS_PART);
1✔
341
        boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
1✔
342
        Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
1✔
343
                .map(hasPart -> hasPart.path(PROP_ID).asText())
1✔
344
                .filter(text -> !text.isBlank())
1✔
345
                .collect(Collectors.toSet());
1✔
346
        if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
1✔
347
            hasPartIds.add(hasPartNode.path(PROP_ID).asText());
1✔
348
        }
349
        return hasPartIds;
1✔
350
    }
351

352
    private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
353
        ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
1✔
354
                .setAll(descriptor.deepCopy())
1✔
355
                .build();
1✔
356
        crate.setJsonDescriptor(descriptorEntity);
1✔
357
    }
1✔
358
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc