• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kit-data-manager / ro-crate-java / #432

19 May 2025 03:13PM UTC coverage: 90.791% (+0.6%) from 90.169%
#432

Pull #258

github

web-flow
Merge 1e2da1171 into 810d1995c
Pull Request #258: Support .ELN-style crates in all zip readers and writers

235 of 253 new or added lines in 25 files covered. (92.89%)

1 existing line in 1 file now uncovered.

1962 of 2161 relevant lines covered (90.79%)

0.91 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.3
/src/main/java/edu/kit/datamanager/ro_crate/reader/CrateReader.java
1
package edu.kit.datamanager.ro_crate.reader;
2

3
import com.fasterxml.jackson.databind.JsonNode;
4
import com.fasterxml.jackson.databind.node.ArrayNode;
5
import com.fasterxml.jackson.databind.node.ObjectNode;
6
import edu.kit.datamanager.ro_crate.RoCrate;
7
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
8
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
9
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
10
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
11
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
12
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
13
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
14
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
15
import edu.kit.datamanager.ro_crate.validation.Validator;
16
import org.slf4j.Logger;
17
import org.slf4j.LoggerFactory;
18

19
import java.io.File;
20
import java.io.IOException;
21
import java.nio.file.Path;
22
import java.util.*;
23
import java.util.stream.Collectors;
24
import java.util.stream.StreamSupport;
25

26
/**
27
 * This class allows reading crates from the outside into the library in order
28
 * to inspect or modify it.
29
 * <p>
30
 * The constructor takes a strategy to support different ways of importing the
31
 * crates. (from zip, folder, etc.).
32
 * <p>
33
 * The reader considers "hasPart" and "isPartOf" properties and considers all
34
 * entities (in-)directly connected to the root entity ("./") as DataEntities.
35
 *
36
 * @param <T> the type of the location parameter
37
 */
38
public class CrateReader<T> {
39

40
    private static final Logger logger = LoggerFactory.getLogger(CrateReader.class);
1✔
41

42
    /**
43
     * This is a private inner class that shall not be exposed. **Do not make it
44
     * public or protected.** It serves only the purpose of unsafe operations
45
     * while reading a crate and may be specific to this implementation.
46
     */
47
    private static class RoCrateUnsafe extends RoCrate {
48

49
        public void addDataEntityWithoutRootHasPart(DataEntity entity) {
50
            this.metadataContext.checkEntity(entity);
1✔
51
            this.roCratePayload.addDataEntity(entity);
1✔
52
        }
1✔
53
    }
54

55
    /**
56
     * If the number of JSON entities in the crate is larger than this number,
57
     * parallelization will be used.
58
     */
59
    private static final int PARALLELIZATION_THRESHOLD = 100;
60

61
    private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
62
    private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
63
    private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
64

65
    protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
66

67
    protected static final String PROP_ABOUT = "about";
68
    protected static final String PROP_CONTEXT = "@context";
69
    protected static final String PROP_CONFORMS_TO = "conformsTo";
70
    protected static final String PROP_GRAPH = "@graph";
71
    protected static final String PROP_HAS_PART = "hasPart";
72
    protected static final String PROP_ID = "@id";
73

74
    private final GenericReaderStrategy<T> strategy;
75

76
    public CrateReader(GenericReaderStrategy<T> strategy) {
1✔
77
        this.strategy = strategy;
1✔
78
    }
1✔
79

80
    /**
81
     * This function will read the location (using one of the specified
82
     * strategies) and then build the relation between the entities.
83
     *
84
     * @param location the location of the ro-crate to be read
85
     * @return the read RO-crate
86
     */
87
    public RoCrate readCrate(T location) throws IOException {
88
        // get the ro-crate-metadata.json
89
        ObjectNode metadataJson = strategy.readMetadataJson(location);
1✔
90
        // get the content of the crate
91
        File files = strategy.readContent(location);
1✔
92

93
        // this set will contain the files that are associated with entities
94
        HashSet<String> usedFiles = new HashSet<>();
1✔
95
        usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
96
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
97
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
98
        return rebuildCrate(metadataJson, files, usedFiles);
1✔
99
    }
100

101
    private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
102
        Objects.requireNonNull(metadataJson,
1✔
103
                "metadataJson must not be null – did the strategy fail to locate 'ro-crate-metadata.json'?");
104
        Objects.requireNonNull(files,
1✔
105
                "files directory must not be null – check GenericReaderStrategy.readContent()");
106
        JsonNode context = metadataJson.get(PROP_CONTEXT);
1✔
107

108
        CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
1✔
109
        RoCrateUnsafe crate = new RoCrateUnsafe();
1✔
110
        crate.setMetadataContext(crateContext);
1✔
111
        JsonNode graph = metadataJson.get(PROP_GRAPH);
1✔
112

113
        if (graph.isArray()) {
1✔
114
            moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
1✔
115
            RootDataEntity root = crate.getRootDataEntity();
1✔
116
            if (root != null) {
1✔
117
                Set<String> dataEntityIds = getDataEntityIds(root, graph);
1✔
118
                for (JsonNode entityJson : graph) {
1✔
119
                    String eId = unpackId(entityJson);
1✔
120
                    if (dataEntityIds.contains(eId)) {
1✔
121
                        // data entity
122
                        DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
1✔
123
                                .setAllUnsafe(entityJson.deepCopy());
1✔
124

125
                        // Handle data entities with corresponding file
126
                        checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
1✔
127
                            usedFiles.add(file.getPath());
1✔
128
                            dataEntity.setLocationWithExceptions(file.toPath())
1✔
129
                                    .setId(file.getName());
1✔
130
                        });
1✔
131

132
                        crate.addDataEntityWithoutRootHasPart(dataEntity.build());
1✔
133
                    } else {
1✔
134
                        // contextual entity
135
                        crate.addContextualEntity(
1✔
136
                                new ContextualEntity.ContextualEntityBuilder()
137
                                        .setAllUnsafe(entityJson.deepCopy())
1✔
138
                                        .build());
1✔
139
                    }
140
                }
1✔
141
            }
142
        }
143

144
        Collection<File> untrackedFiles = Arrays.stream(
1✔
145
                Optional.ofNullable(files.listFiles()).orElse(new File[0]))
1✔
146
                .filter(f -> !usedFiles.contains(f.getPath()))
1✔
147
                .collect(Collectors.toSet());
1✔
148

149
        crate.setUntrackedFiles(untrackedFiles);
1✔
150
        Validator defaultValidation = new Validator(new JsonSchemaValidation());
1✔
151
        defaultValidation.validate(crate);
1✔
152
        return crate;
1✔
153
    }
154

155
    /**
156
     * Extracts graph connections from top to bottom.
157
     * <p>
158
     * Example: (connections.get(parent) -> children)
159
     *
160
     * @param graph the ArrayNode with all Entities.
161
     * @return the graph connections.
162
     */
163
    protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
164
        Map<String, Set<String>> connections = new HashMap<>();
1✔
165

166
        Map<String, JsonNode> idToNodes = new HashMap<>();
1✔
167
        StreamSupport.stream(graph.spliterator(), false)
1✔
168
                .forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
1✔
169

170
        for (JsonNode entityNode : graph) {
1✔
171
            String currentId = unpackId(entityNode);
1✔
172
            StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
1✔
173
                    .map(this::unpackId)
1✔
174
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
175
                    .filter(Objects::nonNull)
1✔
176
                    .forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
1✔
177
                    .add(unpackId(child)));
1✔
178
            StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
1✔
179
                    .map(this::unpackId)
1✔
180
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
181
                    .filter(Objects::nonNull)
1✔
182
                    .forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
1✔
183
                    .add(currentId));
1✔
184
        }
1✔
185
        return connections;
1✔
186
    }
187

188
    protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
189
        if (root == null) {
1✔
UNCOV
190
            return Set.of();
×
191
        }
192
        Map<String, Set<String>> network = makeEntityGraph(graph);
1✔
193
        Set<String> directDataEntities = new HashSet<>(root.hasPart);
1✔
194

195
        Stack<String> processingQueue = new Stack<>();
1✔
196
        processingQueue.addAll(directDataEntities);
1✔
197
        Set<String> result = new HashSet<>();
1✔
198

199
        while (!processingQueue.empty()) {
1✔
200
            String currentId = processingQueue.pop();
1✔
201
            result.add(currentId);
1✔
202
            network.getOrDefault(currentId, new HashSet<>()).stream()
1✔
203
                    .filter(subId -> !result.contains(subId)) // avoid loops!
1✔
204
                    .forEach(subId -> {
1✔
205
                        result.add(subId);
1✔
206
                        processingQueue.add(subId);
1✔
207
                    });
1✔
208
        }
1✔
209
        return result;
1✔
210
    }
211

212
    protected String unpackId(JsonNode node) {
213
        if (node.isTextual()) {
1✔
214
            return node.asText();
1✔
215
        } else /*if (node.isObject())*/ {
216
            return node.path(PROP_ID).asText();
1✔
217
        }
218
    }
219

220
    protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
221
        if (IdentifierUtils.isUrl(filepathOrId)) {
1✔
222
            return Optional.empty();
1✔
223
        }
224
        return IdentifierUtils.decode(filepathOrId)
1✔
225
                .map(decoded -> folder.toPath().resolve(decoded).normalize())
1✔
226
                // defence-in-depth: ensure we are still inside the crate folder
227
                .filter(resolved -> resolved.startsWith(folder.toPath()))
1✔
228
                .map(Path::toFile)
1✔
229
                .filter(File::exists);
1✔
230
    }
231

232
    /**
233
     * Moves the descriptor and the root entity from the graph to the crate.
234
     * <p>
235
     * Extracts the root data entity and the Metadata File Descriptor from the
236
     * graph and inserts them into the crate object. It also deletes it from the
237
     * graph. We will need the root dataset to distinguish between data entities
238
     * and contextual entities.
239
     *
240
     * @param crate the crate, which will receive the entities, if available in
241
     * the graph.
242
     * @param graph the graph of the Metadata JSON file, where the entities are
243
     * extracted and removed from.
244
     */
245
    protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
246
        Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
1✔
247

248
        maybeDescriptor.ifPresent(descriptor -> {
1✔
249
            setCrateDescriptor(crate, descriptor);
1✔
250
            JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
1✔
251

252
            Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
1✔
253

254
            maybeRoot.ifPresent(root -> {
1✔
255
                Set<String> hasPartIds = extractHasPartIds(root);
1✔
256

257
                crate.setRootDataEntity(
1✔
258
                        new RootDataEntity.RootDataEntityBuilder()
259
                                .setAllUnsafe(root.deepCopy())
1✔
260
                                .setHasPart(hasPartIds)
1✔
261
                                .build());
1✔
262

263
                JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
1✔
264
            });
1✔
265
        });
1✔
266
    }
1✔
267

268
    /**
269
     * Find the metadata descriptor.
270
     * <p>
271
     * Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
272
     *
273
     * @param graph the graph to search the descriptor in.
274
     * @return the metadata descriptor of the crate.
275
     */
276
    protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
277
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
278
        // use the algorithm described here:
279
        // https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
280
        Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
1✔
281
                // "2. if the conformsTo property is a URI that starts with
282
                // https://w3id.org/ro/crate/"
283
                .filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
1✔
284
                // "3. from this entity’s about object keep the @id URI as variable root"
285
                .filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
1✔
286
                // There should be only one descriptor. If multiple exist, we take the first
287
                // one.
288
                .findFirst();
1✔
289
        return maybeDescriptor.or(()
1✔
290
                -> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
291
                StreamSupport.stream(graph.spliterator(), isParallel)
1✔
292
                        .filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
1✔
293
                        .findFirst()
1✔
294
        );
295
    }
296

297
    /**
298
     * Extracts the root entity from the graph, using the information from the
299
     * descriptor.
300
     * <p>
301
     * Basically implements step 5 of the algorithm described here:
302
     * <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
303
     * https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
304
     * </a>
305
     *
306
     * @param graph the graph from the metadata JSON-LD file
307
     * @param descriptor the RO-Crate descriptor
308
     * @return the root entity, if found
309
     */
310
    private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
311
        String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
1✔
312
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
313
        return StreamSupport.stream(graph.spliterator(), isParallel)
1✔
314
                // root is an object (filter + conversion)
315
                .filter(JsonNode::isObject)
1✔
316
                .map(JsonNode::<ObjectNode>deepCopy)
1✔
317
                // "5. if the entity has an @id URI that matches root return it"
318
                .filter(node -> node.path(PROP_ID).asText().equals(rootId))
1✔
319
                .findFirst();
1✔
320
    }
321

322
    private Set<String> extractHasPartIds(ObjectNode root) {
323
        JsonNode hasPartNode = root.path(PROP_HAS_PART);
1✔
324
        boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
1✔
325
        Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
1✔
326
                .map(hasPart -> hasPart.path(PROP_ID).asText())
1✔
327
                .filter(text -> !text.isBlank())
1✔
328
                .collect(Collectors.toSet());
1✔
329
        if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
1✔
330
            hasPartIds.add(hasPartNode.path(PROP_ID).asText());
1✔
331
        }
332
        return hasPartIds;
1✔
333
    }
334

335
    private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
336
        ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
1✔
337
                .setAllUnsafe(descriptor.deepCopy())
1✔
338
                .build();
1✔
339
        crate.setJsonDescriptor(descriptorEntity);
1✔
340
    }
1✔
341
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc