• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kit-data-manager / ro-crate-java / #395

29 Apr 2025 06:46PM UTC coverage: 89.357%. First build
#395

Pull #233

github

Pfeil
fix: remove unnecessary Javadoc build step from CI configuration
Pull Request #233: Version 2.1.0

608 of 700 new or added lines in 28 files covered. (86.86%)

1889 of 2114 relevant lines covered (89.36%)

0.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.6
/src/main/java/edu/kit/datamanager/ro_crate/reader/CrateReader.java
1
package edu.kit.datamanager.ro_crate.reader;
2

3
import com.fasterxml.jackson.databind.JsonNode;
4
import com.fasterxml.jackson.databind.node.ArrayNode;
5
import com.fasterxml.jackson.databind.node.ObjectNode;
6
import edu.kit.datamanager.ro_crate.RoCrate;
7
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
8
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
9
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
10
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
11
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
12
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
13
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
14
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
15
import edu.kit.datamanager.ro_crate.validation.Validator;
16
import org.slf4j.Logger;
17
import org.slf4j.LoggerFactory;
18

19
import java.io.File;
20
import java.nio.file.Path;
21
import java.util.*;
22
import java.util.stream.Collectors;
23
import java.util.stream.StreamSupport;
24

25
/**
26
 * This class allows reading crates from the outside into the library in order
27
 * to inspect or modify it.
28
 * <p>
29
 * The constructor takes a strategy to support different ways of importing the
30
 * crates. (from zip, folder, etc.).
31
 * <p>
32
 * The reader consideres "hasPart" and "isPartOf" properties and considers all
33
 * entities (in-)directly connected to the root entity ("./") as DataEntities.
34
 *
35
 * @param <T> the type of the location parameter
36
 */
37
public class CrateReader<T> {
38

39
    private static final Logger logger = LoggerFactory.getLogger(CrateReader.class);
1✔
40

41
    /**
42
     * This is a private inner class that shall not be exposed. **Do not make it
43
     * public or protected.** It serves only the purpose of unsafe operations
44
     * while reading a crate and may be specific to this implementation.
45
     */
46
    private static class RoCrateUnsafe extends RoCrate {
47

48
        public void addDataEntityWithoutRootHasPart(DataEntity entity) {
49
            this.metadataContext.checkEntity(entity);
1✔
50
            this.roCratePayload.addDataEntity(entity);
1✔
51
        }
1✔
52
    }
53

54
    /**
55
     * If the number of JSON entities in the crate is larger than this number,
56
     * parallelization will be used.
57
     */
58
    private static final int PARALLELIZATION_THRESHOLD = 100;
59

60
    private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
61
    private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
62
    private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
63

64
    protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
65

66
    protected static final String PROP_ABOUT = "about";
67
    protected static final String PROP_CONTEXT = "@context";
68
    protected static final String PROP_CONFORMS_TO = "conformsTo";
69
    protected static final String PROP_GRAPH = "@graph";
70
    protected static final String PROP_HAS_PART = "hasPart";
71
    protected static final String PROP_ID = "@id";
72

73
    private final GenericReaderStrategy<T> strategy;
74

75
    public CrateReader(GenericReaderStrategy<T> strategy) {
1✔
76
        this.strategy = strategy;
1✔
77
    }
1✔
78

79
    /**
80
     * This function will read the location (using one of the specified
81
     * strategies) and then build the relation between the entities.
82
     *
83
     * @param location the location of the ro-crate to be read
84
     * @return the read RO-crate
85
     */
86
    public RoCrate readCrate(T location) {
87
        // get the ro-crate-metadata.json
88
        ObjectNode metadataJson = strategy.readMetadataJson(location);
1✔
89
        // get the content of the crate
90
        File files = strategy.readContent(location);
1✔
91

92
        // this set will contain the files that are associated with entities
93
        HashSet<String> usedFiles = new HashSet<>();
1✔
94
        usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
95
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
96
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
97
        return rebuildCrate(metadataJson, files, usedFiles);
1✔
98
    }
99

100
    private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
101
        Objects.requireNonNull(metadataJson,
1✔
102
                "metadataJson must not be null – did the strategy fail to locate 'ro-crate-metadata.json'?");
103
        Objects.requireNonNull(files,
1✔
104
                "files directory must not be null – check GenericReaderStrategy.readContent()");
105
        JsonNode context = metadataJson.get(PROP_CONTEXT);
1✔
106

107
        CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
1✔
108
        RoCrateUnsafe crate = new RoCrateUnsafe();
1✔
109
        crate.setMetadataContext(crateContext);
1✔
110
        JsonNode graph = metadataJson.get(PROP_GRAPH);
1✔
111

112
        if (graph.isArray()) {
1✔
113
            moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
1✔
114
            RootDataEntity root = crate.getRootDataEntity();
1✔
115
            if (root != null) {
1✔
116
                Set<String> dataEntityIds = getDataEntityIds(root, graph);
1✔
117
                for (JsonNode entityJson : graph) {
1✔
118
                    String eId = unpackId(entityJson);
1✔
119
                    if (dataEntityIds.contains(eId)) {
1✔
120
                        // data entity
121
                        DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
1✔
122
                                .setAll(entityJson.deepCopy());
1✔
123

124
                        // Handle data entities with corresponding file
125
                        checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
1✔
126
                            usedFiles.add(file.getPath());
1✔
127
                            dataEntity.setLocationWithExceptions(file.toPath())
1✔
128
                                    .setId(file.getName());
1✔
129
                        });
1✔
130

131
                        crate.addDataEntityWithoutRootHasPart(dataEntity.build());
1✔
132
                    } else {
1✔
133
                        // contextual entity
134
                        crate.addContextualEntity(
1✔
135
                                new ContextualEntity.ContextualEntityBuilder()
136
                                        .setAll(entityJson.deepCopy())
1✔
137
                                        .build());
1✔
138
                    }
139
                }
1✔
140
            }
141
        }
142

143
        Collection<File> untrackedFiles = Arrays.stream(
1✔
144
                Optional.ofNullable(files.listFiles()).orElse(new File[0]))
1✔
145
                .filter(f -> !usedFiles.contains(f.getPath()))
1✔
146
                .collect(Collectors.toSet());
1✔
147

148
        crate.setUntrackedFiles(untrackedFiles);
1✔
149
        Validator defaultValidation = new Validator(new JsonSchemaValidation());
1✔
150
        defaultValidation.validate(crate);
1✔
151
        return crate;
1✔
152
    }
153

154
    /**
155
     * Extracts graph connections from top to bottom.
156
     * <p>
157
     * Example: (connections.get(parent) -> children)
158
     *
159
     * @param graph the ArrayNode with all Entities.
160
     * @return the graph connections.
161
     */
162
    protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
163
        Map<String, Set<String>> connections = new HashMap<>();
1✔
164

165
        Map<String, JsonNode> idToNodes = new HashMap<>();
1✔
166
        StreamSupport.stream(graph.spliterator(), false)
1✔
167
                .forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
1✔
168

169
        for (JsonNode entityNode : graph) {
1✔
170
            String currentId = unpackId(entityNode);
1✔
171
            StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
1✔
172
                    .map(this::unpackId)
1✔
173
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
174
                    .filter(Objects::nonNull)
1✔
175
                    .forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
1✔
176
                    .add(unpackId(child)));
1✔
177
            StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
1✔
178
                    .map(this::unpackId)
1✔
179
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
180
                    .filter(Objects::nonNull)
1✔
181
                    .forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
1✔
182
                    .add(currentId));
1✔
183
        }
1✔
184
        return connections;
1✔
185
    }
186

187
    protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
188
        if (root == null) {
1✔
NEW
189
            return Set.of();
×
190
        }
191
        Map<String, Set<String>> network = makeEntityGraph(graph);
1✔
192
        Set<String> directDataEntities = new HashSet<>(root.hasPart);
1✔
193

194
        Stack<String> processingQueue = new Stack<>();
1✔
195
        processingQueue.addAll(directDataEntities);
1✔
196
        Set<String> result = new HashSet<>();
1✔
197

198
        while (!processingQueue.empty()) {
1✔
199
            String currentId = processingQueue.pop();
1✔
200
            result.add(currentId);
1✔
201
            network.getOrDefault(currentId, new HashSet<>()).stream()
1✔
202
                    .filter(subId -> !result.contains(subId)) // avoid loops!
1✔
203
                    .forEach(subId -> {
1✔
204
                        result.add(subId);
1✔
205
                        processingQueue.add(subId);
1✔
206
                    });
1✔
207
        }
1✔
208
        return result;
1✔
209
    }
210

211
    protected String unpackId(JsonNode node) {
212
        if (node.isTextual()) {
1✔
213
            return node.asText();
1✔
214
        } else /*if (node.isObject())*/ {
215
            return node.path(PROP_ID).asText();
1✔
216
        }
217
    }
218

219
    protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
220
        if (IdentifierUtils.isUrl(filepathOrId)) {
1✔
NEW
221
            return Optional.empty();
×
222
        }
223
        return IdentifierUtils.decode(filepathOrId)
1✔
224
                .map(decoded -> folder.toPath().resolve(decoded).normalize())
1✔
225
                // defence-in-depth: ensure we are still inside the crate folder
226
                .filter(resolved -> resolved.startsWith(folder.toPath()))
1✔
227
                .map(Path::toFile)
1✔
228
                .filter(File::exists);
1✔
229
    }
230

231
    /**
232
     * Moves the descriptor and the root entity from the graph to the crate.
233
     * <p>
234
     * Extracts the root data entity and the Metadata File Descriptor from the
235
     * graph and inserts them into the crate object. It also deletes it from the
236
     * graph. We will need the root dataset to distinguish between data entities
237
     * and contextual entities.
238
     *
239
     * @param crate the crate, which will receive the entities, if available in
240
     * the graph.
241
     * @param graph the graph of the Metadata JSON file, where the entities are
242
     * extracted and removed from.
243
     */
244
    protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
245
        Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
1✔
246

247
        maybeDescriptor.ifPresent(descriptor -> {
1✔
248
            setCrateDescriptor(crate, descriptor);
1✔
249
            JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
1✔
250

251
            Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
1✔
252

253
            maybeRoot.ifPresent(root -> {
1✔
254
                Set<String> hasPartIds = extractHasPartIds(root);
1✔
255

256
                crate.setRootDataEntity(
1✔
257
                        new RootDataEntity.RootDataEntityBuilder()
258
                                .setAll(root.deepCopy())
1✔
259
                                .setHasPart(hasPartIds)
1✔
260
                                .build());
1✔
261

262
                JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
1✔
263
            });
1✔
264
        });
1✔
265
    }
1✔
266

267
    /**
268
     * Find the metadata descriptor.
269
     * <p>
270
     * Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
271
     *
272
     * @param graph the graph to search the descriptor in.
273
     * @return the metadata descriptor of the crate.
274
     */
275
    protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
276
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
277
        // use the algorithm described here:
278
        // https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
279
        Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
1✔
280
                // "2. if the conformsTo property is a URI that starts with
281
                // https://w3id.org/ro/crate/"
282
                .filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
1✔
283
                // "3. from this entity’s about object keep the @id URI as variable root"
284
                .filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
1✔
285
                // There should be only one descriptor. If multiple exist, we take the first
286
                // one.
287
                .findFirst();
1✔
288
        return maybeDescriptor.or(()
1✔
289
                -> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
290
                StreamSupport.stream(graph.spliterator(), isParallel)
1✔
291
                        .filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
1✔
292
                        .findFirst()
1✔
293
        );
294
    }
295

296
    /**
297
     * Extracts the root entity from the graph, using the information from the
298
     * descriptor.
299
     * <p>
300
     * Basically implements step 5 of the algorithm described here:
301
     * <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
302
     * https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
303
     * </a>
304
     *
305
     * @param graph the graph from the metadata JSON-LD file
306
     * @param descriptor the RO-Crate descriptor
307
     * @return the root entity, if found
308
     */
309
    private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
310
        String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
1✔
311
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
312
        return StreamSupport.stream(graph.spliterator(), isParallel)
1✔
313
                // root is an object (filter + conversion)
314
                .filter(JsonNode::isObject)
1✔
315
                .map(JsonNode::<ObjectNode>deepCopy)
1✔
316
                // "5. if the entity has an @id URI that matches root return it"
317
                .filter(node -> node.path(PROP_ID).asText().equals(rootId))
1✔
318
                .findFirst();
1✔
319
    }
320

321
    private Set<String> extractHasPartIds(ObjectNode root) {
322
        JsonNode hasPartNode = root.path(PROP_HAS_PART);
1✔
323
        boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
1✔
324
        Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
1✔
325
                .map(hasPart -> hasPart.path(PROP_ID).asText())
1✔
326
                .filter(text -> !text.isBlank())
1✔
327
                .collect(Collectors.toSet());
1✔
328
        if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
1✔
329
            hasPartIds.add(hasPartNode.path(PROP_ID).asText());
1✔
330
        }
331
        return hasPartIds;
1✔
332
    }
333

334
    private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
335
        ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
1✔
336
                .setAll(descriptor.deepCopy())
1✔
337
                .build();
1✔
338
        crate.setJsonDescriptor(descriptorEntity);
1✔
339
    }
1✔
340
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc