• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

kit-data-manager / ro-crate-java / #373

28 Apr 2025 09:57AM CUT coverage: 86.891% (+0.9%) from 85.949%
#373

Pull #247

github

web-flow
Merge a4e0ad782 into 0ab3f3ca9
Pull Request #247: Generalize reading and writing crates

249 of 278 new or added lines in 16 files covered. (89.57%)

2 existing lines in 2 files now uncovered.

1889 of 2174 relevant lines covered (86.89%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.86
/src/main/java/edu/kit/datamanager/ro_crate/reader/CrateReader.java
1
package edu.kit.datamanager.ro_crate.reader;
2

3
import com.fasterxml.jackson.databind.JsonNode;
4
import com.fasterxml.jackson.databind.node.ArrayNode;
5
import com.fasterxml.jackson.databind.node.ObjectNode;
6
import edu.kit.datamanager.ro_crate.RoCrate;
7
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
8
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
9
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
10
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
11
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
12
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
13
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
14
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
15
import edu.kit.datamanager.ro_crate.validation.Validator;
16
import org.slf4j.Logger;
17
import org.slf4j.LoggerFactory;
18

19
import java.io.File;
20
import java.util.*;
21
import java.util.stream.Collectors;
22
import java.util.stream.StreamSupport;
23

24
/**
25
 * This class allows reading crates from the outside into the library in order
26
 * to inspect or modify it.
27
 * <p>
28
 * The constructor takes a strategy to support different ways of importing the
29
 * crates. (from zip, folder, etc.).
30
 * <p>
31
 * The reader consideres "hasPart" and "isPartOf" properties and considers all
32
 * entities (in-)directly connected to the root entity ("./") as DataEntities.
33
 *
34
 * @param <T> the type of the location parameter
35
 */
36
public class CrateReader<T> {
37

38
    private static final Logger logger = LoggerFactory.getLogger(CrateReader.class);
1✔
39

40
    /**
41
     * This is a private inner class that shall not be exposed. **Do not make it
42
     * public or protected.** It serves only the purpose of unsafe operations
43
     * while reading a crate and may be specific to this implementation.
44
     */
45
    private static class RoCrateUnsafe extends RoCrate {
46

47
        public void addDataEntityWithoutRootHasPart(DataEntity entity) {
48
            this.metadataContext.checkEntity(entity);
1✔
49
            this.roCratePayload.addDataEntity(entity);
1✔
50
        }
1✔
51
    }
52

53
    /**
54
     * If the number of JSON entities in the crate is larger than this number,
55
     * parallelization will be used.
56
     */
57
    private static final int PARALLELIZATION_THRESHOLD = 100;
58

59
    private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
60
    private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
61
    private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
62

63
    protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
64

65
    protected static final String PROP_ABOUT = "about";
66
    protected static final String PROP_CONTEXT = "@context";
67
    protected static final String PROP_CONFORMS_TO = "conformsTo";
68
    protected static final String PROP_GRAPH = "@graph";
69
    protected static final String PROP_HAS_PART = "hasPart";
70
    protected static final String PROP_ID = "@id";
71

72
    private final GenericReaderStrategy<T> strategy;
73

74
    public CrateReader(GenericReaderStrategy<T> strategy) {
1✔
75
        this.strategy = strategy;
1✔
76
    }
1✔
77

78
    /**
79
     * This function will read the location (using one of the specified
80
     * strategies) and then build the relation between the entities.
81
     *
82
     * @param location the location of the ro-crate to be read
83
     * @return the read RO-crate
84
     */
85
    public RoCrate readCrate(T location) {
86
        // get the ro-crate-metadata.json
87
        ObjectNode metadataJson = strategy.readMetadataJson(location);
1✔
88
        // get the content of the crate
89
        File files = strategy.readContent(location);
1✔
90

91
        // this set will contain the files that are associated with entities
92
        HashSet<String> usedFiles = new HashSet<>();
1✔
93
        usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
1✔
94
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
1✔
95
        usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
1✔
96
        return rebuildCrate(metadataJson, files, usedFiles);
1✔
97
    }
98

99
    private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
100
        if (metadataJson == null) {
1✔
NEW
101
            logger.error("Metadata JSON is null, cannot rebuild crate");
×
NEW
102
            return null;
×
103
        }
104
        if (files == null) {
1✔
NEW
105
            logger.error("Content files directory is null, cannot rebuild crate");
×
NEW
106
            return null;
×
107
        }
108
        JsonNode context = metadataJson.get(PROP_CONTEXT);
1✔
109

110
        CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
1✔
111
        RoCrateUnsafe crate = new RoCrateUnsafe();
1✔
112
        crate.setMetadataContext(crateContext);
1✔
113
        JsonNode graph = metadataJson.get(PROP_GRAPH);
1✔
114

115
        if (graph.isArray()) {
1✔
116
            moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
1✔
117
            RootDataEntity root = crate.getRootDataEntity();
1✔
118
            if (root != null) {
1✔
119
                Set<String> dataEntityIds = getDataEntityIds(root, graph);
1✔
120
                for (JsonNode entityJson : graph) {
1✔
121
                    String eId = unpackId(entityJson);
1✔
122
                    if (dataEntityIds.contains(eId)) {
1✔
123
                        // data entity
124
                        DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
1✔
125
                                .setAll(entityJson.deepCopy());
1✔
126

127
                        // Handle data entities with corresponding file
128
                        checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
1✔
129
                            usedFiles.add(file.getPath());
1✔
130
                            dataEntity.setLocationWithExceptions(file.toPath())
1✔
131
                                    .setId(file.getName());
1✔
132
                        });
1✔
133

134
                        crate.addDataEntityWithoutRootHasPart(dataEntity.build());
1✔
135
                    } else {
1✔
136
                        // contextual entity
137
                        crate.addContextualEntity(
1✔
138
                                new ContextualEntity.ContextualEntityBuilder()
139
                                        .setAll(entityJson.deepCopy())
1✔
140
                                        .build());
1✔
141
                    }
142
                }
1✔
143
            }
144
        }
145

146
        Collection<File> untrackedFiles = Arrays.stream(
1✔
147
                Optional.ofNullable(files.listFiles()).orElse(new File[0]))
1✔
148
                .filter(f -> !usedFiles.contains(f.getPath()))
1✔
149
                .collect(Collectors.toSet());
1✔
150

151
        crate.setUntrackedFiles(untrackedFiles);
1✔
152
        Validator defaultValidation = new Validator(new JsonSchemaValidation());
1✔
153
        defaultValidation.validate(crate);
1✔
154
        return crate;
1✔
155
    }
156

157
    /**
158
     * Extracts graph connections from top to bottom.
159
     * <p>
160
     * Example: (connections.get(parent) -> children)
161
     *
162
     * @param graph the ArrayNode with all Entities.
163
     * @return the graph connections.
164
     */
165
    protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
166
        Map<String, Set<String>> connections = new HashMap<>();
1✔
167

168
        Map<String, JsonNode> idToNodes = new HashMap<>();
1✔
169
        StreamSupport.stream(graph.spliterator(), false)
1✔
170
                .forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
1✔
171

172
        for (JsonNode entityNode : graph) {
1✔
173
            String currentId = unpackId(entityNode);
1✔
174
            StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
1✔
175
                    .map(this::unpackId)
1✔
176
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
177
                    .filter(Objects::nonNull)
1✔
178
                    .forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
1✔
179
                    .add(unpackId(child)));
1✔
180
            StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
1✔
181
                    .map(this::unpackId)
1✔
182
                    .map(s -> idToNodes.getOrDefault(s, null))
1✔
183
                    .filter(Objects::nonNull)
1✔
184
                    .forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
1✔
185
                    .add(currentId));
1✔
186
        }
1✔
187
        return connections;
1✔
188
    }
189

190
    protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
191
        if (root == null) {
1✔
NEW
192
            return Set.of();
×
193
        }
194
        Map<String, Set<String>> network = makeEntityGraph(graph);
1✔
195
        Set<String> directDataEntities = new HashSet<>(root.hasPart);
1✔
196

197
        Stack<String> processingQueue = new Stack<>();
1✔
198
        processingQueue.addAll(directDataEntities);
1✔
199
        Set<String> result = new HashSet<>();
1✔
200

201
        while (!processingQueue.empty()) {
1✔
202
            String currentId = processingQueue.pop();
1✔
203
            result.add(currentId);
1✔
204
            network.getOrDefault(currentId, new HashSet<>()).stream()
1✔
205
                    .filter(subId -> !result.contains(subId)) // avoid loops!
1✔
206
                    .forEach(subId -> {
1✔
207
                        result.add(subId);
1✔
208
                        processingQueue.add(subId);
1✔
209
                    });
1✔
210
        }
1✔
211
        return result;
1✔
212
    }
213

214
    protected String unpackId(JsonNode node) {
215
        if (node.isTextual()) {
1✔
216
            return node.asText();
1✔
217
        } else /*if (node.isObject())*/ {
218
            return node.path(PROP_ID).asText();
1✔
219
        }
220
    }
221

222
    protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
223
        if (IdentifierUtils.isUrl(filepathOrId)) {
1✔
NEW
224
            return Optional.empty();
×
225
        }
226
        return IdentifierUtils.decode(filepathOrId)
1✔
227
                .map(decoded -> folder.toPath().resolve(decoded).toFile())
1✔
228
                .filter(File::exists);
1✔
229
    }
230

231
    /**
232
     * Moves the descriptor and the root entity from the graph to the crate.
233
     * <p>
234
     * Extracts the root data entity and the Metadata File Descriptor from the
235
     * graph and inserts them into the crate object. It also deletes it from the
236
     * graph. We will need the root dataset to distinguish between data entities
237
     * and contextual entities.
238
     *
239
     * @param crate the crate, which will receive the entities, if available in
240
     * the graph.
241
     * @param graph the graph of the Metadata JSON file, where the entities are
242
     * extracted and removed from.
243
     */
244
    protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
245
        Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
1✔
246

247
        maybeDescriptor.ifPresent(descriptor -> {
1✔
248
            setCrateDescriptor(crate, descriptor);
1✔
249
            JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
1✔
250

251
            Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
1✔
252

253
            maybeRoot.ifPresent(root -> {
1✔
254
                Set<String> hasPartIds = extractHasPartIds(root);
1✔
255

256
                crate.setRootDataEntity(
1✔
257
                        new RootDataEntity.RootDataEntityBuilder()
258
                                .setAll(root.deepCopy())
1✔
259
                                .setHasPart(hasPartIds)
1✔
260
                                .build());
1✔
261

262
                JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
1✔
263
            });
1✔
264
        });
1✔
265
    }
1✔
266

267
    /**
268
     * Find the metadata descriptor.
269
     * <p>
270
     * Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
271
     *
272
     * @param graph the graph to search the descriptor in.
273
     * @return the metadata descriptor of the crate.
274
     */
275
    protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
276
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
277
        // use the algorithm described here:
278
        // https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
279
        Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
1✔
280
                // "2. if the conformsTo property is a URI that starts with
281
                // https://w3id.org/ro/crate/"
282
                .filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
1✔
283
                // "3. from this entity’s about object keep the @id URI as variable root"
284
                .filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
1✔
285
                // There should be only one descriptor. If multiple exist, we take the first
286
                // one.
287
                .findFirst();
1✔
288
        return maybeDescriptor.or(()
1✔
289
                -> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
290
                StreamSupport.stream(graph.spliterator(), isParallel)
1✔
291
                        .filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
1✔
292
                        .findFirst()
1✔
293
        );
294
    }
295

296
    /**
297
     * Extracts the root entity from the graph, using the information from the
298
     * descriptor.
299
     * <p>
300
     * Basically implements step 5 of the algorithm described here:
301
     * <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
302
     * https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
303
     * </a>
304
     *
305
     * @param graph the graph from the metadata JSON-LD file
306
     * @param descriptor the RO-Crate descriptor
307
     * @return the root entity, if found
308
     */
309
    private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
310
        String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
1✔
311
        boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
1✔
312
        return StreamSupport.stream(graph.spliterator(), isParallel)
1✔
313
                // root is an object (filter + conversion)
314
                .filter(JsonNode::isObject)
1✔
315
                .map(JsonNode::<ObjectNode>deepCopy)
1✔
316
                // "5. if the entity has an @id URI that matches root return it"
317
                .filter(node -> node.path(PROP_ID).asText().equals(rootId))
1✔
318
                .findFirst();
1✔
319
    }
320

321
    private Set<String> extractHasPartIds(ObjectNode root) {
322
        JsonNode hasPartNode = root.path(PROP_HAS_PART);
1✔
323
        boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
1✔
324
        Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
1✔
325
                .map(hasPart -> hasPart.path(PROP_ID).asText())
1✔
326
                .filter(text -> !text.isBlank())
1✔
327
                .collect(Collectors.toSet());
1✔
328
        if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
1✔
329
            hasPartIds.add(hasPartNode.path(PROP_ID).asText());
1✔
330
        }
331
        return hasPartIds;
1✔
332
    }
333

334
    private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
335
        ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
1✔
336
                .setAll(descriptor.deepCopy())
1✔
337
                .build();
1✔
338
        crate.setJsonDescriptor(descriptorEntity);
1✔
339
    }
1✔
340
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc