• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Nanopublication / nanopub-java / 19040279779

03 Nov 2025 03:42PM UTC coverage: 51.843% (-0.02%) from 51.861%
19040279779

push

github

Ziroli Plutschow
Preparation 2 for batch importing of RO-Crates

- improved statistics

Warning: may contain bugs or imperfect RO-Crate interpretation details.
 There are still open issues e.g.
 - do we use 1 Nanopublication per 1 RO-Crate's index file, or do we split it, since sometimes there are more than 1000 triples in a ro-crate-metadata.json.

1007 of 2912 branches covered (34.58%)

Branch coverage included in aggregate %.

5209 of 9078 relevant lines covered (57.38%)

2.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

57.41
src/main/java/org/nanopub/RoCrateParser.java
1
package org.nanopub;
2

3
import org.apache.commons.lang3.StringUtils;
4
import org.apache.commons.logging.Log;
5
import org.apache.commons.logging.LogFactory;
6
import org.eclipse.rdf4j.model.IRI;
7
import org.eclipse.rdf4j.model.Model;
8
import org.eclipse.rdf4j.model.Statement;
9
import org.eclipse.rdf4j.model.ValueFactory;
10
import org.eclipse.rdf4j.model.impl.LinkedHashModel;
11
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
12
import org.eclipse.rdf4j.model.vocabulary.PROV;
13
import org.eclipse.rdf4j.model.vocabulary.RDF;
14
import org.eclipse.rdf4j.model.vocabulary.RDFS;
15
import org.eclipse.rdf4j.rio.RDFFormat;
16
import org.eclipse.rdf4j.rio.RDFParser;
17
import org.eclipse.rdf4j.rio.Rio;
18
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
19
import org.eclipse.rdf4j.rio.helpers.StatementCollector;
20
import org.eclipse.rdf4j.rio.jsonld.JSONLDSettings;
21
import org.jspecify.annotations.NonNull;
22
import org.nanopub.vocabulary.NPX;
23
import org.nanopub.vocabulary.SCHEMA;
24

25
import java.io.IOException;
26
import java.io.InputStream;
27
import java.net.URI;
28
import java.net.URISyntaxException;
29
import java.net.http.HttpClient;
30
import java.net.http.HttpRequest;
31
import java.net.http.HttpResponse;
32
import java.util.Collection;
33
import java.util.Optional;
34
import java.util.regex.Matcher;
35
import java.util.regex.Pattern;
36
import java.util.stream.Collectors;
37

38
/**
39
 * This class represents a parser for RO-Crate metadata files.
40
 */
41
public class RoCrateParser {
3✔
42

43
    private static final Log LOG = LogFactory.getLog(RoCrateParser.class);
3✔
44
    private static final ValueFactory vf = SimpleValueFactory.getInstance();
2✔
45

46
    private static final HttpClient client = HttpClient.newHttpClient();
3✔
47

48
    public static InputStream downloadRoCreateMetadataFile(String uri) throws URISyntaxException, IOException, InterruptedException {
49
        HttpRequest req = HttpRequest.newBuilder().GET().uri(new URI(uri)).build();
×
50
        HttpResponse<InputStream> httpResponse = client.send(req, HttpResponse.BodyHandlers.ofInputStream());
×
51
        return httpResponse.body();
×
52
    }
53

54
    /**
55
     * Parses a RO-Crate metadata file from a given URL.
56
     *
57
     * @param url          the url where the metadata file is published (including trailing "/")
58
     * @param roCrateMetadata the ro-create metadata file name, may be the empty string
59
     * @return a signed Nanopub object containing the parsed data.
60
     * @throws org.nanopub.MalformedNanopubException if the parsed data does not conform to the expected structure.
61
     * @throws java.io.IOException                   if an I/O error occurs while reading the metadata file.
62
     * @throws java.lang.InterruptedException        if the operation is interrupted.
63
     * @throws java.net.URISyntaxException           if the URL is malformed.
64
     */
65
    public Nanopub parseRoCreate(String url, InputStream roCrateMetadata) throws MalformedNanopubException, IOException, NanopubAlreadyFinalizedException {
66
        RDFParser parser = Rio.createParser(RDFFormat.JSONLD);
3✔
67

68
        // Configure parser settings
69
        parser.getParserConfig().set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
7✔
70

71
        // We do not accept spaces in urls (@id elements)
72
        // parser.getParserConfig().set(BasicParserSettings.VERIFY_URI_SYNTAX, false);
73

74
        // Since JSONLDSettings.WHITELIST does not contain "https://w3id.org/ro/crate/1.0/context" we disable SECURE MODE
75
        parser.getParserConfig().set(JSONLDSettings.SECURE_MODE, false);
7✔
76

77
        Model model = new LinkedHashModel();
4✔
78
        StatementCollector handler = new StatementCollector(model);
5✔
79

80
        parser.setRDFHandler(handler);
4✔
81
        IRI globalRoCrateRef = constructRoCrateUrl(url, roCrateMetadata);
4✔
82
        parser.parse(roCrateMetadata, globalRoCrateRef.stringValue());
5✔
83

84
        // Create Nanopub
85
        NanopubCreator npCreator = new NanopubCreator(true);
5✔
86
        Collection<Statement> metadataStatements = handler.getStatements();
3✔
87
        npCreator.addAssertionStatements(metadataStatements);
3✔
88

89
        // Extract some special statements
90
        IRI identifier = extractToplevelIdentifierOrBackup(metadataStatements, globalRoCrateRef.stringValue(), null);
7✔
91
        String label = extractToplevelName(metadataStatements, identifier);
5✔
92

93
        // as provenance statement WAS_DERIVED_FROM we always use the specified name: "ro-crate-metadata.json"
94
        npCreator.addProvenanceStatement(PROV.WAS_DERIVED_FROM, vf.createIRI(url+ "ro-crate-metadata.json"));
7✔
95
        npCreator.addPubinfoStatement(NPX.INTRODUCES,  identifier);
4✔
96
        npCreator.addPubinfoStatement(RDFS.LABEL, vf.createLiteral(label));
6✔
97
        npCreator.addPubinfoStatement(RDF.TYPE, NPX.RO_CRATE_NANOPUB);
4✔
98

99
        return npCreator.finalizeNanopub(true);
4✔
100
    }
101

102
    /**
103
     * Find the ID of the RO-Crate.
104
     * @param url where we get the RO-crate
105
     * @param roCrateMetadata LATER not yet implemented
106
     * @return our current best guess for the ID_IRI
107
     */
108
    // default access for testing
109
    static IRI constructRoCrateUrl(String url, InputStream roCrateMetadata) {
110
        String id;
111
        final String BASE_ROCRATE_API_URL = "https://api.rohub.org/api/ros/";
2✔
112
        final String BASE_ROCRATE_API_URL_SUFFIX = "crate/download/";
2✔
113
        final String BASE_ROHUB_URL = "https://w3id.org/ro-id/";
2✔
114
        final String patternUrlUntilLastSlash = "(https?://.*/)(.*)";
2✔
115
        if (url.startsWith("http")) {
4!
116
            if (url.startsWith("https://api.rohub.org/api/ros/")) {
4✔
117
                id = StringUtils.substringAfter(url, BASE_ROCRATE_API_URL);
4✔
118
                id = StringUtils.removeEnd(id, BASE_ROCRATE_API_URL_SUFFIX);
4✔
119
                return vf.createIRI(BASE_ROHUB_URL + id);
5✔
120
            } else if (url.endsWith("/")) {
4✔
121
                return vf.createIRI(url);
4✔
122
            } else if (url.matches(patternUrlUntilLastSlash)) {
4!
123
                // probably ends in  ./metadata.json or something like that, we remove it anyway
124
                Pattern p = Pattern.compile(patternUrlUntilLastSlash);
3✔
125
                Matcher m = p.matcher(url); m.matches();
7✔
126
                String resultingUrl = m.group(1);
4✔
127
                if (LOG.isDebugEnabled()) {
3!
128
                    try {
129
                        String filename = m.group(2);
×
130
                        if (filename.equals("ro-crate-metadata.json") || filename.equals("ro-crate-metadata.jsonld")) {
×
131
                            // standard case, no logging
132
                        } else {
133
                            LOG.debug("Unexpected filename for RO-Create Metadata: " + filename);
×
134
                            LOG.debug("Stripping the filename anyway and use '" + resultingUrl + "' as RO-Crate base.");
×
135
                        }
136
                    } catch (IllegalStateException | IndexOutOfBoundsException e) {
×
137
                      // there was no trailing filename, all good
138
                    }
×
139
                }
140
                if (resultingUrl == null) {
2!
141
                    LOG.warn("Could not determine RO-Crate base URL with input url: " + url);
×
142
                }
143
                return vf.createIRI(resultingUrl);
4✔
144
            } else {
145
                // TODO extract from roCrateMetadata
146
                return vf.createIRI(url);
×
147
            }
148
        }
149
        return vf.createIRI(url);
×
150
    }
151

152
    /* @return jsonld graph -> top_level_name max 212 chars */
153
    @NonNull
154
    private String extractToplevelName(Collection<Statement> metadataStatements, IRI subj) {
155
        Collection<Statement> nameCandidates = metadataStatements.stream()
4✔
156
                .filter(st -> st.getSubject().equals(subj))
7✔
157
                .filter(st -> st.getPredicate().equals(SCHEMA.NAME))
6✔
158
                .collect(Collectors.toSet());
4✔
159
        if (nameCandidates.size() != 1) {
4!
160
            LOG.info(String.format("This RO-Crate has an invalid number (%n) of names: %s", nameCandidates.size(), subj.stringValue()));
×
161
            nameCandidates.stream().forEach(possibleName -> LOG.debug(possibleName.toString()));
×
162
        }
163

164
        String name;
165
        Optional<Statement> nameCandidate = nameCandidates.stream().findFirst();
4✔
166
        if (nameCandidate.isPresent()) {
3!
167
            name = nameCandidate.get().getObject().stringValue();
7✔
168
        } else {
169
            nameCandidate = metadataStatements.stream()
×
170
                    .filter(st -> st.getSubject().equals(subj)
×
171
                            && st.getPredicate().equals(SCHEMA.DESCRIPTION))
×
172
                    .findFirst();
×
173
            if (nameCandidate.isPresent()) {
×
174
                name = nameCandidate.get().getObject().toString();
×
175
            }
176
            // the very last fallback
177
            name = subj.stringValue();
×
178
        }
179
        return StringUtils.substring(name, 0, 212); // 212 is just our convention;-) 222 was a good choice, too
5✔
180
    }
181

182
    @NonNull
183
    private IRI extractToplevelIdentifierOrBackup(Collection<Statement> metadataStatements, String bestGuess, String latestBackupIdentifier) {
184
        if (bestGuess != null) {
2!
185
            return vf.createIRI(bestGuess);
4✔
186
        }
187
        // TODO verify if this is correct, and check if sometimes the backup is an even better choice
188
        IRI identifier = (IRI) metadataStatements.stream()
×
189
                .filter(st -> st.getPredicate().equals(SCHEMA.RO_CRATE_HAS_PART))
×
190
                .findFirst().get().getSubject(); // TODO or do we need the Object-Value???
×
191
        if (identifier == null) {
×
192
            identifier = vf.createIRI(latestBackupIdentifier);
×
193
            // TODO, probably the best first backup choice is the download url if available in the metadate,
194
            // the url from above is only the second backup, so we never have any null pointer issues.
195
        }
196
        return identifier;
×
197
    }
198

199
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc