• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

CeON / dataverse / 1359

02 Apr 2024 09:47AM UTC coverage: 25.104%. First build
1359

push

jenkins

web-flow
Closes #2440: Improved shapefile handling (#2443)

* Closes #2440: Improved shapefile handler, error handling and simpler api, use commons-compress for extraction supporting unicode extra fields in zip

* review comments

96 of 123 new or added lines in 6 files covered. (78.05%)

17423 of 69404 relevant lines covered (25.1%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.0
/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/datafile/FileTypeDetector.java
1
package edu.harvard.iq.dataverse.datafile;
2

3
import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
4
import edu.harvard.iq.dataverse.util.JhoveFileType;
5
import edu.harvard.iq.dataverse.util.ShapefileHandler;
6
import org.apache.commons.io.IOUtils;
7
import org.apache.commons.lang.StringUtils;
8
import org.apache.tika.Tika;
9
import org.apache.tika.mime.MediaType;
10
import org.slf4j.Logger;
11
import org.slf4j.LoggerFactory;
12

13
import javax.activation.MimetypesFileTypeMap;
14
import javax.ejb.EJBException;
15
import javax.enterprise.context.ApplicationScoped;
16
import javax.xml.stream.XMLInputFactory;
17
import javax.xml.stream.XMLStreamConstants;
18
import javax.xml.stream.XMLStreamException;
19
import javax.xml.stream.XMLStreamReader;
20

21
import java.io.File;
22
import java.io.FileInputStream;
23
import java.io.FileReader;
24
import java.io.IOException;
25
import java.io.InputStream;
26
import java.nio.charset.StandardCharsets;
27
import java.util.Arrays;
28
import java.util.zip.GZIPInputStream;
29

30
/**
31
 * our check is fairly weak (it appears to be hard to really
32
 * really recognize a FITS file without reading the entire
33
 * stream...), so in version 3.* we used to nsist on *both*
34
 * the ".fits" extension and the header check;
35
 * in 4.0, we'll accept either the extension, or the valid
36
 * magic header
37
 *
38
 */
39
@ApplicationScoped
40
public class FileTypeDetector {
1✔
41
    private static final Logger logger = LoggerFactory.getLogger(FileTypeDetector.class);
1✔
42

43
    private static final MimetypesFileTypeMap MIME_TYPE_MAP = new MimetypesFileTypeMap();
1✔
44

45
    /**
46
     * Detects file type based on file content and filename
47
     */
48
    public String determineFileType(File f, String fileName) throws IOException {
49
        String fileType = "application/octet-stream";
1✔
50

51
        // step 1:
52
        // Apply our custom methods to try and recognize data files that can be
53
        // converted to tabular data
54
        logger.debug("Attempting to identify potential tabular data files;");
1✔
55
        fileType = detectTabularFileType(f, fileType);
1✔
56
        logger.debug("determineFileType: tabular data checker found " + fileType);
1✔
57

58
        // step 2: check the mime type of this file with Jhove
59
        if (!isContentTypeSpecificEnough(fileType)) {
1✔
60
            JhoveFileType jw = new JhoveFileType();
1✔
61
            String jHovemimeType = jw.getFileMimeType(f);
1✔
62
            if (jHovemimeType != null) {
1✔
63
                // remove parameter (eg. text/plain; charset=US-ASCII -> text/plain)
64
                MediaType mediaType = MediaType.parse(jHovemimeType);
1✔
65
                fileType = mediaType.getBaseType().toString();
1✔
66
            }
67
        }
68

69
        // step 3: check the mime type of this file with Tika
70
        if (!isContentTypeSpecificEnough(fileType)) {
1✔
71
            fileType = new Tika().detect(f);
1✔
72
        }
73

74

75
        // step 3: Check if xml is an graphml xml
76
        if ("application/xml".equals(fileType) && isGraphMLFile(f)) {
1✔
77
            fileType = "text/xml-graphml";
1✔
78
        }
79
        
80
        // step 4:
81
        // if this is a compressed file - zip or gzip - we'll check the
82
        // file(s) inside the compressed stream and see if it's one of our
83
        // recognized formats that we want to support compressed:
84

85
        if ("application/x-gzip".equals(fileType) || "application/gzip".equals(fileType)) {
1✔
86
            logger.debug("we'll run additional checks on this gzipped file.");
1✔
87
            // We want to be able to support gzipped FITS files, same way as
88
            // if they were just regular FITS files:
89
            // (new FileInputStream() can throw a "filen not found" exception;
90
            // however, if we've made it this far, it really means that the
91
            // file does exist and can be opened)
92
            try (InputStream uncompressedIn = new GZIPInputStream(new FileInputStream(f))) {
1✔
93
                if (isFITSFile(uncompressedIn)) {
1✔
94
                    fileType = "application/fits-gzipped";
1✔
95
                }
96
            } catch (IOException e) {
×
97
                logger.warn("file {} does not seems to be a gzip", fileName);
×
98
            }
1✔
99
        }
100
        if ("application/zip".equals(fileType)) {
1✔
101

102
            // Is this a zipped Shapefile?
103
            // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile
104
            try {
105
                ShapefileHandler shapefileHandler = new ShapefileHandler(f);
1✔
106
                if (shapefileHandler.containsShapefile()) {
1✔
107
                    fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE;
1✔
108
                }
NEW
109
            } catch (Exception ex) {
×
NEW
110
                logger.warn("Shapefile inspection failed for file {}", fileName, ex);
×
111
            }
1✔
112
        }
113
        
114
        // step 5:
115
        // Additional processing; if we haven't gotten much useful information
116
        // back from previous steps, we'll try and make an educated guess based on
117
        // the file extension:
118

119
        if (!isContentTypeSpecificEnough(fileType)) {
1✔
120
            
121
            logger.debug("Type by extension, for " + fileName + ": " + MIME_TYPE_MAP.getContentType(fileName));
1✔
122
            String fileTypeByExtension = MIME_TYPE_MAP.getContentType(fileName);
1✔
123
            if (!"application/octet-stream".equals(fileTypeByExtension)) {
1✔
124
                fileType = fileTypeByExtension;
1✔
125
                logger.debug("mime type recognized by extension: " + fileType);
1✔
126
            }
127
        }
128

129
        logger.debug("returning fileType " + fileType);
1✔
130
        return fileType;
1✔
131
    }
132

133
    public String detectTabularFileType(File file, String fallbackContentType) {
134
        IngestableDataChecker tabChecker = new IngestableDataChecker();
1✔
135
        return StringUtils.defaultString(tabChecker.detectTabularDataFormat(file), fallbackContentType);
1✔
136
    }
137
    
138
    // -------------------- PRIVATE --------------------
139

140
    private boolean isContentTypeSpecificEnough(String contentType) {
141
        return !"text/plain".equals(contentType) && !"application/octet-stream".equals(contentType);
1✔
142
    }
143

144
    /**
145
     * Custom method for identifying FITS files:
146
     * TODO:
147
     * the existing check for the "magic header" is very weak (see below);
148
     * it should probably be replaced by attempting to parse and read at
149
     * least the primary HDU, using the NOM fits parser.
150
     * -- L.A. 4.0 alpha
151
     */
152
    private boolean isFITSFile(InputStream ins) throws IOException {
153

154
        // number of header bytes read for identification:
155
        byte[] magicWord = "SIMPLE".getBytes(StandardCharsets.UTF_8);
1✔
156
        int magicWordLength = magicWord.length;
1✔
157

158
        byte[] b = new byte[magicWordLength];
1✔
159
        logger.debug("attempting to read " + magicWordLength + " bytes from the FITS format candidate stream.");
1✔
160
        IOUtils.read(ins, b);
1✔
161

162
        if (Arrays.equals(magicWord, b)) {
1✔
163
            logger.debug("yes, this is FITS file!");
1✔
164
            return true;
1✔
165
        }
166

167
        return false;
1✔
168
    }
169

170
    private boolean isGraphMLFile(File file) {
171
        boolean isGraphML = false;
1✔
172
        logger.debug("begin isGraphMLFile()");
1✔
173
        
174
        try (FileReader fileReader = new FileReader(file)) {
1✔
175
            XMLInputFactory xmlif = XMLInputFactory.newInstance();
1✔
176
            xmlif.setProperty("javax.xml.stream.isCoalescing", Boolean.TRUE);
1✔
177

178
            XMLStreamReader xmlr = xmlif.createXMLStreamReader(fileReader);
1✔
179
            for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
1✔
180
                if (event == XMLStreamConstants.START_ELEMENT) {
1✔
181
                    if (xmlr.getLocalName().equals("graphml")) {
1✔
182
                        String schema = xmlr.getAttributeValue("http://www.w3.org/2001/XMLSchema-instance",
1✔
183
                                                               "schemaLocation");
184
                        logger.debug("schema = " + schema);
1✔
185
                        if (schema != null && schema.contains("http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd")) {
1✔
186
                            logger.debug("graphML is true");
1✔
187
                            isGraphML = true;
1✔
188
                        }
189
                    }
1✔
190
                    break;
191
                }
192
            }
193
        } catch (XMLStreamException e) {
×
194
            logger.debug("XML error - this is not a valid graphML file.");
×
195
            isGraphML = false;
×
196
        } catch (IOException e) {
×
197
            throw new EJBException(e);
×
198
        }
1✔
199
        logger.debug("end isGraphML()");
1✔
200
        return isGraphML;
1✔
201
    }
202
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc