• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

CeON / dataverse / 1359

02 Apr 2024 09:47AM UTC coverage: 25.104%. First build
1359

push

jenkins

web-flow
Closes #2440: Improved shapefile handling (#2443)

* Closes #2440: Improved shapefile handler, error handling and simpler api, use commons-compress for extraction supporting unicode extra fields in zip

* review comments

96 of 123 new or added lines in 6 files covered. (78.05%)

17423 of 69404 relevant lines covered (25.1%)

0.25 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

76.06
/dataverse-webapp/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java
1
package edu.harvard.iq.dataverse.util;
2

3
import edu.harvard.iq.dataverse.common.files.mime.ShapefileMimeType;
4
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
5
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
6
import org.apache.commons.compress.archivers.zip.ZipFile;
7
import org.apache.commons.io.FileUtils;
8
import org.apache.commons.io.IOUtils;
9

10
import java.io.File;
11
import java.io.IOException;
12
import java.io.OutputStream;
13
import java.nio.file.Files;
14
import java.nio.file.Path;
15
import java.util.ArrayList;
16
import java.util.Arrays;
17
import java.util.Collections;
18
import java.util.Enumeration;
19
import java.util.HashMap;
20
import java.util.HashSet;
21
import java.util.List;
22
import java.util.Map;
23
import java.util.Optional;
24
import java.util.Set;
25
import java.util.logging.Level;
26
import java.util.logging.Logger;
27
import java.util.stream.Collectors;
28
import java.util.stream.Stream;
29
import java.util.zip.ZipEntry;
30

31
/**
32
 * Used to identify, "repackage", and extract data from Shapefiles in .zip format
33
 * <p>
34
 * (1) Identify if a .zip contains a shapefile:
35
 * boolean containsShapefile()
36
 * <p>
37
 * <p>
38
 * <p>
39
 * (2) Unpack/"Repackage" .zip:
40
 * (a) All files extracted
41
 * (b) Each group of files that make up a shapefile are made into individual .zip files
42
 * (c) Non shapefile-related files left on their own
43
 * <p>
44
 * If the original .zip contains:  "shape1.shp", "shape1.shx", "shape1.dbf", "shape1.prj", "shape1.ain",  "shape1.aih",
45
 * "shape2.shp", "shape2.shx", "shape2.dbf", "shape2.prj",
46
 * "shape1.pdf", "README.md", "shape_notes.txt"
47
 * The repackaging results in a folder containing:
48
 * "shape1.zip",
49
 * "shape2.zip",
50
 * "shape1.pdf", "README.md", "shape_notes.txt"
51
 * <p>
52
 * Code Example:
53
 * <pre>{@code
54
 * try {
55
 *   ShapefileHandler shp_handler = new ShapefileHandler(new File("zipped_shapefile.zip"));
56
 *   if (shp_handler.containsShapefile()){
57
 *     File rezip_folder = new File("~/folder_for_rezipping");
58
 *     List<File> reZippedFiles = shp_handler.reZipShapefileSets(rezip_folder);
59
 *     // ...
60
 *   }
61
 * } catch(Exception e) {
62
 *   System.out.println(e.getMessage());
63
 * }
64
 * }</pre>
65
 *
66
 * @author raprasad
67
 */
68
public class ShapefileHandler {
69

70
    private static final Logger logger = Logger.getLogger(ShapefileHandler.class.getCanonicalName());
1✔
71

72
    // Reference for these extensions: http://en.wikipedia.org/wiki/Shapefile
73
    public final static String SHAPEFILE_FILE_TYPE = ShapefileMimeType.SHAPEFILE_FILE_TYPE.getMimeValue();
1✔
74
    public final static List<String> SHAPEFILE_MANDATORY_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj");
1✔
75
    public final static String SHP_XML_EXTENSION = "shp.xml";
76
    public final static String BLANK_EXTENSION = "__PLACEHOLDER-FOR-BLANK-EXTENSION__";
77
    public final static List<String> SHAPEFILE_ALL_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj", "sbn", "sbx", "fbn", "fbx", "ain", "aih", "ixs", "mxs", "atx", "cpg", SHP_XML_EXTENSION);
1✔
78
    private final File zipfile;
79

80
    /**
81
     * Hash of file basenames and a list of extensions.
82
     * e.g.  { "subway_shapefile" : [ ".dbf", ".prj", ".sbn", ".sbx", ".shp", ".shx"],
83
     *         "shapefile_info" : [".docx"],
84
     *,        "README" : ["md"],
85
     *         "Notes" : [""]
86
     * }
87
     */
88
    private final Map<String, List<String>> baseNameExtensions = new HashMap<>();
1✔
89

90
    // -------------------- CONSTRUCTOR --------------------
91

92
    public ShapefileHandler(File zipFile) {
1✔
93
        this.zipfile = zipFile;
1✔
94

95
        examineZipFile();
1✔
96
    }
1✔
97

98
    // -------------------- GETTERS --------------------
99

100
    Map<String, List<String>> getBaseNameExtensions() {
NEW
101
        return this.baseNameExtensions;
×
102
    }
103

104
    // -------------------- LOGIC --------------------
105

106
    /**
107
     * Re-group the shapefile(s) into a given directory.
108
     *
109
     * Creates to subdirectories:
110
     * - unzipped: directory into which the zip-file is extracted
111
     * - rezipped: contains the resulting re-pack
112
     *
113
     * @return List of resulting files after re-packaging.
114
     */
115
    public List<File> reZipShapefileSets(File unzipDirectory, File reZipDirectory) throws IOException {
116
        logger.fine("rezipShapefileSets");
1✔
117

118
        if (!containsShapefile()) {
1✔
NEW
119
            throw new IllegalArgumentException("No shapefiles in zip");
×
120
        }
121

122
        verifyDestinationDirectories(unzipDirectory, reZipDirectory);
1✔
123
        try {
124
            // Unzip files!
125
            unzipFilesToDirectory(unzipDirectory.toPath());
1✔
126

127
            // Redistribute files!
128
            redistributeFilesFromZip(unzipDirectory.toPath(), reZipDirectory.toPath());
1✔
129

130
            return Optional.ofNullable(reZipDirectory.listFiles()).map(Arrays::asList).orElse(Collections.emptyList());
1✔
131
        } finally {
132
            logger.fine(() -> "Post redistribute, unzipped files:" + Optional.ofNullable(unzipDirectory.listFiles())
1✔
NEW
133
                    .map(Arrays::stream).orElse(Stream.empty()).map(File::getName)
×
NEW
134
                    .collect(Collectors.joining(",")));
×
135
        }
136
    }
137

138
    /**
139
     * Does this zip file contain a shapefile set?
140
     */
141
    public boolean containsShapefile() {
142
        for (Map.Entry<String, List<String>> entry : baseNameExtensions.entrySet()) {
1✔
143
            List<String> extenstionList = entry.getValue();
1✔
144
            if (doesListContainShapefileExtensions(extenstionList)) {
1✔
145
                return true;
1✔
146
            }
147
        }
1✔
148

149
        return false;
1✔
150
    }
151

152
    // -------------------- PRIVATE --------------------
153

154
    private void verifyDestinationDirectories(File... directories) throws IOException {
155
        for(File dir : directories) {
1✔
156
            if (dir == null || !dir.isDirectory() || !FileUtils.isEmptyDirectory(dir)) {
1✔
NEW
157
                throw new IllegalArgumentException("Invalid target directory:" + dir);
×
158
            }
159
        }
160
    } // createDirectories
1✔
161

162
    private String getFileBasename(String fileName) {
163
        if (fileName == null) {
1✔
164
            return null;
×
165
        }
166
        String unzipFileName = new File(fileName).getName();
1✔
167
        if (unzipFileName.isEmpty()) {
1✔
168
            logger.info("getFileBasename.  fileName is an empty string: " + fileName);
×
169
            return null;
×
170
        }
171
        return unzipFileName;
1✔
172
    }
173

174
    /**
175
     * Unzip the files to the directory, FLATTENING the directory structure
176
     */
177
    private void unzipFilesToDirectory(Path unzipDirectory) {
178
        try(ZipArchiveInputStream zipStream = new ZipArchiveInputStream(Files.newInputStream(zipfile.toPath()))) {
1✔
179
            ZipEntry origEntry;
180
            while ((origEntry = zipStream.getNextEntry()) != null) {
1✔
181
                String zentryFileName = origEntry.getName();
1✔
182
                String unzipFileName = getFileBasename(zentryFileName);
1✔
183

184
                if (isFileToSkip(unzipFileName)) {
1✔
185
                    logger.fine("Skip file");
×
186
                    continue;
×
187
                }
188

189
                // Create sub-directory, if needed
190
                if (origEntry.isDirectory()) {
1✔
191
                    logger.fine("Skip directory");
×
NEW
192
                    continue; // Continue to next Entry
×
193
                }
194

195
                logger.fine("file found!");
1✔
196

197
                // Write the file
198
                Path outpath = unzipDirectory.resolve(unzipFileName);
1✔
199
                logger.fine("Write zip file: " + outpath);
1✔
200
                try (OutputStream fileOutputStream = Files.newOutputStream(outpath)) {
1✔
201
                    IOUtils.copy(zipStream, fileOutputStream);
1✔
202
                }
203
            } // end outer while
1✔
204
        } catch (IOException ex) {
×
NEW
205
            logger.log(Level.SEVERE, "Failed to open ZipInputStream entry", ex);
×
NEW
206
            throw new IllegalStateException("Failed to unzip:" + ex.getMessage());
×
207
        }
1✔
208
    }
1✔
209

210
    private Path getFilePath(Path directory, String file_basename, String file_ext) {
211
        if (file_ext.equals(BLANK_EXTENSION)) {
1✔
212
            return directory.resolve(file_basename);
1✔
213
        }
214
        return directory.resolve(file_basename + "." + file_ext);
1✔
215
    }
216

217
    /**
218
     * Create new zipped shapefile
219
     */
220
    private void redistributeFilesFromZip(Path unzipDirectory, Path rezipDirectory) throws IOException {
221
        logger.fine("redistributeFilesFromZip. source: '" + unzipDirectory + "'  target: '" + rezipDirectory + "'");
1✔
222

223
        int cnt = 0;
1✔
224
        /* START: Redistribute files by iterating through the Map of basenames + extensions
225
           example key: "shape1"
226
           example ext_list: ["shp", "shx", "dbf", "prj"]
227
        */
228
        for (Map.Entry<String, List<String>> entry : baseNameExtensions.entrySet()) {
1✔
229
            cnt++;
1✔
230
            String baseName = entry.getKey();
1✔
231
            List<String> ext_list = entry.getValue();
1✔
232

233
            logger.fine("\n(" + cnt + ") Basename: " + baseName);
1✔
234
            logger.fine("Extensions: " + Arrays.toString(ext_list.toArray()));
1✔
235

236
            // Is this a shapefile?  If so, rezip it
237
            if (doesListContainShapefileExtensions(ext_list)) {
1✔
238
                Path reZippedFileName = rezipDirectory.resolve(baseName + ".zip");
1✔
239
                try (ZipFileBuilder shapefileZip = new ZipFileBuilder(reZippedFileName)) {
1✔
240
                    for (String ext_name : ext_list) {
1✔
241
                        Path sourceFile = getFilePath(unzipDirectory, baseName, ext_name);
1✔
242
                        if (!isShapefileExtension(ext_name)) {
1✔
243
                            // Another file with similar basename as shapefile.
244
                            // e.g. if shapefile basename is "census", this might be "census.xls", "census.pdf", or another non-shapefile extension
245
                            moveFile(sourceFile, getFilePath(rezipDirectory, baseName, ext_name));
1✔
246
                        } else {
247
                            shapefileZip.addToZipFile(sourceFile);
1✔
248
                            Files.delete(sourceFile);
1✔
249
                        }
250
                    }
1✔
251
                }
252
                // rezip it
253
            } else {
1✔
254
                // Non-shapefiles
255
                for (String ext_name : ext_list) {
1✔
256
                    moveFile(getFilePath(unzipDirectory, baseName, ext_name),
1✔
257
                            getFilePath(rezipDirectory, baseName, ext_name));
1✔
258
                }
1✔
259
            }
260
        }
1✔
261

262
    }  // end: redistributeFilesFromZip
1✔
263

264
    private void moveFile(Path sourceFileName, Path targetFileName) {
265
        try {
266
            Files.move(sourceFileName, targetFileName);
1✔
267
        } catch (IOException ex) {
×
NEW
268
            throw new IllegalStateException("Failed to move file. Source: " + sourceFileName + " Target: " + targetFileName, ex);
×
269
        }
1✔
270
    }
1✔
271

272
    private boolean isShapefileExtension(String ext_name) {
273
        if (ext_name == null) {
1✔
274
            return false;
×
275
        }
276
        return SHAPEFILE_ALL_EXTENSIONS.contains(ext_name);
1✔
277
    }
278

279
    /**
280
     * Does a list of file extensions match those required for a shapefile set?
281
     */
282
    private boolean doesListContainShapefileExtensions(List<String> ext_list) {
283
        return new HashSet<>(ext_list).containsAll(SHAPEFILE_MANDATORY_EXTENSIONS);
1✔
284
    }
285

286
    private void addToFileGroupHash(String basename, String ext) {
287
        if ((basename == null) || (ext == null)) {
1✔
288
            return;
×
289
        }
290
        List<String> extension_list = baseNameExtensions.computeIfAbsent(basename, k -> new ArrayList<>());
1✔
291
        if (!(extension_list.contains(ext))) {
1✔
292
            extension_list.add(ext);
1✔
293
        }
294
    }   // end addToFileGroupHash
1✔
295

296
    /**
297
     * Update the fileGroup hash which contains a { base_filename : [ext1, ext2, etc ]}
298
     * This is used to determine whether a .zip contains a shapefile set
299
     * #
300
     *
301
     * @param fname filename in String format
302
     */
303
    private void updateFileGroupHash(String fname) {
304
        if (fname == null) {
1✔
305
            return;
×
306
        }
307

308
        // Split filename into basename and extension.  No extension yields only basename
309
        //
310
        if (fname.toLowerCase().endsWith(SHP_XML_EXTENSION)) {
1✔
311
            int idx = fname.toLowerCase().indexOf("." + SHP_XML_EXTENSION);
×
312
            if (idx >= 1) {   // if idx==0, then the file name is ".shp.xml""
×
313
                String basename = fname.substring(0, idx);
×
314
                String ext = fname.substring(idx + 1);
×
315
                addToFileGroupHash(basename, ext);
×
316
                return;
×
317
            }
318
        }
319

320
        String[] tokens = fname.split("\\.(?=[^\\.]+$)");
1✔
321
        if (tokens.length == 1) {
1✔
322
            addToFileGroupHash(tokens[0], BLANK_EXTENSION);      // file basename, no extension
1✔
323

324
        } else if (tokens.length == 2) {
1✔
325
            addToFileGroupHash(tokens[0], tokens[1]);  // file basename, extension
1✔
326
        }
327
    } // end updateFileGroupHash
1✔
328

329
    private boolean isFileToSkip(String fname) {
330
        if ((fname == null) || (fname.equals(""))) {
1✔
331
            return true;
×
332
        }
333

334
        if (fname.startsWith("__")) {
1✔
335
            return true;
×
336
        }
337

338
        if (fname.startsWith("._")) {
1✔
339
            return true;
×
340
        }
341

342
        return fname.endsWith(".DS_Store");
1✔
343
    }
344

345
    /**
346
     * Iterate through the zip file contents.
347
     * Does it contain any shapefiles?
348
     */
349
    private void examineZipFile() {
350
        if (zipfile == null || !zipfile.isFile()) {
1✔
NEW
351
            throw new IllegalArgumentException("Invalid zip file: " + zipfile);
×
352
        }
353

354
        try (ZipFile zipFile = ZipFile.builder().setFile(zipfile).get()) {
1✔
355
            Set<String> fileNamesInZip = new HashSet<>();
1✔
356
            Enumeration<ZipArchiveEntry> zipFileEntries = zipFile.getEntries();
1✔
357
            while(zipFileEntries.hasMoreElements()) {
1✔
358
                ZipArchiveEntry zipFileEntry = zipFileEntries.nextElement();
1✔
359
                String unzipFileName = getFileBasename(zipFileEntry.getName());
1✔
360

361
                if (isFileToSkip(unzipFileName) || zipFileEntry.isDirectory()) {
1✔
362
                    continue;
×
363
                }
364
                
365
                if (fileNamesInZip.contains(unzipFileName)) {
1✔
NEW
366
                    throw new IllegalStateException("Found file-name collision: " + unzipFileName);
×
367
                }
368
                fileNamesInZip.add(unzipFileName);
1✔
369
                updateFileGroupHash(unzipFileName);
1✔
370
            }
1✔
371
        } catch (IOException ex) {
×
NEW
372
            throw new IllegalStateException("Error inspecting zip file", ex);
×
373
        }
1✔
374
    }
1✔
375
} // end ShapefileHandler
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc