• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IQSS / dataverse / #22002

01 Apr 2024 07:56PM CUT coverage: 20.716% (+0.5%) from 20.173%
#22002

push

github

web-flow
Merge pull request #10453 from IQSS/develop

Merge 6.2 into master

704 of 2679 new or added lines in 152 files covered. (26.28%)

81 existing lines in 49 files now uncovered.

17160 of 82836 relevant lines covered (20.72%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.63
/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/CSVFileReader.java
1
/*
2
 Copyright (C) 2005-2013, by the President and Fellows of Harvard College.
3

4
 Licensed under the Apache License, Version 2.0 (the "License");
5
 you may not use this file except in compliance with the License.
6
 You may obtain a copy of the License at
7

8
 http://www.apache.org/licenses/LICENSE-2.0
9

10
 Unless required by applicable law or agreed to in writing, software
11
 distributed under the License is distributed on an "AS IS" BASIS,
12
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 See the License for the specific language governing permissions and
14
 limitations under the License.
15

16
 Dataverse Network - A web application to share, preserve and analyze research data.
17
 Developed at the Institute for Quantitative Social Science, Harvard University.
18
 Version 3.0.
19
 */
20
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.csv;
21

22
import java.io.FileReader;
23
import java.io.InputStreamReader;
24

25
import edu.harvard.iq.dataverse.DataTable;
26
import edu.harvard.iq.dataverse.datavariable.DataVariable;
27

28
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader;
29
import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi;
30
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
31
import edu.harvard.iq.dataverse.util.BundleUtil;
32
import java.io.BufferedInputStream;
33
import java.io.BufferedReader;
34
import java.io.File;
35
import java.io.FileWriter;
36
import java.io.IOException;
37
import java.io.PrintWriter;
38
import java.math.BigDecimal;
39
import java.math.MathContext;
40
import java.math.RoundingMode;
41
import java.text.ParseException;
42
import java.text.ParsePosition;
43
import java.text.SimpleDateFormat;
44
import java.util.ArrayList;
45
import java.util.Arrays;
46
import java.util.Date;
47
import java.util.HashSet;
48
import java.util.List;
49
import java.util.Map;
50
import java.util.Set;
51
import java.util.logging.Logger;
52
import org.apache.commons.csv.CSVFormat;
53
import org.apache.commons.lang3.StringUtils;
54
import org.apache.commons.csv.CSVParser;
55
import org.apache.commons.csv.CSVPrinter;
56
import org.apache.commons.csv.CSVRecord;
57

58
/**
59
 * Dataverse 4.0 implementation of <code>TabularDataFileReader</code> for the
60
 * plain CSV file with a variable name header.
61
 *
62
 *
63
 * @author Oscar Smith
64
 *
65
 * This implementation uses the Apache CSV Parser
66
 */
67
public class CSVFileReader extends TabularDataFileReader {
68

69
    private static final Logger logger = Logger.getLogger(CSVFileReader.class.getPackage().getName());
1✔
70
    private static final int DIGITS_OF_PRECISION_DOUBLE = 15;
71
    private static final String FORMAT_IEEE754 = "%+#." + DIGITS_OF_PRECISION_DOUBLE + "e";
72
    private MathContext doubleMathContext;
73
    private CSVFormat inFormat;
74
    //private final Set<Character> firstNumCharSet = new HashSet<>();
75

76
    // DATE FORMATS
77
    private static SimpleDateFormat[] DATE_FORMATS = new SimpleDateFormat[]{
1✔
78
        new SimpleDateFormat("yyyy-MM-dd"), //new SimpleDateFormat("yyyy/MM/dd"),
79
    //new SimpleDateFormat("MM/dd/yyyy"),
80
    //new SimpleDateFormat("MM-dd-yyyy"),
81
    };
82

83
    // TIME FORMATS
84
    private static SimpleDateFormat[] TIME_FORMATS = new SimpleDateFormat[]{
1✔
85
        // Date-time up to seconds with timezone, e.g. 2013-04-08 13:14:23 -0500
86
        new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z"),
87
        // Date-time up to seconds and no timezone, e.g. 2013-04-08 13:14:23
88
        new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
89
    };
90

91
    public CSVFileReader(TabularDataFileReaderSpi originator, char delim) {
92
        super(originator);
1✔
93
        if (delim == ','){
1✔
94
            inFormat = CSVFormat.EXCEL;
1✔
95
        } else if (delim == '\t'){
×
96
            inFormat = CSVFormat.TDF;
×
97
        }
98
    }
1✔
99

100
    private void init() throws IOException {
101
        doubleMathContext = new MathContext(DIGITS_OF_PRECISION_DOUBLE, RoundingMode.HALF_EVEN);
1✔
102
        //firstNumCharSet.addAll(Arrays.asList(new Character[]{'+', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}));
103
    }
1✔
104

105
    /**
106
     * Reads a CSV file, converts it into a dataverse DataTable.
107
     *
108
     * @param stream a <code>BufferedInputStream</code>.
109
     * @return an <code>TabularDataIngest</code> object
110
     * @throws java.io.IOException if a reading error occurs.
111
     */
112
    @Override
113
    public TabularDataIngest read(BufferedInputStream stream, boolean saveWithVariableHeader, File dataFile) throws IOException {
114
        init();
1✔
115

116
        if (stream == null) {
1✔
117
            throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.nullStream"));
1✔
118
        }
119
        TabularDataIngest ingesteddata = new TabularDataIngest();
1✔
120
        DataTable dataTable = new DataTable();
1✔
121

122
        BufferedReader localBufferedReader = new BufferedReader(new InputStreamReader(stream));
1✔
123

124
        File tabFileDestination = File.createTempFile("data-", ".tab");
1✔
125
        PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
1✔
126

127
        int lineCount = readFile(localBufferedReader, dataTable, saveWithVariableHeader, tabFileWriter);
1✔
128

129
        logger.fine("Tab file produced: " + tabFileDestination.getAbsolutePath());
1✔
130

131
        dataTable.setUnf("UNF:6:NOTCALCULATED");
1✔
132

133
        ingesteddata.setTabDelimitedFile(tabFileDestination);
1✔
134
        ingesteddata.setDataTable(dataTable);
1✔
135
        return ingesteddata;
1✔
136

137
    }
138

139
    public int readFile(BufferedReader csvReader, DataTable dataTable, boolean saveWithVariableHeader, PrintWriter finalOut) throws IOException {
140

141
        List<DataVariable> variableList = new ArrayList<>();
1✔
142
        CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
1✔
143
        Map<String, Integer> headers = parser.getHeaderMap();
1✔
144

145
        int i = 0;
1✔
146
        String variableNameHeader = null;
1✔
147
        
148
        for (String varName : headers.keySet()) {
1✔
149
            // @todo: is .keySet() guaranteed to return the names in the right order?
150
            if (varName == null || varName.isEmpty()) {
1✔
151
                // TODO:
152
                // Add a sensible variable name validation algorithm.
153
                // -- L.A. 4.0 alpha 1
154
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
×
155
            }
156

157
            DataVariable dv = new DataVariable(i, dataTable);
1✔
158
            dv.setName(varName);
1✔
159
            dv.setLabel(varName);
1✔
160
            variableList.add(dv);
1✔
161

162
            dv.setTypeCharacter();
1✔
163
            dv.setIntervalDiscrete();
1✔
164
            
165
            if (saveWithVariableHeader) {
1✔
NEW
166
                    variableNameHeader = variableNameHeader == null
×
NEW
167
                            ? varName 
×
NEW
168
                            : variableNameHeader.concat("\t" + varName);
×
169
                }
170
            
171
            i++;
1✔
172
        }
1✔
173

174
        dataTable.setVarQuantity((long) variableList.size());
1✔
175
        dataTable.setDataVariables(variableList);
1✔
176

177
        boolean[] isNumericVariable = new boolean[headers.size()];
1✔
178
        boolean[] isIntegerVariable = new boolean[headers.size()];
1✔
179
        boolean[] isTimeVariable = new boolean[headers.size()];
1✔
180
        boolean[] isDateVariable = new boolean[headers.size()];
1✔
181

182
        for (i = 0; i < headers.size(); i++) {
1✔
183
            // OK, let's assume that every variable is numeric;
184
            // but we'll go through the file and examine every value; the
185
            // moment we find a value that's not a legit numeric one, we'll
186
            // assume that it is in fact a String.
187
            isNumericVariable[i] = true;
1✔
188
            isIntegerVariable[i] = true;
1✔
189
            isDateVariable[i] = true;
1✔
190
            isTimeVariable[i] = true;
1✔
191
        }
192

193
        // First, "learning" pass.
194
        // (we'll save the incoming stream in another temp file:)
195
        SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
1✔
196
        SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];
1✔
197

198
        File firstPassTempFile = File.createTempFile("firstpass-", ".csv");
1✔
199

200
        try (CSVPrinter csvFilePrinter = new CSVPrinter(
1✔
201
                // TODO allow other parsers of tabular data to use this parser by changin inFormat
202
                new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
1✔
203
            //Write  headers
204
            csvFilePrinter.printRecord(headers.keySet());
1✔
205
            for (CSVRecord record : parser.getRecords()) {
1✔
206
                // Checks if #records = #columns in header
207
                if (!record.isConsistent()) {
1✔
208
                    List<String> args = Arrays.asList(new String[]{"" + (parser.getCurrentLineNumber() - 1),
1✔
209
                                                                   "" + headers.size(),
1✔
210
                                                                   "" + record.size()});
1✔
211
                    throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
1✔
212
                }
213

214
                for (i = 0; i < headers.size(); i++) {
1✔
215
                    String varString = record.get(i);
1✔
216
                    isIntegerVariable[i] = isIntegerVariable[i]
1✔
217
                                           && varString != null
218
                                           && (varString.isEmpty()
1✔
219
                                               || varString.equals("null")
1✔
220
                                               || (StringUtils.isNumeric(varString)
1✔
221
                                                    || (varString.substring(0,1).matches("[+-]") 
1✔
222
                                                        && StringUtils.isNumeric(varString.substring(1)))));
1✔
223
                    if (isNumericVariable[i]) {
1✔
224
                        // If variable might be "numeric" test to see if this value is a parsable number:
225
                        if (varString != null && !varString.isEmpty()) {
1✔
226

227
                            boolean isNumeric = false;
1✔
228
                            boolean isInteger = false;
1✔
229

230
                            if (varString.equalsIgnoreCase("NaN")
1✔
231
                                || varString.equalsIgnoreCase("NA")
1✔
232
                                || varString.equalsIgnoreCase("Inf")
1✔
233
                                || varString.equalsIgnoreCase("+Inf")
1✔
234
                                || varString.equalsIgnoreCase("-Inf")
1✔
235
                                || varString.equalsIgnoreCase("null")) {
1✔
236
                                continue;
1✔
237
                            } else {
238
                                try {
239
                                    Double testDoubleValue = new Double(varString);
1✔
240
                                    continue;
1✔
241
                                } catch (NumberFormatException ex) {
1✔
242
                                    // the token failed to parse as a double
243
                                    // so the column is a string variable.
244
                                }
245
                            }
246
                            isNumericVariable[i] = false;
1✔
247
                        }
248
                    }
249

250
                    // If this is not a numeric column, see if it is a date collumn
251
                    // by parsing the cell as a date or date-time value:
252
                    if (!isNumericVariable[i]) {
1✔
253

254
                        Date dateResult = null;
1✔
255

256
                        if (isTimeVariable[i]) {
1✔
257
                            if (varString != null && !varString.isEmpty()) {
1✔
258
                                boolean isTime = false;
1✔
259

260
                                if (selectedDateTimeFormat[i] != null) {
1✔
261
                                    ParsePosition pos = new ParsePosition(0);
1✔
262
                                    dateResult = selectedDateTimeFormat[i].parse(varString, pos);
1✔
263

264
                                    if (dateResult != null && pos.getIndex() == varString.length()) {
1✔
265
                                        // OK, successfully parsed a value!
266
                                        isTime = true;
1✔
267
                                    }
268
                                } else {
1✔
269
                                    for (SimpleDateFormat format : TIME_FORMATS) {
1✔
270
                                        ParsePosition pos = new ParsePosition(0);
1✔
271
                                        dateResult = format.parse(varString, pos);
1✔
272
                                        if (dateResult != null && pos.getIndex() == varString.length()) {
1✔
273
                                            // OK, successfully parsed a value!
274
                                            isTime = true;
1✔
275
                                            selectedDateTimeFormat[i] = format;
1✔
276
                                            break;
1✔
277
                                        }
278
                                    }
279
                                }
280
                                if (!isTime) {
1✔
281
                                    isTimeVariable[i] = false;
1✔
282
                                    // if the token didn't parse as a time value,
283
                                    // we will still try to parse it as a date, below.
284
                                    // unless this column is NOT a date.
285
                                } else {
286
                                    // And if it is a time value, we are going to assume it's
287
                                    // NOT a date.
288
                                    isDateVariable[i] = false;
1✔
289
                                }
290
                            }
291
                        }
292

293
                        if (isDateVariable[i]) {
1✔
294
                            if (varString != null && !varString.isEmpty()) {
1✔
295
                                boolean isDate = false;
1✔
296

297
                                // TODO:
298
                                // Strictly speaking, we should be doing the same thing
299
                                // here as with the time formats above; select the
300
                                // first one that works, then insist that all the
301
                                // other values in this column match it... but we
302
                                // only have one, as of now, so it should be ok.
303
                                // -- L.A. 4.0 beta
304
                                for (SimpleDateFormat format : DATE_FORMATS) {
1✔
305
                                    // Strict parsing - it will throw an
306
                                    // exception if it doesn't parse!
307
                                    format.setLenient(false);
1✔
308
                                    try {
309
                                        format.parse(varString);
1✔
310
                                        isDate = true;
1✔
311
                                        selectedDateFormat[i] = format;
1✔
312
                                        break;
1✔
313
                                    } catch (ParseException ex) {
1✔
314
                                        //Do nothing
315
                                    }
316
                                }
317
                                isDateVariable[i] = isDate;
1✔
318
                            }
319
                        }
320
                    }
321
                }
322

323
                csvFilePrinter.printRecord(record);
1✔
324
            }
1✔
325
        }
326
        dataTable.setCaseQuantity(parser.getRecordNumber());
1✔
327
        parser.close();
1✔
328
        csvReader.close();
1✔
329

330
        // Re-type the variables that we've determined are numerics:
331
        for (i = 0; i < headers.size(); i++) {
1✔
332
            if (isNumericVariable[i]) {
1✔
333
                dataTable.getDataVariables().get(i).setTypeNumeric();
1✔
334

335
                if (isIntegerVariable[i]) {
1✔
336
                    dataTable.getDataVariables().get(i).setIntervalDiscrete();
1✔
337
                } else {
338
                    dataTable.getDataVariables().get(i).setIntervalContinuous();
1✔
339
                }
340
            } else if (isDateVariable[i] && selectedDateFormat[i] != null) {
1✔
341
                // Dates are still Strings, i.e., they are "character" and "discrete";
342
                // But we add special format values for them:
343
                dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
1✔
344
                dataTable.getDataVariables().get(i).setFormatCategory("date");
1✔
345
            } else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
1✔
346
                // Same for time values:
347
                dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
1✔
348
                dataTable.getDataVariables().get(i).setFormatCategory("time");
1✔
349
            }
350
        }
351
        // Second, final pass.
352
        try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
1✔
353
            parser = new CSVParser(secondPassReader, inFormat.withHeader());
1✔
354
            String[] caseRow = new String[headers.size()];
1✔
355
            
356
            // Save the variable name header, if requested
357
            if (saveWithVariableHeader) {
1✔
NEW
358
                if (variableNameHeader == null) {
×
NEW
359
                    throw new IOException("failed to generate the Variable Names header");
×
360
                }
NEW
361
                finalOut.println(variableNameHeader);
×
362
            }
363

364
            for (CSVRecord record : parser) {
1✔
365
                if (!record.isConsistent()) {
1✔
366
                    List<String> args = Arrays.asList(new String[]{"" + (parser.getCurrentLineNumber() - 1),
×
367
                                                                   "" + headers.size(),
×
368
                                                                   "" + record.size()});
×
369
                    throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
×
370
                }
371

372
                for (i = 0; i < headers.size(); i++) {
1✔
373
                    String varString = record.get(i);
1✔
374
                    if (isNumericVariable[i]) {
1✔
375
                        if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
1✔
376
                            // Missing value - represented as an empty string in
377
                            // the final tab file
378
                            caseRow[i] = "";
×
379
                        } else if (varString.equalsIgnoreCase("NaN")) {
1✔
380
                            // "Not a Number" special value:
381
                            caseRow[i] = "NaN";
1✔
382
                        } else if (varString.equalsIgnoreCase("Inf")
1✔
383
                                || varString.equalsIgnoreCase("+Inf")) {
1✔
384
                            // Positive infinity:
385
                            caseRow[i] = "Inf";
1✔
386
                        } else if (varString.equalsIgnoreCase("-Inf")) {
1✔
387
                            // Negative infinity:
388
                            caseRow[i] = "-Inf";
1✔
389
                        } else if (varString.equalsIgnoreCase("null")) {
1✔
390
                            // By request from Gus - "NULL" is recognized as a
391
                            // numeric zero:
392
                            caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
1✔
393
                        } else {
394
                            /* No re-formatting is done on any other numeric values.
395
                             * We'll save them as they were, for archival purposes.
396
                             * The alternative solution - formatting in sci. notation
397
                             * is commented-out below.
398
                             */
399
                            caseRow[i] = varString;
1✔
400
                            /*
401
                             if (isIntegerVariable[i]) {
402
                                try {
403
                                    Integer testIntegerValue = new Integer(varString);
404
                                    caseRow[i] = testIntegerValue.toString();
405
                                } catch (NumberFormatException ex) {
406
                                    throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
407
                                }
408
                            } else {
409
                                try {
410
                                    Double testDoubleValue = new Double(varString);
411
                                    if (testDoubleValue.equals(0.0)) {
412
                                        caseRow[i] = "0.0";
413
                                    } else {
414
                                                                            // One possible implementation:
415
                                        //
416
                                        // Round our fractional values to 15 digits
417
                                        // (minimum number of digits of precision guaranteed by
418
                                        // type Double) and format the resulting representations
419
                                        // in a IEEE 754-like "scientific notation" - for ex.,
420
                                        // 753.24 will be encoded as 7.5324e2
421
                                        BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
422
                                        caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);
423

424
                                        // Strip meaningless zeros and extra + signs:
425
                                        caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
426
                                        caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
427
                                        caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
428
                                        caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
429
                                    }
430
                                } catch (NumberFormatException ex) {
431
                                    throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
432
                                }
433
                            }
434
                             */
435
                        }
436
                    } else if (isTimeVariable[i] || isDateVariable[i]) {
1✔
437
                        // Time and Dates are stored NOT quoted (don't ask).
438
                        if (varString != null) {
1✔
439
                            // Dealing with quotes:
440
                            // remove the leading and trailing quotes, if present:
441
                            varString = varString.replaceFirst("^\"*", "");
1✔
442
                            varString = varString.replaceFirst("\"*$", "");
1✔
443
                            caseRow[i] = varString;
1✔
444
                        } else {
445
                            caseRow[i] = "";
×
446
                        }
447
                    } else {
448
                        // Treat as a String:
449
                        // Strings are stored in tab files quoted;
450
                        // Missing values are stored as an empty string
451
                        // between two tabs (or one tab and the new line);
452
                        // Empty strings stored as "" (quoted empty string).
453
                        // For the purposes  of this CSV ingest reader, we are going
454
                        // to assume that all the empty strings in the file are
455
                        // indeed empty strings, and NOT missing values:
456
                        if (varString != null) {
1✔
457
                            // escape the quotes, newlines, and tabs:
458
                            varString = varString.replace("\"", "\\\"");
1✔
459
                            varString = varString.replace("\n", "\\n");
1✔
460
                            varString = varString.replace("\t", "\\t");
1✔
461
                            // final pair of quotes:
462
                            varString = "\"" + varString + "\"";
1✔
463
                            caseRow[i] = varString;
1✔
464
                        } else {
465
                            caseRow[i] = "\"\"";
×
466
                        }
467
                    }
468
                }
469
                finalOut.println(StringUtils.join(caseRow, "\t"));
1✔
470
            }
1✔
471
        }
472
        long linecount = parser.getRecordNumber();
1✔
473
        finalOut.close();
1✔
474
        parser.close();
1✔
475
        logger.fine("Tmp File: " + firstPassTempFile);
1✔
476
        // Firstpass file is deleted to prevent tmp from filling up.
477
        firstPassTempFile.delete();
1✔
478
        if (dataTable.getCaseQuantity().intValue() != linecount) {
1✔
479
            List<String> args = Arrays.asList(new String[]{"" + dataTable.getCaseQuantity().intValue(),
×
480
                                                           "" + linecount});
481
            throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
×
482
        }
483
        return (int) linecount;
1✔
484
    }
485

486
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc