• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IQSS / dataverse / #22002

01 Apr 2024 07:56PM CUT coverage: 20.716% (+0.5%) from 20.173%
#22002

push

github

web-flow
Merge pull request #10453 from IQSS/develop

Merge 6.2 into master

704 of 2679 new or added lines in 152 files covered. (26.28%)

81 existing lines in 49 files now uncovered.

17160 of 82836 relevant lines covered (20.72%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RTabFileParser.java
1
/*
2
   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
3

4
   Licensed under the Apache License, Version 2.0 (the "License");
5
   you may not use this file except in compliance with the License.
6
   You may obtain a copy of the License at
7

8
         http://www.apache.org/licenses/LICENSE-2.0
9

10
   Unless required by applicable law or agreed to in writing, software
11
   distributed under the License is distributed on an "AS IS" BASIS,
12
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
   See the License for the specific language governing permissions and
14
   limitations under the License.
15

16
   Dataverse Network - A web application to share, preserve and analyze research data.
17
   Developed at the Institute for Quantitative Social Science, Harvard University.
18
   Version 3.0.
19
*/
20
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata;
21

22
import java.io.*;
23
import java.util.Arrays;
24
import java.util.logging.*;
25

26
import edu.harvard.iq.dataverse.util.BundleUtil;
27
import org.apache.commons.lang3.StringUtils;
28

29
import edu.harvard.iq.dataverse.DataTable;
30
import edu.harvard.iq.dataverse.datavariable.DataVariable;
31

32
/**
33
 * This is a customized version of CSVFileReader;
34
 
35
 * Tab files saved by R need some special post-processing, unique to the
36
 * R Ingest, so a specialized version of the file parser was needed. 
37
 * 
38
 *
39
 * @author Leonid Andreev
40
 *
41
 */
42
public class RTabFileParser implements java.io.Serializable {
43
    private char delimiterChar='\t';
×
44

45
    private static Logger dbgLog =
×
46
       Logger.getLogger(RTabFileParser.class.getPackage().getName());
×
47

48

49
    public RTabFileParser () {
×
50
    }
×
51

52
    public RTabFileParser (char delimiterChar) {
×
53
        this.delimiterChar = delimiterChar;
×
54
    }
×
55

56

57
    // version of the read method that parses the CSV file and stores
58
    // its content in the data table matrix (in memory).
59
    // TODO: remove this method.
60
    // Only the version that reads the file and stores it in a TAB file
61
    // should be used.
62

63

64
    public int read(BufferedReader csvReader, DataTable dataTable, boolean saveWithVariableHeader, PrintWriter pwout) throws IOException {
NEW
65
        dbgLog.fine("RTabFileParser: Inside R Tab file parser");
×
66
      
67
        int varQnty = 0;
×
68

69
        try {
70
            varQnty = dataTable.getVarQuantity().intValue();
×
71
        } catch (Exception ex) {
×
72
            //return -1;
73
            throw new IOException (BundleUtil.getStringFromBundle("rtabfileparser.ioexception.parser1"));
×
74
        }
×
75

76
        if (varQnty == 0) {
×
77
            //return -1;
78
            throw new IOException (BundleUtil.getStringFromBundle("rtabfileparser.ioexception.parser2"));
×
79
        }
80

81
        dbgLog.fine("CSV reader; varQnty: "+varQnty);
×
82
        dbgLog.fine("CSV reader; delimiter: "+delimiterChar);
×
83

84

85
        String[] caseRow = new String[varQnty];
×
86

87
        String line;
88
        String[] valueTokens;
89

90
        int lineCounter = 0;
×
91

92
        boolean[] isCharacterVariable = new boolean[varQnty];
×
93
        boolean[] isContinuousVariable = new boolean[varQnty];
×
94
        boolean[] isTimeVariable = new boolean[varQnty];
×
95
        boolean[] isBooleanVariable = new boolean[varQnty];
×
96
        
NEW
97
        String variableNameHeader = null;
×
98
        
99
        if (dataTable.getDataVariables() != null) {
×
100
            for (int i = 0; i < varQnty; i++) {
×
101
                DataVariable var = dataTable.getDataVariables().get(i);
×
102
                if (var == null) {
×
NEW
103
                    throw new IOException ("null dataVariable passed to the parser");
×
104
                    
105
                }
106
                if (var.getType() == null) {
×
NEW
107
                    throw new IOException ("null dataVariable type passed to the parser");
×
108
                }
109
                if (var.isTypeCharacter()) {
×
110
                    isCharacterVariable[i] = true; 
×
111
                    isContinuousVariable[i] = false; 
×
112
                    
113
                    if (var.getFormatCategory() != null && 
×
114
                            (var.getFormatCategory().startsWith("date") || var.getFormatCategory().startsWith("time"))) {
×
115
                            isTimeVariable[i] = true; 
×
116
                        }
117
                    
118
                } else if (var.isTypeNumeric()) {
×
119
                    isCharacterVariable[i] = false; 
×
120
                    
121
                    if (var.getInterval() == null) {
×
122
                        // throw exception!
123
                    }
124
                    if (var.isIntervalContinuous()) {
×
125
                        isContinuousVariable[i] = true;
×
126
                    } else {
127
                        // discrete by default:
128
                        isContinuousVariable[i] = false; 
×
129
                        if (var.getFormatCategory() != null && var.getFormatCategory().equals("Boolean")) {
×
130
                            isBooleanVariable[i] = true; 
×
131
                        }
132
                    }
133
                } else {
NEW
134
                     throw new IOException ("unknown dataVariable format passed to the parser");
×
135
                }
136
                
NEW
137
                if (saveWithVariableHeader) {
×
NEW
138
                    variableNameHeader = variableNameHeader == null  
×
NEW
139
                            ? var.getName() 
×
NEW
140
                            : variableNameHeader.concat("\t" + var.getName());
×
141
                }
142
            }
143
        } else {
NEW
144
            throw new IOException ("null dataVariables list passed to the parser");
×
145
        }
146
        
NEW
147
        if (saveWithVariableHeader) {
×
NEW
148
            if (variableNameHeader == null) {
×
NEW
149
                throw new IOException ("failed to generate the Variable Names header");
×
150
            }
NEW
151
            pwout.println(variableNameHeader);
×
152
        }
153
        
154
        while ((line = csvReader.readLine()) != null) {
×
155
            // chop the line:
156
            line = line.replaceFirst("[\r\n]*$", "");
×
157
            valueTokens = line.split(""+delimiterChar, -2);
×
158

159
            if (valueTokens == null) {
×
160
                throw new IOException(BundleUtil.getStringFromBundle("rtabfileparser.ioexception.failed" , Arrays.asList(Integer.toString(lineCounter + 1))));
×
161

162
            }
163

164
            if (valueTokens.length != varQnty) {
×
165
                throw new IOException(BundleUtil.getStringFromBundle("rtabfileparser.ioexception.mismatch" , Arrays.asList(Integer.toString(lineCounter + 1),Integer.toString(varQnty),Integer.toString(valueTokens.length))));
×
166
            }
167

168
            //dbgLog.fine("case: "+lineCounter);
169

170
            for ( int i = 0; i < varQnty; i++ ) {
×
171
                //dbgLog.fine("value: "+valueTokens[i]);
172

173
                if (isCharacterVariable[i]) {
×
174
                    // String. Adding to the table, quoted.
175
                    // Empty strings stored as " " (one white space):
176
                    if (valueTokens[i] != null && (!valueTokens[i].equals(""))) {
×
177
                        String charToken = valueTokens[i];
×
178
                        // Dealing with quotes: 
179
                        // remove the leading and trailing quotes, if present:
180
                        charToken = charToken.replaceFirst("^\"", "");
×
181
                        charToken = charToken.replaceFirst("\"$", "");
×
182
                        // escape the remaining ones:
183
                        charToken = charToken.replace("\"", "\\\"");
×
184
                        // final pair of quotes:
185
                        if (isTimeVariable==null || (!isTimeVariable[i])) {
×
186
                            charToken = "\"" + charToken + "\"";
×
187
                        }
188
                        caseRow[i] = charToken;
×
189
                    } else {
×
190
                        // missing value:
191
                           caseRow[i] = ""; 
×
192
                    }
193

194
                } else if (isContinuousVariable[i]) {
×
195
                    // Numeric, Double:
196
                    // This is the major case of special/custom processing,
197
                    // specific for R ingest. It was found to be impossible
198
                    // to write a numeric/continuous column into the tab file
199
                    // while unambiguously preserving both NA and NaNs, if both
200
                    // are present. At least, not if using the standard 
201
                    // write.table function. So it seemed easier to treat this
202
                    // as a special case, rather than write our own write.table
203
                    // equivalent in R. On the R side, if any special values 
204
                    // are present in the columns, the values will be 
205
                    // converted into a character vector. The NAs and NaNs will 
206
                    // be replaced with the character tokens "NA" and "NaN" 
207
                    // respectively. Of course R will add double quotes around 
208
                    // the tokens, hence the post-processing - we'll just need 
209
                    // to remove all these quotes, and then we'll be fine. 
210
                    
211
                    dbgLog.fine("R Tab File Parser; double value: "+valueTokens[i]); 
×
212
                    // Dealing with quotes: 
213
                    // remove the leading and trailing quotes, if present:
214
                    valueTokens[i] = valueTokens[i].replaceFirst("^\"", "");
×
215
                    valueTokens[i] = valueTokens[i].replaceFirst("\"$", "");
×
216
                    if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NA")) {
×
217
                        caseRow[i] = "";
×
218
                    } else if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NaN")) {
×
219
                        caseRow[i] = "NaN";
×
220
                    } else if (valueTokens[i] != null && 
×
221
                            ( valueTokens[i].equalsIgnoreCase("Inf")
×
222
                            || valueTokens[i].equalsIgnoreCase("+Inf"))) {
×
223
                        caseRow[i] = "Inf";
×
224
                    } else if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("-Inf")) {
×
225
                        caseRow[i] = "-Inf";
×
226
                    } else {
227
                        try {
228
                            Double testDoubleValue = new Double(valueTokens[i]);
×
229
                            caseRow[i] = testDoubleValue.toString();//valueTokens[i];
×
230
                        } catch (Exception ex) {
×
231
                            dbgLog.fine("caught exception reading numeric value; variable: " + i + ", case: " + lineCounter + "; value: " + valueTokens[i]);
×
232

233
                            //dataTable[i][lineCounter] = (new Double(0)).toString();
234
                            caseRow[i] = "";
×
235
                            
236
                            // TODO:
237
                            // decide if we should rather throw an exception and exit here; 
238
                            // all the values in this file at this point must be 
239
                            // legit numeric values (?) -- L.A.
240
                        }
×
241
                    }
242
                } else if (isBooleanVariable[i]) {
×
243
                    if (valueTokens[i] != null) {
×
244
                        String charToken = valueTokens[i];
×
245
                        // remove the leading and trailing quotes, if present:
246
                        charToken = charToken.replaceFirst("^\"", "");
×
247
                        charToken = charToken.replaceFirst("\"$", "");
×
248
                        
249
                        if (charToken.equals("FALSE")) {
×
250
                            caseRow[i] = "0";
×
251
                        } else if (charToken.equals("TRUE")) {
×
252
                            caseRow[i] = "1";
×
253
                        } else if (charToken.equals("")) {
×
254
                            // Legit case - Missing Value!
255
                            caseRow[i] = charToken;
×
256
                        } else {
257
                            throw new IOException(BundleUtil.getStringFromBundle("rtabfileparser.ioexception.boolean" , Arrays.asList(Integer.toString( +i)))+charToken);
×
258
                        }
259
                    } else {
×
260
                        throw new IOException(BundleUtil.getStringFromBundle("rtabfileparser.ioexception.read" , Arrays.asList(Integer.toString(i))));
×
261
                    }
262

263
                    
264
                } else {
265
                    // Numeric, Integer:
266
                    // One special case first: R NA (missing value) needs to be 
267
                    // converted into the DVN's missing value - an empty String;
268
                    // (strictly speaking, this isn't necessary - an attempt to 
269
                    // create an Integer object from the String "NA" would
270
                    // result in an exception, that would be intercepted below,
271
                    // with the same end result)
272
                    dbgLog.fine("R Tab File Parser; integer value: "+valueTokens[i]);
×
273
                    if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NA")) {
×
274
                        caseRow[i] = "";
×
275
                    } else {
276
                        try {
277
                            Integer testIntegerValue = new Integer(valueTokens[i]);
×
278
                            caseRow[i] = testIntegerValue.toString();
×
279
                        } catch (Exception ex) {
×
280
                            dbgLog.fine("caught exception reading numeric value; variable: " + i + ", case: " + lineCounter + "; value: " + valueTokens[i]);
×
281

282
                            //dataTable[i][lineCounter] = "0";
283
                            caseRow[i] = "";
×
284
                        }
×
285
                    }
286
                }
287
            }
288

289
            pwout.println(StringUtils.join(caseRow, "\t"));
×
290

291
            lineCounter++;
×
292
        }
293

294
        //csvData.setData(dataTable);
295
        //return csvData;
296

297
        pwout.close();
×
298
        return lineCounter;
×
299
    }
300

301
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc