• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IQSS / dataverse / #22002

01 Apr 2024 07:56PM CUT coverage: 20.716% (+0.5%) from 20.173%
#22002

push

github

web-flow
Merge pull request #10453 from IQSS/develop

Merge 6.2 into master

704 of 2679 new or added lines in 152 files covered. (26.28%)

81 existing lines in 49 files now uncovered.

17160 of 82836 relevant lines covered (20.72%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.78
/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/dta/DTAFileReader.java
1
/*
2
   Copyright (C) 2005-2012, by the President and Fellows of Harvard College.
3

4
   Licensed under the Apache License, Version 2.0 (the "License");
5
   you may not use this file except in compliance with the License.
6
   You may obtain a copy of the License at
7

8
         http://www.apache.org/licenses/LICENSE-2.0
9

10
   Unless required by applicable law or agreed to in writing, software
11
   distributed under the License is distributed on an "AS IS" BASIS,
12
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
   See the License for the specific language governing permissions and
14
   limitations under the License.
15

16
   Dataverse Network - A web application to share, preserve and analyze research data.
17
   Developed at the Institute for Quantitative Social Science, Harvard University.
18
   Version 3.0.
19
*/
20

21
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta;
22

23
import java.io.BufferedInputStream;
24
import java.io.File;
25
import java.io.FileOutputStream;
26
import java.io.IOException;
27
import java.io.InvalidObjectException;
28
import java.io.OutputStreamWriter;
29
import java.io.PrintWriter;
30
import java.nio.ByteBuffer;
31
import java.nio.ByteOrder;
32
import java.text.DecimalFormat;
33
import java.text.NumberFormat;
34
import java.text.ParseException;
35
import java.text.SimpleDateFormat;
36
import java.util.ArrayList;
37
import java.util.Arrays;
38
import java.util.Calendar;
39
import java.util.Date;
40
import java.util.GregorianCalendar;
41
import java.util.HashMap;
42
import java.util.HashSet;
43
import java.util.LinkedHashMap;
44
import java.util.List;
45
import java.util.Map;
46
import java.util.Set;
47
import java.util.TimeZone;
48

49
import java.util.logging.Level;
50
import java.util.logging.Logger;
51
import java.util.regex.Matcher;
52

53
import org.apache.commons.codec.binary.Hex;
54

55
import edu.harvard.iq.dataverse.DataTable;
56
import edu.harvard.iq.dataverse.datavariable.DataVariable;
57
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
58

59
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader;
60
import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi;
61
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
62
import org.apache.commons.lang3.StringUtils;
63

64

65
/**
66
 * ingest plugin for Stata DTA file format.
67
 *
68
 * This reader plugin has been fully re-implemented for the DVN 4.0;
69
 * It is still borrows heavily from, and builds on the basis of the 
70
 * old implementation by Akio Sone, that was in use in the versions 
71
 * 2-3 of the DVN.
72
 * 
73
 * @author Akio Sone at UNC-Odum
74
 * @author landreev
75
 */
76

77
public class DTAFileReader extends TabularDataFileReader{
78

79
    private static final Logger logger = Logger.getLogger(DTAFileReader.class.getCanonicalName());
1✔
80

81
    //@Inject
82
    //VariableServiceBean varService;
83
    // static fields, STATA-specific constants, etc. 
84
    // (should it all be isolated in some other class?) 
85

86
    private static Map<Integer, String> STATA_RELEASE_NUMBER = 
1✔
87
            new HashMap<>();
88
    private static Map<String, Integer> release105type = new LinkedHashMap<>();
1✔
89
    private static Map<String, Integer> release111type = new LinkedHashMap<>();
1✔
90

91
    private static Map<Integer, Map<String, Integer>> CONSTATNT_TABLE = new LinkedHashMap<>();
1✔
92

93
    private static Map<String, Integer> release104constant = new LinkedHashMap<>();
1✔
94
                                        
95
    private static Map<String, Integer> release105constant = new LinkedHashMap<>();
1✔
96
                                        
97
    private static Map<String, Integer> release108constant = new LinkedHashMap<>();
1✔
98
                                        
99
    private static Map<String, Integer> release110constant = new LinkedHashMap<>();
1✔
100
                                        
101
    private static Map<String, Integer> release111constant = new LinkedHashMap<>();
1✔
102
                                        
103
    private static Map<String, Integer> release113constant = new LinkedHashMap<>();
1✔
104
                                        
105
    private static Map<String, Integer> release114constant = new LinkedHashMap<>();
1✔
106
      
107
    private static Map<String, Integer> release115constant = new LinkedHashMap<>();
1✔
108
    
109
    private static Map<Byte, Integer> byteLengthTable105 = new HashMap<>();
1✔
110
    private static Map<Byte, Integer> byteLengthTable111 = new HashMap<>();
1✔
111
                                        
112
    private static Map<Byte, String> variableTypeTable105 = new LinkedHashMap<>();
1✔
113
    private static Map<Byte, String> variableTypeTable111 = new LinkedHashMap<>();
1✔
114
    
115
    private static Map<String, Integer> variableTypeMap = new LinkedHashMap<>();
1✔
116

117
    private static final int[] LENGTH_HEADER = {60, 109};
1✔
118
    private static final int[] LENGTH_LABEL = {32, 81};
1✔
119
    private static final int[] LENGTH_NAME = {9, 33};
1✔
120
    private static final int[] LENGTH_FORMAT_FIELD = {7, 12, 49};
1✔
121
    private static final int[] LENGTH_EXPANSION_FIELD ={0, 2, 4};
1✔
122
    private static final int[] DBL_MV_PWR = {333, 1023};
1✔
123
 
124
    static {
125
        
126
        STATA_RELEASE_NUMBER.put(104, "rel_3");
1✔
127
        STATA_RELEASE_NUMBER.put(105, "rel_4or5");
1✔
128
        STATA_RELEASE_NUMBER.put(108, "rel_6");
1✔
129
        STATA_RELEASE_NUMBER.put(110, "rel_7first");
1✔
130
        STATA_RELEASE_NUMBER.put(111, "rel_7scnd");
1✔
131
        STATA_RELEASE_NUMBER.put(113, "rel_8_or_9");
1✔
132
        STATA_RELEASE_NUMBER.put(114, "rel_10");    // reading stata docs suggests
1✔
133
                                                    // 114 means release 11 - ?
134
                                                    // confused. -- L.A.
135
        STATA_RELEASE_NUMBER.put(115, "rel_12");
1✔
136
        // 115 is the *last* development of the "classic" Stata format. 
137
        // STATA v. 13 introduced format 117 (116 was an in-house, 
138
        // experimental version that was never released), it is a completely
139
        // new development, incompatible with the old format. 
140
        
141
        release105type.put("STRING",  127);
1✔
142
        release105type.put("BYTE",     98);
1✔
143
        release105type.put("INT",     105);
1✔
144
        release105type.put("LONG",    108);
1✔
145
        release105type.put("FLOAT",   102);
1✔
146
        release105type.put("DOUBLE0", 100);
1✔
147
        
148
        release111type.put("STRING",   0);
1✔
149
        release111type.put("BYTE",   -5);
1✔
150
        release111type.put("INT",    -4);
1✔
151
        release111type.put("LONG",   -3);
1✔
152
        release111type.put("FLOAT",  -2);
1✔
153
        release111type.put("DOUBLE", -1);
1✔
154

155
        
156
        
157
        release104constant.put("HEADER",     LENGTH_HEADER[0]);
1✔
158
        release104constant.put("LABEL",     LENGTH_LABEL[0]);
1✔
159
        release104constant.put("NAME",      LENGTH_NAME[0]);
1✔
160
        release104constant.put("FORMAT",    LENGTH_FORMAT_FIELD[0]);
1✔
161
        release104constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[0]);
1✔
162
        release104constant.put("DBL_MV_PWR",DBL_MV_PWR[0]);
1✔
163
        CONSTATNT_TABLE.put(104, release104constant);
1✔
164

165
        release105constant.put("HEADER",     LENGTH_HEADER[0]);
1✔
166
        release105constant.put("LABEL",     LENGTH_LABEL[0]);
1✔
167
        release105constant.put("NAME",      LENGTH_NAME[0]);
1✔
168
        release105constant.put("FORMAT",    LENGTH_FORMAT_FIELD[1]);
1✔
169
        release105constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[1]);
1✔
170
        release105constant.put("DBL_MV_PWR",DBL_MV_PWR[0]);
1✔
171
        CONSTATNT_TABLE.put(105, release105constant);
1✔
172
        
173
        release108constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
174
        release108constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
175
        release108constant.put("NAME",      LENGTH_NAME[0]);
1✔
176
        release108constant.put("FORMAT",    LENGTH_FORMAT_FIELD[1]);
1✔
177
        release108constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[1]);
1✔
178
        release108constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
179
        CONSTATNT_TABLE.put(108, release108constant);
1✔
180
        
181
        release110constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
182
        release110constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
183
        release110constant.put("NAME",      LENGTH_NAME[1]);
1✔
184
        release110constant.put("FORMAT",    LENGTH_FORMAT_FIELD[1]);
1✔
185
        release110constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]);
1✔
186
        release110constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
187
        CONSTATNT_TABLE.put(110, release110constant);
1✔
188
        
189
        release111constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
190
        release111constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
191
        release111constant.put("NAME",      LENGTH_NAME[1]);
1✔
192
        release111constant.put("FORMAT",    LENGTH_FORMAT_FIELD[1]);
1✔
193
        release111constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]);
1✔
194
        release111constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
195
        CONSTATNT_TABLE.put(111, release111constant);
1✔
196
        
197
        release113constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
198
        release113constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
199
        release113constant.put("NAME",      LENGTH_NAME[1]);
1✔
200
        release113constant.put("FORMAT",    LENGTH_FORMAT_FIELD[1]);
1✔
201
        release113constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]);
1✔
202
        release113constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
203
        CONSTATNT_TABLE.put(113, release113constant);
1✔
204
        
205
        release114constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
206
        release114constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
207
        release114constant.put("NAME",      LENGTH_NAME[1]);
1✔
208
        release114constant.put("FORMAT",    LENGTH_FORMAT_FIELD[2]);
1✔
209
        release114constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]);
1✔
210
        release114constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
211
        CONSTATNT_TABLE.put(114, release114constant);
1✔
212
        
213
        release115constant.put("HEADER",     LENGTH_HEADER[1]);
1✔
214
        release115constant.put("LABEL",     LENGTH_LABEL[1]);
1✔
215
        release115constant.put("NAME",      LENGTH_NAME[1]);
1✔
216
        release115constant.put("FORMAT",    LENGTH_FORMAT_FIELD[2]);
1✔
217
        release115constant.put("EXPANSION", LENGTH_EXPANSION_FIELD[2]);
1✔
218
        release115constant.put("DBL_MV_PWR",DBL_MV_PWR[1]);
1✔
219
        CONSTATNT_TABLE.put(115, release115constant);
1✔
220
        
221
        byteLengthTable105.put((byte) 98,1);
1✔
222
        byteLengthTable105.put((byte)105,2);
1✔
223
        byteLengthTable105.put((byte)108,4);
1✔
224
        byteLengthTable105.put((byte)102,4);
1✔
225
        byteLengthTable105.put((byte)100,8);
1✔
226
        
227
        byteLengthTable111.put((byte)-5,1);
1✔
228
        byteLengthTable111.put((byte)-4,2);
1✔
229
        byteLengthTable111.put((byte)-3,4);
1✔
230
        byteLengthTable111.put((byte)-2,4);
1✔
231
        byteLengthTable111.put((byte)-1,8);
1✔
232

233
       
234
        variableTypeTable105.put((byte) 98,"Byte");
1✔
235
        variableTypeTable105.put((byte)105,"Integer");
1✔
236
        variableTypeTable105.put((byte)108,"Long");
1✔
237
        variableTypeTable105.put((byte)102,"Float");
1✔
238
        variableTypeTable105.put((byte)100,"Double");
1✔
239
        
240
        variableTypeTable111.put((byte)-5,"Byte");
1✔
241
        variableTypeTable111.put((byte)-4,"Integer");
1✔
242
        variableTypeTable111.put((byte)-3,"Long");
1✔
243
        variableTypeTable111.put((byte)-2,"Float");
1✔
244
        variableTypeTable111.put((byte)-1,"Double");
1✔
245

246

247
        variableTypeMap.put("Byte",   -5);
1✔
248
        variableTypeMap.put("Integer",-4);
1✔
249
        variableTypeMap.put("Long",   -3);
1✔
250
        variableTypeMap.put("Float",  -2);
1✔
251
        variableTypeMap.put("Double", -1);
1✔
252
        variableTypeMap.put("String",  0);
1✔
253
        
254

255
    }
256

257
   
258
    private static String[] MIME_TYPE = {"application/x-stata"};
1✔
259

260

261

262
    /** format-related constants */
263

264
    private static final int DTA_MAGIC_NUMBER_LENGTH = 4;
265
    private static final int NVAR_FIELD_LENGTH       = 2;
266
    private static final int NOBS_FIELD_LENGTH       = 4;
267
    private static final int TIME_STAMP_LENGTH      = 18;
268
    private static final int VAR_SORT_FIELD_LENGTH   = 2;
269
    private static final int VALUE_LABEL_HEADER_PADDING_LENGTH = 3;
270

271

272
    private static int MISSING_VALUE_BIAS = 26;
1✔
273

274
    private byte BYTE_MISSING_VALUE = Byte.MAX_VALUE;
1✔
275
    private short INT_MISSIG_VALUE = Short.MAX_VALUE;
1✔
276
    private int LONG_MISSING_VALUE = Integer.MAX_VALUE;
1✔
277
    
278
  
279

280
    private static final List<Float> FLOAT_MISSING_VALUES = Arrays.asList(
1✔
281
        0x1.000p127f, 0x1.001p127f, 0x1.002p127f, 0x1.003p127f,
1✔
282
        0x1.004p127f, 0x1.005p127f, 0x1.006p127f, 0x1.007p127f,
1✔
283
        0x1.008p127f, 0x1.009p127f, 0x1.00ap127f, 0x1.00bp127f,
1✔
284
        0x1.00cp127f, 0x1.00dp127f, 0x1.00ep127f, 0x1.00fp127f,
1✔
285
        0x1.010p127f, 0x1.011p127f, 0x1.012p127f, 0x1.013p127f,
1✔
286
        0x1.014p127f, 0x1.015p127f, 0x1.016p127f, 0x1.017p127f,
1✔
287
        0x1.018p127f, 0x1.019p127f, 0x1.01ap127f);
1✔
288

289
    private Set<Float> FLOAT_MISSING_VALUE_SET = new HashSet<>(FLOAT_MISSING_VALUES);
1✔
290

291
    private static final List<Double> DOUBLE_MISSING_VALUE_LIST = Arrays.asList(
1✔
292
        0x1.000p1023, 0x1.001p1023, 0x1.002p1023, 0x1.003p1023, 0x1.004p1023,
1✔
293
        0x1.005p1023, 0x1.006p1023, 0x1.007p1023, 0x1.008p1023, 0x1.009p1023,
1✔
294
        0x1.00ap1023, 0x1.00bp1023, 0x1.00cp1023, 0x1.00dp1023, 0x1.00ep1023,
1✔
295
        0x1.00fp1023, 0x1.010p1023, 0x1.011p1023, 0x1.012p1023, 0x1.013p1023,
1✔
296
        0x1.014p1023, 0x1.015p1023, 0x1.016p1023, 0x1.017p1023, 0x1.018p1023,
1✔
297
        0x1.019p1023, 0x1.01ap1023);
1✔
298

299
    private Set<Double> DOUBLE_MISSING_VALUE_SET = new HashSet<>(DOUBLE_MISSING_VALUE_LIST);
1✔
300

301
    private static SimpleDateFormat sdf_ymdhmsS = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"); // sdf
1✔
302

303

304
    private static SimpleDateFormat sdf_ymd = new SimpleDateFormat("yyyy-MM-dd"); // sdf2
1✔
305

306

307
    private static SimpleDateFormat sdf_hms = new SimpleDateFormat("HH:mm:ss"); // stf
1✔
308

309

310
    private static SimpleDateFormat sdf_yw = new SimpleDateFormat("yyyy-'W'ww");
1✔
311

312

313

314
    // stata's calendar
315
    private static Calendar GCO_STATA = new GregorianCalendar(TimeZone.getTimeZone("GMT"));
1✔
316

317
    private static String[] DATE_TIME_FORMAT= {
1✔
318
        "%tc", "%td", "%tw", "%tq","%tm", "%th", "%ty", 
319
        "%d",  "%w",  "%q", "%m",  "h", "%tb"
320
    };
321
    // New "business calendar format" has been added in Stata 12. -- L.A. 
322
    private static String[] DATE_TIME_CATEGORY={
1✔
323
        "time", "date", "date", "date", "date", "date", "date",
324
        "date", "date", "date", "date", "date", "date"
325
    };
326
    private static Map<String, String> DATE_TIME_FORMAT_TABLE=  new LinkedHashMap<>();
1✔
327

328
    private static long MILLISECONDS_PER_DAY = 24L * 60 * 60 * 1000;
1✔
329

330
    private static long STATA_BIAS_TO_EPOCH;
331

332
    static {
333
     
334
        sdf_ymdhmsS.setTimeZone(TimeZone.getTimeZone("GMT"));
1✔
335
        sdf_ymd.setTimeZone(TimeZone.getTimeZone("GMT"));
1✔
336
        sdf_hms.setTimeZone(TimeZone.getTimeZone("GMT"));
1✔
337
        sdf_yw.setTimeZone(TimeZone.getTimeZone("GMT"));
1✔
338

339
        // set stata's calendar
340
        GCO_STATA.set(1, 1960);// year
1✔
341
        GCO_STATA.set(2, 0); // month
1✔
342
        GCO_STATA.set(5, 1);// day of month
1✔
343
        GCO_STATA.set(9, 0);// AM(0) or PM(1)
1✔
344
        GCO_STATA.set(10, 0);// hh
1✔
345
        GCO_STATA.set(12, 0);// mm
1✔
346
        GCO_STATA.set(13, 0);// ss
1✔
347
        GCO_STATA.set(14, 0); // SS millisecond
1✔
348

349

350
        STATA_BIAS_TO_EPOCH  = GCO_STATA.getTimeInMillis(); // =  -315619200000
1✔
351
        
352
        for (int i=0; i<DATE_TIME_FORMAT.length; i++){
1✔
353
            DATE_TIME_FORMAT_TABLE.put(DATE_TIME_FORMAT[i],DATE_TIME_CATEGORY[i]);
1✔
354
        }
355

356
    }
357
    
358

359

360

361
    // instance fields //
362

363
    private static Logger dbgLog = Logger.getLogger(DTAFileReader.class.getPackage().getName());
1✔
364

365
    
366
    // TODO: 
367
    // add a comment explaining what this table is for: 
368
    // -- L.A. 4.0
369
    private String[] valueLabelsLookupTable = null; 
1✔
370
    
371
    /* 
372
     * StrinLengthTable stores the byte lengths of string variables (these are 
373
     * the same fixed values for every string column). 
374
     * -- L.A. 4.0
375
     */
376
    private Map<Integer, Integer> StringLengthTable = new LinkedHashMap<>();
1✔
377
    
378

379
    private Map<String, Integer> typeOffsetTable ;
380

381
    private Map<String, Integer> constantTable ;
382

383
    private Map<Byte, Integer> byteLengthTable;
384

385
    private Map<Byte, String> variableTypeTable;
386

387

388

389
    private NumberFormat twoDigitFormatter = new DecimalFormat("00");
1✔
390

391
    private NumberFormat doubleNumberFormatter = new DecimalFormat();
1✔
392

393
    TabularDataIngest ingesteddata = new TabularDataIngest();
1✔
394

395
    private DataTable dataTable = new DataTable();
1✔
396

397
    private int releaseNumber;
398

399
    private int headerLength;
400

401
    private int dataLabelLength;
402

403
    private boolean isLittleEndian = false;
1✔
404

405
    private int bytes_per_row;
406

407
    
408

409
    /* variableTypes is a list of string values representing the type of 
410
     * data values *stored* in the file - "byte", "integer", "float", "string", 
411
     * etc. We need this information as we're reading the data, to know how
412
     * many bytes to read for every object type and how to convert the binary
413
     * data into the proper Java type.
414
     * It's important to note that these types are *Stata* types - the types
415
     * of the variables on the DVN side may change (see below).
416
     * The variableTypesFinal will describe the data values once they have 
417
     * been read and stored in the tab. file. This is an important distinction: 
418
     * for example, the time/data values are stored as binary numeric values 
419
     * in Stata files, but we'll be storing them as strings in the DVN tabular
420
     * files.
421
     */
422
    
423
    private String[] variableTypes=null;
1✔
424
    
425
    private String[] dateVariableFormats=null; 
1✔
426
  
427
    private int value_label_table_length;
428
    
429
    private static final String MissingValueForTabDelimitedFile = "";
430
  
431
    // Constructor -----------------------------------------------------------//
432

433
    /**
434
     * Constructs a <code>DTAFileReader</code> instance with a 
435
     * <code>StatDataFileReaderSpi</code> object.
436
     * 
437
     * @param originator a <code>StatDataFileReaderSpi</code> object.
438
     */
439
    public DTAFileReader(TabularDataFileReaderSpi originator){
440
        super(originator);
1✔
441
    }
1✔
442

443
    // Methods ---------------------------------------------------------------//
444

445
    /*
446
     * This method configures Stata's release-specific parameters:
447
     */
448
    private void init() throws IOException {
449
        //
450
        if (dbgLog.isLoggable(Level.INFO)) dbgLog.info("release number="+releaseNumber);
1✔
451
        
452
        if (releaseNumber < 111) {
1✔
453
            typeOffsetTable = release105type;
×
454
            variableTypeTable = variableTypeTable105;
×
455
            byteLengthTable = byteLengthTable105;
×
456
        } else {
457
            typeOffsetTable = release111type;
1✔
458
            variableTypeTable = variableTypeTable111;
1✔
459
            byteLengthTable = byteLengthTable111;
1✔
460
            BYTE_MISSING_VALUE   -= MISSING_VALUE_BIAS;
1✔
461
            INT_MISSIG_VALUE     -= MISSING_VALUE_BIAS;
1✔
462
            LONG_MISSING_VALUE   -= MISSING_VALUE_BIAS;
1✔
463
        }
464
        
465
        if (releaseNumber <= 105){
1✔
466
            value_label_table_length = 2;
×
467
        } else {
468
            value_label_table_length = 4;
1✔
469
        }
470
        
471
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("type-offset table to be used:\n"+typeOffsetTable);
1✔
472

473
        constantTable = CONSTATNT_TABLE.get(releaseNumber);
1✔
474

475
        headerLength = constantTable.get("HEADER") - DTA_MAGIC_NUMBER_LENGTH;
1✔
476
        
477
        dataLabelLength = headerLength - (NVAR_FIELD_LENGTH +
1✔
478
            NOBS_FIELD_LENGTH + TIME_STAMP_LENGTH);
479
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("data_label_length="+dataLabelLength);
1✔
480

481
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("constant table to be used:\n"+constantTable);
1✔
482
       
483

484
        doubleNumberFormatter.setGroupingUsed(false);
1✔
485
        doubleNumberFormatter.setMaximumFractionDigits(340);
1✔
486
        /* 
487
         * it's no longer necessary to use the variable service to look up 
488
         * various type entities: 
489
         * -- L.A. 4.0 beta 9
490
        
491
        Context ctx = null; 
492
        try {
493
            ctx = new InitialContext();
494
            varService = (VariableServiceBean) ctx.lookup("java:global/dataverse-4.0/VariableServiceBean");
495
        } catch (NamingException nex) {
496
            try {
497
                ctx = new InitialContext();
498
                varService = (VariableServiceBean) ctx.lookup("java:global/dataverse/VariableServiceBean");
499
            } catch (NamingException nex2) {
500
                if (dbgLog.isLoggable(Level.INFO)) dbgLog.info("Could not look up initial context, or the variable service in JNDI!");
501
                throw new IOException ("Could not look up initial context, or the variable service in JNDI!"); 
502
            }
503
        }
504
        */
505
    }
1✔
506

507
    @Override
508
    public TabularDataIngest read(BufferedInputStream stream, boolean storeWithVariableHeader, File dataFile) throws IOException {
509
        dbgLog.info("***** DTAFileReader: read() start *****");
1✔
510
        
511
        if (dataFile != null) {
1✔
512
            throw new IOException ("this plugin does not support external raw data files");
×
513
        }
514

515
        try {
516
            decodeHeader(stream);
1✔
517
            decodeDescriptors(stream);
1✔
518
            decodeVariableLabels(stream);
1✔
519
            if (releaseNumber!=104) {
1✔
520
                decodeExpansionFields(stream);
1✔
521
            }
522
            decodeData(stream, storeWithVariableHeader);
1✔
523
            decodeValueLabels(stream);
1✔
524

525
            ingesteddata.setDataTable(dataTable);
1✔
526
        } catch (IllegalArgumentException iaex) {
×
527
            throw new IOException(iaex.getMessage());
×
528
        }
1✔
529
        
530
        dbgLog.info("***** DTAFileReader: read() end *****");
1✔
531
        return ingesteddata;
1✔
532
    }
533

534

535

536
    private void decodeHeader(BufferedInputStream stream) throws IOException {
537
        dbgLog.fine("***** decodeHeader(): start *****");
1✔
538

539
        if (stream == null) {
1✔
540
            throw new IllegalArgumentException("stream == null!");
×
541
        }
542

543
        dbgLog.fine("reading the header segument 1: 4 byte\n");
1✔
544
        byte[] magic_number = new byte[DTA_MAGIC_NUMBER_LENGTH];
1✔
545

546
        int nbytes = stream.read(magic_number, 0, DTA_MAGIC_NUMBER_LENGTH);
1✔
547

548
        if (nbytes == 0) {
1✔
549
            throw new IOException();
×
550
        }
551

552
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
553
            dbgLog.fine("hex dump: 1st 4bytes =>"
×
554
                    + new String(Hex.encodeHex(magic_number)) + "<-");
×
555
        }
556

557
        logger.fine("magic_number[0]: " + magic_number[0]);
1✔
558
        logger.fine("magic_number[1]: " + magic_number[1]);
1✔
559
        logger.fine("magic_number[2]: " + magic_number[2]);
1✔
560
        if (magic_number[2] != 1) {
1✔
561
            dbgLog.fine("3rd byte is not 1: given file is not stata-dta type");
×
562
            // FIXME: Figure out the magic number for Stata 14.
563
            // FIXME: Figure out the magic number for Stata 15.
564
            throw new IllegalArgumentException("The file is not in a STATA format that we can read or support.");
×
565
        } else if ((magic_number[1] != 1) && (magic_number[1] != 2)) {
1✔
566
            dbgLog.fine("2nd byte is neither 0 nor 1: this file is not stata-dta type");
×
567
            throw new IllegalArgumentException("given file is not stata-dta type");
×
568
        } else if (!STATA_RELEASE_NUMBER.containsKey((int) magic_number[0])) {
1✔
569
            dbgLog.fine("1st byte (" + magic_number[0]
×
570
                    + ") is not within the ingestable range [rel. 3-10]:"
571
                    + "we cannot ingest this Stata file.");
572
            throw new IllegalArgumentException("given file is not stata-dta type");
×
573
        } else {
574
            releaseNumber = magic_number[0];
1✔
575
            init();
1✔
576

577
            dataTable.setOriginalFileFormat(MIME_TYPE[0]);
1✔
578
            /* 
579
             * releaseNumber: 
580
             * for storing in the datatable, we are converting the numeric Stata
581
             * release number into a more user friendly "version number"; 
582
             * e.g., "release number 115" = "Stata v. 12"
583
             * -- L.A. 4.0 
584
             */
585
            dataTable.setOriginalFormatVersion(STATA_RELEASE_NUMBER.get(releaseNumber));
1✔
586
            dataTable.setUnf("UNF:6:FILEFILEFILEFILE");
1✔
587

588
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
589
                dbgLog.fine("this file is stata-dta type: "
×
590
                        + STATA_RELEASE_NUMBER.get(releaseNumber)
×
591
                        + " (that means Stata version " + releaseNumber + ")");
592
            }
593
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
594
                dbgLog.fine("Endian(file)(Big: 1; Little:2)=" + magic_number[1]);
×
595
            }
596

597
            /* 
598
             * byte order: defined in the second byte of the "magic number": 
599
             */
600
            if (magic_number[1] == 2) {
1✔
601
                isLittleEndian = true;
1✔
602
                dbgLog.fine("Reversal of the bytes is necessary to decode "
1✔
603
                        + "multi-byte fields");
604
            }
605
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
606
                dbgLog.fine("Endian of this platform:" + ByteOrder.nativeOrder().toString());
×
607
            }
608
        }
609

610
        dbgLog.fine("reading the remaining header segument 2: 60 or 109-byte");
1✔
611

612
        byte[] header = new byte[headerLength];
1✔
613
        nbytes = stream.read(header, 0, headerLength);
1✔
614

615
        // 1. number of variables: short (2 bytes)
616
        ByteBuffer bbnvar = ByteBuffer.wrap(header, 0, NVAR_FIELD_LENGTH);
1✔
617
        ByteBuffer dupnvar = bbnvar.duplicate();
1✔
618
        short short_nvar = dupnvar.getShort();
1✔
619

620
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
621
            dbgLog.fine("get original short view(nvar)=" + short_nvar);
×
622
        }
623
        if (isLittleEndian) {
1✔
624
            bbnvar.order(ByteOrder.LITTLE_ENDIAN);
1✔
625

626
        }
627

628
        short shrt_nvar = bbnvar.getShort();
1✔
629
        dataTable.setVarQuantity(new Long(shrt_nvar));
1✔
630
        int nvar = shrt_nvar;
1✔
631
        
632
        if (dbgLog.isLoggable(Level.INFO)) {
1✔
633
            dbgLog.info("number of variables(nvar)=" + nvar);
1✔
634
        }
635

636
        // 4.0 Initialize dataverse variable objects: 
637
        List<DataVariable> variableList = new ArrayList<>();
1✔
638

639
        for (int i = 0; i < nvar; i++) {
1✔
640
            DataVariable dv = new DataVariable(i, dataTable);
1✔
641
            variableList.add(dv);
1✔
642
        }
643

644
        dataTable.setDataVariables(variableList);
1✔
645

646
        // setup variableTypeList
647
        variableTypes = new String[nvar];
1✔
648
        // and the date/time format list:
649
        dateVariableFormats = new String[nvar];
1✔
650

651
        // 2. number of observations: int (4 bytes)
652
        ByteBuffer nobs = ByteBuffer.wrap(header, NVAR_FIELD_LENGTH,
1✔
653
                NOBS_FIELD_LENGTH);
654
        ByteBuffer dupnobs = nobs.duplicate();
1✔
655
        int int_dupnobs = dupnobs.getInt();
1✔
656
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
657
            dbgLog.fine("raw nobs=" + int_dupnobs);
×
658
        }
659
        if (isLittleEndian) {
1✔
660
            nobs.order(ByteOrder.LITTLE_ENDIAN);
1✔
661
        }
662
        int int_nobs = nobs.getInt();
1✔
663
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
664
            dbgLog.fine("reversed nobs=" + int_nobs);
×
665
        }
666

667
        // smd.getFileInformation().put("caseQnty", new Integer(int_nobs));
668
        dataTable.setCaseQuantity(new Long(int_nobs));
1✔
669

670
        /* 
671
         the "data label" - 
672
         note that we are not using this label for anything 
673
         (wonder what it is though? can we use it somewhere?)
674
         but we still need to extract it from the byte stream, 
675
         since the offsets of the objects stored further up
676
         are calculated relative to it. -- L.A., 4.0
677
         */
678
        // 3. data_label: 32 or 81 bytes
679
        int dl_offset = NVAR_FIELD_LENGTH + NOBS_FIELD_LENGTH;
1✔
680
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
681
            dbgLog.fine("dl_offset=" + dl_offset);
×
682
        }
683
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
684
            dbgLog.fine("data_label_length=" + dataLabelLength);
×
685
        }
686

687
        String data_label = new String(Arrays.copyOfRange(header, dl_offset,
1✔
688
                (dl_offset + dataLabelLength)), "ISO-8859-1");
689

690
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
691
            dbgLog.fine("data_label_length=" + data_label.length());
×
692
        }
693
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
694
            dbgLog.fine("loation of the null character=" + data_label.indexOf(0));
×
695
        }
696

697
        String dataLabel = getNullStrippedString(data_label);
1✔
698
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
699
            dbgLog.fine("data_label_length=" + dataLabel.length());
×
700
        }
701
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
702
            dbgLog.fine("data_label=[" + dataLabel + "]");
×
703
        }
704

705
        // smd.getFileInformation().put("dataLabel", dataLabel);
706

707
        /* end of "data label" */
708
        // 4. time_stamp: ASCII String (18 bytes)
709
        // added after release 4
710
        if (releaseNumber > 104) {
1✔
711
            int ts_offset = dl_offset + dataLabelLength;
1✔
712
            String time_stamp = new String(Arrays.copyOfRange(header, ts_offset,
1✔
713
                    ts_offset + TIME_STAMP_LENGTH), "ISO-8859-1");
714
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
715
                dbgLog.fine("time_stamp_length=" + time_stamp.length());
×
716
            }
717
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
718
                dbgLog.fine("loation of the null character=" + time_stamp.indexOf(0));
×
719
            }
720

721
            String timeStamp = getNullStrippedString(time_stamp);
1✔
722
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
723
                dbgLog.fine("timeStamp_length=" + timeStamp.length());
×
724
            }
725
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
726
                dbgLog.fine("timeStamp=[" + timeStamp + "]");
×
727
            }
728

729
        }
730
    }
1✔
731

732

733

734
    private void decodeDescriptors(BufferedInputStream stream) throws IOException {
735

736
        dbgLog.fine("decodeDescriptors(): start");
1✔
737

738
        if (stream == null) {
1✔
739
            throw new IllegalArgumentException("stream == null!");
×
740
        }
741
        int nvar = dataTable.getVarQuantity().intValue();
1✔
742

743
        // part 1: variable type list
744
        decodeDescriptorVarTypeList(stream, nvar);
1✔
745

746
        // part 2: Variable_Name List
747
        // name length= 9(release 105) or 33 (release 111) each null terminated
748
        decodeDescriptorVarNameList(stream, nvar);
1✔
749

750
        // Part 3: variable sort list
751
        // length of this field = short(2bytes)*(nvar +1)
752
        decodeDescriptorVarSortList(stream, nvar);
1✔
753

754
        // Part 4: variable format list
755
        // VAR_FORMAT_FIELD_LENGTH (7,12, 49 bytes) * navar
756
        // null-terminated string
757
        decodeDescriptorVariableFormat(stream, nvar);
1✔
758

759
        // Part 5: value-label list
760
        // variable_name * nvar null-terminated String
761
        decodeDescriptorValueLabel(stream, nvar);
1✔
762

763
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
764
            dbgLog.fine("decodeDescriptors(): end");
×
765
        }
766

767
    }
1✔
768

769
    private void decodeDescriptorVarTypeList(BufferedInputStream stream, int nvar) throws IOException {
770
        byte[] typeList = new byte[nvar];
1✔
771

772
        // note: the offset param of read() is relative to
773
        // the current position, not absolute position
774
        int nbytes = stream.read(typeList, 0, nvar);
1✔
775
        //printHexDump(typeList, "variable type list");
776
        if (nbytes == 0) {
1✔
777
            throw new IOException("reading the descriptior: no byte was read");
×
778
        }
779
        /*
780
         Stata internal constants representing variable type information; 
781
         these were kindly provided by Akio:
782
        111 type
783
        Type:   b   i   l   f   d (byte, int, long, float, double)
784
        byte:  -5  -4  -3  -2  -1 (signed byte = java's byte type)
785
        byte: 251 252 253 254 255 (unsigned byte)
786
        HEX:  FB  FC  FD  FE  FF
787

788
        105 type(type chars correspond to their hex/decimal expressions
789
        Type:   b   i   l   f   d (byte, int, long, float, double)
790
        byte:  98 105 108 102 100 (signed byte = java's byte type)
791
        byte:  98 105 108 102 100 (unsigned byte)
792
        HEX:  62  69  6C  66  64
793
         */
794
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("type_offset_table:\n" + typeOffsetTable);
1✔
795

796

797
        bytes_per_row = 0;
1✔
798
        
799
        for (int i = 0; i < typeList.length; i++) {
1✔
800
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "-th value=" + typeList[i]);
1✔
801
            DataVariable dataVariable = dataTable.getDataVariables().get(i);
1✔
802
            /*
803
             * How Stata types correspond to the DVN types: 
804
             * "Byte", "Integer" and "Long" become Numeric, Discrete (unless date value); 
805
             * "Float" and "Double" become Numeric, Continuous (unless date value);
806
             * "String" becomes String;
807
             * Date/time values stored as numeric types above, are converted into 
808
             * Strings.
809
             * -- L.A. 4.0
810
             */
811

812
            if (byteLengthTable.containsKey(typeList[i])) {
1✔
813
                bytes_per_row += byteLengthTable.get(typeList[i]);
1✔
814
                variableTypes[i] = variableTypeTable.get(typeList[i]);
1✔
815
                String typeLabel = variableTypes[i];
1✔
816
                
817
                if (typeLabel != null) {
1✔
818
                    dataVariable.setTypeNumeric();
1✔
819
                    if (typeLabel.equals("Byte") || typeLabel.equals("Integer") || typeLabel.equals("Long")) {
1✔
820
                        // these are treated as discrete:
821
                        dataVariable.setIntervalDiscrete();
1✔
822
                        
823
                    } else if (typeLabel.equals("Float") || typeLabel.equals("Double")) {
×
824
                        // these are treated as contiuous:
825
                        dataVariable.setIntervalContinuous();
×
826
                        
827
                    } else {
828
                        throw new IOException("Unrecognized type label: "+typeLabel+" for Stata type value byte "+typeList[i]+".");
×
829
                    }
830
                } else {
831
                    throw new IOException("No entry in the known types table for Stata type value byte "+typeList[i]+".");
×
832
                }
833
            } else {
1✔
834
                // pre-111 string type
835
                if (releaseNumber < 111) {
×
836
                    int stringType = 256 + typeList[i];
×
837
                    if (stringType >= typeOffsetTable.get("STRING")) {
×
838
                        int string_var_length = stringType - typeOffsetTable.get("STRING");
×
839
                        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("string_var_length=" + string_var_length);
×
840
                        bytes_per_row += string_var_length;
×
841

842
                        variableTypes[i] = "String";
×
843
                        dataVariable.setTypeCharacter();
×
844
                        dataVariable.setIntervalDiscrete();
×
845
                        StringLengthTable.put(i, string_var_length);
×
846

847

848
                    } else {
×
849
                        throw new IOException(
×
850
                                "unknown variable type was detected: reading errors?");
851
                    }
852
                } else if (releaseNumber >= 111) {
×
853
                    // post-111 string type
854
                    if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("DTA reader: typeList[" + i + "]=" + typeList[i]);
×
855

856
                    // if the size of strXXX type is less than 128,
857
                    // the value of typeList[i] will be equal to that;
858
                    // if however it is >= 128, typeList[i] = (size - 256)
859
                    // i.e. it'll be a negative value:
860

861
                    int stringType = ((typeList[i] > 0) &&
×
862
                            (typeList[i] <= 127)) ? typeList[i] : 256 + typeList[i];
×
863

864
                    if (stringType >= typeOffsetTable.get("STRING")) {
×
865
                        int string_var_length = stringType - typeOffsetTable.get("STRING");
×
866
                        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("DTA reader: string_var_length=" + string_var_length);
×
867
                        bytes_per_row += string_var_length;
×
868

869
                        variableTypes[i] = "String";
×
870
                        dataVariable.setTypeCharacter();
×
871
                        dataVariable.setIntervalDiscrete();
×
872
                        StringLengthTable.put(i, string_var_length);
×
873

874

875
                    } else {
×
876
                        throw new IOException(
×
877
                                "unknown variable type was detected: reading errors?");
878
                    }
879
                } else {
×
880
                    throw new IOException("uknown release number ");
×
881
                }
882

883
            }
884
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "=th\t sum=" + bytes_per_row);
1✔
885
        }
886
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
887
            dbgLog.fine("bytes_per_row(final)=" + bytes_per_row);
×
888
            dbgLog.fine("variableTypes:\n" + Arrays.deepToString(variableTypes));
×
889
            dbgLog.fine("StringLengthTable=" + StringLengthTable);
×
890
        }
891

892
    }
1✔
893

894

895

896
    private void decodeDescriptorVarNameList(BufferedInputStream stream, int nvar) throws IOException {
897
        int length_var_name = constantTable.get("NAME");
1✔
898
        int length_var_name_list = length_var_name * nvar;
1✔
899
        
900
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_var_name_list=" + length_var_name_list);
1✔
901

902
        byte[] variableNameBytes = new byte[length_var_name_list];
1✔
903

904
        int nbytes = stream.read(variableNameBytes, 0, length_var_name_list);
1✔
905

906

907
        if (nbytes == 0) {
1✔
908
            throw new IOException("reading the var name list: no var name was read");
×
909
        }
910
        int offset_start = 0;
1✔
911
        int offset_end = 0;
1✔
912
        for (DataVariable dataVariable: dataTable.getDataVariables()) {
1✔
913
            offset_end += length_var_name;
1✔
914
            String vari = new String(Arrays.copyOfRange(variableNameBytes, offset_start,
1✔
915
                    offset_end), "ISO-8859-1");
916
            String varName = getNullStrippedString(vari);
1✔
917
            dataVariable.setName(varName);
1✔
918
            dbgLog.fine("next name=[" + varName + "]");
1✔
919
            offset_start = offset_end;
1✔
920
        }
1✔
921
    }
1✔
922

923
    private void decodeDescriptorVarSortList(BufferedInputStream stream, int nvar) throws IOException {
924
        /* 
925
         * Whatever this "var sort list" is, we don't seem to be using this 
926
         * information for any purposes in particular. However, we need to read
927
         * the bytes, to skip to the next section in the stream, if nothing else. 
928
         * -- L.A. 4.0
929
         */
930
        int length_var_sort_list = VAR_SORT_FIELD_LENGTH * (nvar + 1);
1✔
931
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_var_sort_list=" + length_var_sort_list);
1✔
932

933
        byte[] varSortList = new byte[length_var_sort_list];
1✔
934
        short[] variableSortList = new short[nvar + 1];
1✔
935

936

937
        int nbytes = stream.read(varSortList, 0, length_var_sort_list);
1✔
938

939
        if (nbytes == 0) {
1✔
940
            throw new IOException("reading error: the varSortList");
×
941
        }
942

943
        int offset_start = 0;
1✔
944
        for (int i = 0; i <= nvar; i++) {
1✔
945

946

947
            ByteBuffer bb_varSortList = ByteBuffer.wrap(varSortList,
1✔
948
                    offset_start, VAR_SORT_FIELD_LENGTH);
949
            if (isLittleEndian) {
1✔
950
                bb_varSortList.order(ByteOrder.LITTLE_ENDIAN);
1✔
951
            }
952
            variableSortList[i] = bb_varSortList.getShort();
1✔
953

954
            offset_start += VAR_SORT_FIELD_LENGTH;
1✔
955
        }
956
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("variableSortList=" + Arrays.toString(variableSortList));
1✔
957

958
    }
1✔
959

960

961
    /* Variable Formats are used exclusively for time and date variables. 
962
     *      -- L.A. 4.0
963
     */
964
    private void decodeDescriptorVariableFormat(BufferedInputStream stream, int nvar) throws IOException {
965
        int length_var_format = constantTable.get("FORMAT");
1✔
966
        int length_var_format_list = length_var_format * nvar;
1✔
967
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_var_format_list=" + length_var_format_list);
1✔
968

969
        byte[] variableFormatList = new byte[length_var_format_list];
1✔
970

971
        int nbytes = stream.read(variableFormatList, 0, length_var_format_list);
1✔
972

973
        if (nbytes == 0) {
1✔
974
            throw new IOException("reading var formats: no format was read");
×
975
        }
976
        int offset_start = 0;
1✔
977
        int offset_end = 0;
1✔
978
        for (int i = 0; i < nvar; i++) {
1✔
979
            offset_end += length_var_format;
1✔
980
            String vari = new String(Arrays.copyOfRange(variableFormatList, offset_start,
1✔
981
                    offset_end), "ISO-8859-1");
982
            String variableFormat = getNullStrippedString(vari);
1✔
983
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "-th format=[" + variableFormat + "]");
1✔
984
                        
985
            String variableFormatKey = null;
1✔
986
            if (variableFormat.startsWith("%t")) {
1✔
987
                variableFormatKey = variableFormat.substring(0, 3);
×
988
            } else {
989
                variableFormatKey = variableFormat.substring(0, 2);
1✔
990
            }
991
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + " th variableFormatKey=" + variableFormatKey);
1✔
992

993
            /* 
994
             * Now, let's check if this format is a known time or date format. 
995
             * If so, note that this changes the storage type of the variable!
996
             * i.e., times and dates are stored as binary numeric values, but on 
997
             * the DVN side/in the tab files they will become strings. 
998
             * TODO: it kinda does look like we can get rid of the variableFormats[]
999
             * list; these formats are only used if this is a recognized 
1000
             * "date/time datum" (see below); so then it looks like we can 
1001
             * extract this info from the DataVariable "formatschemaname". 
1002
             * -- L.A. 4.0
1003
             */
1004
            if (DATE_TIME_FORMAT_TABLE.containsKey(variableFormatKey)) {
1✔
1005
                DataVariable dataVariable = dataTable.getDataVariables().get(i);
×
1006
                // TODO: revisit the whole "formatschemaname" thing; -- L.A. 
1007
                // Instead of populating this field with the Stata's internal 
1008
                // format token (??), we should put the actual format of the 
1009
                // values that we store in the tab file. And the internal 
1010
                // STATA format we'll keep in this array for now: 
1011
                dateVariableFormats[i] = variableFormat; 
×
1012
                //dataTable.getDataVariables().get(i).setFormatSchemaName(variableFormat);
1013
                // TODO: make sure we do save the real format (as .setFormat() somewhere else!)
1014
                dataVariable.setFormatCategory(DATE_TIME_FORMAT_TABLE.get(variableFormatKey));
×
1015
                if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "th var: category=" +
×
1016
                        DATE_TIME_FORMAT_TABLE.get(variableFormatKey));
×
1017
                dataVariable.setTypeCharacter();
×
1018
                dataVariable.setIntervalDiscrete();
×
1019
            } 
1020

1021
            
1022
            offset_start = offset_end;
1✔
1023
        }
1024

1025

1026
    }
1✔
1027
    
1028
    private void decodeDescriptorValueLabel(BufferedInputStream stream, int nvar) throws IOException {
1029
        valueLabelsLookupTable = new String[nvar];
1✔
1030
        
1031
        int length_label_name = constantTable.get("NAME");
1✔
1032
        int length_label_name_list = length_label_name * nvar;
1✔
1033
        dbgLog.fine("length_label_name=" + length_label_name_list);
1✔
1034

1035
        byte[] labelNameList = new byte[length_label_name_list];
1✔
1036
        String[] labelNames = new String[nvar];
1✔
1037

1038
        int nbytes = stream.read(labelNameList, 0, length_label_name_list);
1✔
1039

1040
        if (nbytes == 0) {
1✔
1041
            throw new IOException("reading value-label list:: no var name was read");
×
1042
        }
1043
        int offset_start = 0;
1✔
1044
        int offset_end = 0;
1✔
1045
        for (int i = 0; i < nvar; i++) {
1✔
1046
            offset_end += length_label_name;
1✔
1047
            String vari = new String(Arrays.copyOfRange(labelNameList, offset_start,
1✔
1048
                    offset_end), "ISO-8859-1");
1049
            labelNames[i] = getNullStrippedString(vari);
1✔
1050
            dbgLog.fine(i + "-th label=[" + labelNames[i] + "]");
1✔
1051
            offset_start = offset_end;
1✔
1052
        }
1053
        dbgLog.fine("labelNames=\n" + StringUtils.join(labelNames, ",\n") + "\n");
1✔
1054

1055
        for (int i = 0; i < nvar; i++) {
1✔
1056
            if ((labelNames[i] != null) && (!labelNames[i].isEmpty())) {
1✔
1057
                valueLabelsLookupTable[i] = labelNames[i];
×
1058
            }
1059
        }
1060
    }
1✔
1061

1062

1063
    private void decodeVariableLabels(BufferedInputStream stream) throws IOException {
1064

1065
        dbgLog.fine("decodeVariableLabels(): start");
1✔
1066

1067
        if (stream == null) {
1✔
1068
            throw new IllegalArgumentException("stream == null!");
×
1069
        }
1070

1071
        // variable label length (32 or 81 bytes)*nvar, each null-terminated
1072
        // int nvar = (Integer)smd.getFileInformation().get("varQnty");
1073
        int nvar = dataTable.getVarQuantity().intValue();
1✔
1074

1075
        int length_var_label = constantTable.get("LABEL");
1✔
1076
        int length_var_label_list = length_var_label * nvar;
1✔
1077

1078
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
1079
            dbgLog.fine("length_label_name=" + length_var_label_list);
×
1080
        }
1081

1082
        byte[] variableLabelBytes = new byte[length_var_label_list];
1✔
1083
        int nbytes = stream.read(variableLabelBytes, 0, length_var_label_list);
1✔
1084

1085
        if (nbytes == 0) {
1✔
1086
            throw new IOException("reading variable label list: no label was read");
×
1087
        }
1088
        int offset_start = 0;
1✔
1089
        int offset_end = 0;
1✔
1090
        for (int i = 0; i < nvar; i++) {
1✔
1091
            offset_end += length_var_label;
1✔
1092
            String vari = new String(Arrays.copyOfRange(variableLabelBytes, offset_start,
1✔
1093
                    offset_end), "ISO-8859-1");
1094
            
1095
            String variableLabelParsed = getNullStrippedString(vari);
1✔
1096
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
1097
                dbgLog.fine(i + "-th label=[" + variableLabelParsed + "]");
×
1098
            }
1099
            offset_start = offset_end;
1✔
1100

1101
            dataTable.getDataVariables().get(i).setLabel(variableLabelParsed);
1✔
1102
        }
1103

1104
        dbgLog.fine("decodeVariableLabels(): end");
1✔
1105

1106
    }
1✔
1107
    
1108

1109
    /* 
1110
     * We don't seem to be using any of these "expansion fields" - whatever 
1111
     * they are; but we need to read the section, to skip to the next one in 
1112
     * the byte stream, if nothing else. 
1113
     * -- L.A. 4.0
1114
     * TODO: ok, need to figure out what these are. -- AUG 6 2014
1115
     */
1116
    private void decodeExpansionFields(BufferedInputStream stream) throws IOException {
1117

1118
        dbgLog.fine("***** decodeExpansionFields(): start *****");
1✔
1119
        
1120
        if (stream ==null){
1✔
1121
            throw new IllegalArgumentException("stream == null!");
×
1122
        }
1123
        
1124
        // Added since release 105
1125
        // [1-byte byte_field][short(2)/int(4)_field][variable_field whose
1126
        // length is specified by the previous short/int field]
1127
        
1128
        int int_type_expansion_field = constantTable.get("EXPANSION");
1✔
1129
        if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("int_type_expansion_field="+int_type_expansion_field);
1✔
1130
        while(true){
1131
            byte[] firstByte = new byte[1];
1✔
1132
            byte[] lengthBytes = new byte[int_type_expansion_field];
1✔
1133
          
1134
            int nbyte = stream.read(firstByte, 0, 1);
1✔
1135
            dbgLog.fine("read 1st byte");
1✔
1136
            int nbytes = stream.read(lengthBytes, 0, int_type_expansion_field);
1✔
1137
            dbgLog.fine("read next integer");
1✔
1138

1139
            ByteBuffer bb_field_length = ByteBuffer.wrap(lengthBytes);
1✔
1140

1141
            if (isLittleEndian){
1✔
1142
                bb_field_length.order(ByteOrder.LITTLE_ENDIAN);
1✔
1143
                dbgLog.fine("byte reversed");
1✔
1144
            }
1145

1146
            int field_length;
1147

1148
            if (int_type_expansion_field == 2){
1✔
1149
                field_length = bb_field_length.getShort();
×
1150
            } else {
1151
                field_length = bb_field_length.getInt();
1✔
1152
            }
1153
            
1154
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("field_length="+field_length);
1✔
1155
            if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("firstByte[0]="+firstByte[0]);
1✔
1156
            if ((field_length + firstByte[0]) == 0){
1✔
1157
                // reached the end of this field
1158
                break;
1✔
1159
            } else {
1160
                byte[] stringField = new byte[field_length];
×
1161
                nbyte = stream.read(stringField, 0, field_length);
×
1162

1163
              
1164
            }
1165
        }
×
1166

1167
        dbgLog.fine("decodeExpansionFields(): end");
1✔
1168

1169
    }
1✔
1170
    
1171
    /**
1172
     *
1173
     * @param stream
1174
     */
1175
    private void decodeValueLabels(BufferedInputStream stream) throws IOException {
1176

1177
        dbgLog.fine("decodeValueLabels(): start");
1✔
1178

1179
        if (stream == null) {
1✔
1180
            throw new IllegalArgumentException("stream == null!");
×
1181
        }
1182

1183
        if (stream.available() != 0) {
1✔
1184
            if (releaseNumber <= 105) {
×
1185
                parseValueLabelsRelease105(stream);
×
1186
            } else {
1187
                parseValueLabelsReleasel108(stream);
×
1188
            }
1189
        } else {
1190
            dbgLog.fine("no value-label table: end of file");
1✔
1191
        }
1192
        dbgLog.fine("decodeValueLabels(): end");
1✔
1193
    }
1✔
1194
    
1195
    
1196
    void parseValueLabelsRelease105(BufferedInputStream stream) throws IOException {
1197

1198
        dbgLog.fine("parseValueLabelsRelease105(): start");
×
1199

1200
        if (stream == null) {
×
1201
            throw new IllegalArgumentException("stream == null!");
×
1202
        }
1203

1204
        int nvar = dataTable.getVarQuantity().intValue();
×
1205
        int length_label_name = constantTable.get("NAME") + 1;
×
1206
        // note: caution +1 as the null character, not 9 byte
1207

1208
        int length_value_label_header = value_label_table_length
×
1209
                + length_label_name;
1210

1211
        if (dbgLog.isLoggable(Level.FINE)) {
×
1212
            dbgLog.fine("value_label_table_length=" + value_label_table_length);
×
1213
        }
1214
        if (dbgLog.isLoggable(Level.FINE)) {
×
1215
            dbgLog.fine("length_value_label_header=" + length_value_label_header);
×
1216
        }
1217

1218
        int length_lable_name_field = 8;
×
1219

1220
        /*
1221
         Seg  field         byte    type
1222
         1-1. no of pairs      2    int  (= m)
1223
         1-2. vlt_name        10    includes char+(\0) == name used in Sec2.part 5
1224
         -----------------------------------
1225
         11
1226
         2-1. values         2*n    int[]
1227
         2-2. labels         8*n    char
1228
         */
1229
        
1230
        // This map will hold a temporary lookup table for all the categorical
1231
        // value-label groups we are going to find here:
1232
        // These groups have unique names, and a group *may be shared* between
1233
        // multiple variables. In the method decodeDescriptorValueLabel above
1234
        // we have populated a lookup table where variables are linked to the 
1235
        // corresponding value-label groups by name. Thus we must fully populate 
1236
        // the full map of all the variable groups, then go through the list 
1237
        // of variables and create the dataverse variable categories from 
1238
        // them. -- L.A. 4.0       
1239
        Map<String, Map<String, String>> tempValueLabelTable = new LinkedHashMap<>();
×
1240
        
1241
        for (int i = 0; i < nvar; i++) {
×
1242
            if (dbgLog.isLoggable(Level.FINE)) {
×
1243
                dbgLog.fine("\n\n" + i + "th value-label table header");
×
1244
            }
1245

1246
            byte[] valueLabelHeader = new byte[length_value_label_header];
×
1247

1248
            // Part 1: reading the header of a value-label table if exists
1249
            int nbytes = stream.read(valueLabelHeader, 0,
×
1250
                    length_value_label_header);
1251

1252
            if (nbytes == 0) {
×
1253
                throw new IOException("reading value label header: no datum");
×
1254
            }
1255

1256
            // 1.1 number of value-label pairs in this table (= m)
1257
            ByteBuffer bb_value_label_pairs
×
1258
                    = ByteBuffer.wrap(valueLabelHeader, 0,
×
1259
                            value_label_table_length);
1260
            if (isLittleEndian) {
×
1261
                bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN);
×
1262
                //if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value lable table lenth: byte reversed");
1263
            }
1264
            int no_value_label_pairs = bb_value_label_pairs.getShort();
×
1265

1266
            if (dbgLog.isLoggable(Level.FINE)) {
×
1267
                dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs);
×
1268
            }
1269

1270
            // 1.2 labelName
1271
            String rawLabelName = new String(Arrays.copyOfRange(
×
1272
                    valueLabelHeader,
1273
                    value_label_table_length,
1274
                    (value_label_table_length + length_label_name)),
1275
                    "ISO-8859-1");
1276

1277
            if (dbgLog.isLoggable(Level.FINE)) {
×
1278
                dbgLog.fine("rawLabelName(length)=" + rawLabelName.length());
×
1279
            }
1280
            String labelName = rawLabelName.substring(0, rawLabelName.indexOf(0));
×
1281

1282
            if (dbgLog.isLoggable(Level.FINE)) {
×
1283
                dbgLog.fine("label name = " + labelName + "\n");
×
1284
            }
1285

1286
            if (dbgLog.isLoggable(Level.FINE)) {
×
1287
                dbgLog.fine(i + "-th value-label table");
×
1288
            }
1289
            // Part 2: reading the value-label table
1290
            // the length of the value-label table is: 2*m + 8*m = 10*m
1291
            int length_value_label_table = (value_label_table_length
×
1292
                    + length_lable_name_field) * no_value_label_pairs;
1293

1294
            if (dbgLog.isLoggable(Level.FINE)) {
×
1295
                dbgLog.fine("length_value_label_table=" + length_value_label_table);
×
1296
            }
1297

1298
            byte[] valueLabelTable_i = new byte[length_value_label_table];
×
1299
            int noBytes = stream.read(valueLabelTable_i, 0,
×
1300
                    length_value_label_table);
1301
            if (noBytes == 0) {
×
1302
                throw new IOException("reading value label table: no datum");
×
1303
            }
1304

1305
            // 2-1. 2-byte-integer array (2*m): value array (sorted)
1306
            short[] valueList = new short[no_value_label_pairs];
×
1307
            int offset_value = 0;
×
1308

1309
            for (int k = 0; k < no_value_label_pairs; k++) {
×
1310

1311
                ByteBuffer bb_value_list
×
1312
                        = ByteBuffer.wrap(valueLabelTable_i, offset_value,
×
1313
                                value_label_table_length);
1314
                if (isLittleEndian) {
×
1315
                    bb_value_list.order(ByteOrder.LITTLE_ENDIAN);
×
1316
                }
1317
                valueList[k] = bb_value_list.getShort();
×
1318

1319
                offset_value += value_label_table_length;
×
1320
            }
1321

1322
            if (dbgLog.isLoggable(Level.FINE)) {
×
1323
                dbgLog.fine("value_list=" + Arrays.toString(valueList) + "\n");
×
1324
            }
1325

1326
            // 2-2. 8-byte chars that store label data (m units of labels)
1327
            if (dbgLog.isLoggable(Level.FINE)) {
×
1328
                dbgLog.fine("current offset_value=" + offset_value);
×
1329
            }
1330

1331
            int offset_start = offset_value;
×
1332
            int offset_end = offset_value + length_lable_name_field;
×
1333
            String[] labelList = new String[no_value_label_pairs];
×
1334

1335
            for (int l = 0; l < no_value_label_pairs; l++) {
×
1336

1337
                String string_l = new String(Arrays.copyOfRange(valueLabelTable_i, offset_start,
×
1338
                        offset_end), "ISO-8859-1");
1339

1340
                int null_position = string_l.indexOf(0);
×
1341
                if (null_position != -1) {
×
1342
                    labelList[l] = string_l.substring(0, null_position);
×
1343
                } else {
1344
                    labelList[l] = string_l;
×
1345
                }
1346

1347
                offset_start = offset_end;
×
1348
                offset_end += length_lable_name_field;
×
1349
            }
1350

1351
            // Finally, we've reached the actual value-label pairs. We'll go 
1352
            // through them and put them on the temporary lookup map: 
1353
            
1354
            tempValueLabelTable.put(labelName, new LinkedHashMap<>());
×
1355
            
1356
            for (int j = 0; j < no_value_label_pairs; j++) {
×
1357
                if (dbgLog.isLoggable(Level.FINE)) {
×
1358
                    dbgLog.fine(j + "-th pair:" + valueList[j] + "[" + labelList[j] + "]");
×
1359
                }
1360
                
1361
                // TODO: do we need any null/empty string checks here? -- L.A. 4.0
1362
                tempValueLabelTable.get(labelName).put(Integer.toString(valueList[j]), labelList[j]);
×
1363
            }
1364
            
1365

1366
            if (stream.available() == 0) {
×
1367
                // reached the end of the file
1368
                if (dbgLog.isLoggable(Level.FINE)) {
×
1369
                    dbgLog.fine("reached the end of file at " + i + "th value-label Table.");
×
1370
                }
1371
                break;
1372
            }
1373

1374
        } // for nvar loop
1375

1376
        // And now we can go through the list of variables, see if any have 
1377
        // value-label groups linked, then build dataverse VariableCategory 
1378
        // objects for them, using the values stored in the temporary map 
1379
        // we've just built:
1380
       
1381
        for (int i = 0; i < nvar; i++) {
×
1382
            if (valueLabelsLookupTable[i] != null) {
×
1383
                if (tempValueLabelTable.get(valueLabelsLookupTable[i]) != null) {
×
1384
                    // What if it is null? -- is it a legit condition, that 
1385
                    // a variable was advertised as having categorical values,
1386
                    // but no such cat value group exists under this name?
1387
                    // -- L.A.
1388
                    for (String value : tempValueLabelTable.get(valueLabelsLookupTable[i]).keySet()) {
×
1389
                        VariableCategory cat = new VariableCategory();
×
1390
                        
1391
                        cat.setValue(value);
×
1392
                        cat.setLabel(tempValueLabelTable.get(valueLabelsLookupTable[i]).get(value));
×
1393

1394
                        /* cross-link the variable and category to each other: */
1395
                        cat.setDataVariable(dataTable.getDataVariables().get(i));
×
1396
                        dataTable.getDataVariables().get(i).getCategories().add(cat);
×
1397
                    }
×
1398
                }
1399
            }
1400
        }
1401

1402
        dbgLog.fine("parseValueLabelsRelease105(): end");
×
1403

1404
    }
×
1405

1406

1407
    private void parseValueLabelsReleasel108(BufferedInputStream stream) throws IOException {
1408

1409
        dbgLog.fine("parseValueLabelsRelease108(): start");
×
1410

1411
        if (stream == null) {
×
1412
            throw new IllegalArgumentException("stream == null!");
×
1413
        }
1414

1415
        int nvar = dataTable.getVarQuantity().intValue();
×
1416
        int length_label_name = constantTable.get("NAME");
×
1417
        int length_value_label_header = value_label_table_length
×
1418
                + length_label_name
1419
                + VALUE_LABEL_HEADER_PADDING_LENGTH;
1420

1421
        if (dbgLog.isLoggable(Level.FINE)) {
×
1422
            dbgLog.fine("value_label_table_length=" + value_label_table_length);
×
1423
        }
1424
        if (dbgLog.isLoggable(Level.FINE)) {
×
1425
            dbgLog.fine("length_value_label_header=" + length_value_label_header);
×
1426
        }
1427
        /*
1428
         Seg  field         byte    type
1429
         1-1. len_vlt(Seg.2)   4    int
1430
         1-2. vlt_name      9/33    char+(\0) == name used in Sec2.part 5
1431
         1-3. padding          3    byte
1432
         -----------------------------------
1433
         16/40
1434
         2-1. n(# of vls)      4    int
1435
         2-2. m(len_labels)    4    int
1436
         2-3. label_offsets    4*n  int[]
1437
         2-4. values           4*n  int[]
1438
         2-5. labels           m    char
1439
         */
1440

1441
        // This map will hold a temporary lookup table for all the categorical
1442
        // value-label groups:
1443
        // These groups have unique names, and a group *may be shared* between
1444
        // multiple variables. In the method decodeDescriptorValueLabel above
1445
        // we have populated a lookup table where variables are linked to the 
1446
        // corresponding value-label groups by name. Thus we must fully populate 
1447
        // the full map of all the variable group, then go through the list 
1448
        // of variables and create the dataverse variable categories from 
1449
        // them. -- L.A. 4.0
1450
        
1451
        Map<String, Map<String, String>> tempValueLabelTable = new LinkedHashMap<>();
×
1452

1453
        
1454
        for (int i = 0; i < nvar; i++) {
×
1455
            if (dbgLog.isLoggable(Level.FINE)) {
×
1456
                dbgLog.fine("\n\n" + i + "th value-label table header");
×
1457
            }
1458

1459
            byte[] valueLabelHeader = new byte[length_value_label_header];
×
1460

1461
            // Part 1: reading the header of a value-label table if exists
1462
            int nbytes = stream.read(valueLabelHeader, 0,
×
1463
                    length_value_label_header);
1464

1465
            if (nbytes == 0) {
×
1466
                throw new IOException("reading value label header: no datum");
×
1467
            }
1468

1469
            // 1.1 length_value_label_table
1470
            ByteBuffer bb_value_label_header
×
1471
                    = ByteBuffer.wrap(valueLabelHeader, 0,
×
1472
                            value_label_table_length);
1473
            if (isLittleEndian) {
×
1474
                bb_value_label_header.order(ByteOrder.LITTLE_ENDIAN);
×
1475
            }
1476
            int length_value_label_table = bb_value_label_header.getInt();
×
1477

1478
            if (dbgLog.isLoggable(Level.FINE)) {
×
1479
                dbgLog.fine("length of this value-label table="
×
1480
                        + length_value_label_table);
1481
            }
1482

1483
            // 1.2 labelName
1484
            String rawLabelName = new String(Arrays.copyOfRange(
×
1485
                    valueLabelHeader,
1486
                    value_label_table_length,
1487
                    (value_label_table_length + length_label_name)),
1488
                    "ISO-8859-1");
1489
            String labelName = getNullStrippedString(rawLabelName);
×
1490

1491
            if (dbgLog.isLoggable(Level.FINE)) {
×
1492
                dbgLog.fine("label name = " + labelName + "\n");
×
1493
            }
1494

1495
            if (dbgLog.isLoggable(Level.FINE)) {
×
1496
                dbgLog.fine(i + "-th value-label table");
×
1497
            }
1498
            // Part 2: reading the value-label table
1499
            byte[] valueLabelTable_i = new byte[length_value_label_table];
×
1500
            int noBytes = stream.read(valueLabelTable_i, 0,
×
1501
                    length_value_label_table);
1502
            if (noBytes == 0) {
×
1503
                throw new IOException("reading value label table: no datum");
×
1504
            }
1505

1506
            // 2-1. 4-byte-integer: number of units in this table (n)
1507
            int valueLabelTable_offset = 0;
×
1508
            ByteBuffer bb_value_label_pairs
×
1509
                    = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset,
×
1510
                            value_label_table_length);
1511
            if (isLittleEndian) {
×
1512
                bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN);
×
1513
            }
1514

1515
            int no_value_label_pairs = bb_value_label_pairs.getInt();
×
1516

1517
            valueLabelTable_offset += value_label_table_length;
×
1518

1519
            if (dbgLog.isLoggable(Level.FINE)) {
×
1520
                dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs);
×
1521
            }
1522

1523
                // 2-2. 4-byte-integer: length of the label section (m bytes)
1524
            ByteBuffer bb_length_label_segment
×
1525
                    = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset,
×
1526
                            value_label_table_length);
1527
            if (isLittleEndian) {
×
1528
                bb_length_label_segment.order(ByteOrder.LITTLE_ENDIAN);
×
1529
            }
1530

1531
            int length_label_segment = bb_length_label_segment.getInt();
×
1532
            valueLabelTable_offset += value_label_table_length;
×
1533

1534
            // 2-3. 4-byte-integer array (4xm): offset values for the label sec.
1535
            // these "label offsets" actually appear to represent the byte
1536
            // offsets of the label strings, as stored in the next section.
1537
            // as of now, these are not used for anything, and the code
1538
            // below assumes that the labels are already in the same
1539
            // order as the numeric values! -- L.A.
1540
            int[] label_offsets = new int[no_value_label_pairs];
×
1541
            int byte_offset = valueLabelTable_offset;
×
1542

1543
            for (int j = 0; j < no_value_label_pairs; j++) {
×
1544

1545
                // note: 4-byte singed, not java's long
1546
                ByteBuffer bb_label_offset
×
1547
                        = ByteBuffer.wrap(valueLabelTable_i, byte_offset,
×
1548
                                value_label_table_length);
1549
                if (isLittleEndian) {
×
1550
                    bb_label_offset.order(ByteOrder.LITTLE_ENDIAN);
×
1551
                    dbgLog.fine("label offset: byte reversed");
×
1552
                }
1553
                label_offsets[j] = bb_label_offset.getInt();
×
1554
                dbgLog.fine("label offset [" + j + "]: " + label_offsets[j]);
×
1555

1556
                byte_offset += value_label_table_length;
×
1557

1558
            }
1559

1560
            // 2-4. 4-byte-integer array (4xm): value array (sorted)
1561
            dbgLog.fine("value array");
×
1562

1563
            int[] valueList = new int[no_value_label_pairs];
×
1564
            int offset_value = byte_offset;
×
1565

1566
            for (int k = 0; k < no_value_label_pairs; k++) {
×
1567

1568
                ByteBuffer bb_value_list
×
1569
                        = ByteBuffer.wrap(valueLabelTable_i, offset_value,
×
1570
                                value_label_table_length);
1571
                if (isLittleEndian) {
×
1572
                    bb_value_list.order(ByteOrder.LITTLE_ENDIAN);
×
1573
                }
1574
                valueList[k] = bb_value_list.getInt();
×
1575

1576
                offset_value += value_label_table_length;
×
1577

1578
            }
1579

1580
            // 2-5. m-byte chars that store label data (m units of labels)
1581
            String label_segment = new String(
×
1582
                    Arrays.copyOfRange(valueLabelTable_i,
×
1583
                            offset_value,
1584
                            (length_label_segment + offset_value)), "ISO-8859-1");
1585

1586
            // L.A. -- 2011.2.25:
1587
            // This assumes that the labels are already stored in the right
1588
            // order: (see my comment for the section 2.3 above)
1589
            //String[] labelList = label_segment.split("\0");
1590
            // Instead, we should be using the offset values obtained in
1591
            // the section 2.3 above, and select the corresponding
1592
            // substrings:
1593
            String[] labelList = new String[no_value_label_pairs];
×
1594

1595
            for (int l = 0; l < no_value_label_pairs; l++) {
×
1596
                String lblString = null;
×
1597
                int lblOffset = label_offsets[l];
×
1598

1599
                lblString = label_segment.substring(lblOffset);
×
1600

1601
                int nullIndx = lblString.indexOf('\000');
×
1602

1603
                if (nullIndx > -1) {
×
1604
                    lblString = lblString.substring(0, nullIndx);
×
1605
                }
1606

1607
                labelList[l] = lblString;
×
1608
            }
1609

1610
            // this should work! -- L.A.
1611
            // (TODO: check the v105 value label parsing method, to see if
1612
            // something similar applies there)
1613
            
1614
            // Finally, we've reached the actual value-label pairs. We'll go 
1615
            // through them and put them on the temporary lookup map: 
1616
            
1617
            tempValueLabelTable.put(labelName, new LinkedHashMap<>());
×
1618

1619
            for (int l = 0; l < no_value_label_pairs; l++) {
×
1620
                if (dbgLog.isLoggable(Level.FINE)) {
×
1621
                    dbgLog.fine(l + "-th pair:" + valueList[l] + "[" + labelList[l] + "]");
×
1622
                }
1623

1624
                // TODO: do we need any null/empty string checks here? -- L.A. 4.0
1625
                tempValueLabelTable.get(labelName).put(Integer.toString(valueList[l]), labelList[l]);
×
1626
            }
1627

1628
            if (stream.available() == 0) {
×
1629
                // reached the end of the file
1630
                dbgLog.fine("reached the end of the file at " + i + "th value-label Table");
×
1631
                break;
×
1632
            }
1633

1634
        }  // for nvar loop
1635

1636
        // And now we can go through the list of variables, see if any have 
1637
        // value-label groups linked, then build dataverse VariableCategory 
1638
        // objects for them, using the values stored in the temporary map 
1639
        // we've just built:
1640
       
1641
        // TODO: this code is duplicated between this, and the "105 version" of
1642
        // this method, above. Maybe it should be isolated in its own method.
1643
        // -- L.A. 4.0
1644
        for (int i = 0; i < nvar; i++) {
×
1645
            if (valueLabelsLookupTable[i] != null) {
×
1646
                if (tempValueLabelTable.get(valueLabelsLookupTable[i]) != null) {
×
1647
                    // What if it is null? -- is it a legit condition, that 
1648
                    // a variable was advertised as having categorical values,
1649
                    // but no such cat value group exists under this name?
1650
                    // -- L.A.
1651
                    for (String value : tempValueLabelTable.get(valueLabelsLookupTable[i]).keySet()) {
×
1652
                        VariableCategory cat = new VariableCategory();
×
1653
                        
1654
                        cat.setValue(value);
×
1655
                        cat.setLabel(tempValueLabelTable.get(valueLabelsLookupTable[i]).get(value));
×
1656

1657
                        /* cross-link the variable and category to each other: */
1658
                        cat.setDataVariable(dataTable.getDataVariables().get(i));
×
1659
                        dataTable.getDataVariables().get(i).getCategories().add(cat);
×
1660
                    }
×
1661
                }
1662
            }
1663
        }
1664
        
1665
        dbgLog.fine("parseValueLabelsRelease108(): end");
×
1666
    }
×
1667

1668
    private void decodeData(BufferedInputStream stream, boolean saveWithVariableHeader) throws IOException {
1669

1670
        dbgLog.fine("\n***** decodeData(): start *****");
1✔
1671

1672
        if (stream == null) {
1✔
1673
            throw new IllegalArgumentException("stream == null!");
×
1674
        }
1675

1676
        //int nvar = (Integer)smd.getFileInformation().get("varQnty");
1677
        int nvar = dataTable.getVarQuantity().intValue();
1✔
1678
        //int nobs = (Integer)smd.getFileInformation().get("caseQnty");
1679
        int nobs = dataTable.getCaseQuantity().intValue();
1✔
1680

1681
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
1682
            dbgLog.fine("data dimensions[observations x variables] = (" + nobs + "x" + nvar + ")");
×
1683
        }
1684
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
1685
            dbgLog.fine("bytes per row=" + bytes_per_row + " bytes");
×
1686
        }
1687

1688
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
1689
            dbgLog.fine("variableTypes=" + Arrays.deepToString(variableTypes));
×
1690
        }
1691
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
1692
            dbgLog.fine("StringLengthTable=" + StringLengthTable);
×
1693
        }
1694

1695
        // create a File object to save the tab-delimited data file
1696
        FileOutputStream fileOutTab = null;
1✔
1697
        PrintWriter pwout = null;
1✔
1698
        File tabDelimitedDataFile = File.createTempFile("tempTabfile.", ".tab");
1✔
1699

1700
        // save the temp tab-delimited file in the return ingest object:        
1701
        ingesteddata.setTabDelimitedFile(tabDelimitedDataFile);
1✔
1702

1703
        fileOutTab = new FileOutputStream(tabDelimitedDataFile);
1✔
1704
        pwout = new PrintWriter(new OutputStreamWriter(fileOutTab, "utf8"), true);
1✔
1705

1706
        /* Should we lose this dateFormat thing in 4.0? 
1707
         * the UNF should be calculatable on the app side solely from the data
1708
         * stored in the tab file and the type information stored the dataVariable
1709
         * object. 
1710
         * furthermore, the very idea of storing a format entry not just for 
1711
         * every variable, but for every value/observation is a bit strange. 
1712
         * TODO: review and confirm that, in the 3.* implementation, every
1713
         * entry in dateFormat[nvar][*] is indeed the same - except for the 
1714
         * missing value entries. -- L.A. 4.0
1715
          (OK, I got rid of the dateFormat; instead I kinda sorta assume
1716
          that the format is the same for every value in a column, save for 
1717
          the missing values... like this: 
1718
          dataTable.getDataVariables().get(columnCounter).setFormatSchemaName(ddt.format);
1719
          BUT, this needs to be reviewed/confirmed etc! 
1720
         */
1721
        //String[][] dateFormat = new String[nvar][nobs];
1722
        
1723
        // add the variable header here, if needed
1724
        if (saveWithVariableHeader) {
1✔
NEW
1725
            pwout.println(generateVariableHeader(dataTable.getDataVariables())); 
×
1726
        }
1727

1728
        for (int i = 0; i < nobs; i++) {
1✔
1729
            byte[] dataRowBytes = new byte[bytes_per_row];
1✔
1730
            Object[] dataRow = new Object[nvar];
1✔
1731

1732
            int nbytes = stream.read(dataRowBytes, 0, bytes_per_row);
1✔
1733

1734
            if (nbytes == 0) {
1✔
1735
                String errorMessage = "reading data: no data were read at("
×
1736
                        + i + "th row)";
1737
                throw new IOException(errorMessage);
×
1738
            }
1739
            // decoding each row
1740
            int byte_offset = 0;
1✔
1741
            for (int columnCounter = 0;
1✔
1742
                    columnCounter < variableTypes.length; columnCounter++) {
1✔
1743

1744
                Integer varType
1✔
1745
                        = variableTypeMap.get(variableTypes[columnCounter]);
1✔
1746

1747

1748
                // 4.0 Check if this is a time/date variable: 
1749
                boolean isDateTimeDatum = false; 
1✔
1750
                String formatCategory = dataTable.getDataVariables().get(columnCounter).getFormatCategory();
1✔
1751
                if (formatCategory != null && (formatCategory.equals("time") || formatCategory.equals("date"))) {
1✔
1752
                    isDateTimeDatum = true; 
×
1753
                }
1754

1755
                String variableFormat = dateVariableFormats[columnCounter];
1✔
1756

1757
                switch (varType != null ? varType : 256) {
1✔
1758
                    case -5:
1759
                        // Byte case
1760
                        // note: 1 byte signed
1761
                        byte byte_datum = dataRowBytes[byte_offset];
1✔
1762

1763
                        if (dbgLog.isLoggable(Level.FINER)) {
1✔
1764
                            dbgLog.finer(i + "-th row " + columnCounter
×
1765
                                    + "=th column byte =" + byte_datum);
1766
                        }
1767
                        if (byte_datum >= BYTE_MISSING_VALUE) {
1✔
1768
                            if (dbgLog.isLoggable(Level.FINER)) {
×
1769
                                dbgLog.finer(i + "-th row " + columnCounter
×
1770
                                        + "=th column byte MV=" + byte_datum);
1771
                            }
1772
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1773
                        } else {
1774
                            dataRow[columnCounter] = byte_datum;
1✔
1775
                        }
1776

1777
                        byte_offset++;
1✔
1778
                        break;
1✔
1779
                    case -4:
1780
                        // Stata-int (=java's short: 2byte) case
1781
                        // note: 2-byte signed int, not java's int
1782
                        ByteBuffer int_buffer
×
1783
                                = ByteBuffer.wrap(dataRowBytes, byte_offset, 2);
×
1784
                        if (isLittleEndian) {
×
1785
                            int_buffer.order(ByteOrder.LITTLE_ENDIAN);
×
1786

1787
                        }
1788
                        short short_datum = int_buffer.getShort();
×
1789

1790
                        if (dbgLog.isLoggable(Level.FINER)) {
×
1791
                            dbgLog.finer(i + "-th row " + columnCounter
×
1792
                                    + "=th column stata int =" + short_datum);
1793
                        }
1794
                        if (short_datum >= INT_MISSIG_VALUE) {
×
1795
                            if (dbgLog.isLoggable(Level.FINER)) {
×
1796
                                dbgLog.finer(i + "-th row " + columnCounter
×
1797
                                        + "=th column stata long missing value=" + short_datum);
1798
                            }
1799
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1800
                        } else {
1801

1802
                            if (isDateTimeDatum) {
×
1803

1804
                                DecodedDateTime ddt = decodeDateTimeData("short", variableFormat, Short.toString(short_datum));
×
1805
                                if (dbgLog.isLoggable(Level.FINER)) {
×
1806
                                    dbgLog.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format);
×
1807
                                }
1808
                                dataRow[columnCounter] = ddt.decodedDateTime;
×
1809
                                //dateFormat[columnCounter][i] = ddt.format;
1810
                                dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format);
×
1811

1812
                            } else {
×
1813
                                dataRow[columnCounter] = short_datum;
×
1814
                            }
1815
                        }
1816
                        byte_offset += 2;
×
1817
                        break;
×
1818
                    case -3:
1819
                        // stata-Long (= java's int: 4 byte) case
1820
                        // note: 4-byte singed, not java's long
1821
                        //dbgLog.fine("DATreader: stata long");
1822

1823
                        ByteBuffer long_buffer
×
1824
                                = ByteBuffer.wrap(dataRowBytes, byte_offset, 4);
×
1825
                        if (isLittleEndian) {
×
1826
                            long_buffer.order(ByteOrder.LITTLE_ENDIAN);
×
1827

1828
                        }
1829
                        int int_datum = long_buffer.getInt();
×
1830

1831
                        if (dbgLog.isLoggable(Level.FINE)) {
×
1832
                            //dbgLog.fine(i + "-th row " + columnCounter
1833
                            //        + "=th column stata long =" + int_datum);
1834
                        }
1835
                        if (int_datum >= LONG_MISSING_VALUE) {
×
1836
                            if (dbgLog.isLoggable(Level.FINE)) {
×
1837
                                //dbgLog.fine(i + "-th row " + columnCounter
1838
                                //        + "=th column stata long missing value=" + int_datum);
1839
                            }
1840
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1841
                        } else {
1842
                            if (isDateTimeDatum) {
×
1843
                                DecodedDateTime ddt = decodeDateTimeData("int", variableFormat, Integer.toString(int_datum));
×
1844
                                if (dbgLog.isLoggable(Level.FINER)) {
×
1845
                                    dbgLog.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format);
×
1846
                                }
1847
                                dataRow[columnCounter] = ddt.decodedDateTime;
×
1848
                                dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format);
×
1849

1850
                            } else {
×
1851
                                dataRow[columnCounter] = int_datum;
×
1852
                            }
1853

1854
                        }
1855
                        byte_offset += 4;
×
1856
                        break;
×
1857
                    case -2:
1858
                        // float case
1859
                        // note: 4-byte
1860
                        ByteBuffer float_buffer
×
1861
                                = ByteBuffer.wrap(dataRowBytes, byte_offset, 4);
×
1862
                        if (isLittleEndian) {
×
1863
                            float_buffer.order(ByteOrder.LITTLE_ENDIAN);
×
1864
                        }
1865
                        float float_datum = float_buffer.getFloat();
×
1866

1867
                        if (dbgLog.isLoggable(Level.FINER)) {
×
1868
                            dbgLog.finer(i + "-th row " + columnCounter
×
1869
                                    + "=th column float =" + float_datum);
1870
                        }
1871
                        if (FLOAT_MISSING_VALUE_SET.contains(float_datum)) {
×
1872
                            if (dbgLog.isLoggable(Level.FINER)) {
×
1873
                                dbgLog.finer(i + "-th row " + columnCounter
×
1874
                                        + "=th column float missing value=" + float_datum);
1875
                            }
1876
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1877

1878
                        } else {
1879

1880
                            if (isDateTimeDatum) {
×
1881
                                DecodedDateTime ddt = decodeDateTimeData("float", variableFormat, doubleNumberFormatter.format(float_datum));
×
1882
                                if (dbgLog.isLoggable(Level.FINER)) {
×
1883
                                    dbgLog.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format);
×
1884
                                }
1885
                                dataRow[columnCounter] = ddt.decodedDateTime;
×
1886
                                dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format);
×
1887
                            } else {
×
1888
                                dataRow[columnCounter] = float_datum;
×
1889
                                // This may be temporary - but for now (as in, while I'm testing 
1890
                                // 4.0 ingest against 3.* ingest, I need to be able to tell if a 
1891
                                // floating point value was a single, or double float in the 
1892
                                // original STATA file: -- L.A. Jul. 2014
1893
                                dataTable.getDataVariables().get(columnCounter).setFormat("float");
×
1894
                            }
1895

1896
                        }
1897
                        byte_offset += 4;
×
1898
                        break;
×
1899
                    case -1:
1900
                        // double case
1901
                        // note: 8-byte
1902
                        ByteBuffer double_buffer
×
1903
                                = ByteBuffer.wrap(dataRowBytes, byte_offset, 8);
×
1904
                        if (isLittleEndian) {
×
1905
                            double_buffer.order(ByteOrder.LITTLE_ENDIAN);
×
1906
                        }
1907
                        double double_datum = double_buffer.getDouble();
×
1908

1909
                        if (DOUBLE_MISSING_VALUE_SET.contains(double_datum)) {
×
1910
                            if (dbgLog.isLoggable(Level.FINER)) {
×
1911
                                dbgLog.finer(i + "-th row " + columnCounter
×
1912
                                        + "=th column double missing value=" + double_datum);
1913
                            }
1914
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1915
                        } else {
1916

1917
                            if (isDateTimeDatum) {
×
1918
                                DecodedDateTime ddt = decodeDateTimeData("double", variableFormat, doubleNumberFormatter.format(double_datum));
×
1919
                                if (dbgLog.isLoggable(Level.FINER)) {
×
1920
                                    dbgLog.finer(i + "-th row , decodedDateTime " + ddt.decodedDateTime + ", format=" + ddt.format);
×
1921
                                }
1922
                                dataRow[columnCounter] = ddt.decodedDateTime;
×
1923
                                dataTable.getDataVariables().get(columnCounter).setFormat(ddt.format);
×
1924
                            } else {
×
1925
                                dataRow[columnCounter] = doubleNumberFormatter.format(double_datum);
×
1926
                            }
1927

1928
                        }
1929
                        byte_offset += 8;
×
1930
                        break;
×
1931
                    case 0:
1932
                        // String case
1933
                        int strVarLength = StringLengthTable.get(columnCounter);
×
1934
                        String raw_datum = new String(Arrays.copyOfRange(dataRowBytes, byte_offset,
×
1935
                                (byte_offset + strVarLength)), "ISO-8859-1");
1936
                        // TODO: 
1937
                        // is it the right thing to do, to default to "ISO-8859-1"?
1938
                        // (it may be; since there's no mechanism for specifying
1939
                        // alternative encodings in Stata, this may be their default;
1940
                        // it just needs to be verified. -- L.A. Jul. 2014)
1941
                        String string_datum = getNullStrippedString(raw_datum);
×
1942
                        if (dbgLog.isLoggable(Level.FINER)) {
×
1943
                            dbgLog.finer(i + "-th row " + columnCounter
×
1944
                                    + "=th column string =" + string_datum);
1945
                        }
1946
                        if (string_datum.isEmpty()) {
×
1947
                            if (dbgLog.isLoggable(Level.FINER)) {
×
1948
                                dbgLog.finer(i + "-th row " + columnCounter
×
1949
                                        + "=th column string missing value=" + string_datum);
1950
                            }
1951
                            // TODO: 
1952
                            /* Is this really a missing value case? 
1953
                             * Or is it an honest empty string? 
1954
                             * Is there such a thing as a missing value for a String in Stata?
1955
                             * -- L.A. 4.0
1956
                             */
1957
                            dataRow[columnCounter] = MissingValueForTabDelimitedFile;
×
1958
                        } else {
1959
                            /*
1960
                             * Some special characters, like new lines and tabs need to 
1961
                             * be escaped - otherwise they will break our TAB file 
1962
                             * structure! 
1963
                             * But before we escape anything, all the back slashes 
1964
                             * already in the string need to be escaped themselves.
1965
                             */
1966
                            String escapedString = string_datum.replace("\\", "\\\\");
×
1967
                            // escape quotes: 
1968
                            escapedString = escapedString.replaceAll("\"", Matcher.quoteReplacement("\\\""));
×
1969
                            // escape tabs and new lines:
1970
                            escapedString = escapedString.replaceAll("\t", Matcher.quoteReplacement("\\t"));
×
1971
                            escapedString = escapedString.replaceAll("\n", Matcher.quoteReplacement("\\n"));
×
1972
                            escapedString = escapedString.replaceAll("\r", Matcher.quoteReplacement("\\r"));
×
1973
                            // the escaped version of the string is stored in the tab file 
1974
                            // enclosed in double-quotes; this is in order to be able 
1975
                            // to differentiate between an empty string (tab-delimited empty string in 
1976
                            // double quotes) and a missing value (tab-delimited empty string). 
1977
                            // Although the question still remains - is it even possible 
1978
                            // to store an empty string, that's not a missing value, in Stata? 
1979
                            // - see the comment in the missing value case above. -- L.A. 4.0
1980
                            dataRow[columnCounter] = "\"" + escapedString + "\"";
×
1981
                        }
1982
                        byte_offset += strVarLength;
×
1983
                        break;
×
1984
                    default:
1985
                        dbgLog.fine("unknown variable type found");
×
1986
                        String errorMessage
×
1987
                                = "unknow variable Type found at data section";
1988
                        throw new InvalidObjectException(errorMessage);
×
1989
                } // switch
1990
            } // for-columnCounter
1991

1992
            // Dump the row of data to the tab-delimited file we are producing:
1993
            pwout.println(StringUtils.join(dataRow, "\t"));
1✔
1994

1995
            if (dbgLog.isLoggable(Level.FINE)) {
1✔
1996
                //dbgLog.fine(i + "-th row's data={" + StringUtils.join(dataRow, ",") + "};");
1997
            }
1998

1999
        }  // for- i (row)
2000

2001
        pwout.close();
1✔
2002

2003
        if (dbgLog.isLoggable(Level.FINE)) {
1✔
2004
            dbgLog.fine("variableTypes:\n" + Arrays.deepToString(variableTypes));
×
2005
        }
2006

2007
        dbgLog.fine("DTA Ingest: decodeData(): end.");
1✔
2008

2009
    }
1✔
2010

2011

2012
    private class DecodedDateTime {
×
2013
        String format;
2014
        String decodedDateTime;
2015
    }
2016

2017
    private DecodedDateTime decodeDateTimeData(String storageType, String FormatType, String rawDatum) throws IOException {
2018

2019
        if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("(storageType, FormatType, rawDatum)=("+
×
2020
        storageType +", " +FormatType +", " +rawDatum+")");
2021
        /*
2022
         *         Historical note:
2023
                   pseudofunctions,  td(), tw(), tm(), tq(), and th()
2024
                used to be called     d(),  w(),  m(),  q(), and  h().
2025
                Those names still work but are considered anachronisms.
2026

2027
        */
2028
        long milliSeconds;
2029
        String decodedDateTime=null;
×
2030
        String format = null;
×
2031

2032
        if (FormatType.matches("^%tc.*")){
×
2033
            // tc is a relatively new format
2034
            // datum is millisecond-wise
2035

2036
            milliSeconds = Math.round(new Double(rawDatum)) + STATA_BIAS_TO_EPOCH;
×
2037
            decodedDateTime = sdf_ymdhmsS.format(new Date(milliSeconds));
×
2038
            format = sdf_ymdhmsS.toPattern();
×
2039
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("tc: result="+decodedDateTime+", format = "+format);
×
2040
            
2041
        } else if (FormatType.matches("^%t?d.*")){
×
2042
            milliSeconds = Long.parseLong(rawDatum) * MILLISECONDS_PER_DAY + STATA_BIAS_TO_EPOCH;
×
2043
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("milliSeconds="+milliSeconds);
×
2044
            
2045
            decodedDateTime = sdf_ymd.format(new Date(milliSeconds));
×
2046
            format = sdf_ymd.toPattern();
×
2047
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("td:"+decodedDateTime+", format = "+format);
×
2048

2049
        } else if (FormatType.matches("^%t?w.*")){
×
2050

2051
            long weekYears = Math.round(new Double(rawDatum));
×
2052
            long left = Math.abs(weekYears)%52L;
×
2053
            long years;
2054
            if (weekYears < 0L){
×
2055
                left = 52L - left;
×
2056
                if (left == 52L){
×
2057
                    left = 0L;
×
2058
                }
2059
                //out.println("left="+left);
2060
                years = (Math.abs(weekYears) -1)/52L +1L;
×
2061
                years *= -1L;
×
2062
            } else {
2063
                years = weekYears/52L;
×
2064
            }
2065

2066
            String yearString  = Long.toString(1960L + years);
×
2067
            String dayInYearString = new DecimalFormat("000").format((left*7) + 1);
×
2068
            String yearDayInYearString = yearString + "-" + dayInYearString;
×
2069

2070
            Date tempDate = null;
×
2071
            try {
2072
                tempDate = new SimpleDateFormat("yyyy-DDD").parse(yearDayInYearString);
×
2073
            } catch (ParseException ex) {
×
2074
                throw new IOException(ex);
×
2075
            }
×
2076
            
2077
            decodedDateTime = sdf_ymd.format(tempDate.getTime());
×
2078
            format = sdf_ymd.toPattern();
×
2079

2080
        } else if (FormatType.matches("^%t?m.*")){
×
2081
            // month 
2082
            long monthYears = Math.round(new Double(rawDatum));
×
2083
            long left = Math.abs(monthYears)%12L;
×
2084
            long years;
2085
            if (monthYears < 0L){
×
2086
                left = 12L - left;
×
2087
                //out.println("left="+left);
2088
                years = (Math.abs(monthYears) -1)/12L +1L;
×
2089
                years *= -1L;
×
2090
            } else {
2091
                years = monthYears/12L;
×
2092
            }
2093

2094
            String month = null;
×
2095
            if (left == 12L){
×
2096
                left = 0L;
×
2097
            }
2098
            Long monthdata = (left+1);
×
2099
            month = "-"+twoDigitFormatter.format(monthdata)+"-01";
×
2100
            long year  = 1960L + years;
×
2101
            String monthYear = Long.toString(year) + month;
×
2102
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("rawDatum="+rawDatum+": monthYear="+monthYear);
×
2103
            
2104
            decodedDateTime = monthYear;
×
2105
            format = "yyyy-MM-dd";
×
2106
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("tm:"+decodedDateTime+", format:"+format);
×
2107

2108
        } else if (FormatType.matches("^%t?q.*")){
×
2109
            // quater
2110
            long quaterYears = Math.round(new Double(rawDatum));
×
2111
            long left = Math.abs(quaterYears)%4L;
×
2112
            long years;
2113
            if (quaterYears < 0L){
×
2114
                left = 4L - left;
×
2115
                //out.println("left="+left);
2116
                years = (Math.abs(quaterYears) -1)/4L +1L;
×
2117
                years *= -1L;
×
2118
            } else {
2119
                years = quaterYears/4L;
×
2120
            }
2121

2122
            String quater = null;
×
2123

2124
            if ((left == 0L) || (left == 4L)){
×
2125
                //quater ="q1"; //
2126
                quater = "-01-01";
×
2127
            } else if (left ==1L) {
×
2128
                //quater = "q2"; //
2129
                quater = "-04-01";
×
2130
            } else if (left ==2L) {
×
2131
                //quater = "q3"; //
2132
                quater = "-07-01";
×
2133
            } else if (left ==3L) {
×
2134
                //quater = "q4"; //
2135
                quater = "-11-01";
×
2136
            }
2137

2138
            long year  = 1960L + years;
×
2139
            String quaterYear = Long.toString(year) + quater;
×
2140
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("rawDatum="+rawDatum+": quaterYear="+quaterYear);
×
2141

2142
            decodedDateTime = quaterYear;
×
2143
            format = "yyyy-MM-dd";
×
2144
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("tq:"+decodedDateTime+", format:"+format);
×
2145

2146
        } else if (FormatType.matches("^%t?h.*")){
×
2147
            // half year
2148
            // odd number:2nd half
2149
            // even number: 1st half
2150
            
2151
            long halvesYears = Long.parseLong(rawDatum);
×
2152
            long left = Math.abs(halvesYears)%2L;
×
2153
            long years;
2154
            if (halvesYears < 0L){
×
2155
                years = (Math.abs(halvesYears) -1)/2L +1L;
×
2156
                years *= -1L;
×
2157
            } else {
2158
                years = halvesYears/2L;
×
2159
            }
2160

2161
            String half = null;
×
2162
            if (left != 0L){
×
2163
                // odd number => 2nd half: "h2"
2164
                //half ="h2"; //
2165
                half = "-07-01";
×
2166
            } else {
2167
                // even number => 1st half: "h1"
2168
                //half = "h1"; //
2169
                half = "-01-01";
×
2170
            }
2171
            long year  = 1960L + years;
×
2172
            String halfYear = Long.toString(year) + half;
×
2173
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("rawDatum="+rawDatum+": halfYear="+halfYear);
×
2174
            
2175
            decodedDateTime = halfYear;
×
2176
            format = "yyyy-MM-dd";
×
2177
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("th:"+decodedDateTime+", format:"+format);
×
2178
            
2179
        } else if (FormatType.matches("^%t?y.*")){
×
2180
            // year type's origin is 0 AD
2181
            decodedDateTime = rawDatum;
×
2182
            format = "yyyy";
×
2183
            if (dbgLog.isLoggable(Level.FINER)) dbgLog.finer("th:"+decodedDateTime);
×
2184
        } else {
2185
            decodedDateTime = rawDatum;
×
2186
            format=null;
×
2187
        }
2188
        DecodedDateTime retValue = new DecodedDateTime();
×
2189
        retValue.decodedDateTime = decodedDateTime;
×
2190
        retValue.format = format;
×
2191
        return retValue;
×
2192
    }
2193

2194
}
2195

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc