• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IQSS / dataverse / #22002

01 Apr 2024 07:56PM CUT coverage: 20.716% (+0.5%) from 20.173%
#22002

push

github

web-flow
Merge pull request #10453 from IQSS/develop

Merge 6.2 into master

704 of 2679 new or added lines in 152 files covered. (26.28%)

81 existing lines in 49 files now uncovered.

17160 of 82836 relevant lines covered (20.72%)

0.21 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/rdata/RDATAFileReader.java
1
/*
2
   Copyright (C) 2005-2013, by the President and Fellows of Harvard College.
3

4
   Licensed under the Apache License, Version 2.0 (the "License");
5
   you may not use this file except in compliance with the License.
6
   You may obtain a copy of the License at
7

8
         http://www.apache.org/licenses/LICENSE-2.0
9

10
   Unless required by applicable law or agreed to in writing, software
11
   distributed under the License is distributed on an "AS IS" BASIS,
12
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
   See the License for the specific language governing permissions and
14
   limitations under the License.
15

16
   Dataverse Network - A web application to share, preserve and analyze research data.
17
   Developed at the Institute for Quantitative Social Science, Harvard University.
18
   Version 3.0.
19
*/
20
package edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata;
21

22

23
import java.io.*;
24
import java.io.InputStreamReader;
25
import java.text.*;
26
import java.util.logging.*;
27
import java.util.*;
28

29
import jakarta.inject.Inject;
30

31
// Rosuda Wrappers and Methods for R-calls to Rserve
32
import edu.harvard.iq.dataverse.settings.JvmSettings;
33
import org.rosuda.REngine.REXP;
34
import org.rosuda.REngine.REXPMismatchException;
35
import org.rosuda.REngine.RList;
36
import org.rosuda.REngine.Rserve.RFileInputStream;
37
import org.rosuda.REngine.Rserve.RFileOutputStream;
38
import org.rosuda.REngine.Rserve.*;
39

40
import edu.harvard.iq.dataverse.DataTable;
41
import edu.harvard.iq.dataverse.datavariable.DataVariable;
42
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
43

44
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader;
45
import edu.harvard.iq.dataverse.ingest.tabulardata.spi.TabularDataFileReaderSpi;
46
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
47
import edu.harvard.iq.dataverse.rserve.*;
48

49

50
import org.apache.commons.lang3.RandomStringUtils;
51

52
/**
53
 * Dataverse 4.0 implementation of <code>TabularDataFileReader</code> for the 
54
 * RData Binary Format.
55
 * 
56
 * Based on the original implementation for DVN v3.*, by Matt Owen (2012-2013),
57
 * completed by Leonid Andreev in 2013. 
58
 * 
59
 * This version is a serious re-write of the plugin, using the new 4.0 
60
 * ingest plugin architecture. 
61
 * 
62
 * original 
63
 * @author Matthew Owen
64
 * @author Leonid Andreev
65
 
66
 * This implementation uses external R-Scripts to do the bulk of the processing.
67
 */
68
public class RDATAFileReader extends TabularDataFileReader {
69
    
70
// Date-time things
71
  public static final String[] FORMATS = { "other", "date", "date-time", "date-time-timezone" };
×
72

73
  // R-ingest recognition files
74
  private static final String[] FORMAT_NAMES = { "RDATA", "Rdata", "rdata" };
×
75
  private static final String[] EXTENSIONS = { "Rdata", "rdata" };
×
76
  private static final String[] MIME_TYPE = { "application/x-rlang-transport" };
×
77
  
78
  // R Scripts
79
  static private String RSCRIPT_CREATE_WORKSPACE = "";
×
80
  static private String RSCRIPT_DATASET_INFO_SCRIPT = "";
×
81
  static private String RSCRIPT_GET_DATASET = "";
×
82
  static private String RSCRIPT_GET_LABELS = "";
×
83
  static private String RSCRIPT_WRITE_DVN_TABLE = "";
×
84
  
85
  // RServe static variables
86
  private final String RSERVE_HOST;
87
  private final int RSERVE_PORT;
88
  private final String RSERVE_USER;
89
  private final String RSERVE_PASSWORD;
90
  
91
  // TODO: 
92
  // we're not using these time/data formats for anything, are we?
93
  // DATE FORMATS
94
  private static SimpleDateFormat[] DATE_FORMATS = new SimpleDateFormat[] {
×
95
    new SimpleDateFormat("yyyy-MM-dd")
96
  };
97
  
98
  // TIME FORMATS
99
  private static SimpleDateFormat[] TIME_FORMATS = new SimpleDateFormat[] {
×
100
    // Date-time up to milliseconds with timezone, e.g. 2013-04-08 13:14:23.102 -0500
101
    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z"),
102
    // Date-time up to milliseconds, e.g. 2013-04-08 13:14:23.102
103
    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"),
104
    // Date-time up to seconds with timezone, e.g. 2013-04-08 13:14:23 -0500
105
    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z"),
106
    // Date-time up to seconds and no timezone, e.g. 2013-04-08 13:14:23
107
    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
108
  };
109
  
110
  // Logger
111
  private static final Logger LOG = Logger.getLogger(RDATAFileReader.class.getPackage().getName());
×
112

113
 
114
  TabularDataIngest ingesteddata = new TabularDataIngest();
×
115
  private DataTable dataTable = new DataTable();
×
116
    
117
  // Process ID, used partially in the generation of temporary directories
118
  private String mPID;
119
  
120
  // Object containing all the informatin for an R-workspace (including
121
  // temporary directories on and off server)
122
  private RWorkspace mRWorkspace;
123
  
124
  
125

126
  // Number formatter
127
  NumberFormat doubleNumberFormatter = new DecimalFormat();
×
128

129
  // Builds R Requests for an R-server
130
  private RRequestBuilder mRequestBuilder;
131
  /*
132
   * Initialize Static Variables
133
   * This is primarily to construct the R-Script
134
   */
135
  static {
136
    // Load R Scripts into memory, so that we can run them via R-serve
137
    RSCRIPT_WRITE_DVN_TABLE = readLocalResource("scripts/write.table.R");
×
138
    RSCRIPT_GET_DATASET = readLocalResource("scripts/get.dataset.R");
×
139
    RSCRIPT_CREATE_WORKSPACE = readLocalResource("scripts/create.workspace.R");
×
140
    RSCRIPT_GET_LABELS = readLocalResource("scripts/get.labels.R");
×
141
    RSCRIPT_DATASET_INFO_SCRIPT = readLocalResource("scripts/dataset.info.script.R");
×
142
    
143
    
144
    LOG.finer("R SCRIPTS AS STRINGS --------------");
×
145
    LOG.finer(RSCRIPT_WRITE_DVN_TABLE);
×
146
    LOG.finer(RSCRIPT_GET_DATASET);
×
147
    LOG.fine(RSCRIPT_CREATE_WORKSPACE);
×
148
    LOG.finer(RSCRIPT_GET_LABELS);
×
149
    LOG.finer(RSCRIPT_DATASET_INFO_SCRIPT);
×
150
    LOG.finer("END OF R SCRIPTS AS STRINGS -------");
×
151
   }
×
152
  
153
  /* 
154
   * TODO: 
155
   * Switch to the implementation in iq.dataverse.rserve
156
   * -- L.A. 4.0 alpha 1
157
  */
158
  private class RWorkspace {
159
    public String mParent, mWeb, mDvn, mDsb;
160
    public File mDataFile, mCsvDataFile;
161
    public RRequest mRRequest;
162
    public BufferedInputStream mInStream;
163
    /**
164
     * 
165
     */
166
    public RWorkspace () {
×
167
      mParent = mWeb = mDvn = mDsb = "";
×
168
      mDataFile = null;
×
169
      mCsvDataFile = null;
×
170
      mInStream = null;
×
171
    }
×
172
    /**
173
     * Create the Actual R Workspace
174
     */
175
    public void create () {
176
      try {
177
        LOG.fine("RDATAFileReader: Creating R Workspace");
×
178
        RRequestBuilder scriptBuilder = mRequestBuilder.script(RSCRIPT_CREATE_WORKSPACE);
×
179
        LOG.fine("got a sript request builder");
×
180
        
181
        RRequest scriptRequest = scriptBuilder.build();
×
182
        LOG.fine("script request built.");
×
183
        
184
        /*
185
        REXP result = mRequestBuilder
186
                .script(RSCRIPT_CREATE_WORKSPACE)
187
                .build()
188
                .eval();
189
        */
190
        REXP result = scriptRequest.eval(); 
×
191
        
192
        LOG.fine("evaluated the script");
×
193
        
194
        RList directoryNames = result.asList();
×
195
        
196
        mParent = null; 
×
197
        
198
        if (directoryNames != null) {
×
199
            if (directoryNames.at("parent") != null) {
×
200
                mParent = directoryNames.at("parent").asString();
×
201
            } else {
202
                LOG.fine("WARNING: directoryNames at \"parent\" is null!");
×
203
                if(directoryNames.isEmpty()) {
×
204
                    LOG.fine("WARNING: directoryNames is empty!");
×
205
                } else {
206
                    Set<String> dirKeySet = directoryNames.keySet();
×
207
                    Iterator iter = dirKeySet.iterator();
×
208
                    String key;
209

210
                    while (iter.hasNext()) {
×
211
                        key = (String) iter.next();
×
212
                        LOG.fine("directoryNames list key: "+key);
×
213
                    }
214
                }
×
215
            }
216
            
217
        } else {
218
            LOG.fine("WARNING: directoryNames is null!");
×
219
        }
220
        
221
        LOG.fine(String.format("RDATAFileReader: Parent directory of R Workspace is %s", mParent));
×
222
        
223
        LOG.fine("RDATAFileReader: Creating file handle");
×
224
        
225
        mDataFile = new File(mParent, "data.Rdata");
×
226
      }
227
      catch (Exception E) {
×
228
        LOG.warning("RDATAFileReader: Could not create R workspace");
×
229
        mParent = mWeb = mDvn = mDsb = "";
×
230
      }
×
231
    }
×
232
    /**
233
     * Destroy the Actual R Workspace
234
     */
235
    public void destroy () {
236
      String destroyerScript = new StringBuilder("")
×
237
              .append(String.format("unlink(\"%s\", TRUE, TRUE)", mParent))
×
238
              .toString();
×
239
      
240
      try {
241
        LOG.fine("RDATAFileReader: Destroying R Workspace");
×
242

243
        mRRequest = mRequestBuilder
×
244
                .script(destroyerScript)
×
245
                .build();
×
246
        
247
        mRRequest.eval();
×
248
        
249
        LOG.fine("RDATAFileReader: DESTROYED R Workspace");
×
250
      }
251
      catch (Exception ex) {
×
252
        LOG.warning("RDATAFileReader: R Workspace was not destroyed");
×
253
        LOG.fine(ex.getMessage());
×
254
      }
×
255
    }
×
256
    /**
257
     * Create the Data File to Use for Analysis, etc.
258
     */
259
    public File dataFile (String target, String prefix, int size) {
260
      
261
      String fileName = String.format("DVN.dataframe.%s.Rdata", mPID);
×
262
      
263
      mDataFile = new File(mParent, fileName);
×
264
                
265
      RFileInputStream RInStream = null;
×
266
      OutputStream outStream = null;
×
267
      
268
      RRequest req = mRequestBuilder.build();
×
269
      
270
      try {
271
        outStream = new BufferedOutputStream(new FileOutputStream(mDataFile));
×
272
        RInStream = req.getRConnection().openFile(target);
×
273
        
274
        if (size < 1024*1024*500) {
×
275
          int bufferSize = size;
×
276
          byte [] outputBuffer = new byte[bufferSize];
×
277
          RInStream.read(outputBuffer);
×
278
          outStream.write(outputBuffer, 0, size);
×
279
        }
280
        
281
        RInStream.close();
×
282
        outStream.close();
×
283
        return mDataFile;
×
284
      }
285
      catch (FileNotFoundException exc) {
×
286
        exc.printStackTrace();
×
287
        LOG.warning("RDATAFileReader: FileNotFound exception occurred");
×
288
        return mDataFile;
×
289
      }
290
      catch (IOException exc) {
×
291
        exc.printStackTrace();
×
292
        LOG.warning("RDATAFileReader: IO exception occurred");
×
293
      }
294

295
      // Close R input data stream
296
      if (RInStream != null) {
×
297
        try {
298
          RInStream.close();
×
299
        }
300
        catch (IOException exc) {
×
301
        }
×
302
      }
303

304
      // Close output data stream
305
      if (outStream != null) {
×
306
        try {
307
          outStream.close();
×
308
        }
309
        catch (IOException ex) {
×
310
        }
×
311
      }
312
      
313
      return mDataFile;
×
314
    }
315
    /**
316
     * Set the stream
317
     * @param inStream 
318
     */
319
    public void stream (BufferedInputStream inStream) {
320
      mInStream = inStream;
×
321
    }
×
322
    /**
323
     * Save the Rdata File Temporarily
324
     */
325
    private File saveRdataFile () {
326
      LOG.fine("RDATAFileReader: Saving Rdata File from Input Stream");
×
327
      
328
      if (mInStream == null) {
×
329
        LOG.fine("RDATAFileReader: No input stream was specified. Not writing file and returning NULL");
×
330
        return null;
×
331
      }
332
      
333
      byte [] buffer = new byte [1024];
×
334
      int bytesRead = 0;
×
335
      RFileOutputStream outStream = null;
×
336
      RConnection rServerConnection = null;
×
337
      
338
      try {
339
        LOG.fine("RDATAFileReader: Opening R connection");
×
340
        rServerConnection = new RConnection(RSERVE_HOST, RSERVE_PORT);
×
341
        
342
        LOG.fine("RDATAFileReader: Logging into R connection");
×
343
        rServerConnection.login(RSERVE_USER, RSERVE_PASSWORD);
×
344
        
345
        LOG.fine("RDATAFileReader: Attempting to create file");
×
346
        outStream = rServerConnection.createFile(mDataFile.getAbsolutePath());
×
347
        
348
        LOG.fine(String.format("RDATAFileReader: File created on server at %s", mDataFile.getAbsolutePath()));
×
349
      }
350
      catch (IOException ex) {
×
351
        LOG.warning("RDATAFileReader: Could not create file on R Server");
×
352
      }
353
      catch (RserveException ex) {
×
354
        LOG.warning("RDATAFileReader: Could not connect to R Server");
×
355
      }
×
356
      
357
      /*
358
       * Read stream and write to destination file
359
       */
360
      try {
361
        // Read from local file and write to rserver 1kb at a time
362
        while (mInStream.read(buffer) != -1) {
×
363
          outStream.write(buffer);
×
364
          bytesRead++;
×
365
        }
366
      }
367
      catch (IOException ex) {
×
368
        LOG.warning("RDATAFileReader: Could not write to file");
×
369
        LOG.fine(String.format("Error message: %s", ex.getMessage()));
×
370
      }
371
      catch (NullPointerException ex) {
×
372
        LOG.warning("RDATAFileReader: Data file has not been specified");
×
373
      }
×
374
      
375
      // Closing R server connection
376
      if (rServerConnection != null) {
×
377
        LOG.fine("RDATAFileReader: Closing R server connection");
×
378
        rServerConnection.close();
×
379
      }
380
      
381
      return mDataFile;
×
382
    }
383
    private File saveCsvFile () {
384
      // Specify CSV File Location on Server
385
      mCsvDataFile = new File(mRWorkspace.getRdataFile().getParent(), "data.csv");
×
386

387
      // 
388
      String csvScript = new StringBuilder("")
×
389
        .append("options(digits.secs=3)")
×
390
        .append("\n")
×
391
        .append(RSCRIPT_WRITE_DVN_TABLE)
×
392
        .append("\n")
×
393
        .append(String.format("load(\"%s\")", mRWorkspace.getRdataAbsolutePath()))
×
394
        .append("\n")
×
395
        .append(RSCRIPT_GET_DATASET)
×
396
        .append("\n")
×
397
        .append(String.format("write.dvn.table(data.set, file=\"%s\")", mCsvDataFile.getAbsolutePath()))
×
398
        .toString();
×
399
      
400
      // 
401
      RRequest csvRequest = mRequestBuilder.build();
×
402
      
403
      LOG.fine(String.format("RDATAFileReader: Attempting to write table to `%s`", mCsvDataFile.getAbsolutePath()));
×
404
      csvRequest.script(csvScript).eval();
×
405

406
      return mCsvDataFile;
×
407
    }
408
    /**
409
     * Return Rdata File Handle on R Server
410
     * @return File asdasd 
411
     */
412
    public File getRdataFile () {
413
      return mDataFile;
×
414
    }
415
    /**
416
     * Return Location of Rdata File on R Server
417
     * @return the file location as a string on the (potentially) remote R server
418
     */
419
    public String getRdataAbsolutePath () {
420
      return mDataFile.getAbsolutePath();
×
421
    }
422
  }
423
  /**
424
   * Constructs a <code>RDATAFileReader</code> instance from its "Spi" Class
425
   * @param originator a <code>StatDataFileReaderSpi</code> object.
426
   */
427
  public RDATAFileReader(TabularDataFileReaderSpi originator) {
428

429
    super(originator);
×
430
    
431
    // These settings have sane defaults in resources/META-INF/microprofile-config.properties,
432
    // ready to be overridden by a sysadmin. Every time a file would be read with this file reader,
433
    // a new reader will be created, reading from the cached config source settings with minimal overhead.
434
    this.RSERVE_HOST = JvmSettings.RSERVE_HOST.lookup();
×
435
    int port;
436
    try {
437
      port = JvmSettings.RSERVE_PORT.lookup(Integer.class);
×
438
    } catch (IllegalArgumentException e) {
×
439
      LOG.log(Level.SEVERE, "Could not parse value for " + JvmSettings.RSERVE_PORT.getScopedKey() + ", defaulting to 6311", e);
×
440
      port = 6311;
×
441
    }
×
442
    this.RSERVE_PORT = port;
×
443
    this.RSERVE_USER = JvmSettings.RSERVE_USER.lookup();
×
444
    this.RSERVE_PASSWORD = JvmSettings.RSERVE_PASSWORD.lookup();
×
445

446
    LOG.fine("RDATAFileReader: INSIDE RDATAFileReader");
×
447

448
    // Create request builder.
449
    // This object is used throughout as an RRequest factory
450
    mRequestBuilder = new RRequestBuilder()
×
451
            .host(RSERVE_HOST)
×
452
            .port(RSERVE_PORT)
×
453
            .user(RSERVE_USER)
×
454
            .password(RSERVE_PASSWORD);
×
455
    
456
    // Create R Workspace
457
    mRWorkspace = new RWorkspace();
×
458
    
459
    mPID = RandomStringUtils.randomNumeric(6);
×
460
  }
×
461

462
  private void init() throws IOException {
463
    doubleNumberFormatter.setGroupingUsed(false);
×
464
    doubleNumberFormatter.setMaximumFractionDigits(340);
×
465
    
466
  }
×
467
  
468
  /**
469
   * Read the Given RData File
470
   * @param stream a <code>BufferedInputStream</code>.
471
   * @param ignored
472
   * @return an <code>TabularDataIngest</code> object
473
   * @throws java.io.IOException if a reading error occurs.
474
   */
475
    @Override
476
    public TabularDataIngest read(BufferedInputStream stream, boolean saveWithVariableHeader, File dataFile) throws IOException {
477

478
        init();
×
479

480
        // Create Request object
481
        LOG.fine("RDATAFileReader: Creating RRequest object from RRequestBuilder object");
×
482

483
        try {
484
            // Create R Workspace
485
            mRWorkspace.stream(stream);
×
486
            mRWorkspace.create();
×
487
            mRWorkspace.saveRdataFile();
×
488
            mRWorkspace.saveCsvFile();
×
489

490
            // Copy CSV file to a local, temporary directory
491
            // Additionally, this sets the "tabDelimitedDataFile" property of the FileInformation
492
            File localCsvFile = transferCsvFile(mRWorkspace.mCsvDataFile);
×
493

494
            // Generate and save all the information about data set; this creates all 
495
            // the DataVariable objects, among other things:
496
            getDataFrameInformation();
×
497

498
            // Read and parse the TAB-delimited file saved by R, above; do the 
499
            // necessary post-processinga and filtering, and save the resulting 
500
            // TAB file as tabFileDestination, below. This is the file we'll be 
501
            // using to calculate the UNF, and for the storage/preservation of the
502
            // dataset. 
503
            // IMPORTANT: this must be done *after* the variable metadata has been 
504
            // created!
505
            // - L.A. 
506
            RTabFileParser csvFileReader = new RTabFileParser('\t');
×
507
            BufferedReader localBufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(localCsvFile), "UTF-8"));
×
508

509
            File tabFileDestination = File.createTempFile("data-", ".tab");
×
510
            PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath(), "UTF-8");
×
511
        
NEW
512
            int lineCount = csvFileReader.read(localBufferedReader, dataTable, saveWithVariableHeader, tabFileWriter);
×
513

514
            LOG.fine("RDATAFileReader: successfully read "+lineCount+" lines of tab-delimited data.");
×
515
        
516
            dataTable.setUnf("UNF:pending");
×
517
        
518
            ingesteddata.setTabDelimitedFile(tabFileDestination);
×
519
            ingesteddata.setDataTable(dataTable);
×
520

521
            // Destroy R workspace
522
            mRWorkspace.destroy();
×
523
        } catch (Exception ex) {
×
524
            throw new IOException ("Unknown exception occured during ingest; "+ex.getMessage());
×
525
        }
×
526

527
        LOG.fine("RDATAFileReader: Leaving \"read\" function");
×
528

529
        return ingesteddata;
×
530
    }
531
  
532
  /**
533
   * Copy Remote File on R-server to a Local Target
534
   * @param target a target on the remote r-server
535
   * @return 
536
   */
537
  private File transferCsvFile (File target) {
538
    File destination;
539
    FileOutputStream csvDestinationStream;
540
    
541
    try {
542
      destination = File.createTempFile("data", ".csv");
×
543
      LOG.fine(String.format("RDATAFileReader: Writing local CSV File to `%s`", destination.getAbsolutePath()));
×
544
      csvDestinationStream = new FileOutputStream(destination);
×
545
    }
546
    catch (IOException ex) {
×
547
      LOG.warning("RDATAFileReader: Could not create temporary file!");
×
548
      return null;
×
549
    }
×
550
    
551
    try {
552
      // Open connection to R-serve
553
      RConnection rServeConnection = new RConnection(RSERVE_HOST, RSERVE_PORT);
×
554
      rServeConnection.login(RSERVE_USER, RSERVE_PASSWORD);
×
555
      
556
      // Open file for reading from R-serve
557
      RFileInputStream rServeInputStream = rServeConnection.openFile(target.getAbsolutePath());
×
558
      
559
      int b;
560
      
561
      LOG.fine("RDATAFileReader: Beginning to write to local destination file");
×
562
      
563
      // Read from stream one character at a time
564
      while ((b = rServeInputStream.read()) != -1) {
×
565
        // Write to the *local* destination file
566
        csvDestinationStream.write(b);
×
567
      }
568
      
569
      LOG.fine(String.format("RDATAFileReader: Finished writing from destination `%s`", target.getAbsolutePath()));
×
570
      LOG.fine(String.format("RDATAFileReader: Finished copying to source `%s`", destination.getAbsolutePath()));
×
571
      
572
      
573
      LOG.fine("RDATAFileReader: Closing CSVFileReader R Connection");
×
574
      rServeConnection.close();
×
575
    }
576
    /*
577
     * TO DO: Make this error catching more intelligent
578
     */
579
    catch (Exception ex) {
×
580
    }
×
581
    
582
    return destination;
×
583
  }
584
  
585
  
586
    /**
587
     *
588
     * Runs an R-script that extracts meta-data from the *original* Rdata
589
     * object, then parses its output and creates DataVariable objects.
590
     *
591
     * @throws IOException if something bad happens?
592
     */
593
    private void getDataFrameInformation() {
594
        LOG.fine("RDATAFileReader: Entering `getDataFrameInformation` function");
×
595

596
        // Store variable names
597
        String[] variableNames = {};
×
598

599
        String parentDirectory = mRWorkspace.getRdataFile().getParent();
×
600

601
        String fileInfoScript = new StringBuilder("")
×
602
                .append(String.format("load(\"%s\")\n", mRWorkspace.getRdataAbsolutePath()))
×
603
                .append(String.format("setwd(\"%s\")\n", parentDirectory))
×
604
                .append(RSCRIPT_GET_DATASET)
×
605
                .append("\n")
×
606
                .append(RSCRIPT_DATASET_INFO_SCRIPT)
×
607
                .toString();
×
608

609
        try {
610
            RRequest request = mRequestBuilder.build();
×
611
            request.script(fileInfoScript);
×
612
            RList fileInformation = request.eval().asList();
×
613

614
            RList metaInfo = fileInformation.at("meta.info").asList();
×
615

616
            int varQnty = 0;
×
617
            variableNames = fileInformation.at("varNames").asStrings();
×
618

619
            //mDataTypes = fileInformation.at("dataTypes").asStrings();
620

621
            // Initialize variables: 
622
            List<DataVariable> variableList = new ArrayList<>();
×
623

624
            for (String varName : variableNames) {
×
625
                DataVariable dv = new DataVariable(varQnty, dataTable);
×
626
                dv.setName(varName);
×
627
                dv.setLabel(varName);
×
628
                // TODO:
629
                // Check if variables have real descriptive labels defined, 
630
                // via the mechanismm provided by that special optional package... 
631
                // (?) -- L.A.
632
                variableList.add(dv);
×
633

634
                // variableLabels.put(varName, varName);
635
                // variableNameList.add(varName);
636
                varQnty++;
×
637
            }
638

639
            dataTable.setVarQuantity(new Long(varQnty));
×
640
            dataTable.setDataVariables(variableList);
×
641
        
642
            // Get the Variable Meta Data Table while Populating 
643
            processVariableInfo(metaInfo, dataTable);
×
644
      
645
            
646
            if (fileInformation.at("caseQnty") != null) {
×
647
                int caseQuantity = 0; 
×
648
                try {
649
                    caseQuantity =  fileInformation.at("caseQnty").asInteger();
×
650
                } catch (REXPMismatchException rexp) {
×
651
                    // bummer! - but not fatal. 
652
                }
×
653
                if (caseQuantity > 0) {
×
654
                    dataTable.setCaseQuantity(new Long(caseQuantity));
×
655
                }
656
            }
657
    }
658
    catch (REXPMismatchException ex) {
×
659
      LOG.warning("RDATAFileReader: Could not put information correctly");
×
660
    }
661
    catch (Exception ex) {
×
662
      ex.printStackTrace();
×
663
      LOG.warning(ex.getMessage());
×
664
    }
×
665
    
666
    
667
  }
×
668

669
    /**
670
   * Read a Local Resource and Return Its Contents as a String
671
   * <code>readLocalResource</code> searches the local path around the class
672
   * <code>RDATAFileReader</code> for a file and returns its contents as a
673
   * string.
674
   * @param path String specifying the name of the local file to be converted
675
   * into a UTF-8 string.
676
   * @return a UTF-8 <code>String</code>
677
   */
678
    private static String readLocalResource(String path) {
679
        // Debug
680
        LOG.fine(String.format("RDATAFileReader: readLocalResource: reading local path \"%s\"", path));
×
681

682
        // Get stream
683
        InputStream resourceStream = RDATAFileReader.class.getResourceAsStream(path);
×
684
        String resourceAsString = "";
×
685

686
        // Try opening a buffered reader stream
687
        try {
688
            BufferedReader rd = new BufferedReader(new InputStreamReader(resourceStream, "UTF-8"));
×
689

690
            String line = null;
×
691
            while ((line = rd.readLine()) != null) {
×
692
                resourceAsString = resourceAsString.concat(line + "\n");
×
693
            }
694
            resourceStream.close();
×
695
        } catch (IOException ex) {
×
696
            LOG.warning(String.format("RDATAFileReader: (readLocalResource) resource stream from path \"%s\" was invalid", path));
×
697
        }
×
698

699
        // Return string
700
        return resourceAsString;
×
701
    }
702

703
  
704
    /**
705
     * Get a HashMap matching column number to meta-data used in re-creating R
706
     * Objects
707
     *
708
     * @param metaInfo an "RList" Object containing indices - type, type.string,
709
     * class, levels, and format.
710
     * @param dataTable a dataverse DataTable object
711
     */
712
    private void processVariableInfo(RList metaInfo, DataTable dataTable) throws IOException {
713
        // list(type = 1, type.string = "integer", class = class(values), levels = NULL, format = NULL)
714
        Integer variableType = -1;
×
715
        String variableTypeName = "", variableFormat = "";
×
716
        String[] variableLevels = null;
×
717

718

719
        for (int k = 0; k < metaInfo.size(); k++) {
×
720

721
            try {
722

723
                // Meta-data for a column in the data-set
724
                RList columnMeta = metaInfo.at(k).asList();
×
725

726
                // Extract information from the returned list
727
                variableType = !columnMeta.at("type").isNull() ? columnMeta.at("type").asInteger() : null;
×
728
                variableTypeName = !columnMeta.at("type.string").isNull() ? columnMeta.at("type.string").asString() : null;
×
729
                variableLevels = !columnMeta.at("levels").isNull() ? columnMeta.at("levels").asStrings() : new String[0];
×
730
                variableFormat = !columnMeta.at("format").isNull() ? columnMeta.at("format").asString() : null;
×
731

732
                LOG.fine("variable type: " + variableType);
×
733
                LOG.fine("variable type name: " + variableTypeName);
×
734
                LOG.fine("variable format: " + variableFormat);
×
735

736
                for (String variableLevel : variableLevels) {
×
737
                    LOG.fine("variable level: " + variableLevel);
×
738
                }
739

740
                //dataTable.getDataVariables().get(k).setFormatSchema("RDATA");
741

742
                if (variableTypeName == null || variableTypeName.equals("character") || variableTypeName.equals("other")) {
×
743
                    // This is a String: 
744
                    dataTable.getDataVariables().get(k).setTypeCharacter();
×
745
                    dataTable.getDataVariables().get(k).setIntervalDiscrete();
×
746
                    
747
                } else if (variableTypeName.equals("integer")) {
×
748
                    dataTable.getDataVariables().get(k).setTypeNumeric();
×
749
                    dataTable.getDataVariables().get(k).setIntervalDiscrete();
×
750
                    
751
                } else if (variableTypeName.equals("numeric") || variableTypeName.equals("double")) {
×
752
                    dataTable.getDataVariables().get(k).setTypeNumeric();
×
753
                    dataTable.getDataVariables().get(k).setIntervalContinuous();
×
754
                    
755
                } else if (variableTypeName.startsWith("Date")) {
×
756
                    dataTable.getDataVariables().get(k).setTypeCharacter();
×
757
                    dataTable.getDataVariables().get(k).setIntervalDiscrete();
×
758
                    dataTable.getDataVariables().get(k).setFormat(variableFormat);
×
759
                    
760
                    // instead:
761
                    if (variableTypeName.equals("Date")) {
×
762
                        dataTable.getDataVariables().get(k).setFormatCategory("date");
×
763
                    } else if (variableTypeName.equals("DateTime")) {
×
764
                        dataTable.getDataVariables().get(k).setFormatCategory("time");
×
765
                    }
766
                    
767
                } else if (variableTypeName.equals("factor")) {
×
768
                    
769
                    // All R factors are *string* factors!
770
                    dataTable.getDataVariables().get(k).setTypeCharacter();
×
771
                    dataTable.getDataVariables().get(k).setIntervalDiscrete();
×
772
                    if (variableLevels != null && variableLevels.length > 0) {
×
773
                        // yes, this is a factor, with levels defined.
774
                        LOG.fine("this is a factor.");
×
775
                        dataTable.getDataVariables().get(k).setFactor(true);
×
776
                        boolean ordered = false; 
×
777
                        
778
                        if (variableFormat != null && variableFormat.equals("ordered")) {
×
779
                            LOG.fine("an ordered factor, too");
×
780
                            ordered = true;
×
781
                        }
782
                        
783
                        for (int i = 0; i < variableLevels.length; i++) {
×
784
                            VariableCategory cat = new VariableCategory();
×
785
                            cat.setValue(variableLevels[i]);
×
786
                            // Sadly, R factors don't have descriptive labels;
787
                            cat.setLabel(variableLevels[i]);
×
788
                            
789
                            if (ordered) {
×
790
                                cat.setOrder(i+1);
×
791
                            }
792

793
                            /* cross-link the variable and category to each other: */
794
                            cat.setDataVariable(dataTable.getDataVariables().get(k));
×
795
                            dataTable.getDataVariables().get(k).getCategories().add(cat);
×
796
                        }
797
                        
798
                        dataTable.getDataVariables().get(k).setOrderedCategorical(ordered);
×
799

800
                    }
×
801

802
                } // And finally, a special case for logical variables: 
803
                // For all practical purposes, they are handled as numeric factors
804
                // with 0 and 1 for the values and "FALSE" and "TRUE" for the labels.
805
                // (so this can also be used as an example of ingesting a *numeric* 
806
                // categorical variable - as opposed to *string* categoricals, that
807
                // we turn R factors into - above.
808
                else if ("logical".equals(variableTypeName)) {
×
809
                    dataTable.getDataVariables().get(k).setFormatCategory("Boolean");
×
810
                    
811
                    dataTable.getDataVariables().get(k).setTypeNumeric();
×
812
                    dataTable.getDataVariables().get(k).setIntervalDiscrete();
×
813

814
                    String booleanFactorLabels[] = new String[2];
×
815
                    booleanFactorLabels[0] = "FALSE";
×
816
                    booleanFactorLabels[1] = "TRUE";
×
817

818
                    String booleanFactorValues[] = new String[2];
×
819
                    booleanFactorValues[0] = "0";
×
820
                    booleanFactorValues[1] = "1";
×
821

822
                    for (int i = 0; i < 2; i++) {
×
823
                        VariableCategory cat = new VariableCategory();
×
824
                        cat.setValue(booleanFactorValues[i]);
×
825
                        // Sadly, R factors don't have descriptive labels;
826
                        cat.setLabel(booleanFactorLabels[i]);
×
827

828
                        /* cross-link the variable and category to each other: */
829
                        cat.setDataVariable(dataTable.getDataVariables().get(k));
×
830
                        dataTable.getDataVariables().get(k).getCategories().add(cat);
×
831
                    }
832
                }
833

834
                // Store the meta-data in a hashmap (to return later)
835
            } catch (REXPMismatchException ex) {
×
836
                // If something went wrong, then it wasn't meant to be for that column.
837
                // And you know what? That's okay.
838
                ex.printStackTrace();
×
839
                LOG.fine(String.format("Could not process variable %d of the data frame.", k));
×
840
            }
×
841
        }
842
    }
×
843
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc