• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 6245535001

20 Sep 2023 07:44AM UTC coverage: 57.013%. First build
6245535001

push

github

web-flow
8.1.0 Release (#1628)

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

Bumps [Newtonsoft.Json](https://github.com/JamesNK/Newtonsoft.Json) from 13.0.1 to 13.0.2.
- [Release notes](https://github.com/JamesNK/Newtonsoft.Json/releases)
- [Commits](https://github.com/JamesNK/Newtonsoft.Json/compare/13.0.1...13.0.2)

---
updated-dependencies:
- dependency-name: Newtonsoft.Json
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

Bumps [NLog](https://github.com/NLog/NLog) from 5.0.5 to 5.1.0.
- [Release notes](https://github.com/NLog/NLog/releases)
- [Changelog](https://github.com/NLog/NLog/blob/dev/CHANGELOG.md)
- [Commits](https://github.com/NLog/NLog/compare/v5.0.5...v5.1.0)

---
updated-dependencies:
- dependency-name: NLog
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump NLog from 5.0.5 to 5.1.0

* Fix -r flag - should have been --results-directory all along

* Bump Newtonsoft.Json from 13.0.1 to 13.0.2

* Bump YamlDotNet from 12.0.2 to 12.1.0

Bumps [YamlDotNet](https://github.com/aaubry/YamlDotNet) from 12.0.2 to 12.1.0.
- [Release notes](https://github.com/aaubry/YamlDotNet/releases)
- [Commits](https://github.com/aaubry/YamlDotNet/compare/v12.0.2...v12.1.0)

---
updated-dependencies:
- dependency-name: YamlDotNet
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump Moq from 4.18.2 to 4.18.3

Bumps [Moq](https://github.com/moq/moq4) from 4.18.2 to 4.18.3.
- [Release notes](https://github.com/moq/moq4/releases)
- [Changelog](https://github.com/moq/moq4/blob/main/CHANGELOG.md)
- [Commits](https://github.com/moq/moq4/compare/v4.18.2...v4.18.3)

---
updated-dependencies:
- dependency-name: Moq
... (continued)

10732 of 20257 branches covered (0.0%)

Branch coverage included in aggregate %.

48141 of 48141 new or added lines in 1086 files covered. (100.0%)

30685 of 52388 relevant lines covered (58.57%)

7387.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.62
/Rdmp.Core/DataLoad/Modules/DataFlowSources/ExcelDataFlowSource.cs
1
// Copyright (c) The University of Dundee 2018-2019
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Data;
10
using System.Diagnostics;
11
using System.Globalization;
12
using System.IO;
13
using System.Linq;
14
using System.Threading;
15
using ExcelNumberFormat;
16
using FAnsi.Discovery;
17
using NPOI.HSSF.UserModel;
18
using NPOI.SS.UserModel;
19
using NPOI.XSSF.UserModel;
20
using Rdmp.Core.Curation.Data;
21
using Rdmp.Core.DataFlowPipeline;
22
using Rdmp.Core.DataFlowPipeline.Requirements;
23
using Rdmp.Core.DataLoad.Modules.Exceptions;
24
using Rdmp.Core.ReusableLibraryCode.Checks;
25
using Rdmp.Core.ReusableLibraryCode.Progress;
26

27
namespace Rdmp.Core.DataLoad.Modules.DataFlowSources;
28

29
/// <summary>
30
/// Pipeline component for reading from Microsoft Excel files.  Reads only from a single worksheet (by default the first one in the workbook).  Data read
31
/// is returned as a DataTable all read at once in one big batch.  This component requires Microsoft Office to be installed since it uses Interop.
32
/// </summary>
33
public class ExcelDataFlowSource : IPluginDataFlowSource<DataTable>, IPipelineRequirement<FlatFileToLoad>
34
{
35
    public const string WorkSheetName_DemandDescription =
36
        "Name of the worksheet to load data from (single sheet name only).  If this is empty then the first sheet in the spreadsheet will be loaded instead";
37

38
    public const string AddFilenameColumnNamed_DemandDescription =
39
        "Optional - Set to the name of a column in your RAW database (e.g. Filename).  If set this named column will be populated with the path to the file being read (e.g. c:\\myproj\\Data\\ForLoading\\MyFile.csv)";
40

41
    [DemandsInitialization(WorkSheetName_DemandDescription)]
42
    public string WorkSheetName { get; set; }
30✔
43

44
    [DemandsInitialization(DelimitedFlatFileDataFlowSource.MakeHeaderNamesSane_DemandDescription,
45
        DemandType.Unspecified, true)]
46
    public bool MakeHeaderNamesSane { get; set; }
2✔
47

48
    [DemandsInitialization(AddFilenameColumnNamed_DemandDescription)]
49
    public string AddFilenameColumnNamed { get; set; }
52✔
50

51
    private FlatFileToLoad _fileToLoad;
52

53
    private DataTable dataReadFromFile;
54
    private bool haveDispatchedDataTable;
55

56
    public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
57
    {
58
        dataReadFromFile ??= GetAllData(listener, cancellationToken);
26✔
59

60
        if (haveDispatchedDataTable)
24✔
61
            return null;
2✔
62

63
        haveDispatchedDataTable = true;
22✔
64

65
        return dataReadFromFile;
22✔
66
    }
67

68
    private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
69
    {
70
        var sw = new Stopwatch();
24✔
71
        sw.Start();
24✔
72
        if (_fileToLoad == null)
24!
73
            throw new Exception("_fileToLoad has not been set yet, possibly component has not been Initialized yet");
×
74

75
        if (!IsAcceptableFileExtension())
24!
76
            throw new Exception($"FileToLoad ({_fileToLoad.File.FullName}) extension was not XLS or XLSX, dubious");
×
77

78
        using var fs = new FileStream(_fileToLoad.File.FullName, FileMode.Open);
24✔
79
        IWorkbook wb = _fileToLoad.File.Extension == ".xls" ? new HSSFWorkbook(fs) : new XSSFWorkbook(fs);
24✔
80

81
        DataTable toReturn = null;
24✔
82

83
        try
84
        {
85
            var worksheet =
24!
86
                //if the user hasn't picked one, use the first
24✔
87
                (string.IsNullOrWhiteSpace(WorkSheetName) ? wb.GetSheetAt(0) : wb.GetSheet(WorkSheetName)) ??
24✔
88
                throw new FlatFileLoadException(
24✔
89
                    $"The Excel sheet '{WorkSheetName}' was not found in workbook '{_fileToLoad.File.Name}'");
24✔
90
            toReturn = GetAllData(worksheet, listener);
24✔
91

92
            //set the table name the file name
93
            toReturn.TableName =
24✔
94
                QuerySyntaxHelper.MakeHeaderNameSensible(Path.GetFileNameWithoutExtension(_fileToLoad.File.Name));
24✔
95

96
            if (toReturn.Columns.Count == 0)
24✔
97
                throw new FlatFileLoadException(
2✔
98
                    $"The Excel sheet '{worksheet.SheetName}' in workbook '{_fileToLoad.File.Name}' is empty");
2✔
99

100
            //if the user wants a column in the DataTable storing the filename loaded add it
101
            if (!string.IsNullOrWhiteSpace(AddFilenameColumnNamed))
22✔
102
            {
103
                toReturn.Columns.Add(AddFilenameColumnNamed);
4✔
104
                foreach (DataRow dataRow in toReturn.Rows)
48✔
105
                    dataRow[AddFilenameColumnNamed] = _fileToLoad.File.FullName;
20✔
106
            }
107
        }
18✔
108
        finally
109
        {
110
            wb.Close();
24✔
111
            toReturn?.EndLoadData();
24✔
112
        }
24✔
113

114
        return toReturn;
22✔
115
    }
22✔
116

117
    /// <summary>
118
    /// Returns all data held in the current <paramref name="worksheet"/>.  The first row of data becomes the headers.  Throws away fully blank columns/rows.
119
    /// </summary>
120
    /// <param name="worksheet"></param>
121
    /// <param name="listener"></param>
122
    /// <returns></returns>
123
    public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener)
124
    {
125
        var toReturn = new DataTable();
38✔
126
        toReturn.BeginLoadData();
38✔
127

128
        var rowEnumerator = worksheet.GetRowEnumerator();
38✔
129
        var nColumns = -1;
38✔
130

131
        var nonBlankColumns = new Dictionary<int, DataColumn>();
38✔
132

133
        while (rowEnumerator.MoveNext())
740✔
134
        {
135
            var row = (IRow)rowEnumerator.Current;
702✔
136

137
            //if all the cells in the current row are blank skip it (eliminates top of file whitespace)
138
            if (row.Cells.All(c => string.IsNullOrWhiteSpace(c.ToString())))
1,516✔
139
                continue;
140

141
            //first row (that has any data in it) - makes headers
142
            if (nColumns == -1)
678✔
143
            {
144
                nColumns = row.Cells.Count;
36✔
145
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
36✔
146
                    $"Excel sheet {worksheet.SheetName} contains {nColumns}"));
36✔
147

148
                for (var i = 0; i < nColumns; i++)
740✔
149
                {
150
                    //if the cell header is blank
151
                    var cell = row.Cells[i];
334✔
152
                    var h = cell.StringCellValue;
334✔
153
                    if (string.IsNullOrWhiteSpace(h))
334✔
154
                        continue;
155

156
                    nonBlankColumns.Add(cell.ColumnIndex, toReturn.Columns.Add(h));
332✔
157
                }
158

159
                continue;
36✔
160
            }
161

162
            //the rest of the rows
163
            var r = toReturn.Rows.Add();
642✔
164

165
            var gotAtLeastOneGoodValue = false;
642✔
166

167
            foreach (var cell in row.Cells)
22,992✔
168
            {
169
                var value = GetCellValue(cell);
10,854✔
170

171
                //if the cell is blank skip it
172
                if (IsNull(value))
10,854✔
173
                    continue;
174

175
                //were we expecting this to be blank?
176
                if (!nonBlankColumns.ContainsKey(cell.ColumnIndex))
10,736✔
177
                {
178
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
8✔
179
                        $"Discarded the following data (that was found in unnamed columns):{value}"));
8✔
180
                    continue;
8✔
181
                }
182

183
                r[nonBlankColumns[cell.ColumnIndex]] = value.ToString();
10,728✔
184
                gotAtLeastOneGoodValue = true;
10,728✔
185
            }
186

187
            //if we didn't get any values at all for the row throw it away
188
            if (!gotAtLeastOneGoodValue)
642✔
189
                toReturn.Rows.Remove(r);
8✔
190
        }
191

192
        return toReturn;
38✔
193
    }
194

195
    /// <summary>
196
    /// Returns the C# value that best represents the contents of the cell.
197
    /// </summary>
198
    /// <param name="cell">The cell whose value you want to retrieve</param>
199
    /// <param name="treatAs">Leave blank, used in recursion for dealing with Formula cells</param>
200
    /// <returns></returns>
201
    private object GetCellValue(ICell cell, CellType treatAs = CellType.Unknown)
202
    {
203
        if (cell == null)
10,856!
204
            return null;
×
205

206
        treatAs = treatAs switch
10,856!
207
        {
10,856✔
208
            CellType.Formula => throw new Exception("Cannot treat the cell contents as a Formula"),
×
209
            CellType.Unknown => cell.CellType,
10,854✔
210
            _ => treatAs
2!
211
        };
10,856✔
212

213
        switch (treatAs)
214
        {
215
            case CellType.Unknown:
216
                return cell.ToString();
×
217
            case CellType.Numeric:
218

219
                //some numerics are actually dates/times
220
                if (cell.CellStyle.DataFormat == 0) return cell.NumericCellValue;
616✔
221
                var format = cell.CellStyle.GetDataFormatString();
228✔
222
                var f = new NumberFormat(format);
228✔
223

224
                if (IsDateWithoutTime(format))
228✔
225
                    return cell.DateCellValue.ToString("yyyy-MM-dd");
40✔
226

227
                if (IsDateWithTime(format))
188✔
228
                    return cell.DateCellValue.ToString("yyyy-MM-dd HH:mm:ss");
60✔
229

230
                if (IsTimeWithoutDate(format))
128✔
231
                    return cell.DateCellValue.ToString("HH:mm:ss");
80✔
232

233
                return IsDateFormat(format)
48!
234
                    ? f.Format(cell.DateCellValue, CultureInfo.InvariantCulture)
48✔
235
                    : f.Format(cell.NumericCellValue, CultureInfo.InvariantCulture);
48✔
236

237
            case CellType.String:
238

239
                var v = cell.StringCellValue;
10,326✔
240

241
                //if it is blank or 'null' then leave it null
242
                if (string.IsNullOrWhiteSpace(v) || v.Trim().Equals("NULL", StringComparison.CurrentCultureIgnoreCase))
10,326✔
243
                    return null;
12✔
244

245
                return cell.StringCellValue;
10,314✔
246
            case CellType.Formula:
247
                return GetCellValue(cell, cell.CachedFormulaResultType);
2✔
248
            case CellType.Blank:
249
                return null;
106✔
250
            case CellType.Boolean:
251
                return cell.BooleanCellValue;
×
252
            case CellType.Error:
253
                return null;
×
254
            default:
255
                throw new ArgumentOutOfRangeException();
×
256
        }
257
    }
258

259
    private static bool IsDateWithTime(string formatString) => formatString.Contains('h') && formatString.Contains('y');
188✔
260

261
    private static bool IsDateWithoutTime(string formatString) =>
262
        formatString.Contains('y') && !formatString.Contains('h');
228✔
263

264
    private static bool IsTimeWithoutDate(string formatString) =>
265
        !formatString.Contains('y') && formatString.Contains('h');
128!
266

267
    private static bool IsDateFormat(string formatString) => !string.IsNullOrWhiteSpace(formatString) &&
48!
268
                                                             (formatString.Contains('/') ||
48✔
269
                                                              formatString.Contains('\\') ||
48✔
270
                                                              formatString.Contains(':'));
48✔
271

272
    /*
273

274
    private string IntToExcelColumnLetter(int colNumberStartingAtOne)
275
    {
276
        int dividend = colNumberStartingAtOne;
277
        string columnName = String.Empty;
278
        int modulo;
279

280
        while (dividend > 0)
281
        {
282
            modulo = (dividend - 1) % 26;
283
            columnName = Convert.ToChar(65 + modulo) + columnName;
284
            dividend = (int)((dividend - modulo) / 26);
285
        }
286

287
        return columnName;
288
    }
289

290
    private object MakeTimeRelatedDecision(double value, Range cells, string type, dynamic cell,int row, int col)
291
    {
292
        if (type.Contains("/") || type.Contains("\\") || type.Contains(":"))
293
        {
294
            if (cell != null)
295
                return cell.Text;
296

297
            return cells[row, col].Text;
298
        }
299

300
        //timeRelatedDescisions
301
        return value;
302
    }
303
    */
304
    private string[] acceptedFileExtensions =
34✔
305
    {
34✔
306
        ".xlsx",
34✔
307
        ".xls"
34✔
308
    };
34✔
309

310
    private bool IsAcceptableFileExtension() => acceptedFileExtensions.Contains(_fileToLoad.File.Extension.ToLower());
28✔
311

312
    private static bool IsNull(object o) => o == null || o == DBNull.Value || string.IsNullOrWhiteSpace(o.ToString());
10,854✔
313

314
    public void Check(ICheckNotifier notifier)
315
    {
316
        if (_fileToLoad != null)
4!
317
            if (!IsAcceptableFileExtension())
4✔
318
                notifier.OnCheckPerformed(
2✔
319
                    new CheckEventArgs(
2✔
320
                        $"File extension {_fileToLoad.File} has an invalid extension:{_fileToLoad.File.Extension} (this class only accepts:{string.Join(",", acceptedFileExtensions)})",
2✔
321
                        CheckResult.Fail));
2✔
322
            else
323
                notifier.OnCheckPerformed(
2✔
324
                    new CheckEventArgs($"File extension of file {_fileToLoad.File.Name} is acceptable",
2✔
325
                        CheckResult.Success));
2✔
326
        else
327
            notifier.OnCheckPerformed(
×
328
                new CheckEventArgs(
×
329
                    "FlatFileToLoad (Pipeline Requirement) was not met (we weren't initialized with a file)",
×
330
                    CheckResult.Warning));
×
331
    }
×
332

333
    public void Dispose(IDataLoadEventListener listener, Exception pipelineFailureExceptionIfAny)
334
    {
335
    }
2✔
336

337
    public void Abort(IDataLoadEventListener listener)
338
    {
339
    }
×
340

341
    public DataTable TryGetPreview()
342
    {
343
        var timeoutToken = new CancellationTokenSource();
×
344
        timeoutToken.CancelAfter(10000);
×
345

346
        var token = new GracefulCancellationToken(timeoutToken.Token, timeoutToken.Token);
×
347

348
        DataTable dt;
349
        try
350
        {
351
            dt = GetAllData(ThrowImmediatelyDataLoadEventListener.Quiet, token);
×
352
        }
×
353
        catch (Exception e)
×
354
        {
355
            if (timeoutToken.IsCancellationRequested)
×
356
                throw new Exception(
×
357
                    "Failed to generate preview in 10 seconds or less, giving up trying to load a preview (this doesn't mean that the source is broken, more likely you just have a big file or something)",
×
358
                    e);
×
359

360
            throw;
×
361
        }
362

363
        return dt;
×
364
    }
365

366
    public void PreInitialize(FlatFileToLoad value, IDataLoadEventListener listener)
367
    {
368
        _fileToLoad = value;
36✔
369
    }
36✔
370
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc