• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 8598528338

08 Apr 2024 10:28AM UTC coverage: 56.914% (+0.08%) from 56.837%
8598528338

push

github

web-flow
Move 8.1.5 to main for autoupdater (#1797)

* bump to net8

* add sqlcli

* Bump Autoupdater.NET.Official from 1.8.4 to 1.8.5

Bumps [Autoupdater.NET.Official](https://github.com/ravibpatel/AutoUpdater.NET) from 1.8.4 to 1.8.5.
- [Release notes](https://github.com/ravibpatel/AutoUpdater.NET/releases)
- [Commits](https://github.com/ravibpatel/AutoUpdater.NET/compare/v1.8.4...v1.8.5)

---
updated-dependencies:
- dependency-name: Autoupdater.NET.Official
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

* Delete .nuget/packages.config (#1758)

* Delete .nuget/packages.config

Remove  hopefully obsolete package list confusing dependency tracking

* Remove .nuget folder

---------

Co-authored-by: James A Sutherland <>

* Bump svenstaro/upload-release-action from 2.7.0 to 2.9.0 (#1759)

* Bump shogo82148/actions-setup-perl from 1.28.0 to 1.29.0 (#1751)

Bumps [shogo82148/actions-setup-perl](https://github.com/shogo82148/actions-setup-perl) from 1.28.0 to 1.29.0.
- [Release notes](https://github.com/shogo82148/actions-setup-perl/releases)
- [Commits](https://github.com/shogo82148/actions-setup-perl/compare/v1.28.0...v1.29.0)

---
updated-dependencies:
- dependency-name: shogo82148/actions-setup-perl
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: James Friel <jfriel001@dundee.ac.uk>

* Bump CsvHelper from 30.0.3 to 30.1.0 (#1737)

Bumps [CsvHelper](https://github.com/JoshClose/CsvHelper) from 30.0.3 to 30.1.0.
- [Commits](https://github.com/JoshClose/CsvHelper/compare/30.0.3...30.1.0)

---
updated-dependencies:
- dependency-name: CsvHelper
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

S... (continued)

10835 of 20509 branches covered (52.83%)

Branch coverage included in aggregate %.

99 of 176 new or added lines in 33 files covered. (56.25%)

10 existing lines in 7 files now uncovered.

30856 of 52744 relevant lines covered (58.5%)

7400.67 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.05
/Rdmp.Core/DataLoad/Modules/DataFlowSources/ExcelDataFlowSource.cs
1
// Copyright (c) The University of Dundee 2018-2024
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Data;
10
using System.Diagnostics;
11
using System.Globalization;
12
using System.IO;
13
using System.Linq;
14
using System.Text;
15
using System.Threading;
16
using ExcelNumberFormat;
17
using FAnsi.Discovery;
18
using NPOI.HSSF.UserModel;
19
using NPOI.SS.UserModel;
20
using NPOI.XSSF.UserModel;
21
using Rdmp.Core.Curation.Data;
22
using Rdmp.Core.DataFlowPipeline;
23
using Rdmp.Core.DataFlowPipeline.Requirements;
24
using Rdmp.Core.DataLoad.Modules.Exceptions;
25
using Rdmp.Core.ReusableLibraryCode.Annotations;
26
using Rdmp.Core.ReusableLibraryCode.Checks;
27
using Rdmp.Core.ReusableLibraryCode.Progress;
28

29
namespace Rdmp.Core.DataLoad.Modules.DataFlowSources;
30

31
/// <summary>
32
/// Pipeline component for reading from Microsoft Excel files.  Reads only from a single worksheet (by default the first one in the workbook).  Data read
33
/// is returned as a DataTable all read at once in one big batch.  This component requires Microsoft Office to be installed since it uses Interop.
34
/// </summary>
35
public class ExcelDataFlowSource : IPluginDataFlowSource<DataTable>, IPipelineRequirement<FlatFileToLoad>
36
{
37
    public const string WorkSheetName_DemandDescription =
38
        "Name of the worksheet to load data from (single sheet name only).  If this is empty then the first sheet in the spreadsheet will be loaded instead";
39

40
    public const string AddFilenameColumnNamed_DemandDescription =
41
        "Optional - Set to the name of a column in your RAW database (e.g. Filename).  If set this named column will be populated with the path to the file being read (e.g. c:\\myproj\\Data\\ForLoading\\MyFile.csv)";
42

43
    [DemandsInitialization(WorkSheetName_DemandDescription)]
44
    public string WorkSheetName { get; set; }
64✔
45

46
    [DemandsInitialization(DelimitedFlatFileDataFlowSource.MakeHeaderNamesSane_DemandDescription,
47
        DemandType.Unspecified, true)]
48
    public bool MakeHeaderNamesSane { get; set; }
2✔
49

50
    [DemandsInitialization(AddFilenameColumnNamed_DemandDescription)]
51
    public string AddFilenameColumnNamed { get; set; }
86✔
52

53
    private FlatFileToLoad _fileToLoad;
54

55
    private DataTable dataReadFromFile;
56
    private bool haveDispatchedDataTable;
57

58
    public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
59
    {
60
        dataReadFromFile ??= GetAllData(listener, cancellationToken, rowOffset, columnOffset, replacementHeadersSplit);
58✔
61

62
        if (haveDispatchedDataTable)
56✔
63
            return null;
18✔
64

65
        haveDispatchedDataTable = true;
38✔
66

67
        return dataReadFromFile;
38✔
68
    }
69

70
    private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
71
    {
72
        var sw = new Stopwatch();
40✔
73
        sw.Start();
40✔
74
        if (_fileToLoad == null)
40!
75
            throw new Exception("_fileToLoad has not been set yet, possibly component has not been Initialized yet");
×
76

77
        if (!IsAcceptableFileExtension())
40!
78
            throw new Exception($"FileToLoad ({_fileToLoad.File.FullName}) extension was not XLS or XLSX, dubious");
×
79

80
        using var fs = new FileStream(_fileToLoad.File.FullName, FileMode.Open);
40✔
81
        IWorkbook wb = _fileToLoad.File.Extension == ".xls" ? new HSSFWorkbook(fs) : new XSSFWorkbook(fs);
40✔
82

83
        DataTable toReturn = null;
40✔
84

85
        try
86
        {
87
            var worksheet =
40!
88
                //if the user hasn't picked one, use the first
40✔
89
                (string.IsNullOrWhiteSpace(WorkSheetName) ? wb.GetSheetAt(0) : wb.GetSheet(WorkSheetName)) ??
40✔
90
                throw new FlatFileLoadException(
40✔
91
                    $"The Excel sheet '{WorkSheetName}' was not found in workbook '{_fileToLoad.File.Name}'");
40✔
92
            toReturn = GetAllData(worksheet, listener, rowOffset, columnOffset, replacementHeadersSplit);
40✔
93

94
            //set the table name the file name
95
            toReturn.TableName =
40✔
96
                QuerySyntaxHelper.MakeHeaderNameSensible(Path.GetFileNameWithoutExtension(_fileToLoad.File.Name));
40✔
97

98
            if (toReturn.Columns.Count == 0)
40✔
99
                throw new FlatFileLoadException(
2✔
100
                    $"The Excel sheet '{worksheet.SheetName}' in workbook '{_fileToLoad.File.Name}' is empty");
2✔
101

102
            //if the user wants a column in the DataTable storing the filename loaded add it
103
            if (!string.IsNullOrWhiteSpace(AddFilenameColumnNamed))
38✔
104
            {
105
                toReturn.Columns.Add(AddFilenameColumnNamed);
4✔
106
                foreach (DataRow dataRow in toReturn.Rows)
48✔
107
                    dataRow[AddFilenameColumnNamed] = _fileToLoad.File.FullName;
20✔
108
            }
109
        }
34✔
110
        finally
111
        {
112
            wb.Close();
40✔
113
            toReturn?.EndLoadData();
40✔
114
        }
40✔
115

116
        return toReturn;
38✔
117
    }
38✔
118

119
    /// <summary>
120
    /// Returns all data held in the current <paramref name="worksheet"/>.  The first row of data becomes the headers.  Throws away fully blank columns/rows.
121
    /// </summary>
122
    /// <param name="worksheet"></param>
123
    /// <param name="listener"></param>
124
    /// <param name="rowOffset"></param>
125
    /// <param name="columnOffset"></param>
126
    /// <param name="replacementHeadersSplit"></param>
127
    /// <returns></returns>
128
    public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener, int rowOffset = 0, int columnOffset = 0, string[] replacementHeadersSplit = null)
129
    {
130
        var toReturn = new DataTable();
54✔
131
        toReturn.BeginLoadData();
54✔
132

133
        var rowEnumerator = worksheet.GetRowEnumerator();
54✔
134
        var nColumns = -1;
54✔
135

136
        var nonBlankColumns = new Dictionary<int, DataColumn>();
54✔
137

138
        while (rowEnumerator.MoveNext())
806✔
139
        {
140
            var row = (IRow)rowEnumerator.Current;
752✔
141
            if (rowOffset - 1 > row.RowNum) continue;// .RowNumber is 0 indexed
752✔
142

143
            //if all the cells in the current row are blank skip it (eliminates top of file whitespace)
144
            if (row.Cells.All(c => string.IsNullOrWhiteSpace(c.ToString())))
1,612✔
145
                continue;
146

147
            //first row (that has any data in it) - makes headers
148
            if (nColumns == -1)
726✔
149
            {
150
                nColumns = row.Cells.Count;
52✔
151
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
52✔
152
                    $"Excel sheet {worksheet.SheetName} contains {nColumns}"));
52✔
153

154

155
                if (replacementHeadersSplit is not null && replacementHeadersSplit.Any() && replacementHeadersSplit.Length != nColumns)
52!
NEW
156
                    listener.OnNotify(this,
×
NEW
157
                        new NotifyEventArgs(ProgressEventType.Error,
×
NEW
158
                            $"ForceReplacementHeaders was set but it had {replacementHeadersSplit.Length} column header names while the file had {nColumns} (there must be the same number of replacement headers as headers in the excel file)"));
×
159

160
                string[] originalHeaders = new string[nColumns];
52✔
161
                for (var i = 0; i < nColumns; i++)
848✔
162
                {
163
                    //if the cell header is blank
164
                    var cell = row.Cells[i];
372✔
165
                    if (cell.ColumnIndex < columnOffset) continue;
372✔
166
                    string h;
167
                    try
168
                    {
169
                        h = cell.StringCellValue;
366✔
170
                    }
366✔
171
                    catch (Exception)
×
172
                    {
173
                        h = cell.NumericCellValue.ToString();
×
174
                    }
×
175
                    if (replacementHeadersSplit is not null && replacementHeadersSplit.Any() && replacementHeadersSplit.Length == nColumns)
366!
176
                    {
NEW
177
                        originalHeaders[i] = h;
×
NEW
178
                        h = replacementHeadersSplit[i];
×
179
                    }
180
                    if (string.IsNullOrWhiteSpace(h))
366✔
181
                        continue;
182

183
                    nonBlankColumns.Add(cell.ColumnIndex, toReturn.Columns.Add(h));
364✔
184
                }
185
                if (replacementHeadersSplit is not null && replacementHeadersSplit.Any())
52!
NEW
186
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
×
NEW
187
                    $"Force headers will make the following header changes:{GenerateASCIIArtOfSubstitutions(originalHeaders, replacementHeadersSplit)}"));
×
188

UNCOV
189
                continue;
×
190
            }
191

192
            //the rest of the rows
193
            var r = toReturn.Rows.Add();
674✔
194

195
            var gotAtLeastOneGoodValue = false;
674✔
196

197
            foreach (var cell in row.Cells)
23,208✔
198
            {
199
                var value = GetCellValue(cell);
10,930✔
200

201
                //if the cell is blank skip it
202
                if (IsNull(value))
10,930✔
203
                    continue;
204

205
                //were we expecting this to be blank?
206
                if (!nonBlankColumns.ContainsKey(cell.ColumnIndex))
10,812✔
207
                {
208
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
20✔
209
                        $"Discarded the following data (that was found in unnamed columns):{value}"));
20✔
210
                    continue;
20✔
211
                }
212

213
                r[nonBlankColumns[cell.ColumnIndex]] = value.ToString();
10,792✔
214
                gotAtLeastOneGoodValue = true;
10,792✔
215
            }
216

217
            //if we didn't get any values at all for the row throw it away
218
            if (!gotAtLeastOneGoodValue)
674✔
219
                toReturn.Rows.Remove(r);
8✔
220
        }
221

222
        return toReturn;
54✔
223
    }
224

225
    private static string GenerateASCIIArtOfSubstitutions(string[] headers,
226
      string[] replacements)
227
    {
NEW
228
        var sb = new StringBuilder("");
×
229

NEW
230
        var max = Math.Max(replacements.Length, headers.Length);
×
231

NEW
232
        for (var i = 0; i < max; i++)
×
233
        {
NEW
234
            var replacement = i >= replacements.Length ? "???" : replacements[i];
×
NEW
235
            var original = i >= headers.Length ? "???" : headers[i];
×
236

NEW
237
            sb.Append($"{Environment.NewLine}[{i}]{original}>>>{replacement}");
×
238
        }
239

NEW
240
        return sb.ToString();
×
241
    }
242

243
    /// <summary>
244
    /// Returns the C# value that best represents the contents of the cell.
245
    /// </summary>
246
    /// <param name="cell">The cell whose value you want to retrieve</param>
247
    /// <param name="treatAs">Leave blank, used in recursion for dealing with Formula cells</param>
248
    /// <returns></returns>
249
    private object GetCellValue([CanBeNull] ICell cell, CellType treatAs = CellType.Unknown)
250
    {
251
        if (cell == null)
10,932!
252
            return null;
×
253

254
        treatAs = treatAs switch
10,932!
255
        {
10,932✔
256
            CellType.Formula => throw new Exception("Cannot treat the cell contents as a Formula"),
×
257
            CellType.Unknown => cell.CellType,
10,930✔
258
            _ => treatAs
2!
259
        };
10,932✔
260

261
        switch (treatAs)
262
        {
263
            case CellType.Unknown:
264
                return cell.ToString();
×
265
            case CellType.Numeric:
266

267
                //some numerics are actually dates/times
268
                if (cell.CellStyle.DataFormat == 0) return cell.NumericCellValue;
616✔
269

270
                var format = cell.CellStyle.GetDataFormatString();
228✔
271

272
                if (IsDateWithoutTime(format))
228✔
273
                    return cell.DateCellValue.HasValue ? cell.DateCellValue.Value.ToString("yyyy-MM-dd") : null;
40!
274

275
                if (IsDateWithTime(format))
188✔
276
                    return cell.DateCellValue.HasValue ? cell.DateCellValue.Value.ToString("yyyy-MM-dd HH:mm:ss") : null;
60!
277

278
                if (IsTimeWithoutDate(format))
128✔
279
                    return cell.DateCellValue.HasValue ? cell.DateCellValue.Value.ToString("HH:mm:ss") : null;
80!
280

281
                return new NumberFormat(format).Format(
48!
282
                    IsDateFormat(format) ? cell.DateCellValue : cell.NumericCellValue, CultureInfo.InvariantCulture);
48✔
283

284
            case CellType.String:
285

286
                //if it is blank or 'null' then leave it null
287
                return string.IsNullOrWhiteSpace(cell.StringCellValue) ||
10,402✔
288
                       cell.StringCellValue.Trim().Equals("NULL", StringComparison.CurrentCultureIgnoreCase)
10,402✔
289
                    ? null
10,402✔
290
                    : cell.StringCellValue;
10,402✔
291

292
            case CellType.Formula:
293
                return GetCellValue(cell, cell.CachedFormulaResultType);
2✔
294
            case CellType.Blank:
295
                return null;
106✔
296
            case CellType.Boolean:
297
                return cell.BooleanCellValue;
×
298
            case CellType.Error:
299
                return null;
×
300
            default:
301
                throw new ArgumentOutOfRangeException(nameof(treatAs));
×
302
        }
303
    }
304

305
    private static bool IsDateWithTime(string formatString) => formatString.Contains('h') && formatString.Contains('y');
188✔
306

307
    private static bool IsDateWithoutTime(string formatString) =>
308
        formatString.Contains('y') && !formatString.Contains('h');
228✔
309

310
    private static bool IsTimeWithoutDate(string formatString) =>
311
        !formatString.Contains('y') && formatString.Contains('h');
128!
312

313
    private static bool IsDateFormat(string formatString) => !string.IsNullOrWhiteSpace(formatString) &&
48!
314
                                                             (formatString.Contains('/') ||
48✔
315
                                                              formatString.Contains('\\') ||
48✔
316
                                                              formatString.Contains(':'));
48✔
317

318
    /*
319

320
    private string IntToExcelColumnLetter(int colNumberStartingAtOne)
321
    {
322
        int dividend = colNumberStartingAtOne;
323
        string columnName = String.Empty;
324
        int modulo;
325

326
        while (dividend > 0)
327
        {
328
            modulo = (dividend - 1) % 26;
329
            columnName = Convert.ToChar(65 + modulo) + columnName;
330
            dividend = (int)((dividend - modulo) / 26);
331
        }
332

333
        return columnName;
334
    }
335

336
    private object MakeTimeRelatedDecision(double value, Range cells, string type, dynamic cell,int row, int col)
337
    {
338
        if (type.Contains("/") || type.Contains("\\") || type.Contains(":"))
339
        {
340
            if (cell != null)
341
                return cell.Text;
342

343
            return cells[row, col].Text;
344
        }
345

346
        //timeRelatedDescisions
347
        return value;
348
    }
349
    */
350
    private string[] acceptedFileExtensions =
52✔
351
    {
52✔
352
        ".xlsx",
52✔
353
        ".xls"
52✔
354
    };
52✔
355

356
    private bool IsAcceptableFileExtension() => acceptedFileExtensions.Contains(_fileToLoad.File.Extension.ToLower());
44✔
357

358
    private static bool IsNull(object o) => o == null || o == DBNull.Value || string.IsNullOrWhiteSpace(o.ToString());
10,930✔
359

360
    public void Check(ICheckNotifier notifier)
361
    {
362
        if (_fileToLoad != null)
4!
363
            if (!IsAcceptableFileExtension())
4✔
364
                notifier.OnCheckPerformed(
2✔
365
                    new CheckEventArgs(
2✔
366
                        $"File extension {_fileToLoad.File} has an invalid extension:{_fileToLoad.File.Extension} (this class only accepts:{string.Join(",", acceptedFileExtensions)})",
2✔
367
                        CheckResult.Fail));
2✔
368
            else
369
                notifier.OnCheckPerformed(
2✔
370
                    new CheckEventArgs($"File extension of file {_fileToLoad.File.Name} is acceptable",
2✔
371
                        CheckResult.Success));
2✔
372
        else
373
            notifier.OnCheckPerformed(
×
374
                new CheckEventArgs(
×
375
                    "FlatFileToLoad (Pipeline Requirement) was not met (we weren't initialized with a file)",
×
376
                    CheckResult.Warning));
×
377
    }
×
378

379
    public void Dispose(IDataLoadEventListener listener, Exception pipelineFailureExceptionIfAny)
380
    {
381
    }
2✔
382

383
    public void Abort(IDataLoadEventListener listener)
384
    {
385
    }
×
386

387
    public DataTable TryGetPreview()
388
    {
389
        var timeoutToken = new CancellationTokenSource();
×
390
        timeoutToken.CancelAfter(10000);
×
391

392
        var token = new GracefulCancellationToken(timeoutToken.Token, timeoutToken.Token);
×
393

394
        DataTable dt;
395
        try
396
        {
397
            dt = GetAllData(ThrowImmediatelyDataLoadEventListener.Quiet, token);
×
398
        }
×
399
        catch (Exception e)
×
400
        {
401
            if (timeoutToken.IsCancellationRequested)
×
402
                throw new Exception(
×
403
                    "Failed to generate preview in 10 seconds or less, giving up trying to load a preview (this doesn't mean that the source is broken, more likely you just have a big file or something)",
×
404
                    e);
×
405

406
            throw;
×
407
        }
408

409
        return dt;
×
410
    }
411

412
    public void PreInitialize(FlatFileToLoad value, IDataLoadEventListener listener)
413
    {
414
        _fileToLoad = value;
54✔
415
    }
54✔
416

417
    public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
418
    {
419
        return GetChunk(listener, cancellationToken, 0, 0);
26✔
420
    }
421
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc