• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

HicServices / RDMP / 7194961165

13 Dec 2023 12:07PM UTC coverage: 56.776% (-0.2%) from 57.013%
7194961165

push

github

web-flow
Merge Latest Release into main (#1702)

* Bump YamlDotNet from 13.3.1 to 13.4.0

Bumps [YamlDotNet](https://github.com/aaubry/YamlDotNet) from 13.3.1 to 13.4.0.
- [Release notes](https://github.com/aaubry/YamlDotNet/releases)
- [Commits](https://github.com/aaubry/YamlDotNet/compare/v13.3.1...v13.4.0)

---
updated-dependencies:
- dependency-name: YamlDotNet
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Bump shogo82148/actions-setup-perl from 1.23.1 to 1.24.1

Bumps [shogo82148/actions-setup-perl](https://github.com/shogo82148/actions-setup-perl) from 1.23.1 to 1.24.1.
- [Release notes](https://github.com/shogo82148/actions-setup-perl/releases)
- [Commits](https://github.com/shogo82148/actions-setup-perl/compare/v1.23.1...v1.24.1)

---
updated-dependencies:
- dependency-name: shogo82148/actions-setup-perl
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* fix checkbox issue

* improve confirmation text (#1639)

* improve confirmation text
* Loop tidyup, use var where possible

---------

Co-authored-by: jas88 <j.a.sutherland@dundee.ac.uk>

* correct typo in create logging sql (#1640)

* Feature/ci codescan (#1641)

* Move SecurityCodescan.VS2019 to run on Github CI alone, integrate results with CodeQL
* Remove SecurityCodescan from Packages.md, no longer used via Nuget

---------

Co-authored-by: James A Sutherland <j@sutherland.pw>

* hide source control when not available

* Remove old Plugin object bits, tidy up (#1636)

* Remove old Plugin object bits, tidy up

* Purge remaining bits of AllExpiredPluginsNode

* Fix plugin display name in tree

* Update CreateNewDataExtractionProjectUI.cs

Casting fix

* Feature/rdmp42 delete plugins (#1642)

* add ui plugin delete functionality

* Warning and inherita... (continued)

10722 of 20351 branches covered (0.0%)

Branch coverage included in aggregate %.

215 of 789 new or added lines in 63 files covered. (27.25%)

39 existing lines in 16 files now uncovered.

30650 of 52518 relevant lines covered (58.36%)

7294.17 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.47
/Rdmp.Core/DataLoad/Modules/DataFlowSources/ExcelDataFlowSource.cs
1
// Copyright (c) The University of Dundee 2018-2019
2
// This file is part of the Research Data Management Platform (RDMP).
3
// RDMP is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
4
// RDMP is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
5
// You should have received a copy of the GNU General Public License along with RDMP. If not, see <https://www.gnu.org/licenses/>.
6

7
using System;
8
using System.Collections.Generic;
9
using System.Data;
10
using System.Diagnostics;
11
using System.Globalization;
12
using System.IO;
13
using System.Linq;
14
using System.Threading;
15
using ExcelNumberFormat;
16
using FAnsi.Discovery;
17
using NPOI.HSSF.UserModel;
18
using NPOI.SS.UserModel;
19
using NPOI.XSSF.UserModel;
20
using Rdmp.Core.Curation.Data;
21
using Rdmp.Core.DataFlowPipeline;
22
using Rdmp.Core.DataFlowPipeline.Requirements;
23
using Rdmp.Core.DataLoad.Modules.Exceptions;
24
using Rdmp.Core.ReusableLibraryCode.Annotations;
25
using Rdmp.Core.ReusableLibraryCode.Checks;
26
using Rdmp.Core.ReusableLibraryCode.Progress;
27

28
namespace Rdmp.Core.DataLoad.Modules.DataFlowSources;
29

30
/// <summary>
31
/// Pipeline component for reading from Microsoft Excel files.  Reads only from a single worksheet (by default the first one in the workbook).  Data read
32
/// is returned as a DataTable all read at once in one big batch.  This component requires Microsoft Office to be installed since it uses Interop.
33
/// </summary>
34
public class ExcelDataFlowSource : IPluginDataFlowSource<DataTable>, IPipelineRequirement<FlatFileToLoad>
35
{
36
    public const string WorkSheetName_DemandDescription =
37
        "Name of the worksheet to load data from (single sheet name only).  If this is empty then the first sheet in the spreadsheet will be loaded instead";
38

39
    public const string AddFilenameColumnNamed_DemandDescription =
40
        "Optional - Set to the name of a column in your RAW database (e.g. Filename).  If set this named column will be populated with the path to the file being read (e.g. c:\\myproj\\Data\\ForLoading\\MyFile.csv)";
41

42
    [DemandsInitialization(WorkSheetName_DemandDescription)]
43
    public string WorkSheetName { get; set; }
30✔
44

45
    [DemandsInitialization(DelimitedFlatFileDataFlowSource.MakeHeaderNamesSane_DemandDescription,
46
        DemandType.Unspecified, true)]
47
    public bool MakeHeaderNamesSane { get; set; }
2✔
48

49
    [DemandsInitialization(AddFilenameColumnNamed_DemandDescription)]
50
    public string AddFilenameColumnNamed { get; set; }
52✔
51

52
    private FlatFileToLoad _fileToLoad;
53

54
    private DataTable dataReadFromFile;
55
    private bool haveDispatchedDataTable;
56

57
    public DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
58
    {
59
        dataReadFromFile ??= GetAllData(listener, cancellationToken);
26✔
60

61
        if (haveDispatchedDataTable)
24✔
62
            return null;
2✔
63

64
        haveDispatchedDataTable = true;
22✔
65

66
        return dataReadFromFile;
22✔
67
    }
68

69
    private DataTable GetAllData(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
70
    {
71
        var sw = new Stopwatch();
24✔
72
        sw.Start();
24✔
73
        if (_fileToLoad == null)
24!
74
            throw new Exception("_fileToLoad has not been set yet, possibly component has not been Initialized yet");
×
75

76
        if (!IsAcceptableFileExtension())
24!
77
            throw new Exception($"FileToLoad ({_fileToLoad.File.FullName}) extension was not XLS or XLSX, dubious");
×
78

79
        using var fs = new FileStream(_fileToLoad.File.FullName, FileMode.Open);
24✔
80
        IWorkbook wb = _fileToLoad.File.Extension == ".xls" ? new HSSFWorkbook(fs) : new XSSFWorkbook(fs);
24✔
81

82
        DataTable toReturn = null;
24✔
83

84
        try
85
        {
86
            var worksheet =
24!
87
                //if the user hasn't picked one, use the first
24✔
88
                (string.IsNullOrWhiteSpace(WorkSheetName) ? wb.GetSheetAt(0) : wb.GetSheet(WorkSheetName)) ??
24✔
89
                throw new FlatFileLoadException(
24✔
90
                    $"The Excel sheet '{WorkSheetName}' was not found in workbook '{_fileToLoad.File.Name}'");
24✔
91
            toReturn = GetAllData(worksheet, listener);
24✔
92

93
            //set the table name the file name
94
            toReturn.TableName =
24✔
95
                QuerySyntaxHelper.MakeHeaderNameSensible(Path.GetFileNameWithoutExtension(_fileToLoad.File.Name));
24✔
96

97
            if (toReturn.Columns.Count == 0)
24✔
98
                throw new FlatFileLoadException(
2✔
99
                    $"The Excel sheet '{worksheet.SheetName}' in workbook '{_fileToLoad.File.Name}' is empty");
2✔
100

101
            //if the user wants a column in the DataTable storing the filename loaded add it
102
            if (!string.IsNullOrWhiteSpace(AddFilenameColumnNamed))
22✔
103
            {
104
                toReturn.Columns.Add(AddFilenameColumnNamed);
4✔
105
                foreach (DataRow dataRow in toReturn.Rows)
48✔
106
                    dataRow[AddFilenameColumnNamed] = _fileToLoad.File.FullName;
20✔
107
            }
108
        }
18✔
109
        finally
110
        {
111
            wb.Close();
24✔
112
            toReturn?.EndLoadData();
24✔
113
        }
24✔
114

115
        return toReturn;
22✔
116
    }
22✔
117

118
    /// <summary>
119
    /// Returns all data held in the current <paramref name="worksheet"/>.  The first row of data becomes the headers.  Throws away fully blank columns/rows.
120
    /// </summary>
121
    /// <param name="worksheet"></param>
122
    /// <param name="listener"></param>
123
    /// <returns></returns>
124
    public DataTable GetAllData(ISheet worksheet, IDataLoadEventListener listener)
125
    {
126
        var toReturn = new DataTable();
38✔
127
        toReturn.BeginLoadData();
38✔
128

129
        var rowEnumerator = worksheet.GetRowEnumerator();
38✔
130
        var nColumns = -1;
38✔
131

132
        var nonBlankColumns = new Dictionary<int, DataColumn>();
38✔
133

134
        while (rowEnumerator.MoveNext())
740✔
135
        {
136
            var row = (IRow)rowEnumerator.Current;
702✔
137

138
            //if all the cells in the current row are blank skip it (eliminates top of file whitespace)
139
            if (row.Cells.All(c => string.IsNullOrWhiteSpace(c.ToString())))
1,516✔
140
                continue;
141

142
            //first row (that has any data in it) - makes headers
143
            if (nColumns == -1)
678✔
144
            {
145
                nColumns = row.Cells.Count;
36✔
146
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information,
36✔
147
                    $"Excel sheet {worksheet.SheetName} contains {nColumns}"));
36✔
148

149
                for (var i = 0; i < nColumns; i++)
740✔
150
                {
151
                    //if the cell header is blank
152
                    var cell = row.Cells[i];
334✔
153
                    var h = cell.StringCellValue;
334✔
154
                    if (string.IsNullOrWhiteSpace(h))
334✔
155
                        continue;
156

157
                    nonBlankColumns.Add(cell.ColumnIndex, toReturn.Columns.Add(h));
332✔
158
                }
159

160
                continue;
36✔
161
            }
162

163
            //the rest of the rows
164
            var r = toReturn.Rows.Add();
642✔
165

166
            var gotAtLeastOneGoodValue = false;
642✔
167

168
            foreach (var cell in row.Cells)
22,992✔
169
            {
170
                var value = GetCellValue(cell);
10,854✔
171

172
                //if the cell is blank skip it
173
                if (IsNull(value))
10,854✔
174
                    continue;
175

176
                //were we expecting this to be blank?
177
                if (!nonBlankColumns.ContainsKey(cell.ColumnIndex))
10,736✔
178
                {
179
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning,
8✔
180
                        $"Discarded the following data (that was found in unnamed columns):{value}"));
8✔
181
                    continue;
8✔
182
                }
183

184
                r[nonBlankColumns[cell.ColumnIndex]] = value.ToString();
10,728✔
185
                gotAtLeastOneGoodValue = true;
10,728✔
186
            }
187

188
            //if we didn't get any values at all for the row throw it away
189
            if (!gotAtLeastOneGoodValue)
642✔
190
                toReturn.Rows.Remove(r);
8✔
191
        }
192

193
        return toReturn;
38✔
194
    }
195

196
    /// <summary>
197
    /// Returns the C# value that best represents the contents of the cell.
198
    /// </summary>
199
    /// <param name="cell">The cell whose value you want to retrieve</param>
200
    /// <param name="treatAs">Leave blank, used in recursion for dealing with Formula cells</param>
201
    /// <returns></returns>
202
    private object GetCellValue([CanBeNull] ICell cell, CellType treatAs = CellType.Unknown)
203
    {
204
        if (cell == null)
10,856!
205
            return null;
×
206

207
        treatAs = treatAs switch
10,856!
208
        {
10,856✔
209
            CellType.Formula => throw new Exception("Cannot treat the cell contents as a Formula"),
×
210
            CellType.Unknown => cell.CellType,
10,854✔
211
            _ => treatAs
2!
212
        };
10,856✔
213

214
        switch (treatAs)
215
        {
216
            case CellType.Unknown:
217
                return cell.ToString();
×
218
            case CellType.Numeric:
219

220
                //some numerics are actually dates/times
221
                if (cell.CellStyle.DataFormat == 0) return cell.NumericCellValue;
616✔
222

223
                var format = cell.CellStyle.GetDataFormatString();
228✔
224

225
                if (IsDateWithoutTime(format))
228✔
226
                    return cell.DateCellValue.ToString("yyyy-MM-dd");
40✔
227

228
                if (IsDateWithTime(format))
188✔
229
                    return cell.DateCellValue.ToString("yyyy-MM-dd HH:mm:ss");
60✔
230

231
                if (IsTimeWithoutDate(format))
128✔
232
                    return cell.DateCellValue.ToString("HH:mm:ss");
80✔
233

234
                return new NumberFormat(format).Format(
48!
235
                    IsDateFormat(format) ? cell.DateCellValue : cell.NumericCellValue, CultureInfo.InvariantCulture);
48✔
236

237
            case CellType.String:
238

239
                //if it is blank or 'null' then leave it null
240
                return string.IsNullOrWhiteSpace(cell.StringCellValue) ||
10,326✔
241
                       cell.StringCellValue.Trim().Equals("NULL", StringComparison.CurrentCultureIgnoreCase)
10,326✔
242
                    ? null
10,326✔
243
                    : cell.StringCellValue;
10,326✔
244

245
            case CellType.Formula:
246
                return GetCellValue(cell, cell.CachedFormulaResultType);
2✔
247
            case CellType.Blank:
248
                return null;
106✔
249
            case CellType.Boolean:
250
                return cell.BooleanCellValue;
×
251
            case CellType.Error:
252
                return null;
×
253
            default:
NEW
254
                throw new ArgumentOutOfRangeException(nameof(treatAs));
×
255
        }
256
    }
257

258
    private static bool IsDateWithTime(string formatString) => formatString.Contains('h') && formatString.Contains('y');
188✔
259

260
    private static bool IsDateWithoutTime(string formatString) =>
261
        formatString.Contains('y') && !formatString.Contains('h');
228✔
262

263
    private static bool IsTimeWithoutDate(string formatString) =>
264
        !formatString.Contains('y') && formatString.Contains('h');
128!
265

266
    private static bool IsDateFormat(string formatString) => !string.IsNullOrWhiteSpace(formatString) &&
48!
267
                                                             (formatString.Contains('/') ||
48✔
268
                                                              formatString.Contains('\\') ||
48✔
269
                                                              formatString.Contains(':'));
48✔
270

271
    /*
272

273
    private string IntToExcelColumnLetter(int colNumberStartingAtOne)
274
    {
275
        int dividend = colNumberStartingAtOne;
276
        string columnName = String.Empty;
277
        int modulo;
278

279
        while (dividend > 0)
280
        {
281
            modulo = (dividend - 1) % 26;
282
            columnName = Convert.ToChar(65 + modulo) + columnName;
283
            dividend = (int)((dividend - modulo) / 26);
284
        }
285

286
        return columnName;
287
    }
288

289
    private object MakeTimeRelatedDecision(double value, Range cells, string type, dynamic cell,int row, int col)
290
    {
291
        if (type.Contains("/") || type.Contains("\\") || type.Contains(":"))
292
        {
293
            if (cell != null)
294
                return cell.Text;
295

296
            return cells[row, col].Text;
297
        }
298

299
        //timeRelatedDescisions
300
        return value;
301
    }
302
    */
303
    private string[] acceptedFileExtensions =
34✔
304
    {
34✔
305
        ".xlsx",
34✔
306
        ".xls"
34✔
307
    };
34✔
308

309
    private bool IsAcceptableFileExtension() => acceptedFileExtensions.Contains(_fileToLoad.File.Extension.ToLower());
28✔
310

311
    private static bool IsNull(object o) => o == null || o == DBNull.Value || string.IsNullOrWhiteSpace(o.ToString());
10,854✔
312

313
    public void Check(ICheckNotifier notifier)
314
    {
315
        if (_fileToLoad != null)
4!
316
            if (!IsAcceptableFileExtension())
4✔
317
                notifier.OnCheckPerformed(
2✔
318
                    new CheckEventArgs(
2✔
319
                        $"File extension {_fileToLoad.File} has an invalid extension:{_fileToLoad.File.Extension} (this class only accepts:{string.Join(",", acceptedFileExtensions)})",
2✔
320
                        CheckResult.Fail));
2✔
321
            else
322
                notifier.OnCheckPerformed(
2✔
323
                    new CheckEventArgs($"File extension of file {_fileToLoad.File.Name} is acceptable",
2✔
324
                        CheckResult.Success));
2✔
325
        else
326
            notifier.OnCheckPerformed(
×
327
                new CheckEventArgs(
×
328
                    "FlatFileToLoad (Pipeline Requirement) was not met (we weren't initialized with a file)",
×
329
                    CheckResult.Warning));
×
330
    }
×
331

332
    public void Dispose(IDataLoadEventListener listener, Exception pipelineFailureExceptionIfAny)
333
    {
334
    }
2✔
335

336
    public void Abort(IDataLoadEventListener listener)
337
    {
338
    }
×
339

340
    public DataTable TryGetPreview()
341
    {
342
        var timeoutToken = new CancellationTokenSource();
×
343
        timeoutToken.CancelAfter(10000);
×
344

345
        var token = new GracefulCancellationToken(timeoutToken.Token, timeoutToken.Token);
×
346

347
        DataTable dt;
348
        try
349
        {
350
            dt = GetAllData(ThrowImmediatelyDataLoadEventListener.Quiet, token);
×
351
        }
×
352
        catch (Exception e)
×
353
        {
354
            if (timeoutToken.IsCancellationRequested)
×
355
                throw new Exception(
×
356
                    "Failed to generate preview in 10 seconds or less, giving up trying to load a preview (this doesn't mean that the source is broken, more likely you just have a big file or something)",
×
357
                    e);
×
358

359
            throw;
×
360
        }
361

362
        return dt;
×
363
    }
364

365
    public void PreInitialize(FlatFileToLoad value, IDataLoadEventListener listener)
366
    {
367
        _fileToLoad = value;
36✔
368
    }
36✔
369
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc